Hi all,
I am modeling count data that is zero-inflated. To model the data, I am using hurdle() function from the R pscl package. I am also refitting the model after the removal of outlier points. I have written a function to remove outliers from the data set. However, I think there is a bug in the code because the nothing as been removed. I noticed that gc and map covariates are not significant for non-zero part of the model, when I omit them I get better results. When I set map as an offset, the gc is very significant for the zero part of the model. I am wondering if this happening because these two covariates explain so little of variance.
Here is the code:
library(pscl)
library(VGAM)
library(data.table)
library(splines)
# Define function
remove.outliers <- function(dat,mod,f=qzanegbin){
q1 <- f(0.025,size = mod$theta, munb=predict(mod,newdata = dat,
dispersion = 1/mod$theta,type='response'),pobs=predprob(mod,newdata=dat)[,1])
q2 <- f(0.975,size = mod$theta, munb=predict(mod,newdata = dat,
dispersion = 1/mod$theta,type='response'),pobs=predprob(mod,newdata=dat)[,1])
q <- sapply(seq(0,0.99,by=0.05),function(k) f(k,size = mod$theta,
munb=predict(mod,newdata = dat,dispersion = 1/mod$theta,type='response'),
pobs=predprob(mod,newdata=dat)[,1]))
H <- apply(q,1,function(x){1.5*IQR(x)})
counts <- dat$counts
counts[counts < q1 - H] <- NA
counts[counts > q2 + H] <- NA
new.dat <- dat[which(!is.na(counts)),]
return(new.dat)
}
# Main body of algorithm
x <- readRDS('WorkinProgress/Data.rds')
nrows <- nrow(x)
Intervals <- seq(0,2e6,by=500)
cls <- findInterval(x$D,Intervals,rightmost.closed = T)
# Binning the distances
bins <- split(seq_len(nrows),cls)
# Sampling 10% of data from each bin
idx <- unlist(lapply(bins,function(x){sample(x,floor(length(x)*0.01),replace = F)}))
dat <- x[idx,]
# Knots
knots <- quantile(dat$D,probs = c(0.25,0.5,0.75))
bdpts <- quantile(dat$D,probs=c(0,1))
df <- 6 # degrees of freedom
mod <- hurdle(counts ~ gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts))| gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts)),data=dat,dist='negbin')
# Remove outliers
new.dat <- remove.outliers(dat,mod)
# Refit the model
mod <- hurdle(counts ~ gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts))| gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts)),data=new.dat,dist='negbin')
> x
counts D gc map
1: 2 239 -1.6960761 -0.6069708
2: 2 4011 -1.1518852 -0.6399841
3: 3 8875 -0.7788370 -0.5767152
4: 0 14045 -1.1544460 -0.4697568
5: 0 17502 -1.4958545 -0.6374427
---
16053150: 7 16926 0.4619302 -0.6723117
16053151: 10 4429 0.6345688 -0.6723117
16053152: 4 7942 0.9654208 -0.6723117
16053153: 4 3309 1.4065476 -0.6723117
16053154: 20 307 1.2228625 -0.6723117
> length(which(x$counts==0))
[1] 14271361
cor(x,method = 'spearman')
counts D gc map
counts 1.000000000 -0.3445307605 0.003804754 -0.0034717774
D -0.344530761 1.0000000000 -0.006490156 -0.0003908146
gc 0.003804754 -0.0064901556 1.000000000 -0.1263421203
map -0.003471777 -0.0003908146 -0.126342120 1.0000000000
summary(mod)
Call:
hurdle(formula = counts ~ gc + map + bs(D, df = df, knots = c(knots), Boundary.knots = c(bdpts)) |
gc + map + bs(D, df = df, knots = c(knots), Boundary.knots = c(bdpts)), data = new.dat,
dist = "negbin")
Pearson residuals:
Min 1Q Median 3Q Max
-0.9878 -0.2939 -0.1931 -0.1546 25.157
sessionInfo()
R version 3.1.2 (2014-10-31)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] splines stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] data.table_1.9.4 VGAM_0.9-7 pscl_1.4.8 lattice_0.20-30 MASS_7.3-39
loaded via a namespace (and not attached):
[1] chron_2.3-45 grid_3.1.2 plyr_1.8.1 Rcpp_0.11.5 reshape2_1.4.1 stringr_0.6.2
[7] tools_3.1.2
