Question: Modeling count data that is zero inflated with negative binomial hurdle regression
0
mrk.carty30 wrote:

Hi all,

I am modeling count data that is zero-inflated. To model the data, I am using hurdle() function from the R pscl package. I am also refitting the model after the removal of outlier points. I have written a function to remove outliers from the data set. However, I think there is a bug in the code because the nothing as been removed. I noticed that gc and map covariates are not significant for non-zero part of the model, when I omit them I get better results. When I set map as an offset, the gc is very significant for the zero part of the model.  I am wondering if this happening because these two covariates explain so little of variance.

Here is the code:

library(pscl)
library(VGAM)
library(data.table)
library(splines)

# Define function

remove.outliers <- function(dat,mod,f=qzanegbin){
q1 <- f(0.025,size = mod$theta, munb=predict(mod,newdata = dat, dispersion = 1/mod$theta,type='response'),pobs=predprob(mod,newdata=dat)[,1])
q2 <- f(0.975,size = mod$theta, munb=predict(mod,newdata = dat, dispersion = 1/mod$theta,type='response'),pobs=predprob(mod,newdata=dat)[,1])
q <- sapply(seq(0,0.99,by=0.05),function(k) f(k,size = mod$theta, munb=predict(mod,newdata = dat,dispersion = 1/mod$theta,type='response'),
pobs=predprob(mod,newdata=dat)[,1]))
H <- apply(q,1,function(x){1.5*IQR(x)})
counts <- dat$counts counts[counts < q1 - H] <- NA counts[counts > q2 + H] <- NA new.dat <- dat[which(!is.na(counts)),] return(new.dat) } # Main body of algorithm x <- readRDS('WorkinProgress/Data.rds') nrows <- nrow(x) Intervals <- seq(0,2e6,by=500) cls <- findInterval(x$D,Intervals,rightmost.closed = T)
# Binning the distances
bins <- split(seq_len(nrows),cls)
# Sampling 10% of data from each bin
idx <- unlist(lapply(bins,function(x){sample(x,floor(length(x)*0.01),replace = F)}))
dat <- x[idx,]
# Knots
knots <- quantile(dat$D,probs = c(0.25,0.5,0.75)) bdpts <- quantile(dat$D,probs=c(0,1))
df <- 6 # degrees of freedom

mod <- hurdle(counts ~ gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts))| gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts)),data=dat,dist='negbin')

# Remove outliers
new.dat <- remove.outliers(dat,mod)
# Refit the model
mod <- hurdle(counts ~ gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts))| gc + map + bs(D,df=df,knots=c(knots),
Boundary.knots=c(bdpts)),data=new.dat,dist='negbin')

> x
counts     D         gc        map
1:      2   239 -1.6960761 -0.6069708
2:      2  4011 -1.1518852 -0.6399841
3:      3  8875 -0.7788370 -0.5767152
4:      0 14045 -1.1544460 -0.4697568
5:      0 17502 -1.4958545 -0.6374427
---
16053150:      7 16926  0.4619302 -0.6723117
16053151:     10  4429  0.6345688 -0.6723117
16053152:      4  7942  0.9654208 -0.6723117
16053153:      4  3309  1.4065476 -0.6723117
16053154:     20   307  1.2228625 -0.6723117
> length(which(x\$counts==0))
 14271361

cor(x,method = 'spearman')
counts             D           gc           map
counts  1.000000000 -0.3445307605  0.003804754 -0.0034717774
D      -0.344530761  1.0000000000 -0.006490156 -0.0003908146
gc      0.003804754 -0.0064901556  1.000000000 -0.1263421203
map    -0.003471777 -0.0003908146 -0.126342120  1.0000000000

summary(mod)

Call:
hurdle(formula = counts ~ gc + map + bs(D, df = df, knots = c(knots), Boundary.knots = c(bdpts)) |
gc + map + bs(D, df = df, knots = c(knots), Boundary.knots = c(bdpts)), data = new.dat,
dist = "negbin")

Pearson residuals:
Min      1Q  Median      3Q     Max
-0.9878 -0.2939 -0.1931 -0.1546 25.157

sessionInfo()
R version 3.1.2 (2014-10-31)
Platform: x86_64-apple-darwin13.4.0 (64-bit)

locale:
 en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
 splines   stats4    stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
 data.table_1.9.4 VGAM_0.9-7       pscl_1.4.8       lattice_0.20-30  MASS_7.3-39

loaded via a namespace (and not attached):
 chron_2.3-45   grid_3.1.2     plyr_1.8.1     Rcpp_0.11.5    reshape2_1.4.1 stringr_0.6.2
 tools_3.1.2  

outliers pscl • 1.5k views  modified 4.7 years ago by Sean Davis21k • written 4.7 years ago by mrk.carty30