This should reproduce the bully example results that are given in lecture. The results in the lecture notes were created using SAS. Some of the results below (i.e., marginal distribution graphs) seem to be a bit off.

library(MASS)  # for negative binomial
library(pscl)  # for zero inflated
## Warning: package 'pscl' was built under R version 4.0.5
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
setwd("D:/Dropbox/edps 589/4 glm")

bully <- read.table("bully_data.txt",header=TRUE)

Exploratory

Some basic descriptive statistics

(n <- length(bully$nom))
## [1] 291
(ybar <- mean(bully$nom))
## [1] 2.487973
(s <- sd(bully$nom))
## [1] 6.423746
(s2 <- var(bully$nom))
## [1] 41.26451
summary(bully$nom)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   2.488   2.000  50.000

Histogram of bully nominations

hist(bully$nom,
     main="Distribution of Number of Bully Nominations",
     xlab="Number of Nominations that a student is Bully",
     breaks=50,
     col="cyan")
text(41.2,200,"N = 291")     
text(40,190,"Mean = 2.49")   
text(41,180,"sd = 6.42")
text(41.1,170,"s2 = 41.26")
text(39.1,160,"min = 0")
text(39.1,150,"max = 50")

Histogram of bully nominations with null poisson overlay

y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)  
     
hist(bully$nom, prob=TRUE, col="cyan", breaks=50, 
     main="Bully Nominations w/ Poisson Overlayed",
     xlab="Number of Bully Nominations")     
#prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1.5)   # added a density estimate 

Poisson loglinear regression

Although we don’t expect this to fit…

mod1 <- glm(nom ~ bsc, data=bully, family=poisson)
summary(mod1)
## 
## Call:
## glm(formula = nom ~ bsc, family = poisson, data = bully)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.4995  -1.8246  -1.5952  -0.1552  11.6806  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.66308    0.08871  -7.475 7.75e-14 ***
## bsc          0.81434    0.03504  23.239  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2205.2  on 290  degrees of freedom
## Residual deviance: 1774.6  on 289  degrees of freedom
## AIC: 2160.3
## 
## Number of Fisher Scoring iterations: 6

Let’s see how well this is doing

bully$pred1 <- fitted.values(mod1)
pred <- predict(mod1, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE
        )
bully$lower1 <- exp(pred$fit  - 1.96*pred$se.fit)
bully$upper1 <- exp(pred$fit  + 1.96*pred$se.fit)
bully$pred1  <- exp(pred$fit)

bully.sort <- bully[order(bully$bsc),]
plot(bully.sort$bsc,bully.sort$nom,type='p',pch=20, col="blue",
     main='Data, Fitted Line, and 95% Confidence', 
     xlab='Bully Scale Score', ylab='Fitted Count')
lines(bully.sort$bsc,bully.sort$pred1,col="darkgreen",lty=1,lwd=2)
lines(bully.sort$bsc,bully.sort$lower1,col="darkgreen",lty=2)
lines(bully.sort$bsc,bully.sort$upper1,col="darkgreen",lty=2)

Other models – digression

Negative Binomial

library(MASS)

Fit model to individual level data

mod2 <- glm.nb(nom ~ bsc, data=bully)  # dispersion phi = 1/"disperson from R"
summary(mod2)
## 
## Call:
## glm.nb(formula = nom ~ bsc, data = bully, init.theta = 0.2858008788, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6981  -0.9888  -0.9027  -0.0086   2.5282  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.1940     0.3058  -3.904 9.45e-05 ***
## bsc           1.0921     0.1648   6.626 3.44e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2858) family taken to be 1)
## 
##     Null deviance: 288.32  on 290  degrees of freedom
## Residual deviance: 244.08  on 289  degrees of freedom
## AIC: 998.82
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2858 
##           Std. Err.:  0.0360 
## 
##  2 x log-likelihood:  -992.8160

Using my grouped data

mod2b <- glm.nb(mnom ~ mbsc, data=grpdata)
## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.891892
## Warning in dpois(y, mu, log = TRUE): non-integer x = 1.700000
## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.437500
## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.212766
## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.272727
## Warning in dpois(y, mu, log = TRUE): non-integer x = 4.800000
## Warning in dpois(y, mu, log = TRUE): non-integer x = 6.463415
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(mod2b)
## 
## Call:
## glm.nb(formula = mnom ~ mbsc, data = grpdata, init.theta = 131312.3787, 
##     link = log)
## 
## Deviance Residuals: 
##        1         2         3         4         5         6         7  
## -0.59433   0.21016  -1.37446   0.25070   0.01697   0.92676  -0.31365  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.7058     0.6915  -1.021  0.30738   
## mbsc          0.8777     0.3008   2.918  0.00352 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(131312.4) family taken to be 1)
## 
##     Null deviance: 11.1723  on 6  degrees of freedom
## Residual deviance:  3.3069  on 5  degrees of freedom
## AIC: 26.715
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  131312 
##           Std. Err.:  7093913 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -20.715
#-- test of fit
1 - pchisq(mod2b$deviance,mod2b$df.residual)   
## [1] 0.6527796
# looks good

S3 method for class ‘glm’

pred <- predict(mod2b, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE         )

lower <- exp(pred$fit  - 1.96*pred$se.fit)
upper <- exp(pred$fit  + 1.96*pred$se.fit)
fit   <- exp(pred$fit)

And a graph

plot(grpdata$mbsc,grpdata$mnom, 
     pch=20, col="blue", 
     main='NegBin to Grouped: Means bully scale by Nominations (95%)',
     xlab='mean bully scale score',
     ylab='observed and fitted number nominations',
     ylim=c(0,35),
     xlim=c(1,4) )
lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2)
lines(grpdata$mbsc,lower,col='darkgreen',lty=2)
lines(grpdata$mbsc,upper,col='darkgreen',lty=2)                

Doesn’t look same as SAS, but not sure why.

Fitted Marginal distribution

Using back to individual level

This seems to be off a bit off; not sure why.

ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob2 <- matrix(99,nrow=291,ncol=51)               # For negative binomial
phi = 0.2858
for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu <- exp(-1.1940 + 1.0921*bully$bsc[i] )
    term1 <- lgamma(mu+phi)/(factorial(yobs)*lgamma(phi))
    term2 <- ( phi/(mu+phi))**phi 
    term3 <- ( mu/(mu+phi))**yobs  
    fitprob2[i,yindex] = term1*term2*term3
  }
}

Mean over kids (rows) is what I want to plot

marginals2 <- colSums(fitprob2)/length(bully$bsc)

# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)  
     
hist(bully$nom, prob=TRUE, col="cyan", breaks=30, 
     main="Bully Nominations with Fitted Overlayed",
     xlab="Number of Bully Nominations",
     ylim=c(0,1)
     )     # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1)         # add a density estimate 
lines(y,marginals, col="black", lty=1, lwd=1)              # mod 1, poisson with predictor
lines(y,marginals2,col="red",lty=1, lwd=2)                 # negative binomial
text(30,0.30,"Data",col="cyan")                            # zip
text(31,0.25,"Poisson no predcitors",col="darkgreen")
text(32.4,0.20,"Poisson nom ~ bullyscale score",col="black")
text(32.4,0.15,"NegBin nom ~ bullyscale score",col="red")

These look a bit off.

ZIP

library(pscl)
summary(mod3 <- zeroinfl(nom ~ bsc | 1, data = bully))
## 
## Call:
## zeroinfl(formula = nom ~ bsc | 1, data = bully)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8713 -0.7353 -0.7096 -0.1353  9.8156 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.47682    0.10148   4.699 2.62e-06 ***
## bsc          0.60231    0.03979  15.138  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.2058     0.1212   1.698   0.0896 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 5 
## Log-likelihood: -789.5 on 3 Df
summary(mod4 <- zeroinfl(nom ~ bsc | bsc, data = bully))
## 
## Call:
## zeroinfl(formula = nom ~ bsc | bsc, data = bully)
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.37068 -0.68873 -0.59685 -0.05357  9.87491 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.49604    0.09987   4.967 6.81e-07 ***
## bsc          0.59609    0.03942  15.120  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.4967     0.3463   4.322 1.54e-05 ***
## bsc          -0.7716     0.1967  -3.924 8.72e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 8 
## Log-likelihood: -780.6 on 4 Df
ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob4 <- matrix(99,nrow=291,ncol=51)               # 
# For zip (mod4)

for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu.zip <- exp(0.4967 + 0.59609*bully$bsc[i] )
    pie <- exp(1.4967 - 0.7716*bully$bsc[i])/( 1 + exp(1.4967 - 0.7716*bully$bsc[i]))
    if (yobs==0) {
       fitprob4[i,yindex] <- pie + (1-pie)*exp(-mu.zip)
    } else {
       fitprob4[i,yindex] <- (1-pie)*exp(-mu.zip)*(mu.zip**yobs)/factorial(yobs)
    }
  }
}

Mean over kids (rows) is what I want to plot

marginals4 <- colSums(fitprob4)/length(bully$bsc)
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)  
     
hist(bully$nom, prob=TRUE, col="cyan", breaks=30, 
     main="Bully Nominations with Fitted Overlayed",
     xlab="Number of Bully Nominations",
     ylim=c(0,1)
     )     # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1)         # add a density estimate 
lines(y,marginals, col="black", lty=1, lwd=1)              # mod 1, poisson with predictor
lines(y,marginals2,col="red",lty=1, lwd=2)                 # negative binomial
lines(y,marginals4,col="violet", lty=4, lwd=2)
    # zip
text(30,0.35,"Data",col="cyan")  
text(32.4,0.30,"Poisson nom ~ bullyscale score",col="black")
text(32.4,0.25,"NegBin nom ~ bullyscale score",col="red")
text(32.4,0.20,"ZIP nom ~ bullyscale score",col="violet")

The marginal predictions for zip and negbin seem to be a bit off.