Exploratory
Poisson loglinear regression
Other models – digression
- identity link in poisson
- identity link and negative binomial
Negative Binomial
ZIP

This should reproduce the bully example results that are given in lecture. The results in the lecture notes were created using SAS. Some of the results below (i.e., marginal distribution graphs) seem to be a bit off.

library(MASS)  # for negative binomial
library(pscl)  # for zero inflated

## Warning: package 'pscl' was built under R version 4.0.5

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

setwd("D:/Dropbox/edps 589/4 glm")

bully <- read.table("bully_data.txt",header=TRUE)

Exploratory

Some basic descriptive statistics

(n <- length(bully$nom))

## [1] 291

(ybar <- mean(bully$nom))

## [1] 2.487973

(s <- sd(bully$nom))

## [1] 6.423746

(s2 <- var(bully$nom))

## [1] 41.26451

summary(bully$nom)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   2.488   2.000  50.000

Histogram of bully nominations

hist(bully$nom,
     main="Distribution of Number of Bully Nominations",
     xlab="Number of Nominations that a student is Bully",
     breaks=50,
     col="cyan")
text(41.2,200,"N = 291")     
text(40,190,"Mean = 2.49")   
text(41,180,"sd = 6.42")
text(41.1,170,"s2 = 41.26")
text(39.1,160,"min = 0")
text(39.1,150,"max = 50")

Histogram of bully nominations with null poisson overlay

y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)  
     
hist(bully$nom, prob=TRUE, col="cyan", breaks=50, 
     main="Bully Nominations w/ Poisson Overlayed",
     xlab="Number of Bully Nominations")     
#prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1.5)   # added a density estimate

Poisson loglinear regression

Although we don’t expect this to fit…

mod1 <- glm(nom ~ bsc, data=bully, family=poisson)
summary(mod1)

## 
## Call:
## glm(formula = nom ~ bsc, family = poisson, data = bully)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.4995  -1.8246  -1.5952  -0.1552  11.6806  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.66308    0.08871  -7.475 7.75e-14 ***
## bsc          0.81434    0.03504  23.239  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2205.2  on 290  degrees of freedom
## Residual deviance: 1774.6  on 289  degrees of freedom
## AIC: 2160.3
## 
## Number of Fisher Scoring iterations: 6

Let’s see how well this is doing

bully$pred1 <- fitted.values(mod1)
pred <- predict(mod1, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE
        )
bully$lower1 <- exp(pred$fit  - 1.96*pred$se.fit)
bully$upper1 <- exp(pred$fit  + 1.96*pred$se.fit)
bully$pred1  <- exp(pred$fit)

bully.sort <- bully[order(bully$bsc),]
plot(bully.sort$bsc,bully.sort$nom,type='p',pch=20, col="blue",
     main='Data, Fitted Line, and 95% Confidence', 
     xlab='Bully Scale Score', ylab='Fitted Count')
lines(bully.sort$bsc,bully.sort$pred1,col="darkgreen",lty=1,lwd=2)
lines(bully.sort$bsc,bully.sort$lower1,col="darkgreen",lty=2)
lines(bully.sort$bsc,bully.sort$upper1,col="darkgreen",lty=2)

Other models – digression

identity link in poisson

summary(mod.Pidentity <- glm(nom ~ bsc, data=bully, family=poisson("identity")))

## 
## Call:
## glm(formula = nom ~ bsc, family = poisson("identity"), data = bully)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8270  -1.8970  -1.2308   0.1469  10.9259  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.7471     0.2173  -12.64   <2e-16 ***
## bsc           3.1573     0.1648   19.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2205.2  on 290  degrees of freedom
## Residual deviance: 1713.7  on 289  degrees of freedom
## AIC: 2099.3
## 
## Number of Fisher Scoring iterations: 5

mod.Pidentity$aic

## [1] 2099.334

identity link and negative binomial

summary(nb.indentity <- glm.nb(nom ~ bsc, data=bully, link=identity))

## 
## Call:
## glm.nb(formula = nom ~ bsc, data = bully, link = identity, init.theta = 0.2986099095)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.47966  -1.02677  -0.73090   0.05897   2.62879  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.6421     0.6402  -4.127 3.67e-05 ***
## bsc           3.0739     0.5582   5.507 3.65e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2986) family taken to be 1)
## 
##     Null deviance: 297.15  on 290  degrees of freedom
## Residual deviance: 244.33  on 289  degrees of freedom
## AIC: 992.09
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2986 
##           Std. Err.:  0.0380 
## 
##  2 x log-likelihood:  -986.0890

nb.indentity$aic

## [1] 992.0894

But the line is for fitted mean..so another way to look at the goodness of fit is to group data based on bully scale score and re-fit the model, etc.

This is ONLY for assessing fit (fit to individual level is what should be report and interpreted

Group data and get a look-see at how good it may be fitting the data

bully$grp <- cut(bully$bsc, quantile(bully$bsc, c(0, 1/7, 2/7, 3/7, 4/7, 5/7, 6/7, 1)))     
grp.bsc <- aggregate(bully$bsc, by= list(bgrp=bully$grp), FUN=mean)

grp.nom <- aggregate(bully$nom, by= list(bgrp=bully$grp), FUN=mean)

grpdata <- cbind(grp.bsc, grp.nom)
grpdata <- as.data.frame(grpdata)
names(grpdata) <- c("grp","mbsc","grp2","mnom")
plot(grpdata$mbsc,grpdata$mnom)

mod1b <- glm(mnom ~ mbsc, data=grpdata, family=poisson)

## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.891892

## Warning in dpois(y, mu, log = TRUE): non-integer x = 1.700000

## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.437500

## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.212766

## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.272727

## Warning in dpois(y, mu, log = TRUE): non-integer x = 4.800000

## Warning in dpois(y, mu, log = TRUE): non-integer x = 6.463415

summary(mod1b)

## 
## Call:
## glm(formula = mnom ~ mbsc, family = poisson, data = grpdata)
## 
## Deviance Residuals: 
##        1         2         3         4         5         6         7  
## -0.38619   0.21016  -1.07973   0.25070   0.01697   0.92678  -0.31365  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.7058     0.6915  -1.021  0.30738   
## mbsc          0.8777     0.3008   2.918  0.00352 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 10.2451  on 6  degrees of freedom
## Residual deviance:  2.3796  on 5  degrees of freedom
## AIC: Inf
## 
## Number of Fisher Scoring iterations: 4

#-- test of fit
1 - pchisq(mod1b$deviance,mod1b$df.residual)   # look good

## [1] 0.7945152

#  S3 method for class 'glm'
pred <- predict(mod1b, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE
        )

This looks difference from SAS but numerical results are spot on (i.e. equal for all quanties so …

lower <- exp(pred$fit  - 1.96*pred$se.fit)
upper <- exp(pred$fit  + 1.96*pred$se.fit)
fit   <- exp(pred$fit)

plot(grpdata$mbsc,grpdata$mnom, 
     pch=20, col="blue", 
     main='Grouped: Means bully scale by Nominations (95% Confidence)',
     xlab='mean bully scale score',
     ylab='observed and fitted number nominations',
     ylim=c(0,35),
     xlim=c(1,4) )
lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2)
lines(grpdata$mbsc,lower,col='darkgreen',lty=2)
lines(grpdata$mbsc,upper,col='darkgreen',lty=2)

Fitted Marginal distribution (back to individual level)

ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob <- matrix(99,nrow=291,ncol=51)
for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu <- exp(-0.66308 + 0.81434*bully$bsc[i] )
    fitprob[i,yindex] = ( exp(-mu) * mu**yobs)/(factorial(yobs))
  }
}
# summing over y values (colums) should all equal 1
rowSums(fitprob)

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

# mean over kids (rows) is what I want to plot
marginals <- colSums(fitprob)/length(bully$bsc)

# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)  
     
hist(bully$nom, prob=TRUE, col="cyan", breaks=50, 
     main="Bully Nominations with Poisson Overlayed",
     xlab="Number of Bully Nominations")     # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1)     # added a density estimate 
lines(y,marginals, col="black", lty=1, lwd=2)          # mod 1, poisson with predictor
text(30,0.68,"Data",col="cyan")
text(36,0.62,"Poisson no predictors",col="darkgreen")
text(39.4,0.66,"Poisson nom ~ bullyscale score",col="black")

Negative Binomial

library(MASS)

Fit model to individual level data

mod2 <- glm.nb(nom ~ bsc, data=bully)  # dispersion phi = 1/"disperson from R"
summary(mod2)

## 
## Call:
## glm.nb(formula = nom ~ bsc, data = bully, init.theta = 0.2858008788, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6981  -0.9888  -0.9027  -0.0086   2.5282  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.1940     0.3058  -3.904 9.45e-05 ***
## bsc           1.0921     0.1648   6.626 3.44e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2858) family taken to be 1)
## 
##     Null deviance: 288.32  on 290  degrees of freedom
## Residual deviance: 244.08  on 289  degrees of freedom
## AIC: 998.82
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2858 
##           Std. Err.:  0.0360 
## 
##  2 x log-likelihood:  -992.8160

Using my grouped data

mod2b <- glm.nb(mnom ~ mbsc, data=grpdata)

## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.891892

## Warning in dpois(y, mu, log = TRUE): non-integer x = 1.700000

## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.437500

## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.212766

## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.272727

## Warning in dpois(y, mu, log = TRUE): non-integer x = 4.800000

## Warning in dpois(y, mu, log = TRUE): non-integer x = 6.463415

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(mod2b)

## 
## Call:
## glm.nb(formula = mnom ~ mbsc, data = grpdata, init.theta = 131312.3787, 
##     link = log)
## 
## Deviance Residuals: 
##        1         2         3         4         5         6         7  
## -0.59433   0.21016  -1.37446   0.25070   0.01697   0.92676  -0.31365  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.7058     0.6915  -1.021  0.30738   
## mbsc          0.8777     0.3008   2.918  0.00352 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(131312.4) family taken to be 1)
## 
##     Null deviance: 11.1723  on 6  degrees of freedom
## Residual deviance:  3.3069  on 5  degrees of freedom
## AIC: 26.715
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  131312 
##           Std. Err.:  7093913 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -20.715

#-- test of fit
1 - pchisq(mod2b$deviance,mod2b$df.residual)

## [1] 0.6527796

# looks good

S3 method for class ‘glm’

pred <- predict(mod2b, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE         )

lower <- exp(pred$fit  - 1.96*pred$se.fit)
upper <- exp(pred$fit  + 1.96*pred$se.fit)
fit   <- exp(pred$fit)

And a graph

plot(grpdata$mbsc,grpdata$mnom, 
     pch=20, col="blue", 
     main='NegBin to Grouped: Means bully scale by Nominations (95%)',
     xlab='mean bully scale score',
     ylab='observed and fitted number nominations',
     ylim=c(0,35),
     xlim=c(1,4) )
lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2)
lines(grpdata$mbsc,lower,col='darkgreen',lty=2)
lines(grpdata$mbsc,upper,col='darkgreen',lty=2)

Doesn’t look same as SAS, but not sure why.

Fitted Marginal distribution

Using back to individual level

This seems to be off a bit off; not sure why.

ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob2 <- matrix(99,nrow=291,ncol=51)               # For negative binomial
phi = 0.2858
for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu <- exp(-1.1940 + 1.0921*bully$bsc[i] )
    term1 <- lgamma(mu+phi)/(factorial(yobs)*lgamma(phi))
    term2 <- ( phi/(mu+phi))**phi 
    term3 <- ( mu/(mu+phi))**yobs  
    fitprob2[i,yindex] = term1*term2*term3
  }
}

Mean over kids (rows) is what I want to plot

marginals2 <- colSums(fitprob2)/length(bully$bsc)

# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)  
     
hist(bully$nom, prob=TRUE, col="cyan", breaks=30, 
     main="Bully Nominations with Fitted Overlayed",
     xlab="Number of Bully Nominations",
     ylim=c(0,1)
     )     # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1)         # add a density estimate 
lines(y,marginals, col="black", lty=1, lwd=1)              # mod 1, poisson with predictor
lines(y,marginals2,col="red",lty=1, lwd=2)                 # negative binomial
text(30,0.30,"Data",col="cyan")                            # zip
text(31,0.25,"Poisson no predcitors",col="darkgreen")
text(32.4,0.20,"Poisson nom ~ bullyscale score",col="black")
text(32.4,0.15,"NegBin nom ~ bullyscale score",col="red")

These look a bit off.

ZIP

library(pscl)

summary(mod3 <- zeroinfl(nom ~ bsc | 1, data = bully))

## 
## Call:
## zeroinfl(formula = nom ~ bsc | 1, data = bully)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8713 -0.7353 -0.7096 -0.1353  9.8156 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.47682    0.10148   4.699 2.62e-06 ***
## bsc          0.60231    0.03979  15.138  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.2058     0.1212   1.698   0.0896 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 5 
## Log-likelihood: -789.5 on 3 Df

summary(mod4 <- zeroinfl(nom ~ bsc | bsc, data = bully))

## 
## Call:
## zeroinfl(formula = nom ~ bsc | bsc, data = bully)
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.37068 -0.68873 -0.59685 -0.05357  9.87491 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.49604    0.09987   4.967 6.81e-07 ***
## bsc          0.59609    0.03942  15.120  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.4967     0.3463   4.322 1.54e-05 ***
## bsc          -0.7716     0.1967  -3.924 8.72e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 8 
## Log-likelihood: -780.6 on 4 Df

ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob4 <- matrix(99,nrow=291,ncol=51)               # 
# For zip (mod4)

for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu.zip <- exp(0.4967 + 0.59609*bully$bsc[i] )
    pie <- exp(1.4967 - 0.7716*bully$bsc[i])/( 1 + exp(1.4967 - 0.7716*bully$bsc[i]))
    if (yobs==0) {
       fitprob4[i,yindex] <- pie + (1-pie)*exp(-mu.zip)
    } else {
       fitprob4[i,yindex] <- (1-pie)*exp(-mu.zip)*(mu.zip**yobs)/factorial(yobs)
    }
  }
}