This should reproduce the bully example results that are given in lecture. The results in the lecture notes were created using SAS. Some of the results below (i.e., marginal distribution graphs) seem to be a bit off.
library(MASS) # for negative binomial
library(pscl) # for zero inflated
## Warning: package 'pscl' was built under R version 4.0.5
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
setwd("D:/Dropbox/edps 589/4 glm")
bully <- read.table("bully_data.txt",header=TRUE)
(n <- length(bully$nom))
## [1] 291
(ybar <- mean(bully$nom))
## [1] 2.487973
(s <- sd(bully$nom))
## [1] 6.423746
(s2 <- var(bully$nom))
## [1] 41.26451
summary(bully$nom)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 2.488 2.000 50.000
hist(bully$nom,
main="Distribution of Number of Bully Nominations",
xlab="Number of Nominations that a student is Bully",
breaks=50,
col="cyan")
text(41.2,200,"N = 291")
text(40,190,"Mean = 2.49")
text(41,180,"sd = 6.42")
text(41.1,170,"s2 = 41.26")
text(39.1,160,"min = 0")
text(39.1,150,"max = 50")
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)
hist(bully$nom, prob=TRUE, col="cyan", breaks=50,
main="Bully Nominations w/ Poisson Overlayed",
xlab="Number of Bully Nominations")
#prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2, lwd=1.5) # added a density estimate
Although we don’t expect this to fit…
mod1 <- glm(nom ~ bsc, data=bully, family=poisson)
summary(mod1)
##
## Call:
## glm(formula = nom ~ bsc, family = poisson, data = bully)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.4995 -1.8246 -1.5952 -0.1552 11.6806
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.66308 0.08871 -7.475 7.75e-14 ***
## bsc 0.81434 0.03504 23.239 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2205.2 on 290 degrees of freedom
## Residual deviance: 1774.6 on 289 degrees of freedom
## AIC: 2160.3
##
## Number of Fisher Scoring iterations: 6
Let’s see how well this is doing
bully$pred1 <- fitted.values(mod1)
pred <- predict(mod1, newdata = NULL,
type = c("link", "response", "terms"),
se.fit = TRUE
)
bully$lower1 <- exp(pred$fit - 1.96*pred$se.fit)
bully$upper1 <- exp(pred$fit + 1.96*pred$se.fit)
bully$pred1 <- exp(pred$fit)
bully.sort <- bully[order(bully$bsc),]
plot(bully.sort$bsc,bully.sort$nom,type='p',pch=20, col="blue",
main='Data, Fitted Line, and 95% Confidence',
xlab='Bully Scale Score', ylab='Fitted Count')
lines(bully.sort$bsc,bully.sort$pred1,col="darkgreen",lty=1,lwd=2)
lines(bully.sort$bsc,bully.sort$lower1,col="darkgreen",lty=2)
lines(bully.sort$bsc,bully.sort$upper1,col="darkgreen",lty=2)
summary(mod.Pidentity <- glm(nom ~ bsc, data=bully, family=poisson("identity")))
##
## Call:
## glm(formula = nom ~ bsc, family = poisson("identity"), data = bully)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8270 -1.8970 -1.2308 0.1469 10.9259
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7471 0.2173 -12.64 <2e-16 ***
## bsc 3.1573 0.1648 19.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2205.2 on 290 degrees of freedom
## Residual deviance: 1713.7 on 289 degrees of freedom
## AIC: 2099.3
##
## Number of Fisher Scoring iterations: 5
mod.Pidentity$aic
## [1] 2099.334
summary(nb.indentity <- glm.nb(nom ~ bsc, data=bully, link=identity))
##
## Call:
## glm.nb(formula = nom ~ bsc, data = bully, link = identity, init.theta = 0.2986099095)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.47966 -1.02677 -0.73090 0.05897 2.62879
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.6421 0.6402 -4.127 3.67e-05 ***
## bsc 3.0739 0.5582 5.507 3.65e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.2986) family taken to be 1)
##
## Null deviance: 297.15 on 290 degrees of freedom
## Residual deviance: 244.33 on 289 degrees of freedom
## AIC: 992.09
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.2986
## Std. Err.: 0.0380
##
## 2 x log-likelihood: -986.0890
nb.indentity$aic
## [1] 992.0894
But the line is for fitted mean..so another way to look at the goodness of fit is to group data based on bully scale score and re-fit the model, etc.
This is ONLY for assessing fit (fit to individual level is what should be report and interpreted
Group data and get a look-see at how good it may be fitting the data
bully$grp <- cut(bully$bsc, quantile(bully$bsc, c(0, 1/7, 2/7, 3/7, 4/7, 5/7, 6/7, 1)))
grp.bsc <- aggregate(bully$bsc, by= list(bgrp=bully$grp), FUN=mean)
grp.nom <- aggregate(bully$nom, by= list(bgrp=bully$grp), FUN=mean)
grpdata <- cbind(grp.bsc, grp.nom)
grpdata <- as.data.frame(grpdata)
names(grpdata) <- c("grp","mbsc","grp2","mnom")
plot(grpdata$mbsc,grpdata$mnom)
mod1b <- glm(mnom ~ mbsc, data=grpdata, family=poisson)
## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.891892
## Warning in dpois(y, mu, log = TRUE): non-integer x = 1.700000
## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.437500
## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.212766
## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.272727
## Warning in dpois(y, mu, log = TRUE): non-integer x = 4.800000
## Warning in dpois(y, mu, log = TRUE): non-integer x = 6.463415
summary(mod1b)
##
## Call:
## glm(formula = mnom ~ mbsc, family = poisson, data = grpdata)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7
## -0.38619 0.21016 -1.07973 0.25070 0.01697 0.92678 -0.31365
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7058 0.6915 -1.021 0.30738
## mbsc 0.8777 0.3008 2.918 0.00352 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 10.2451 on 6 degrees of freedom
## Residual deviance: 2.3796 on 5 degrees of freedom
## AIC: Inf
##
## Number of Fisher Scoring iterations: 4
#-- test of fit
1 - pchisq(mod1b$deviance,mod1b$df.residual) # look good
## [1] 0.7945152
# S3 method for class 'glm'
pred <- predict(mod1b, newdata = NULL,
type = c("link", "response", "terms"),
se.fit = TRUE
)
This looks difference from SAS but numerical results are spot on (i.e. equal for all quanties so …
lower <- exp(pred$fit - 1.96*pred$se.fit)
upper <- exp(pred$fit + 1.96*pred$se.fit)
fit <- exp(pred$fit)
plot(grpdata$mbsc,grpdata$mnom,
pch=20, col="blue",
main='Grouped: Means bully scale by Nominations (95% Confidence)',
xlab='mean bully scale score',
ylab='observed and fitted number nominations',
ylim=c(0,35),
xlim=c(1,4) )
lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2)
lines(grpdata$mbsc,lower,col='darkgreen',lty=2)
lines(grpdata$mbsc,upper,col='darkgreen',lty=2)
Fitted Marginal distribution (back to individual level)
ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob <- matrix(99,nrow=291,ncol=51)
for (i in ss) {
for (yindex in yvalues) {
yobs <- yindex-1
mu <- exp(-0.66308 + 0.81434*bully$bsc[i] )
fitprob[i,yindex] = ( exp(-mu) * mu**yobs)/(factorial(yobs))
}
}
# summing over y values (colums) should all equal 1
rowSums(fitprob)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# mean over kids (rows) is what I want to plot
marginals <- colSums(fitprob)/length(bully$bsc)
# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)
hist(bully$nom, prob=TRUE, col="cyan", breaks=50,
main="Bully Nominations with Poisson Overlayed",
xlab="Number of Bully Nominations") # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2, lwd=1) # added a density estimate
lines(y,marginals, col="black", lty=1, lwd=2) # mod 1, poisson with predictor
text(30,0.68,"Data",col="cyan")
text(36,0.62,"Poisson no predictors",col="darkgreen")
text(39.4,0.66,"Poisson nom ~ bullyscale score",col="black")
library(MASS)
mod2 <- glm.nb(nom ~ bsc, data=bully) # dispersion phi = 1/"disperson from R"
summary(mod2)
##
## Call:
## glm.nb(formula = nom ~ bsc, data = bully, init.theta = 0.2858008788,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6981 -0.9888 -0.9027 -0.0086 2.5282
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1940 0.3058 -3.904 9.45e-05 ***
## bsc 1.0921 0.1648 6.626 3.44e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.2858) family taken to be 1)
##
## Null deviance: 288.32 on 290 degrees of freedom
## Residual deviance: 244.08 on 289 degrees of freedom
## AIC: 998.82
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.2858
## Std. Err.: 0.0360
##
## 2 x log-likelihood: -992.8160
Using my grouped data
mod2b <- glm.nb(mnom ~ mbsc, data=grpdata)
## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.891892
## Warning in dpois(y, mu, log = TRUE): non-integer x = 1.700000
## Warning in dpois(y, mu, log = TRUE): non-integer x = 0.437500
## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.212766
## Warning in dpois(y, mu, log = TRUE): non-integer x = 2.272727
## Warning in dpois(y, mu, log = TRUE): non-integer x = 4.800000
## Warning in dpois(y, mu, log = TRUE): non-integer x = 6.463415
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(mod2b)
##
## Call:
## glm.nb(formula = mnom ~ mbsc, data = grpdata, init.theta = 131312.3787,
## link = log)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7
## -0.59433 0.21016 -1.37446 0.25070 0.01697 0.92676 -0.31365
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7058 0.6915 -1.021 0.30738
## mbsc 0.8777 0.3008 2.918 0.00352 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(131312.4) family taken to be 1)
##
## Null deviance: 11.1723 on 6 degrees of freedom
## Residual deviance: 3.3069 on 5 degrees of freedom
## AIC: 26.715
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 131312
## Std. Err.: 7093913
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -20.715
#-- test of fit
1 - pchisq(mod2b$deviance,mod2b$df.residual)
## [1] 0.6527796
# looks good
pred <- predict(mod2b, newdata = NULL,
type = c("link", "response", "terms"),
se.fit = TRUE )
lower <- exp(pred$fit - 1.96*pred$se.fit)
upper <- exp(pred$fit + 1.96*pred$se.fit)
fit <- exp(pred$fit)
And a graph
plot(grpdata$mbsc,grpdata$mnom,
pch=20, col="blue",
main='NegBin to Grouped: Means bully scale by Nominations (95%)',
xlab='mean bully scale score',
ylab='observed and fitted number nominations',
ylim=c(0,35),
xlim=c(1,4) )
lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2)
lines(grpdata$mbsc,lower,col='darkgreen',lty=2)
lines(grpdata$mbsc,upper,col='darkgreen',lty=2)
Doesn’t look same as SAS, but not sure why.
Using back to individual level
This seems to be off a bit off; not sure why.
ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob2 <- matrix(99,nrow=291,ncol=51) # For negative binomial
phi = 0.2858
for (i in ss) {
for (yindex in yvalues) {
yobs <- yindex-1
mu <- exp(-1.1940 + 1.0921*bully$bsc[i] )
term1 <- lgamma(mu+phi)/(factorial(yobs)*lgamma(phi))
term2 <- ( phi/(mu+phi))**phi
term3 <- ( mu/(mu+phi))**yobs
fitprob2[i,yindex] = term1*term2*term3
}
}
Mean over kids (rows) is what I want to plot
marginals2 <- colSums(fitprob2)/length(bully$bsc)
# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)
hist(bully$nom, prob=TRUE, col="cyan", breaks=30,
main="Bully Nominations with Fitted Overlayed",
xlab="Number of Bully Nominations",
ylim=c(0,1)
) # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2, lwd=1) # add a density estimate
lines(y,marginals, col="black", lty=1, lwd=1) # mod 1, poisson with predictor
lines(y,marginals2,col="red",lty=1, lwd=2) # negative binomial
text(30,0.30,"Data",col="cyan") # zip
text(31,0.25,"Poisson no predcitors",col="darkgreen")
text(32.4,0.20,"Poisson nom ~ bullyscale score",col="black")
text(32.4,0.15,"NegBin nom ~ bullyscale score",col="red")
These look a bit off.
library(pscl)
summary(mod3 <- zeroinfl(nom ~ bsc | 1, data = bully))
##
## Call:
## zeroinfl(formula = nom ~ bsc | 1, data = bully)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.8713 -0.7353 -0.7096 -0.1353 9.8156
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.47682 0.10148 4.699 2.62e-06 ***
## bsc 0.60231 0.03979 15.138 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2058 0.1212 1.698 0.0896 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 5
## Log-likelihood: -789.5 on 3 Df
summary(mod4 <- zeroinfl(nom ~ bsc | bsc, data = bully))
##
## Call:
## zeroinfl(formula = nom ~ bsc | bsc, data = bully)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.37068 -0.68873 -0.59685 -0.05357 9.87491
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.49604 0.09987 4.967 6.81e-07 ***
## bsc 0.59609 0.03942 15.120 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4967 0.3463 4.322 1.54e-05 ***
## bsc -0.7716 0.1967 -3.924 8.72e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 8
## Log-likelihood: -780.6 on 4 Df
ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob4 <- matrix(99,nrow=291,ncol=51) #
# For zip (mod4)
for (i in ss) {
for (yindex in yvalues) {
yobs <- yindex-1
mu.zip <- exp(0.4967 + 0.59609*bully$bsc[i] )
pie <- exp(1.4967 - 0.7716*bully$bsc[i])/( 1 + exp(1.4967 - 0.7716*bully$bsc[i]))
if (yobs==0) {
fitprob4[i,yindex] <- pie + (1-pie)*exp(-mu.zip)
} else {
fitprob4[i,yindex] <- (1-pie)*exp(-mu.zip)*(mu.zip**yobs)/factorial(yobs)
}
}
}
Mean over kids (rows) is what I want to plot
marginals4 <- colSums(fitprob4)/length(bully$bsc)
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)
hist(bully$nom, prob=TRUE, col="cyan", breaks=30,
main="Bully Nominations with Fitted Overlayed",
xlab="Number of Bully Nominations",
ylim=c(0,1)
) # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2, lwd=1) # add a density estimate
lines(y,marginals, col="black", lty=1, lwd=1) # mod 1, poisson with predictor
lines(y,marginals2,col="red",lty=1, lwd=2) # negative binomial
lines(y,marginals4,col="violet", lty=4, lwd=2)
# zip
text(30,0.35,"Data",col="cyan")
text(32.4,0.30,"Poisson nom ~ bullyscale score",col="black")
text(32.4,0.25,"NegBin nom ~ bullyscale score",col="red")
text(32.4,0.20,"ZIP nom ~ bullyscale score",col="violet")
The marginal predictions for zip and negbin seem to be a bit off.