These are analyses reported in lecture notes on logistic regression.
library(vcd)
## Loading required package: grid
library(ResourceSelection)
## ResourceSelection 0.3-5 2019-07-22
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(DescTools)
library(texreg)
## Version: 1.37.5
## Date: 2020-06-17
## Author: Philip Leifeld (University of Essex)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
setwd("C:/Users/cja/Dropbox/edps 589/5 logistic regression")
hsb <- read.csv(file="hsb_data.csv", header=TRUE, sep=",")
names(hsb)
## [1] "id" "sex" "race" "ses" "sctyp" "hsp"
## [7] "locus" "concpt" "mot" "car" "rdg" "wrtg"
## [13] "math" "sci" "civ" "achieve" "sci_math" "type"
## [19] "xmean" "xsum" "program" "group"
head(hsb)
## id sex race ses sctyp hsp locus concpt mot car rdg wrtg math sci civ
## 1 1 2 1 1 1 3 0.29 0.88 0.67 10 33.6 43.7 40.2 39.0 40.6
## 2 4 2 1 2 1 3 0.06 0.03 0.00 15 38.9 41.1 32.7 41.7 40.6
## 3 23 2 1 1 1 3 -0.22 0.32 0.33 1 35.2 38.5 39.9 34.7 40.6
## 4 26 2 1 2 1 2 1.12 -0.74 0.67 15 31.0 41.1 36.0 36.9 45.6
## 5 29 2 1 2 1 2 -0.90 0.03 0.67 4 36.3 44.3 36.1 33.6 40.6
## 6 31 1 1 2 1 2 -0.21 -1.38 0.00 9 34.2 46.3 44.5 39.0 35.6
## achieve sci_math type xmean xsum program group
## 1 197.1 -1.2 case 39.42 197.1 0 1
## 2 195.0 9.0 case 39.00 195.0 0 1
## 3 188.9 -5.2 case 37.78 188.9 0 1
## 4 190.6 0.9 case 38.12 190.6 1 1
## 5 190.9 -2.5 case 38.18 190.9 1 1
## 6 199.6 -5.5 case 39.92 199.6 1 1
var.levels <- expand.grid(academic=c("no", "yes"),
sctyp=c("public","private"),
ses = c("low","middle","high"))
( hsb3way <- data.frame(var.levels, count=c(91, 40, 4, 4, 138, 111,
14, 36, 44, 82, 1, 35 )))
## academic sctyp ses count
## 1 no public low 91
## 2 yes public low 40
## 3 no private low 4
## 4 yes private low 4
## 5 no public middle 138
## 6 yes public middle 111
## 7 no private middle 14
## 8 yes private middle 36
## 9 no public high 44
## 10 yes public high 82
## 11 no private high 1
## 12 yes private high 35
var.levels2 <- expand.grid(sctyp=c("public","private"),
ses = c("low","middle","high"))
hsb3way.no <- data.frame(var.levels2, no=c(40, 4, 111, 36, 82, 35 ))
hsb3way.yes <- data.frame(var.levels2, yes=c(91,4,136,14,44,1))
hsb3way.wide <- merge(hsb3way.yes,hsb3way.no, by=c("sctyp","ses"))
hsb3way.wide$ncases <- hsb3way.wide$yes + hsb3way.wide$no
(hsb3way.wide)
## sctyp ses yes no ncases
## 1 private high 1 35 36
## 2 private low 4 4 8
## 3 private middle 14 36 50
## 4 public high 44 82 126
## 5 public low 91 40 131
## 6 public middle 136 111 247
summary( i.mod <- glm(cbind(yes,no) ~ 1 , data=hsb3way.wide, family=binomial))
##
## Call:
## glm(formula = cbind(yes, no) ~ 1, family = binomial, data = hsb3way.wide)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -6.2252 0.0852 -2.9563 -3.0768 4.8559 2.0650
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.06022 0.08182 -0.736 0.462
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.81 on 5 degrees of freedom
## Residual deviance: 84.81 on 5 degrees of freedom
## AIC: 111.85
##
## Number of Fisher Scoring iterations: 3
or all 2-way interactions
summary( h.logit <- glm(cbind(yes,no) ~ sctyp + ses , data=hsb3way.wide,family=binomial) )
##
## Call:
## glm(formula = cbind(yes, no) ~ sctyp + ses, family = binomial,
## data = hsb3way.wide)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -1.8337 0.7284 0.6186 0.5513 -0.1931 -0.2439
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8587 0.1857 4.623 3.78e-06 ***
## sctypprivate -1.3766 0.2793 -4.929 8.25e-07 ***
## sesmiddle -0.6244 0.2209 -2.826 0.00471 **
## seshigh -1.5848 0.2577 -6.149 7.81e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.8101 on 5 degrees of freedom
## Residual deviance: 4.6765 on 2 degrees of freedom
## AIC: 37.72
##
## Number of Fisher Scoring iterations: 4
(X2 <- sum(residuals(h.logit,type="pearson")^2) )
## [1] 3.787392
# As a poisson --- one way to get X2 goodness-of-fit statistics
summary( h.poi <- glm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way,family=poisson) )
##
## Call:
## glm(formula = count ~ sctyp + academic + ses + sctyp * academic +
## sctyp * ses + ses * academic, family = poisson, data = hsb3way)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7 8
## -0.1075 0.1641 0.5667 -0.4760 -0.1577 0.1777 0.5214 -0.3050
## 9 10 11 12
## 0.4456 -0.3155 -1.7503 0.5027
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.5221 0.1034 43.742 < 2e-16 ***
## sctypprivate -3.4332 0.4064 -8.448 < 2e-16 ***
## academicyes -0.8593 0.1858 -4.625 3.74e-06 ***
## sesmiddle 0.4185 0.1323 3.163 0.00156 **
## seshigh -0.8059 0.1828 -4.409 1.04e-05 ***
## sctypprivate:academicyes 1.3856 0.2792 4.962 6.97e-07 ***
## sctypprivate:sesmiddle 0.9889 0.4034 2.451 0.01423 *
## sctypprivate:seshigh 1.0754 0.4230 2.543 0.01100 *
## academicyes:sesmiddle 0.6113 0.2207 2.769 0.00562 **
## academicyes:seshigh 1.5844 0.2578 6.146 7.94e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 485.7696 on 11 degrees of freedom
## Residual deviance: 4.6219 on 2 degrees of freedom
## AIC: 85.535
##
## Number of Fisher Scoring iterations: 4
(X2 <- sum(residuals(h.poi,type="pearson")^2))
## [1] 3.748049
# This will give both G2 and X2
library(MASS)
summary( h.poi2 <- loglm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way) )
## Formula:
## count ~ sctyp + academic + ses + sctyp * academic + sctyp * ses +
## ses * academic
## attr(,"variables")
## list(count, sctyp, academic, ses)
## attr(,"factors")
## sctyp academic ses sctyp:academic sctyp:ses academic:ses
## count 0 0 0 0 0 0
## sctyp 1 0 0 1 1 0
## academic 0 1 0 1 0 1
## ses 0 0 1 0 1 1
## attr(,"term.labels")
## [1] "sctyp" "academic" "ses" "sctyp:academic"
## [5] "sctyp:ses" "academic:ses"
## attr(,"order")
## [1] 1 1 1 2 2 2
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,".Environment")
## <environment: R_GlobalEnv>
## attr(,"predvars")
## list(count, sctyp, academic, ses)
## attr(,"dataClasses")
## count sctyp academic ses
## "numeric" "factor" "factor" "factor"
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 4.621902 2 0.0991669
## Pearson 3.747989 2 0.1535093
Different programs may use different ID constraints. Different parameterizations on impact the value of the regression coefficients but not the fitted values, odds ration estimates, residuals, goodness-of-model fit to data, etc.
The first is set to 0,
hsb3way.wide$low <- ifelse(hsb3way.wide$ses=="low",1,0)
hsb3way.wide$mid <- ifelse(hsb3way.wide$ses=="middle",1,0)
hsb3way.wide$hig <- ifelse(hsb3way.wide$ses=="high",1,0)
summary( logit.bin <- glm(cbind(yes,no) ~ mid + hig + sctyp, family = binomial, data = hsb3way.wide) )
##
## Call:
## glm(formula = cbind(yes, no) ~ mid + hig + sctyp, family = binomial,
## data = hsb3way.wide)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -1.8337 0.7284 0.6186 0.5513 -0.1931 -0.2439
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8587 0.1857 4.623 3.78e-06 ***
## mid -0.6244 0.2209 -2.826 0.00471 **
## hig -1.5848 0.2577 -6.149 7.81e-10 ***
## sctypprivate -1.3766 0.2793 -4.929 8.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.8101 on 5 degrees of freedom
## Residual deviance: 4.6765 on 2 degrees of freedom
## AIC: 37.72
##
## Number of Fisher Scoring iterations: 4
By defaults glm in R will set the 1st coefficient to 0, but SAS sets the last to 0. Below we switch to last coefficients
# -- change how Dummy codes are defined
hsb3way.wide$Dsctyp <- ifelse(hsb3way.wide$sctyp=="public",1,0)
hsb3way.wide$Dlow <- ifelse(hsb3way.wide$ses=="low",1,0)
hsb3way.wide$Dmid <- ifelse(hsb3way.wide$ses=="middle",1,0)
summary( logit.bin2 <- glm(cbind(yes,no) ~ Dlow + Dmid + Dsctyp, family = binomial, data = hsb3way.wide) )
##
## Call:
## glm(formula = cbind(yes, no) ~ Dlow + Dmid + Dsctyp, family = binomial,
## data = hsb3way.wide)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -1.8337 0.7284 0.6186 0.5513 -0.1931 -0.2439
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1027 0.3059 -6.874 6.25e-12 ***
## Dlow 1.5848 0.2577 6.149 7.81e-10 ***
## Dmid 0.9605 0.2153 4.460 8.19e-06 ***
## Dsctyp 1.3766 0.2793 4.929 8.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.8101 on 5 degrees of freedom
## Residual deviance: 4.6765 on 2 degrees of freedom
## AIC: 37.72
##
## Number of Fisher Scoring iterations: 4
hsb3way.wide$Esctyp <- ifelse(hsb3way.wide$sctyp=="public",1,-1)
hsb3way.wide$Elow <- ifelse(hsb3way.wide$ses=="low",1,0)
hsb3way.wide$Elow <- ifelse(hsb3way.wide$ses=="high",-1, hsb3way.wide$Elow)
hsb3way.wide$Emid <- ifelse(hsb3way.wide$ses=="middle",1,0)
hsb3way.wide$Emid <- ifelse(hsb3way.wide$ses=="high",-1,hsb3way.wide$Emid)
summary( logit.bin3 <- glm(cbind(yes,no) ~ Elow + Emid + Esctyp, family = binomial, data = hsb3way.wide) )
##
## Call:
## glm(formula = cbind(yes, no) ~ Elow + Emid + Esctyp, family = binomial,
## data = hsb3way.wide)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -1.8337 0.7284 0.6186 0.5513 -0.1931 -0.2439
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5660 0.1459 -3.880 0.000105 ***
## Elow 0.7364 0.1430 5.149 2.62e-07 ***
## Emid 0.1120 0.1173 0.955 0.339684
## Esctyp 0.6883 0.1396 4.929 8.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.8101 on 5 degrees of freedom
## Residual deviance: 4.6765 on 2 degrees of freedom
## AIC: 37.72
##
## Number of Fisher Scoring iterations: 4
screenreg(list(logit.bin, logit.bin2, logit.bin3),
custom.model.names=c("First=0", "Last =0", "Effect"))
##
## ==================================================
## First=0 Last =0 Effect
## --------------------------------------------------
## (Intercept) 0.86 *** -2.10 *** -0.57 ***
## (0.19) (0.31) (0.15)
## mid -0.62 **
## (0.22)
## hig -1.58 ***
## (0.26)
## sctypprivate -1.38 ***
## (0.28)
## Dlow 1.58 ***
## (0.26)
## Dmid 0.96 ***
## (0.22)
## Dsctyp 1.38 ***
## (0.28)
## Elow 0.74 ***
## (0.14)
## Emid 0.11
## (0.12)
## Esctyp 0.69 ***
## (0.14)
## --------------------------------------------------
## AIC 37.72 37.72 37.72
## BIC 36.89 36.89 36.89
## Log Likelihood -14.86 -14.86 -14.86
## Deviance 4.68 4.68 4.68
## Num. obs. 6 6 6
## ==================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Same model, just different value of the coefficients.
Holding sctype constant: public vs private odds ratios
(odds.dummy1 <- exp(0 - logit.bin$coefficients[4]))
## sctypprivate
## 3.961426
(odds.dummy2 <- exp(logit.bin2$coefficients[4]- 0))
## Dsctyp
## 3.961426
(odds.effect3 <- exp(logit.bin3$coefficients[4]+logit.bin3$coefficients[4]))
## Esctyp
## 3.961426
You can re-code school type to get exactly what are in the lecture notes
SES is definitely ordinal, but first treat is as a factor
hsb3way.wide$xses <- as.numeric(hsb3way.wide$ses)
# -- ses as factor
summary( logit.factor <- glm(cbind(yes,no) ~ ses + sctyp, family = binomial, data = hsb3way.wide) )
##
## Call:
## glm(formula = cbind(yes, no) ~ ses + sctyp, family = binomial,
## data = hsb3way.wide)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -1.8337 0.7284 0.6186 0.5513 -0.1931 -0.2439
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8587 0.1857 4.623 3.78e-06 ***
## sesmiddle -0.6244 0.2209 -2.826 0.00471 **
## seshigh -1.5848 0.2577 -6.149 7.81e-10 ***
## sctypprivate -1.3766 0.2793 -4.929 8.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.8101 on 5 degrees of freedom
## Residual deviance: 4.6765 on 2 degrees of freedom
## AIC: 37.72
##
## Number of Fisher Scoring iterations: 4
Note SES will be numeric (this is special case of when treated as a factor)
summary( logit.numeric <- glm(cbind(yes,no) ~ xses + sctyp, family = binomial, data = hsb3way.wide) )
##
## Call:
## glm(formula = cbind(yes, no) ~ xses + sctyp, family = binomial,
## data = hsb3way.wide)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -1.9954 0.5755 0.8124 0.1059 -0.6799 0.3775
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.7493 0.2762 6.333 2.40e-10 ***
## xses -0.7972 0.1289 -6.185 6.20e-10 ***
## sctypprivate -1.3604 0.2784 -4.887 1.02e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.8101 on 5 degrees of freedom
## Residual deviance: 5.5889 on 3 degrees of freedom
## AIC: 36.633
##
## Number of Fisher Scoring iterations: 4
Compute goodness-of-fit statistics for logit.numeric
hsb3way.wide$pi.numeric <- logit.numeric$fitted
Xsq.yes <- (hsb3way.wide$yes - hsb3way.wide$pi.numeric*hsb3way.wide$ncases)**2 /(hsb3way.wide$pi.numeric*hsb3way.wide$ncases)
Xsq.no <- (hsb3way.wide$no - (1-hsb3way.wide$pi.numeric)*hsb3way.wide$ncases)**2 /((1-hsb3way.wide$pi.numeric)*hsb3way.wide$ncases)
(Xsq <- sum(Xsq.yes + Xsq.no))
## [1] 4.507544
Gsq.yes <- 2*hsb3way.wide$yes *log(hsb3way.wide$yes/(hsb3way.wide$ncases*hsb3way.wide$pi.numeric))
Gsq.no <- 2*hsb3way.wide$no * log(hsb3way.wide$no/(hsb3way.wide$ncases*(1-hsb3way.wide$pi.numeric)))
( Gsq <- sum(Gsq.yes+Gsq.no) )
## [1] 5.588936
# -- LR test of restriction on values for SES
anova(logit.numeric, logit.factor)
## Analysis of Deviance Table
##
## Model 1: cbind(yes, no) ~ xses + sctyp
## Model 2: cbind(yes, no) ~ ses + sctyp
## Resid. Df Resid. Dev Df Deviance
## 1 3 5.5889
## 2 2 4.6765 1 0.91243
Can treat SES as numeric with evenly spaced scores.
Just some here
logit.numeric$coefficients
## (Intercept) xses sctypprivate
## 1.7492951 -0.7972102 -1.3604133
# -- Holding schtyp constant, odds of academic are
exp(logit.numeric$coefficients[2])
## xses
## 0.4505843
# the odds for an increase of SES by 1 unit
# -- odds academic of low vs high ses
exp((3-1)*logit.numeric$coefficients[2])
## xses
## 0.2030262
summary(logit.mod1 <- glm(program ~ math, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math, family = binomial, data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2997 -0.9562 0.4355 0.9806 1.9107
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.58539 0.58066 -9.619 <2e-16 ***
## math 0.10933 0.01119 9.770 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 709.27 on 598 degrees of freedom
## AIC: 713.27
##
## Number of Fisher Scoring iterations: 4
j <- order(hsb$math)
plot(hsb$math[j],logit.mod1$fitted[j],
type='l',
lwd=2,
ylim=c(0,1),
col="blue",
main="Model 1: program ~ math")
hsb$ses <- as.factor(hsb$ses)
summary(logit.mod2 <- glm(program ~ math + ses, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses, family = binomial, data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2142 -0.9260 0.4107 0.9469 2.0086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.52362 0.59008 -9.361 < 2e-16 ***
## math 0.09889 0.01147 8.620 < 2e-16 ***
## ses2 0.35070 0.23577 1.487 0.137
## ses3 1.15033 0.27543 4.177 2.96e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 689.30 on 596 degrees of freedom
## AIC: 697.3
##
## Number of Fisher Scoring iterations: 3
# - subset for graphing
hsb$fitted2 <- logit.mod2$fitted
low <- hsb[ which(hsb$ses=="1"),]
mid <- hsb[ which(hsb$ses=="2"),]
hi <- hsb[ which(hsb$ses=="3"),]
j <- order(low$math)
plot(low$math[j],low$fitted2[j],
type='l',
lwd=2,
ylim=c(0,1),
xlim=c(30,80),
col="blue",
main="Model 2: program ~ math + ses (as factor) "
)
j <- order(mid$math)
lines(mid$math[j],mid$fitted2[j],col="green",lwd=2)
j <- order(hi$math)
lines(hi$math[j],hi$fitted2[j],col="red",lwd=2)
legend("topleft",legend=c("high","middle","low"),
col=c("red","green","blue"), lty=1,lwd=2, cex=1)
hsb$ses <- as.numeric(hsb$ses)
summary(logit.mod3 <- glm(program ~ math + ses, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses, family = binomial, data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1836 -0.9460 0.4366 0.9652 2.0587
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.19142 0.61298 -10.101 < 2e-16 ***
## math 0.09801 0.01140 8.595 < 2e-16 ***
## ses 0.58374 0.13762 4.242 2.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 690.76 on 597 degrees of freedom
## AIC: 696.76
##
## Number of Fisher Scoring iterations: 3
# - subset for graphing
hsb$fitted3 <- logit.mod3$fitted
low <- hsb[ which(hsb$ses=="1"),]
mid <- hsb[ which(hsb$ses=="2"),]
hi <- hsb[ which(hsb$ses=="3"),]
j <- order(low$math)
plot(low$math[j],low$fitted3[j],
type='l',
lwd=2,
ylim=c(0,1),
xlim=c(30,80),
col="blue",
main="Model 3: program ~ math + ses (numeric) "
)
j <- order(mid$math)
lines(mid$math[j],mid$fitted3[j],col="green",lwd=2)
j <- order(hi$math)
lines(hi$math[j],hi$fitted3[j],col="red",lwd=2)
legend("topleft",legend=c("high","middle","low"),
col=c("red","green","blue"), lty=1,lwd=2, cex=1)
hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod4 <- glm(program ~ math + ses + sctyp, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses + sctyp, family = binomial,
## data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1063 -0.8883 0.2979 0.8911 2.1101
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.24834 0.62568 -9.986 < 2e-16 ***
## math 0.09858 0.01166 8.454 < 2e-16 ***
## ses 0.49856 0.14069 3.544 0.000395 ***
## sctyp2 1.36466 0.28982 4.709 2.49e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 665.42 on 596 degrees of freedom
## AIC: 673.42
##
## Number of Fisher Scoring iterations: 4
# - subset for graphing
hsb$fitted4 <- logit.mod4$fitted
low.public <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),]
low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),]
mid.public <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),]
mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),]
hi.public <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),]
hi.private <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),]
j <- order(low.public$math)
plot(low.public$math[j],low.public$fitted4[j],
type='l',
lwd=2,
ylim=c(0,1),
xlim=c(30,80),
col="blue",
main="Model 4: program ~ math + ses (numeric) + school type "
)
j <- order(low.private$math)
lines(low.private$math[j],low.private$fitted4[j],col="darkblue",lwd=2)
j <- order(mid.public$math)
lines(mid.public$math[j],mid.public$fitted4[j],col="green",lwd=2)
j <- order(mid.private$math)
lines(mid.private$math[j],mid.private$fitted4[j],col="darkgreen",lwd=2)
j <- order(hi.public$math)
lines(hi.public$math[j],hi.public$fitted4[j],col="red",lwd=2)
j <- order(hi.private$math)
lines(hi.private$math[j],hi.private$fitted4[j],col="darkred",lwd=2)
legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "),
col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1)
hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod5 <- glm(program ~ math + ses + sctyp + ses*sctyp, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp, family = binomial,
## data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1263 -0.8776 0.2376 0.8949 2.0717
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.1130 0.6251 -9.780 <2e-16 ***
## math 0.1006 0.0118 8.526 <2e-16 ***
## ses 0.3804 0.1483 2.566 0.0103 *
## sctyp2 -1.1328 1.1223 -1.009 0.3128
## ses:sctyp2 1.1929 0.5293 2.254 0.0242 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 659.44 on 595 degrees of freedom
## AIC: 669.44
##
## Number of Fisher Scoring iterations: 5
# - subset for graphing
hsb$fitted5 <- logit.mod5$fitted
low.public <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),]
low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),]
mid.public <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),]
mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),]
hi.public <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),]
hi.private <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),]
j <- order(low.public$math)
plot(low.public$math[j],low.public$fitted5[j],
type='l',
lwd=2,
ylim=c(0,1),
xlim=c(30,80),
col="blue",
main="Model 5: program ~ math + ses + schyp + ses*schtyp "
)
j <- order(low.private$math)
lines(low.private$math[j],low.private$fitted5[j],col="darkblue",lwd=2)
j <- order(mid.public$math)
lines(mid.public$math[j],mid.public$fitted5[j],col="green",lwd=2)
j <- order(mid.private$math)
lines(mid.private$math[j],mid.private$fitted5[j],col="darkgreen",lwd=2)
j <- order(hi.public$math)
lines(hi.public$math[j],hi.public$fitted5[j],col="red",lwd=2)
j <- order(hi.private$math)
lines(hi.private$math[j],hi.private$fitted5[j],col="darkred",lwd=2)
legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "),
col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1)
hoslem.test(logit.mod5$y, fitted(logit.mod5), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: logit.mod5$y, fitted(logit.mod5)
## X-squared = 7.0278, df = 8, p-value = 0.5336
Using Pearson (adjusted standardized residuals
resid <- rstandard(logit.mod5, pearson=TRUE)
qqnorm(resid,
main="QQ plot of Pearson Residuals",
col="blue")
### ROC and concordance #### Concordance index
Cstat(logit.mod1,resp=hsb$program)
## [1] 0.750556
Cstat(logit.mod2,resp=hsb$program)
## [1] 0.7687856
Cstat(logit.mod3,resp=hsb$program)
## [1] 0.7679294
Cstat(logit.mod4,resp=hsb$program)
## [1] 0.7867873
Cstat(logit.mod5,resp=hsb$program)
## [1] 0.7888888
roc.data5 <-roc(logit.mod5$y , logit.mod5$fitted.values,ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(roc.data5)
## [1] "percent" "sensitivities" "specificities"
## [4] "thresholds" "direction" "cases"
## [7] "controls" "fun.sesp" "auc"
## [10] "call" "original.predictor" "original.response"
## [13] "predictor" "response" "levels"
## [16] "ci"
plot((1-roc.data5$specificities),roc.data5$sensitivities,
type='l',
lwd=2,
col="blue",
main="ROC for Models 5, 1 and chance",
xlab="1-sensitivity (false positives)",
ylab="Specificity (true positives)"
)
roc.data1 <- roc(logit.mod1$y, logit.mod1$fitted.values,ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lines((1-roc.data1$specificities), roc.data1$sensitivities, col="pink", lwd=2)
text(.1,.99,"model 5: c=.79",col="blue")
text(.1,.94,"model 1: c=.75",col="pink")
text(.1,.89,"chance: c=.50",col="black")
lines(c(0,1),c(0,1), lty=2)
legend("bottomright",
legend=c("math + ses + schyp + ses*schtyp", "math","chance"),
col=c("blue","pink","black"), lty=c(1,1,2), lwd=2)
summary(logit.mod6 <- glm(program ~ math + ses + sctyp + ses*sctyp + ses*math + math*sctyp + math*ses*sctyp, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp + ses *
## math + math * sctyp + math * ses * sctyp, family = binomial,
## data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1257 -0.8892 0.2191 0.8954 2.1101
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.820981 1.910324 -3.571 0.000356 ***
## math 0.114402 0.037364 3.062 0.002200 **
## ses 0.751389 0.899646 0.835 0.403602
## sctyp2 -0.290924 11.257039 -0.026 0.979382
## ses:sctyp2 0.525944 5.134676 0.102 0.918415
## math:ses -0.007153 0.017197 -0.416 0.677467
## math:sctyp2 -0.017047 0.227843 -0.075 0.940360
## math:ses:sctyp2 0.013286 0.104381 0.127 0.898716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 659.21 on 592 degrees of freedom
## AIC: 675.21
##
## Number of Fisher Scoring iterations: 6
hsb$cmath <- scale(hsb$math,center=TRUE,scale=FALSE)
hsb$cses <- scale(hsb$ses,center=TRUE,scale=FALSE)
hsb$sctyp <- as.numeric(hsb$sctyp)
hsb$csctyp <- scale(hsb$sctyp,center=TRUE,scale=FALSE)
summary(logit.mod7 <- glm(program ~ cmath + cses + csctyp + cses*csctyp + cses*cmath + cmath*csctyp +cmath*cses*csctyp, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ cmath + cses + csctyp + cses * csctyp +
## cses * cmath + cmath * csctyp + cmath * cses * csctyp, family = binomial,
## data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1257 -0.8892 0.2191 0.8954 2.1101
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.090215 0.098616 0.915 0.360290
## cmath 0.101395 0.012566 8.069 7.08e-16 ***
## cses 0.570854 0.156184 3.655 0.000257 ***
## csctyp 1.301414 0.317330 4.101 4.11e-05 ***
## cses:csctyp 1.214812 0.613601 1.980 0.047725 *
## cmath:cses -0.005071 0.021691 -0.234 0.815147
## cmath:csctyp 0.010035 0.047429 0.212 0.832442
## cmath:cses:csctyp 0.013286 0.104381 0.127 0.898716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 659.21 on 592 degrees of freedom
## AIC: 675.21
##
## Number of Fisher Scoring iterations: 6
summary(logit.mod8 <- glm(program ~ math + ses + sctyp + ses*sctyp + rdg + sci + wrtg + civ, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp + rdg +
## sci + wrtg + civ, family = binomial, data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2309 -0.8324 0.1797 0.8307 2.1526
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.41108 1.41007 -4.547 5.45e-06 ***
## math 0.06638 0.01624 4.088 4.35e-05 ***
## ses -0.89589 0.60236 -1.487 0.13694
## sctyp -1.14500 1.13485 -1.009 0.31300
## rdg 0.04145 0.01557 2.662 0.00778 **
## sci -0.03295 0.01488 -2.215 0.02679 *
## wrtg 0.01379 0.01425 0.968 0.33296
## civ 0.04113 0.01318 3.122 0.00180 **
## ses:sctyp 1.22596 0.54124 2.265 0.02351 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 629.52 on 591 degrees of freedom
## AIC: 647.52
##
## Number of Fisher Scoring iterations: 5
var.names <- c("math","rdg","sci","wrtg","civ")
var.data <- hsb[var.names]
cor.mat <- cor(var.data,method="pearson")
round(cor.mat,2)
## math rdg sci wrtg civ
## math 1.00 0.68 0.65 0.63 0.53
## rdg 0.68 1.00 0.69 0.63 0.59
## sci 0.65 0.69 1.00 0.57 0.52
## wrtg 0.63 0.63 0.57 1.00 0.59
## civ 0.53 0.59 0.52 0.59 1.00
# Note eigen-vectors can be reflected (i.e., multiply by -1)
eigen(cor.mat)
## eigen() decomposition
## $values
## [1] 3.4351681 0.5285769 0.4125608 0.3297962 0.2938980
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.4570127 -0.2941759 -0.2688896 -0.76713446 0.20929173
## [2,] -0.4697851 -0.1837444 0.2013959 0.05109347 -0.83807465
## [3,] -0.4472481 -0.4684827 0.4051036 0.43430444 0.47724613
## [4,] -0.4444552 0.2427263 -0.7384324 0.44293682 0.04547653
## [5,] -0.4157765 0.7754395 0.4216003 -0.15517991 0.15490600
hoslem.test(logit.mod5$y, fitted(logit.mod5), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: logit.mod5$y, fitted(logit.mod5)
## X-squared = 7.0278, df = 8, p-value = 0.5336
Using Pearson (adjusted standardized residuals
resid <- rstandard(logit.mod5, pearson=TRUE)
qqnorm(resid,
main="QQ plot of Pearson Residuals",
col="blue")
lines(resid,resid)
Concordance indices
Cstat(logit.mod1,resp=hsb$program)
## [1] 0.750556
Cstat(logit.mod2,resp=hsb$program)
## [1] 0.7687856
Cstat(logit.mod3,resp=hsb$program)
## [1] 0.7679294
Cstat(logit.mod4,resp=hsb$program)
## [1] 0.7867873
Cstat(logit.mod5,resp=hsb$program)
## [1] 0.7888888
roc.data5 <-roc(logit.mod5$y , logit.mod5$fitted.values,ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(roc.data5)
## [1] "percent" "sensitivities" "specificities"
## [4] "thresholds" "direction" "cases"
## [7] "controls" "fun.sesp" "auc"
## [10] "call" "original.predictor" "original.response"
## [13] "predictor" "response" "levels"
## [16] "ci"
plot((1-roc.data5$specificities),roc.data5$sensitivities,
type='l',
lwd=2,
col="blue",
main="ROC for Models 5, 1 and chance",
xlab="1-sensitivity (false positives)",
ylab="Specificity (true positives)"
)
roc.data1 <- roc(logit.mod1$y, logit.mod1$fitted.values,ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lines((1-roc.data1$specificities), roc.data1$sensitivities, col="pink", lwd=2)
text(.1,.99,"model 5: c=.79",col="blue")
text(.1,.94,"model 1: c=.75",col="pink")
text(.1,.89,"chance: c=.50",col="black")
lines(c(0,1),c(0,1), lty=2)
legend("bottomright",
legend=c("math + ses + schyp + ses*schtyp", "math","chance"),
col=c("blue","pink","black"), lty=c(1,1,2), lwd=2)
summary(logit.mod6 <- glm(program ~ math + ses + sctyp + ses*sctyp + ses*math + math*sctyp + math*ses*sctyp, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp + ses *
## math + math * sctyp + math * ses * sctyp, family = binomial,
## data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1257 -0.8892 0.2191 0.8954 2.1101
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.53006 11.73324 -0.557 0.578
## math 0.13145 0.23686 0.555 0.579
## ses 0.22544 5.36591 0.042 0.966
## sctyp -0.29092 11.25704 -0.026 0.979
## ses:sctyp 0.52594 5.13468 0.102 0.918
## math:ses -0.02044 0.10855 -0.188 0.851
## math:sctyp -0.01705 0.22784 -0.075 0.940
## math:ses:sctyp 0.01329 0.10438 0.127 0.899
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 659.21 on 592 degrees of freedom
## AIC: 675.21
##
## Number of Fisher Scoring iterations: 6
# center all variables
hsb$cmath <- scale(hsb$math,center=TRUE,scale=FALSE)
hsb$cses <- scale(hsb$ses,center=TRUE,scale=FALSE)
hsb$sctyp <- as.numeric(hsb$sctyp)
hsb$csctyp <- scale(hsb$sctyp,center=TRUE,scale=FALSE)
summary(logit.mod7 <- glm(program ~ cmath + cses + csctyp + cses*csctyp + cses*cmath + cmath*csctyp +cmath*cses*csctyp, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ cmath + cses + csctyp + cses * csctyp +
## cses * cmath + cmath * csctyp + cmath * cses * csctyp, family = binomial,
## data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1257 -0.8892 0.2191 0.8954 2.1101
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.090215 0.098616 0.915 0.360290
## cmath 0.101395 0.012566 8.069 7.08e-16 ***
## cses 0.570854 0.156184 3.655 0.000257 ***
## csctyp 1.301414 0.317330 4.101 4.11e-05 ***
## cses:csctyp 1.214812 0.613601 1.980 0.047725 *
## cmath:cses -0.005071 0.021691 -0.234 0.815147
## cmath:csctyp 0.010035 0.047429 0.212 0.832442
## cmath:cses:csctyp 0.013286 0.104381 0.127 0.898716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 659.21 on 592 degrees of freedom
## AIC: 675.21
##
## Number of Fisher Scoring iterations: 6
hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod8 <- glm(program ~ math + ses + sctyp + ses*sctyp + rdg + sci + wrtg + civ, data=hsb,family = binomial))
##
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp + rdg +
## sci + wrtg + civ, family = binomial, data = hsb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2309 -0.8324 0.1797 0.8307 2.1526
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.55608 0.75424 -10.018 < 2e-16 ***
## math 0.06638 0.01624 4.088 4.35e-05 ***
## ses 0.33006 0.15326 2.154 0.03127 *
## sctyp2 -1.14500 1.13485 -1.009 0.31300
## rdg 0.04145 0.01557 2.662 0.00778 **
## sci -0.03295 0.01488 -2.215 0.02679 *
## wrtg 0.01379 0.01425 0.968 0.33296
## civ 0.04113 0.01318 3.122 0.00180 **
## ses:sctyp2 1.22596 0.54124 2.265 0.02351 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 831.35 on 599 degrees of freedom
## Residual deviance: 629.52 on 591 degrees of freedom
## AIC: 647.52
##
## Number of Fisher Scoring iterations: 5
screenreg(list(logit.mod1,logit.mod5, logit.mod8),
custom.model.names=c("Model 1", "Model 5", "Model 8"))
##
## =====================================================
## Model 1 Model 5 Model 8
## -----------------------------------------------------
## (Intercept) -5.59 *** -6.11 *** -7.56 ***
## (0.58) (0.63) (0.75)
## math 0.11 *** 0.10 *** 0.07 ***
## (0.01) (0.01) (0.02)
## ses 0.38 * 0.33 *
## (0.15) (0.15)
## sctyp2 -1.13 -1.15
## (1.12) (1.13)
## ses:sctyp2 1.19 * 1.23 *
## (0.53) (0.54)
## rdg 0.04 **
## (0.02)
## sci -0.03 *
## (0.01)
## wrtg 0.01
## (0.01)
## civ 0.04 **
## (0.01)
## -----------------------------------------------------
## AIC 713.27 669.44 647.52
## BIC 722.07 691.43 687.10
## Log Likelihood -354.64 -329.72 -314.76
## Deviance 709.27 659.44 629.52
## Num. obs. 600 600 600
## =====================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
# Examine correlations among achievement tests
var.names <- c("math","rdg","sci","wrtg","civ")
var.data <- hsb[var.names]
cor.mat <- cor(var.data,method="pearson")
round(cor.mat,2)
## math rdg sci wrtg civ
## math 1.00 0.68 0.65 0.63 0.53
## rdg 0.68 1.00 0.69 0.63 0.59
## sci 0.65 0.69 1.00 0.57 0.52
## wrtg 0.63 0.63 0.57 1.00 0.59
## civ 0.53 0.59 0.52 0.59 1.00
# Note eigen-vectors can be reflected (i.e., multiply by -1)
eigen(cor.mat)
## eigen() decomposition
## $values
## [1] 3.4351681 0.5285769 0.4125608 0.3297962 0.2938980
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.4570127 -0.2941759 -0.2688896 -0.76713446 0.20929173
## [2,] -0.4697851 -0.1837444 0.2013959 0.05109347 -0.83807465
## [3,] -0.4472481 -0.4684827 0.4051036 0.43430444 0.47724613
## [4,] -0.4444552 0.2427263 -0.7384324 0.44293682 0.04547653
## [5,] -0.4157765 0.7754395 0.4216003 -0.15517991 0.15490600