Set up
- libraries
- Data
Some Logit Models
- Independence
- Homogeneous Associaiton
Logit Models as poisson regression models
Identification Constraints
Nominal versus Ordinal Explanatory Variable?
- Odds Ratio with ordinal/numeric SES
Fitting Data at Individual Level
More predictors and problem of multicolinearity
Even more More predictors and problem of multicolinearity

These are analyses reported in lecture notes on logistic regression.

Set up

libraries

library(vcd)

## Loading required package: grid

library(ResourceSelection)

## ResourceSelection 0.3-5   2019-07-22

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(DescTools)
library(texreg)

## Version:  1.37.5
## Date:     2020-06-17
## Author:   Philip Leifeld (University of Essex)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").

Data

In long format (data frame):

setwd("C:/Users/cja/Dropbox/edps 589/5 logistic regression")
hsb <- read.csv(file="hsb_data.csv", header=TRUE, sep=",")
names(hsb)

##  [1] "id"       "sex"      "race"     "ses"      "sctyp"    "hsp"     
##  [7] "locus"    "concpt"   "mot"      "car"      "rdg"      "wrtg"    
## [13] "math"     "sci"      "civ"      "achieve"  "sci_math" "type"    
## [19] "xmean"    "xsum"     "program"  "group"

head(hsb)

##   id sex race ses sctyp hsp locus concpt  mot car  rdg wrtg math  sci  civ
## 1  1   2    1   1     1   3  0.29   0.88 0.67  10 33.6 43.7 40.2 39.0 40.6
## 2  4   2    1   2     1   3  0.06   0.03 0.00  15 38.9 41.1 32.7 41.7 40.6
## 3 23   2    1   1     1   3 -0.22   0.32 0.33   1 35.2 38.5 39.9 34.7 40.6
## 4 26   2    1   2     1   2  1.12  -0.74 0.67  15 31.0 41.1 36.0 36.9 45.6
## 5 29   2    1   2     1   2 -0.90   0.03 0.67   4 36.3 44.3 36.1 33.6 40.6
## 6 31   1    1   2     1   2 -0.21  -1.38 0.00   9 34.2 46.3 44.5 39.0 35.6
##   achieve sci_math type xmean  xsum program group
## 1   197.1     -1.2 case 39.42 197.1       0     1
## 2   195.0      9.0 case 39.00 195.0       0     1
## 3   188.9     -5.2 case 37.78 188.9       0     1
## 4   190.6      0.9 case 38.12 190.6       1     1
## 5   190.9     -2.5 case 38.18 190.9       1     1
## 6   199.6     -5.5 case 39.92 199.6       1     1

Long table:

var.levels <- expand.grid(academic=c("no", "yes"),
                          sctyp=c("public","private"),
              ses = c("low","middle","high"))
( hsb3way <- data.frame(var.levels, count=c(91, 40,  4,  4, 138, 111,
                                            14, 36, 44, 82,   1,  35 )))

##    academic   sctyp    ses count
## 1        no  public    low    91
## 2       yes  public    low    40
## 3        no private    low     4
## 4       yes private    low     4
## 5        no  public middle   138
## 6       yes  public middle   111
## 7        no private middle    14
## 8       yes private middle    36
## 9        no  public   high    44
## 10      yes  public   high    82
## 11       no private   high     1
## 12      yes private   high    35

Wide table:

var.levels2 <- expand.grid(sctyp=c("public","private"),
              ses = c("low","middle","high"))
hsb3way.no <- data.frame(var.levels2, no=c(40, 4, 111,  36, 82,   35 ))                       
hsb3way.yes <- data.frame(var.levels2, yes=c(91,4,136,14,44,1))                   
hsb3way.wide <- merge(hsb3way.yes,hsb3way.no, by=c("sctyp","ses"))
hsb3way.wide$ncases <- hsb3way.wide$yes + hsb3way.wide$no
(hsb3way.wide)

##     sctyp    ses yes  no ncases
## 1 private   high   1  35     36
## 2 private    low   4   4      8
## 3 private middle  14  36     50
## 4  public   high  44  82    126
## 5  public    low  91  40    131
## 6  public middle 136 111    247

Some Logit Models

Independence

summary(  i.mod <- glm(cbind(yes,no) ~ 1 , data=hsb3way.wide, family=binomial))

## 
## Call:
## glm(formula = cbind(yes, no) ~ 1, family = binomial, data = hsb3way.wide)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
## -6.2252   0.0852  -2.9563  -3.0768   4.8559   2.0650  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.06022    0.08182  -0.736    0.462
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.81  on 5  degrees of freedom
## Residual deviance: 84.81  on 5  degrees of freedom
## AIC: 111.85
## 
## Number of Fisher Scoring iterations: 3

Homogeneous Associaiton

or all 2-way interactions

summary(  h.logit <- glm(cbind(yes,no)  ~ sctyp + ses , data=hsb3way.wide,family=binomial)  )

## 
## Call:
## glm(formula = cbind(yes, no) ~ sctyp + ses, family = binomial, 
##     data = hsb3way.wide)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
## -1.8337   0.7284   0.6186   0.5513  -0.1931  -0.2439  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.8587     0.1857   4.623 3.78e-06 ***
## sctypprivate  -1.3766     0.2793  -4.929 8.25e-07 ***
## sesmiddle     -0.6244     0.2209  -2.826  0.00471 ** 
## seshigh       -1.5848     0.2577  -6.149 7.81e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.8101  on 5  degrees of freedom
## Residual deviance:  4.6765  on 2  degrees of freedom
## AIC: 37.72
## 
## Number of Fisher Scoring iterations: 4

(X2 <- sum(residuals(h.logit,type="pearson")^2) )

## [1] 3.787392

Logit Models as poisson regression models

# As a poisson --- one way to get  X2 goodness-of-fit statistics
summary( h.poi <- glm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way,family=poisson) )

## 
## Call:
## glm(formula = count ~ sctyp + academic + ses + sctyp * academic + 
##     sctyp * ses + ses * academic, family = poisson, data = hsb3way)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6        7        8  
## -0.1075   0.1641   0.5667  -0.4760  -0.1577   0.1777   0.5214  -0.3050  
##       9       10       11       12  
##  0.4456  -0.3155  -1.7503   0.5027  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                4.5221     0.1034  43.742  < 2e-16 ***
## sctypprivate              -3.4332     0.4064  -8.448  < 2e-16 ***
## academicyes               -0.8593     0.1858  -4.625 3.74e-06 ***
## sesmiddle                  0.4185     0.1323   3.163  0.00156 ** 
## seshigh                   -0.8059     0.1828  -4.409 1.04e-05 ***
## sctypprivate:academicyes   1.3856     0.2792   4.962 6.97e-07 ***
## sctypprivate:sesmiddle     0.9889     0.4034   2.451  0.01423 *  
## sctypprivate:seshigh       1.0754     0.4230   2.543  0.01100 *  
## academicyes:sesmiddle      0.6113     0.2207   2.769  0.00562 ** 
## academicyes:seshigh        1.5844     0.2578   6.146 7.94e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 485.7696  on 11  degrees of freedom
## Residual deviance:   4.6219  on  2  degrees of freedom
## AIC: 85.535
## 
## Number of Fisher Scoring iterations: 4

(X2 <- sum(residuals(h.poi,type="pearson")^2))

## [1] 3.748049

# This will give both G2 and X2
library(MASS)
summary( h.poi2 <- loglm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way) )

## Formula:
## count ~ sctyp + academic + ses + sctyp * academic + sctyp * ses + 
##     ses * academic
## attr(,"variables")
## list(count, sctyp, academic, ses)
## attr(,"factors")
##          sctyp academic ses sctyp:academic sctyp:ses academic:ses
## count        0        0   0              0         0            0
## sctyp        1        0   0              1         1            0
## academic     0        1   0              1         0            1
## ses          0        0   1              0         1            1
## attr(,"term.labels")
## [1] "sctyp"          "academic"       "ses"            "sctyp:academic"
## [5] "sctyp:ses"      "academic:ses"  
## attr(,"order")
## [1] 1 1 1 2 2 2
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,".Environment")
## <environment: R_GlobalEnv>
## attr(,"predvars")
## list(count, sctyp, academic, ses)
## attr(,"dataClasses")
##     count     sctyp  academic       ses 
## "numeric"  "factor"  "factor"  "factor" 
## 
## Statistics:
##                       X^2 df  P(> X^2)
## Likelihood Ratio 4.621902  2 0.0991669
## Pearson          3.747989  2 0.1535093

Identification Constraints

Different programs may use different ID constraints. Different parameterizations on impact the value of the regression coefficients but not the fitted values, odds ration estimates, residuals, goodness-of-model fit to data, etc.

Dummy: first set to zer0

The first is set to 0,

hsb3way.wide$low   <- ifelse(hsb3way.wide$ses=="low",1,0)
hsb3way.wide$mid   <- ifelse(hsb3way.wide$ses=="middle",1,0)
hsb3way.wide$hig   <- ifelse(hsb3way.wide$ses=="high",1,0)

summary( logit.bin <- glm(cbind(yes,no) ~  mid + hig + sctyp, family = binomial, data = hsb3way.wide) )

## 
## Call:
## glm(formula = cbind(yes, no) ~ mid + hig + sctyp, family = binomial, 
##     data = hsb3way.wide)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
## -1.8337   0.7284   0.6186   0.5513  -0.1931  -0.2439  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.8587     0.1857   4.623 3.78e-06 ***
## mid           -0.6244     0.2209  -2.826  0.00471 ** 
## hig           -1.5848     0.2577  -6.149 7.81e-10 ***
## sctypprivate  -1.3766     0.2793  -4.929 8.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.8101  on 5  degrees of freedom
## Residual deviance:  4.6765  on 2  degrees of freedom
## AIC: 37.72
## 
## Number of Fisher Scoring iterations: 4

Alternate Dummy Codes

By defaults glm in R will set the 1st coefficient to 0, but SAS sets the last to 0. Below we switch to last coefficients

# -- change how Dummy codes are defined
hsb3way.wide$Dsctyp <- ifelse(hsb3way.wide$sctyp=="public",1,0)
hsb3way.wide$Dlow   <- ifelse(hsb3way.wide$ses=="low",1,0)
hsb3way.wide$Dmid   <- ifelse(hsb3way.wide$ses=="middle",1,0)

summary( logit.bin2 <- glm(cbind(yes,no) ~  Dlow + Dmid + Dsctyp, family = binomial, data = hsb3way.wide) )

## 
## Call:
## glm(formula = cbind(yes, no) ~ Dlow + Dmid + Dsctyp, family = binomial, 
##     data = hsb3way.wide)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
## -1.8337   0.7284   0.6186   0.5513  -0.1931  -0.2439  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1027     0.3059  -6.874 6.25e-12 ***
## Dlow          1.5848     0.2577   6.149 7.81e-10 ***
## Dmid          0.9605     0.2153   4.460 8.19e-06 ***
## Dsctyp        1.3766     0.2793   4.929 8.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.8101  on 5  degrees of freedom
## Residual deviance:  4.6765  on 2  degrees of freedom
## AIC: 37.72
## 
## Number of Fisher Scoring iterations: 4

Effects codes

hsb3way.wide$Esctyp <- ifelse(hsb3way.wide$sctyp=="public",1,-1)
hsb3way.wide$Elow   <- ifelse(hsb3way.wide$ses=="low",1,0)
hsb3way.wide$Elow   <- ifelse(hsb3way.wide$ses=="high",-1, hsb3way.wide$Elow)
hsb3way.wide$Emid   <- ifelse(hsb3way.wide$ses=="middle",1,0)
hsb3way.wide$Emid   <- ifelse(hsb3way.wide$ses=="high",-1,hsb3way.wide$Emid)

summary( logit.bin3 <- glm(cbind(yes,no) ~  Elow + Emid + Esctyp, family = binomial, data = hsb3way.wide) )

## 
## Call:
## glm(formula = cbind(yes, no) ~ Elow + Emid + Esctyp, family = binomial, 
##     data = hsb3way.wide)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
## -1.8337   0.7284   0.6186   0.5513  -0.1931  -0.2439  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.5660     0.1459  -3.880 0.000105 ***
## Elow          0.7364     0.1430   5.149 2.62e-07 ***
## Emid          0.1120     0.1173   0.955 0.339684    
## Esctyp        0.6883     0.1396   4.929 8.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.8101  on 5  degrees of freedom
## Residual deviance:  4.6765  on 2  degrees of freedom
## AIC: 37.72
## 
## Number of Fisher Scoring iterations: 4

Comparison

screenreg(list(logit.bin, logit.bin2, logit.bin3),
          custom.model.names=c("First=0", "Last =0", "Effect"))

## 
## ==================================================
##                 First=0     Last =0     Effect    
## --------------------------------------------------
## (Intercept)       0.86 ***   -2.10 ***   -0.57 ***
##                  (0.19)      (0.31)      (0.15)   
## mid              -0.62 **                         
##                  (0.22)                           
## hig              -1.58 ***                        
##                  (0.26)                           
## sctypprivate     -1.38 ***                        
##                  (0.28)                           
## Dlow                          1.58 ***            
##                              (0.26)               
## Dmid                          0.96 ***            
##                              (0.22)               
## Dsctyp                        1.38 ***            
##                              (0.28)               
## Elow                                      0.74 ***
##                                          (0.14)   
## Emid                                      0.11    
##                                          (0.12)   
## Esctyp                                    0.69 ***
##                                          (0.14)   
## --------------------------------------------------
## AIC              37.72       37.72       37.72    
## BIC              36.89       36.89       36.89    
## Log Likelihood  -14.86      -14.86      -14.86    
## Deviance          4.68        4.68        4.68    
## Num. obs.         6           6           6       
## ==================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Same model, just different value of the coefficients.

Same odds ratios

Holding sctype constant: public vs private odds ratios

(odds.dummy1 <- exp(0 - logit.bin$coefficients[4]))

## sctypprivate 
##     3.961426

(odds.dummy2 <-  exp(logit.bin2$coefficients[4]- 0))

##   Dsctyp 
## 3.961426

(odds.effect3 <-  exp(logit.bin3$coefficients[4]+logit.bin3$coefficients[4]))

##   Esctyp 
## 3.961426

You can re-code school type to get exactly what are in the lecture notes

Nominal versus Ordinal Explanatory Variable?

SES is definitely ordinal, but first treat is as a factor

hsb3way.wide$xses  <- as.numeric(hsb3way.wide$ses)

# -- ses as factor
summary( logit.factor <- glm(cbind(yes,no) ~  ses + sctyp, family = binomial, data = hsb3way.wide) )

## 
## Call:
## glm(formula = cbind(yes, no) ~ ses + sctyp, family = binomial, 
##     data = hsb3way.wide)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
## -1.8337   0.7284   0.6186   0.5513  -0.1931  -0.2439  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.8587     0.1857   4.623 3.78e-06 ***
## sesmiddle     -0.6244     0.2209  -2.826  0.00471 ** 
## seshigh       -1.5848     0.2577  -6.149 7.81e-10 ***
## sctypprivate  -1.3766     0.2793  -4.929 8.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.8101  on 5  degrees of freedom
## Residual deviance:  4.6765  on 2  degrees of freedom
## AIC: 37.72
## 
## Number of Fisher Scoring iterations: 4

Note SES will be numeric (this is special case of when treated as a factor)

summary( logit.numeric <- glm(cbind(yes,no) ~  xses + sctyp, family = binomial, data = hsb3way.wide) )

## 
## Call:
## glm(formula = cbind(yes, no) ~ xses + sctyp, family = binomial, 
##     data = hsb3way.wide)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
## -1.9954   0.5755   0.8124   0.1059  -0.6799   0.3775  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.7493     0.2762   6.333 2.40e-10 ***
## xses          -0.7972     0.1289  -6.185 6.20e-10 ***
## sctypprivate  -1.3604     0.2784  -4.887 1.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 84.8101  on 5  degrees of freedom
## Residual deviance:  5.5889  on 3  degrees of freedom
## AIC: 36.633
## 
## Number of Fisher Scoring iterations: 4

Compute goodness-of-fit statistics for logit.numeric

hsb3way.wide$pi.numeric <- logit.numeric$fitted
Xsq.yes <- (hsb3way.wide$yes -     hsb3way.wide$pi.numeric*hsb3way.wide$ncases)**2   /(hsb3way.wide$pi.numeric*hsb3way.wide$ncases)
Xsq.no <-  (hsb3way.wide$no   - (1-hsb3way.wide$pi.numeric)*hsb3way.wide$ncases)**2 /((1-hsb3way.wide$pi.numeric)*hsb3way.wide$ncases)
(Xsq <- sum(Xsq.yes + Xsq.no))

## [1] 4.507544

Gsq.yes <- 2*hsb3way.wide$yes *log(hsb3way.wide$yes/(hsb3way.wide$ncases*hsb3way.wide$pi.numeric))
Gsq.no <- 2*hsb3way.wide$no * log(hsb3way.wide$no/(hsb3way.wide$ncases*(1-hsb3way.wide$pi.numeric)))
( Gsq <- sum(Gsq.yes+Gsq.no) )

## [1] 5.588936

# -- LR test of restriction on values for SES
anova(logit.numeric, logit.factor)

## Analysis of Deviance Table
## 
## Model 1: cbind(yes, no) ~ xses + sctyp
## Model 2: cbind(yes, no) ~ ses + sctyp
##   Resid. Df Resid. Dev Df Deviance
## 1         3     5.5889            
## 2         2     4.6765  1  0.91243

Can treat SES as numeric with evenly spaced scores.

Odds Ratio with ordinal/numeric SES

Just some here

logit.numeric$coefficients

##  (Intercept)         xses sctypprivate 
##    1.7492951   -0.7972102   -1.3604133

# -- Holding schtyp constant, odds of academic are 
exp(logit.numeric$coefficients[2])

##      xses 
## 0.4505843

# the odds for an increase of SES by 1 unit

# -- odds academic of low vs high ses
exp((3-1)*logit.numeric$coefficients[2])

##      xses 
## 0.2030262

Fitting Data at Individual Level

(1) Simple: program ~ math

summary(logit.mod1 <- glm(program ~ math, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ math, family = binomial, data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2997  -0.9562   0.4355   0.9806   1.9107  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.58539    0.58066  -9.619   <2e-16 ***
## math         0.10933    0.01119   9.770   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 709.27  on 598  degrees of freedom
## AIC: 713.27
## 
## Number of Fisher Scoring iterations: 4

j <- order(hsb$math)                                    
plot(hsb$math[j],logit.mod1$fitted[j], 
    type='l',
    lwd=2,
    ylim=c(0,1),
    col="blue",
    main="Model 1: program ~ math")

(2) SES as nominal

hsb$ses <- as.factor(hsb$ses)
summary(logit.mod2 <- glm(program ~ math + ses, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ math + ses, family = binomial, data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2142  -0.9260   0.4107   0.9469   2.0086  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.52362    0.59008  -9.361  < 2e-16 ***
## math         0.09889    0.01147   8.620  < 2e-16 ***
## ses2         0.35070    0.23577   1.487    0.137    
## ses3         1.15033    0.27543   4.177 2.96e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 689.30  on 596  degrees of freedom
## AIC: 697.3
## 
## Number of Fisher Scoring iterations: 3

# - subset for graphing
hsb$fitted2 <- logit.mod2$fitted
low <- hsb[ which(hsb$ses=="1"),]
mid <- hsb[ which(hsb$ses=="2"),]
hi  <- hsb[ which(hsb$ses=="3"),]
j <- order(low$math)                                    
plot(low$math[j],low$fitted2[j], 
    type='l',
    lwd=2,
    ylim=c(0,1),
    xlim=c(30,80),
    col="blue",
    main="Model 2: program ~ math + ses (as factor) "
    )
j <- order(mid$math)                                    
lines(mid$math[j],mid$fitted2[j],col="green",lwd=2)
j <- order(hi$math)                                 
lines(hi$math[j],hi$fitted2[j],col="red",lwd=2)
legend("topleft",legend=c("high","middle","low"), 
       col=c("red","green","blue"), lty=1,lwd=2, cex=1)

(3) numeric SES

hsb$ses <- as.numeric(hsb$ses)
summary(logit.mod3 <- glm(program ~ math + ses, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ math + ses, family = binomial, data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1836  -0.9460   0.4366   0.9652   2.0587  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.19142    0.61298 -10.101  < 2e-16 ***
## math         0.09801    0.01140   8.595  < 2e-16 ***
## ses          0.58374    0.13762   4.242 2.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 690.76  on 597  degrees of freedom
## AIC: 696.76
## 
## Number of Fisher Scoring iterations: 3

# - subset for graphing
hsb$fitted3 <- logit.mod3$fitted
low <- hsb[ which(hsb$ses=="1"),]
mid <- hsb[ which(hsb$ses=="2"),]
hi  <- hsb[ which(hsb$ses=="3"),]
j <- order(low$math)                                    
plot(low$math[j],low$fitted3[j], 
    type='l',
    lwd=2,
    ylim=c(0,1),
    xlim=c(30,80),
    col="blue",
    main="Model 3: program ~ math + ses (numeric) "
    )
j <- order(mid$math)                                    
lines(mid$math[j],mid$fitted3[j],col="green",lwd=2)
j <- order(hi$math)                                 
lines(hi$math[j],hi$fitted3[j],col="red",lwd=2)
legend("topleft",legend=c("high","middle","low"), 
       col=c("red","green","blue"), lty=1,lwd=2, cex=1)

(4) program ~ math + ses + sctyp

hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod4 <- glm(program ~ math + ses + sctyp, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ math + ses + sctyp, family = binomial, 
##     data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1063  -0.8883   0.2979   0.8911   2.1101  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.24834    0.62568  -9.986  < 2e-16 ***
## math         0.09858    0.01166   8.454  < 2e-16 ***
## ses          0.49856    0.14069   3.544 0.000395 ***
## sctyp2       1.36466    0.28982   4.709 2.49e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 665.42  on 596  degrees of freedom
## AIC: 673.42
## 
## Number of Fisher Scoring iterations: 4

# - subset for graphing
hsb$fitted4 <- logit.mod4$fitted
low.public  <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),]
low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),]
mid.public  <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),]
mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),]
hi.public   <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),]
hi.private  <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),]
j <- order(low.public$math)                                
plot(low.public$math[j],low.public$fitted4[j], 
    type='l',
    lwd=2,
    ylim=c(0,1),
    xlim=c(30,80),
    col="blue",
    main="Model 4: program ~ math + ses (numeric) + school type  "
    )
j <- order(low.private$math)                                    
lines(low.private$math[j],low.private$fitted4[j],col="darkblue",lwd=2)
j <- order(mid.public$math)                                 
lines(mid.public$math[j],mid.public$fitted4[j],col="green",lwd=2)
j <- order(mid.private$math)                                    
lines(mid.private$math[j],mid.private$fitted4[j],col="darkgreen",lwd=2)
j <- order(hi.public$math)                                  
lines(hi.public$math[j],hi.public$fitted4[j],col="red",lwd=2)
j <- order(hi.private$math)                                 
lines(hi.private$math[j],hi.private$fitted4[j],col="darkred",lwd=2)
legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "), 
       col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1)

(5) An interaction program ~ math + ses + sctyp + ses*sctyp

hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod5 <- glm(program ~ math + ses + sctyp + ses*sctyp, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp, family = binomial, 
##     data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1263  -0.8776   0.2376   0.8949   2.0717  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -6.1130     0.6251  -9.780   <2e-16 ***
## math          0.1006     0.0118   8.526   <2e-16 ***
## ses           0.3804     0.1483   2.566   0.0103 *  
## sctyp2       -1.1328     1.1223  -1.009   0.3128    
## ses:sctyp2    1.1929     0.5293   2.254   0.0242 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 659.44  on 595  degrees of freedom
## AIC: 669.44
## 
## Number of Fisher Scoring iterations: 5

# - subset for graphing
hsb$fitted5 <- logit.mod5$fitted
low.public  <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),]
low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),]
mid.public  <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),]
mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),]
hi.public   <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),]
hi.private  <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),]
j <- order(low.public$math)                                
plot(low.public$math[j],low.public$fitted5[j], 
    type='l',
    lwd=2,
    ylim=c(0,1),
    xlim=c(30,80),
    col="blue",
    main="Model 5: program ~ math + ses + schyp + ses*schtyp "
    )
j <- order(low.private$math)                                    
lines(low.private$math[j],low.private$fitted5[j],col="darkblue",lwd=2)
j <- order(mid.public$math)                                 
lines(mid.public$math[j],mid.public$fitted5[j],col="green",lwd=2)
j <- order(mid.private$math)                                    
lines(mid.private$math[j],mid.private$fitted5[j],col="darkgreen",lwd=2)
j <- order(hi.public$math)                                  
lines(hi.public$math[j],hi.public$fitted5[j],col="red",lwd=2)
j <- order(hi.private$math)                                 
lines(hi.private$math[j],hi.private$fitted5[j],col="darkred",lwd=2)

legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "), 
       col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1)

Diagonistics for model 5

Hosmer-Lemeshow

hoslem.test(logit.mod5$y, fitted(logit.mod5), g=10)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  logit.mod5$y, fitted(logit.mod5)
## X-squared = 7.0278, df = 8, p-value = 0.5336

qq plot residuals

Using Pearson (adjusted standardized residuals

resid <- rstandard(logit.mod5, pearson=TRUE)
qqnorm(resid,
       main="QQ plot of  Pearson  Residuals",
         col="blue")

### ROC and concordance #### Concordance index

Cstat(logit.mod1,resp=hsb$program)

## [1] 0.750556

Cstat(logit.mod2,resp=hsb$program)

## [1] 0.7687856

Cstat(logit.mod3,resp=hsb$program)

## [1] 0.7679294

Cstat(logit.mod4,resp=hsb$program)

## [1] 0.7867873

Cstat(logit.mod5,resp=hsb$program)

## [1] 0.7888888

For ROC w/ multivariate logitistic regression

roc.data5 <-roc(logit.mod5$y , logit.mod5$fitted.values,ci=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

names(roc.data5)

##  [1] "percent"            "sensitivities"      "specificities"     
##  [4] "thresholds"         "direction"          "cases"             
##  [7] "controls"           "fun.sesp"           "auc"               
## [10] "call"               "original.predictor" "original.response" 
## [13] "predictor"          "response"           "levels"            
## [16] "ci"

plot((1-roc.data5$specificities),roc.data5$sensitivities,
      type='l',
      lwd=2,
      col="blue",
     main="ROC for Models 5, 1 and chance",
     xlab="1-sensitivity (false positives)",
     ylab="Specificity (true positives)"
     )
roc.data1 <- roc(logit.mod1$y, logit.mod1$fitted.values,ci=T)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

lines((1-roc.data1$specificities), roc.data1$sensitivities, col="pink", lwd=2)
text(.1,.99,"model 5: c=.79",col="blue")
text(.1,.94,"model 1: c=.75",col="pink")
text(.1,.89,"chance:  c=.50",col="black")
lines(c(0,1),c(0,1), lty=2)
legend("bottomright",
        legend=c("math + ses + schyp + ses*schtyp", "math","chance"),
        col=c("blue","pink","black"), lty=c(1,1,2), lwd=2)

Even more More predictors and problem of multicolinearity

summary(logit.mod6  <- glm(program ~ math + ses + sctyp + ses*sctyp + ses*math + math*sctyp + math*ses*sctyp, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp + ses * 
##     math + math * sctyp + math * ses * sctyp, family = binomial, 
##     data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1257  -0.8892   0.2191   0.8954   2.1101  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -6.53006   11.73324  -0.557    0.578
## math            0.13145    0.23686   0.555    0.579
## ses             0.22544    5.36591   0.042    0.966
## sctyp          -0.29092   11.25704  -0.026    0.979
## ses:sctyp       0.52594    5.13468   0.102    0.918
## math:ses       -0.02044    0.10855  -0.188    0.851
## math:sctyp     -0.01705    0.22784  -0.075    0.940
## math:ses:sctyp  0.01329    0.10438   0.127    0.899
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 659.21  on 592  degrees of freedom
## AIC: 675.21
## 
## Number of Fisher Scoring iterations: 6

# center all variables
hsb$cmath <- scale(hsb$math,center=TRUE,scale=FALSE)
hsb$cses <- scale(hsb$ses,center=TRUE,scale=FALSE)
hsb$sctyp <- as.numeric(hsb$sctyp)
hsb$csctyp <- scale(hsb$sctyp,center=TRUE,scale=FALSE)

summary(logit.mod7 <- glm(program ~ cmath + cses + csctyp + cses*csctyp + cses*cmath + cmath*csctyp +cmath*cses*csctyp, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ cmath + cses + csctyp + cses * csctyp + 
##     cses * cmath + cmath * csctyp + cmath * cses * csctyp, family = binomial, 
##     data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1257  -0.8892   0.2191   0.8954   2.1101  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.090215   0.098616   0.915 0.360290    
## cmath              0.101395   0.012566   8.069 7.08e-16 ***
## cses               0.570854   0.156184   3.655 0.000257 ***
## csctyp             1.301414   0.317330   4.101 4.11e-05 ***
## cses:csctyp        1.214812   0.613601   1.980 0.047725 *  
## cmath:cses        -0.005071   0.021691  -0.234 0.815147    
## cmath:csctyp       0.010035   0.047429   0.212 0.832442    
## cmath:cses:csctyp  0.013286   0.104381   0.127 0.898716    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 659.21  on 592  degrees of freedom
## AIC: 675.21
## 
## Number of Fisher Scoring iterations: 6

hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod8 <-  glm(program ~ math + ses + sctyp + ses*sctyp + rdg + sci + wrtg + civ, data=hsb,family = binomial))

## 
## Call:
## glm(formula = program ~ math + ses + sctyp + ses * sctyp + rdg + 
##     sci + wrtg + civ, family = binomial, data = hsb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2309  -0.8324   0.1797   0.8307   2.1526  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.55608    0.75424 -10.018  < 2e-16 ***
## math         0.06638    0.01624   4.088 4.35e-05 ***
## ses          0.33006    0.15326   2.154  0.03127 *  
## sctyp2      -1.14500    1.13485  -1.009  0.31300    
## rdg          0.04145    0.01557   2.662  0.00778 ** 
## sci         -0.03295    0.01488  -2.215  0.02679 *  
## wrtg         0.01379    0.01425   0.968  0.33296    
## civ          0.04113    0.01318   3.122  0.00180 ** 
## ses:sctyp2   1.22596    0.54124   2.265  0.02351 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 831.35  on 599  degrees of freedom
## Residual deviance: 629.52  on 591  degrees of freedom
## AIC: 647.52
## 
## Number of Fisher Scoring iterations: 5

screenreg(list(logit.mod1,logit.mod5, logit.mod8),
          custom.model.names=c("Model 1", "Model 5", "Model 8"))

## 
## =====================================================
##                 Model 1      Model 5      Model 8    
## -----------------------------------------------------
## (Intercept)       -5.59 ***    -6.11 ***    -7.56 ***
##                   (0.58)       (0.63)       (0.75)   
## math               0.11 ***     0.10 ***     0.07 ***
##                   (0.01)       (0.01)       (0.02)   
## ses                             0.38 *       0.33 *  
##                                (0.15)       (0.15)   
## sctyp2                         -1.13        -1.15    
##                                (1.12)       (1.13)   
## ses:sctyp2                      1.19 *       1.23 *  
##                                (0.53)       (0.54)   
## rdg                                          0.04 ** 
##                                             (0.02)   
## sci                                         -0.03 *  
##                                             (0.01)   
## wrtg                                         0.01    
##                                             (0.01)   
## civ                                          0.04 ** 
##                                             (0.01)   
## -----------------------------------------------------
## AIC              713.27       669.44       647.52    
## BIC              722.07       691.43       687.10    
## Log Likelihood  -354.64      -329.72      -314.76    
## Deviance         709.27       659.44       629.52    
## Num. obs.        600          600          600       
## =====================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

# Examine correlations among achievement tests
var.names <- c("math","rdg","sci","wrtg","civ")
var.data <- hsb[var.names]
cor.mat <- cor(var.data,method="pearson")
round(cor.mat,2)

##      math  rdg  sci wrtg  civ
## math 1.00 0.68 0.65 0.63 0.53
## rdg  0.68 1.00 0.69 0.63 0.59
## sci  0.65 0.69 1.00 0.57 0.52
## wrtg 0.63 0.63 0.57 1.00 0.59
## civ  0.53 0.59 0.52 0.59 1.00

# Note eigen-vectors can be reflected (i.e., multiply by -1)
eigen(cor.mat)

## eigen() decomposition
## $values
## [1] 3.4351681 0.5285769 0.4125608 0.3297962 0.2938980
## 
## $vectors
##            [,1]       [,2]       [,3]        [,4]        [,5]
## [1,] -0.4570127 -0.2941759 -0.2688896 -0.76713446  0.20929173
## [2,] -0.4697851 -0.1837444  0.2013959  0.05109347 -0.83807465
## [3,] -0.4472481 -0.4684827  0.4051036  0.43430444  0.47724613
## [4,] -0.4444552  0.2427263 -0.7384324  0.44293682  0.04547653
## [5,] -0.4157765  0.7754395  0.4216003 -0.15517991  0.15490600

Multiple Logstic Regression: HSB

Carolyn J. Anderson

2022-10-17

Set up

libraries

Data

In long format (data frame):

Long table:

Wide table:

Some Logit Models

Independence

Homogeneous Associaiton

Logit Models as poisson regression models

Identification Constraints

Dummy: first set to zer0

Alternate Dummy Codes

Effects codes

Comparison

Same odds ratios

Nominal versus Ordinal Explanatory Variable?

Odds Ratio with ordinal/numeric SES

Fitting Data at Individual Level

(1) Simple: program ~ math

(2) SES as nominal

(3) numeric SES

(4) program ~ math + ses + sctyp

(5) An interaction program ~ math + ses + sctyp + ses*sctyp

Diagonistics for model 5

Hosmer-Lemeshow

qq plot residuals

For ROC w/ multivariate logitistic regression

More predictors and problem of multicolinearity

center all variables

Examine correlations among achievement tests

qq plot residuals

ROC and concordance

For ROC w/ multivariate logitistic regression

Even more More predictors and problem of multicolinearity