We will start with exploratory data analysis, followed by model the data, and assessing model fit and assumptions. Included in this example are location-scale models.

Many of the figures are in the lecture notes on model building. The page numbers are those in the lecture notes (at least a close approximation of their location)

Set up

library(lme4)
library(lmerTest)
library(lattice)
library(ggplot2)
library(stringi)

library(HLMdiag)
## Warning: package 'HLMdiag' was built under R version 4.0.5
library(texreg)
library(optimx)

We will use some of the functions described in lecture on inference of the marginal model (e.g., robust, contrast, hlmRsq, etc.)

source("D:\\Dropbox\\edps587\\lectures\\6 inference1\\All_functions.txt")
#source("C:\\Users\\cja\\Dropbox\\edps587\\lectures\\6 inference1\\All_functions.txt")

NELS Data

nels<-read.table("D:\\Dropbox\\edps587\\lectures\\8 modelbuilding\\nels23_data.txt",header=TRUE)
#nels<-read.table("C:\\Users\\cja\\Dropbox\\edps587\\lectures\\8 modelbuilding\\nels23_data.txt",header=TRUE)

#look at first few rows of data set
head(nels)
##   school student sex race homew schtype   ses pared math classstr schsize
## 1   7472       3   2    4     1       1 -0.13     2   48        2       3
## 2   7472       8   1    4     0       1 -0.39     2   48        2       3
## 3   7472      13   1    4     0       1 -0.80     2   53        2       3
## 4   7472      17   1    4     1       1 -0.72     2   42        2       3
## 5   7472      27   2    4     2       1 -0.74     2   43        2       3
## 6   7472      28   2    4     1       1 -0.58     2   57        2       3
##   urbana geo minority ratio
## 1      2   2        0    19
## 2      2   2        0    19
## 3      2   2        0    19
## 4      2   2        0    19
## 5      2   2        0    19
## 6      2   2        0    19

Read in the data and create some variables we’ll need.

nels$public <- ifelse (nels$schtype==1, 1, 0)
nels$white  <- ifelse (nels$race==4, 1, 0)


schMses <- as.data.frame(aggregate(ses~school, data=nels, FUN="mean"))
names(schMses) <- c('school', 'schMses')
nels <- merge(nels,schMses, by=c('school'))
nels$schCses <- nels$ses - nels$schMses

nels$homewrk <- as.numeric(nels$homew)
nels$school  <- as.factor(nels$school)

head(nels)
##   school student sex race homew schtype   ses pared math classstr schsize
## 1   6053       1   2    4     1       4  0.85     4   50        3       3
## 2   6053       2   2    4     1       4  0.43     3   43        3       3
## 3   6053       4   2    1     3       4 -0.59     3   50        3       3
## 4   6053      11   2    4     1       4  1.02     5   49        3       3
## 5   6053      12   1    4     1       4  0.84     5   62        3       3
## 6   6053      13   2    4     1       4  1.32     6   43        3       3
##   urbana geo minority ratio public white   schMses    schCses homewrk
## 1      1   2        3    18      0     1 0.6997727  0.1502273       1
## 2      1   2        3    18      0     1 0.6997727 -0.2697727       1
## 3      1   2        3    18      0     0 0.6997727 -1.2897727       3
## 4      1   2        3    18      0     1 0.6997727  0.3202273       1
## 5      1   2        3    18      0     1 0.6997727  0.1402273       1
## 6      1   2        3    18      0     1 0.6997727  0.6202273       1

That should be enough for now.

Math as a function of Homework

Linear regression model and overall 95% CI fit to the data

math.homewrk <- lm(math~homewrk, data=nels)
conflm1<-confint(math.homewrk)
plot(nels$homewrk,nels$math,type='p', main='Overall regression with 95% CI')
abline(math.homewrk)
abline(coef=conflm1[,1],lty=2)
abline(coef=conflm1[,2],lty=2)

Smooth (loess) curve fit to data

Fit loess to data are plot

lw1 <- loess(math ~ homewrk,data=nels)
# --- plot data
plot(nels$homewrk,nels$math,
     main="NELS:Math Score Plotted vs Homework \n with smooth (loess) curve",
     xlab="Time Spent Doing homework",
     ylab="Math Scores",
     cex=1.0,
     col="black"
     )  
# --- put in proper order
    j <- order(nels$homewrk)
# --- Add loess to figure   
lines(nels$homewrk[j],lw1$fitted[j],col="blue",lwd=3)

Joint points for each school

This graph is around page 25. We compute means mean of math for each level of time spent doing home for rach school. These are plotted just connecting points (one curve for each school)

math.hmwk <- aggregate(math ~ homewrk + school, data=nels, FUN="mean")

ggplot(math.hmwk, aes(x=homewrk, y=math, col=school,type='l'),
  main='Regressions all in one plot')  + geom_line() +  theme(legend.position="none")

Or an alternative method

plot(nels$homewrk,nels$math,
     main="NELS:Math Score Plotted vs Homework \n joining means for each school",
     xlab="Time Spent Doing homework",
     ylab="School Mean Math Scores",
     cex=1.0,
     col="black",
     type="n"
     )  
# --- put in proper order
    j <- order(math.hmwk$homewrk)
# --- Add to figure 
for (i in unique(math.hmwk$school)) {
  sub <- math.hmwk[which(math.hmwk$school==i),]
  lines(sub$homewrk,sub$math,col=i)
} 

Lattice Plots

page 32 linear regression math~homewrk + race

xyplot(math ~ homew | school, data=nels, col.line='black', 
                      type=c('p'),
                      main='Varability in Math ~ homework relationship')

page 19 - 20

Lattice plot with regressions overlaid

xyplot(math ~ homew | school, data=nels, col.line='black', 
                      type=c('p','r'),
                      main='Varability in Math ~ homework relationship')

page 23-24

Lattice plot with smooth regressions (i.e., loess)

xyplot(math ~ homew | school, data=nels, col.line='black', 
                      type=c('p','smooth'),
                      main='Varability in Math ~ homework relationship')

page 25

compute mean and plot by school just connecting points

math.hmwk <- aggregate(math ~ homewrk + school, data=nels, FUN="mean")

ggplot(math.hmwk, aes(x=homewrk, y=math, col=school,type='l'),
  main='Regressions all in one plot')  + geom_line() +  theme(legend.position="none")

## page 26 All regressions in one figure

ggplot(nels, aes(x=homewrk, y=math, col=school,type='l'),
       main='Regressions all in one plot')  + geom_smooth(method="lm", se=FALSE ) +  theme(legend.position="none")
## `geom_smooth()` using formula = 'y ~ x'

Alternate code to put all regressions in a single plot:

Set up graphic frame:

plot(nels$homew,nels$math,
    type="n", 
    col="blue",
    lwd=2,
    main="NELS: Linear Regression by School \n(for where there is data)",
    xlab="Time Spent Doing Homework",
    ylab="Math Scores"
    )
# --- loop through the schools and add line
for (j in unique(nels$school)) {
  sub <- nels[ which(nels$school==j),]
  fitted <- fitted(lm(math~homew,sub))
  lines(sub$homew,fitted,col=j)
}   

Math ~ homework + white

Need this for next set of graphs,

Draw mean math and plot by white

w <- as.factor(nels$white)
math.white <- aggregate(math ~ homewrk + w, data=nels, FUN="mean")
ggplot(math.white, aes(x=homewrk, y=math, col=w, type='l'),
            main='Regressions all in one plot')  + geom_line() 

Alternate that looks nicer:

plot(math.white$homewrk[1:7],math.white$math[1:7],type="l",
     main="Mean Math by Homework",
     xlab="Time Spent Doing Homework",
     ylab="Mean Math Score",
     col="blue")
lines(math.white$homewrk[8:14],math.white$math[8:14],type="l",col="red")
legend("topleft",legend=c("Non-White", "White"),
       col=c("blue","red"), lty=1, cex=1.2)   

page 29 linear regression math~homewrk + white

plot(math.white$homewrk,math.white$math,type="n",
     main="Regression of Math on Homework",
     xlab="Time Spent Doing Homework",
     ylab="Mean Math Score",
     ylim=c(40,70))
subw <- math.white[which(math.white$w==0),]
lm1 <- lm(math ~ homewrk,data=subw)
lines(subw$homewrk,fitted(lm1),col="red")
subw <- math.white[which(math.white$w==1),]
lm2 <- lm(math ~ homewrk,data=subw)
lines(subw$homewrk,fitted(lm2),col="blue")
legend("topleft",legend=c("Non-White", "White"),
       col=c("red","blue"), lty=1, cex=1.2)   

Regression to whole uncollapsed data set

plot(math.white$homewrk,math.white$math,type="n",
     main="Regression of Math on Homework",
     xlab="Time Spent Doing Homework",
     ylab="Mean Math Score",
     ylim=c(40,70))
subw <- nels[which(nels$white==1),]
abline(lm(math~homewrk,data=subw),col="blue")
subn <- nels[which(nels$white==0),]
abline(lm(math~homewrk,data=subn),col="red")
legend("topleft",legend=c("Non-White", "White"),
       col=c("red","blue"), lty=1, cex=1.2)   

Math mean ~ homework + by race

Need to get means by race

r <- as.factor(nels$race)
math.race <- aggregate(math ~ homewrk + r, data=nels, FUN="mean")

math ~ homework + race (page 32ish)

plot(math.race$homewrk[1:5],math.race$math[1:5],type="l",
     main="Mean Math by Homework",
     xlab="Time Spent Doing Homework",
     ylab="Mean Math Score",
     ylim=c(30,70),
     xlim=c(0,7),
     col="blue")
lines(math.race$homewrk[6:11],math.race$math[6:11],type="l", col="red")
lines(math.race$homewrk[12:18],math.race$math[12:18],type="l", col="darkgreen")
lines(math.race$homewrk[19:26],math.race$math[19:26],type="l", col="purple")
lines(math.race$homewrk[27:28],math.race$math[27:28],type="l", col="cyan")
legend("topleft",legend=c("Asian/PI", "Hispanic","Black","White","Native Amercian"),
       col=c("blue","red","darkgreen","purple","cyan"), lty=1, cex=1.2)   

Some more

page 32 linear regression math~homework + race

regressions extend beyond range where there is data

par(mfrow=c(1,1))
plot(math.race$homewrk[1:5],math.race$math[1:5],type="n",
     main="Mean Math by Homework",
     xlab="Time Spent Doing Homework",
     ylab="Mean Math Score",
     ylim=c(30,70),
     xlim=c(0,7))
abline(lm(math.race$math[1:5]~math.race$homewrk[1:5]),type="l",col="blue")
abline(lm(math.race$math[6:11]~math.race$homewrk[6:11]),type="l",col="red")
abline(lm(math.race$math[12:18]~math.race$homewrk[12:18]),type="l",col="darkgreen")
abline(lm(math.race$math[19:26]~math.race$homewrk[19:26]),type="l",col="purple")
abline(lm(math.race$math[27:28]~math.race$homewrk[27:28]),type="l",col="cyan")
legend("topleft",legend=c("Asian/PI", "Hispanic","Black","White","Native American"),
       col=c("blue","red","darkgreen","purple","cyan"), lty=1, cex=0.9)   

page 34 race x homework to show where there is not much data

Does not extend beyond the range of data. Set up graphic frame and then add stuff

plot(nels$homew,nels$math,
    type="n", 
    col="blue",
    lwd=2,
    main="NELS: Linear Regression by School \n(for where there is data)",
    xlab="Time Spent Doing Homework",
    ylab="Math Scores"
    )
# --- loop through the schools and add line
for (j in nels$race) {
  sub <- nels[ which(nels$race==j),]
  fitted <- fitted(lm(math~homew,sub))
  lines(sub$homew,fitted,col=j)
}
legend("topleft",legend=c("Asian/PI", "Hispanic","Black","White","Native Amercian"),
       col=c("blue","red","darkgreen","purple","cyan"), lty=1, cex=0.9)   

Cross-classification of homew x race

There’s not much data for some races and homework levels

with(nels, table(homew, race))
##      race
## homew   1   2   3   4   5
##     0   0   5   3  34   0
##     1   8  22  30 162   3
##     2   7   4  20  79   1
##     3   2   5   7  33   0
##     4   1   2   4  40   0
##     5   2   1   1  34   0
##     6   0   0   1   5   0
##     7   0   0   0   3   0

Math ~ homework + gender

page 36 mean math by gender

nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")

nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")

For many of these graphs, I also have code for ggplot; however, it doesn’t seem to be working, for example…

nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")

ggplot(math.gender, aes(x=homewrk, y=math, col=gender, type='l'),
            main='Regressions all in one plot')  + geom_line() 
text(0,77,'1=Male,',col='red')
text(1.5,77,'2=Female',col='lightseagreen')

But this works

nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")

plot(math.gender$homewrk,math.gender$math,type="n",
     main="Regression of Math on Homework \n Seperate lines for genders",
     xlab="Time Spent Doing Homework",
     ylab="Mean Math Score")
lines(math.gender$homewrk[1:8],math.gender$math[1:8],col="blue")
lines(math.gender$homewrk[9:15],math.gender$math[9:15],col="red")
legend("topleft",legend=c("Female", "Male"),
       col=c("blue","red"), lty=1, cex=1.2)   

Cross-classification with gender

This is around page 37

with(nels,table(sex, homewrk))
##    homewrk
## sex   0   1   2   3   4   5   6   7
##   1  27 110  46  23  24  14   2   3
##   2  15 115  65  24  23  24   4   0

graph math ~ homework + gender

These lines extend beyond data

plot(math.gender$homewrk,math.gender$math,type="n",
     main="Regression of Mean Math on Homework \n Seperate lines for genders",
     xlab="Time Spent Doing Homework",
     ylab="Mean Math Score",
     ylim=c(45,70))
abline(lm(math[1:7]~homewrk[1:7],data=nels),col="blue")
abline(lm(math[8:14]~homewrk[8:14],data=nels),col="red")
legend("topleft",legend=c("Female", "Male"),
       col=c("blue","red"), lty=1, cex=1.2)   

These do not extend beyond the range of data and fits lines to uncollapsed data

#  --- set up graphic frame:
plot(nels$homew,nels$math,
    type="n", 
    col="blue",
    lwd=2,
    main="NELS: Linear Regression by School by Gender \n(for where there is data)",
    xlab="Time Spent Doing Homework",
    ylab="Math Scores"
    )
# --- loop through the schools and add line
for (j in nels$sex) {
  sub <- nels[ which(nels$sex==j),]
  fitted <- fitted(lm(math~homew,sub))
  lines(sub$homew,fitted,col=j)
}  

SES

page 41-42 Lattice plot with regressions

xyplot(math ~ ses | school, data=nels, col.line='black', 
                      type=c('p','r'),
                      main='Varability in Math ~ ses relationship')

page 43-44 Plot 3: Lattice plot with smooth regressions (i.e., loess)

xyplot(math ~ ses | school, data=nels, col.line='black', 
                      type=c('p','smooth'),
                      main='Varability in Math ~ ses relationship')

page 45 All regressions in one figure

#  --- set up graphic frame:
par(ask=FALSE)
plot(nels$homew,nels$ses,
    type="n", 
    col="blue",
    lwd=2,
    main="NELS: Linear Regression by School \n(for where there is data)",
    xlab="Socio-econmic status (SES)",
    ylab="Math Scores",
    ylim=c(30,70),
    xlim=c(-2.5,2.5)
    )
# --- loop through the schools and add line
for (j in unique(nels$school)) {
  sub <- nels[ which(nels$school==j),]
  sub2 <- sub[order(sub$ses),]
  fitted <- fitted(lm(math~ses,sub2))
  lines(sub2$ses,fitted,col=j)
}     

Create 10 groups with about the same number per group

nels$ses.grp <- as.numeric(cut(nels$ses, 10))
nels$ses.grp <- as.factor(nels$ses.grp)

mean.ses.grp <- aggregate(ses~ses.grp, data=nels, FUN="mean")
mean.math.grp<- aggregate(math~ses.grp, data=nels, FUN="mean")

# Plot grouped data
plot(mean.ses.grp$ses,mean.math.grp$math, type='b',
     main="Grouped SES:  math mean vs ses mean",
     xlab="Mean SES",
     ylab="Mean Math",
     col="blue"  )

# Plot math vs SES
plot(nels$ses,nels$math,type='p')    

Fitting models

First make categorical as factors.

nels$white <- as.factor(nels$white)
nels$sex <- as.factor(nels$sex)
nels$public <- as.factor(nels$public)

Empty/baseline

This is just a simple random intercept model.

summary(model0 <- lmer(math ~ (1  | school), data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ (1 | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3806.8   3819.5  -1900.4   3800.8      516 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.67819 -0.72760 -0.01965  0.73306  2.67062 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  school   (Intercept) 24.85    4.985   
##  Residual             81.24    9.013   
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##             Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)   50.757      1.127 23.824   45.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(icc <- 24.85/(24.85+81.24))
## [1] 0.2342351
bic.hlm(model0, nels$school)
##        n groups total obs haroninc.mean.nj n random n fixed deviance  bic.new
## school       23       519         17.73371        2       1 3800.776 3813.299
##        bic.harm bic.ngrps bic.ntot      aic
## school 3813.058  3810.183 3819.532 3806.776

Preliminary lmm

The equations for this model can be found on pages 134–135 and the output on page 140.

summary( model1 <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public 
                 + (1 + homew | school), data=nels, REML=FALSE,
                 control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +  
##     public + (1 + homew | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3622.3   3673.3  -1799.1   3598.3      507 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.27541 -0.68326 -0.00958  0.66476  2.94001 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  school   (Intercept) 45.69    6.760         
##           homew       13.30    3.646    -0.91
##  Residual             51.05    7.145         
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##                 Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)     45.57373    2.10712  46.31987  21.628  < 2e-16 ***
## homew            1.85341    0.81512  20.69597   2.274  0.03376 *  
## schCses          1.24444    0.94965 492.11090   1.310  0.19067    
## sex2            -0.47049    0.66411 491.36235  -0.708  0.47900    
## white1           2.45950    0.98427 373.16369   2.499  0.01289 *  
## schMses          5.26977    1.81333  25.02604   2.906  0.00755 ** 
## public1          0.09328    2.11261  25.52103   0.044  0.96513    
## schCses:white1   1.40386    1.16134 493.26442   1.209  0.22731    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) homew  schCss sex2   white1 schMss publc1
## homew       -0.644                                          
## schCses      0.076 -0.015                                   
## sex2        -0.112 -0.040  0.117                            
## white1      -0.286 -0.020 -0.292 -0.029                     
## schMses     -0.324  0.003  0.067 -0.038 -0.160              
## public1     -0.575  0.014  0.067 -0.031 -0.093  0.714       
## schCss:wht1 -0.027 -0.006 -0.826 -0.094  0.198 -0.057 -0.076
screenreg(model1, single.row=TRUE)
## 
## ==================================================
##                                Model 1            
## --------------------------------------------------
## (Intercept)                       45.57 (2.11) ***
## homew                              1.85 (0.82) *  
## schCses                            1.24 (0.95)    
## sex2                              -0.47 (0.66)    
## white1                             2.46 (0.98) *  
## schMses                            5.27 (1.81) ** 
## public1                            0.09 (2.11)    
## schCses:white1                     1.40 (1.16)    
## --------------------------------------------------
## AIC                             3622.27           
## BIC                             3673.29           
## Log Likelihood                 -1799.14           
## Num. obs.                        519              
## Num. groups: school               23              
## Var: school (Intercept)           45.69           
## Var: school homew                 13.30           
## Cov: school (Intercept) homew    -22.40           
## Var: Residual                     51.05           
## ==================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
bic.hlm(model1, nels$school)
##        n groups total obs haroninc.mean.nj n random n fixed deviance  bic.new
## school       23       519         17.73371        4       8 3598.271 3660.828
##        bic.harm bic.ngrps bic.ntot      aic
## school   3658.9  3635.897 3673.294 3622.271

Too Complex

This model is too complex and does not converge. It has an additional random slope.

Random 1 + homew + schCses

model1b <- lmer(math ~ homew +  schCses + sex + white + schCses*white + schMses + public  + (1 + homew + schCses| school), data=nels, REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))
## boundary (singular) fit: see ?isSingular

Random 1 + homew + sex

model1c <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public + (1 + homew + sex| school), data=nels, REML=FALSE, control = lmerControl(optimizer="Nelder_Mead"))
## boundary (singular) fit: see ?isSingular

Random 1 + homew + white

This converges but has larger BIC.new and AIC compared to our prelininary model.

summary( model1d <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public  + (1 + homew + white| school), data=nels, REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +  
##     public + (1 + homew + white | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3623.5   3687.2  -1796.7   3593.5      504 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.19774 -0.68320 -0.01025  0.65298  2.98147 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr       
##  school   (Intercept) 64.48    8.030               
##           homew       13.11    3.621    -0.86      
##           white1      15.93    3.991    -0.67  0.32
##  Residual             50.01    7.072               
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)     46.6976     2.3015  30.5500  20.290  < 2e-16 ***
## homew            1.8717     0.8090  20.6538   2.314  0.03108 *  
## schCses          1.1303     0.9638 460.1114   1.173  0.24151    
## sex2            -0.4289     0.6621 490.8794  -0.648  0.51748    
## white1           2.0143     1.3736  11.8180   1.466  0.16862    
## schMses          5.5404     1.6584  21.9468   3.341  0.00297 ** 
## public1         -0.8064     1.9109  22.6738  -0.422  0.67701    
## schCses:white1   1.5025     1.1611 480.4592   1.294  0.19630    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) homew  schCss sex2   white1 schMss publc1
## homew       -0.659                                          
## schCses      0.074 -0.014                                   
## sex2        -0.094 -0.040  0.107                            
## white1      -0.562  0.171 -0.175 -0.022                     
## schMses     -0.286  0.002  0.037 -0.054 -0.110              
## public1     -0.485  0.013  0.041 -0.044 -0.070  0.720       
## schCss:wht1 -0.030 -0.006 -0.829 -0.088  0.125 -0.040 -0.064

Test Random Effects (i.e., \(\tau\)s)

\(H_o: \tau_1^2 = \tau_{10}=0\)

The full model is model 1 (the preliminary model) and we need to fit a null model for this test (i.e., remove ``homew’’ from random part of model)

summary(model2 <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public + (1 | school), data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +  
##     public + (1 | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3693.6   3736.1  -1836.8   3673.6      509 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6152 -0.7115  0.0079  0.6988  3.2779 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  school   (Intercept)  6.921   2.631   
##  Residual             65.946   8.121   
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)     45.3648     1.6805  41.3323  26.994  < 2e-16 ***
## homew            2.1069     0.2676 518.2015   7.874 2.03e-14 ***
## schCses          1.0502     1.0595 516.0901   0.991  0.32208    
## sex2            -0.7979     0.7352 512.2746  -1.085  0.27835    
## white1           3.0228     1.0728 301.2941   2.818  0.00516 ** 
## schMses          5.5667     1.7580  22.5352   3.167  0.00438 ** 
## public1          0.1596     2.0678  23.8516   0.077  0.93913    
## schCses:white1   2.8420     1.2946 516.4455   2.195  0.02859 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) homew  schCss sex2   white1 schMss publc1
## homew       -0.301                                          
## schCses      0.109 -0.037                                   
## sex2        -0.158 -0.068  0.097                            
## white1      -0.397 -0.038 -0.292 -0.029                     
## schMses     -0.372 -0.051  0.076 -0.042 -0.183              
## public1     -0.692  0.044  0.070 -0.048 -0.104  0.722       
## schCss:wht1 -0.050 -0.029 -0.829 -0.077  0.194 -0.061 -0.074

The test statistic equals the difference between -2lnlike (i.e., deviances)

(lr <- deviance(model2) - deviance(model1)  )
## [1] 75.33025
pvalue <- .5*(pchisq(lr,1,lower.tail=FALSE)) + .5*(pchisq(lr,2,lower.tail=FALSE))
round(pvalue, digits=2)
## [1] 0

Test fixed effects

Mean SES vs sector

Since sector not significant in model 1, we’ll try remove schMses (the predcitor of random intercept )

summary(model3 <- lmer(math ~ homew + schCses + sex + white + schCses*white  + public + (1 + homew | school),        data=nels, REML=FALSE,
control =lmerControl(optimizer="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + public +  
##     (1 + homew | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3627.3   3674.1  -1802.7   3605.3      508 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.27212 -0.68624 -0.00829  0.65111  2.92832 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  school   (Intercept) 48.00    6.928         
##           homew       13.65    3.694    -0.87
##  Residual             50.98    7.140         
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)     47.6883     2.1126  46.3192  22.574  < 2e-16 ***
## homew            1.8258     0.8253  20.7145   2.212  0.03833 *  
## schCses          1.1531     0.9506 491.3680   1.213  0.22568    
## sex2            -0.4083     0.6648 489.8831  -0.614  0.53941    
## white1           2.6606     0.9972 404.4834   2.668  0.00793 ** 
## public1         -4.1864     1.7306  24.2811  -2.419  0.02341 *  
## schCses:white1   1.4822     1.1632 491.6358   1.274  0.20317    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) homew  schCss sex2   white1 publc1
## homew       -0.627                                   
## schCses      0.102 -0.015                            
## sex2        -0.124 -0.039  0.120                     
## white1      -0.352 -0.021 -0.291 -0.035              
## public1     -0.565  0.012  0.024 -0.004  0.030       
## schCss:wht1 -0.046 -0.005 -0.826 -0.097  0.194 -0.046
# for latex in lecture notes: texreg(list(model3), single.row=TRUE,custom.model.names("No Mean SES"), caption=("Why Secter was ns?"))
anova(model3, model1)
## Data: nels
## Models:
## model3: math ~ homew + schCses + sex + white + schCses * white + public + 
## model3:     (1 + homew | school)
## model1: math ~ homew + schCses + sex + white + schCses * white + schMses + 
## model1:     public + (1 + homew | school)
##        npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)   
## model3   11 3627.3 3674.1 -1802.7   3605.3                        
## model1   12 3622.3 3673.3 -1799.1   3598.3 7.0303  1   0.008014 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bic.hlm(model3, nels$school)
##        n groups total obs haroninc.mean.nj n random n fixed deviance  bic.new
## school       23       519         17.73371        4       7 3605.301 3661.606
##        bic.harm bic.ngrps bic.ntot      aic
## school  3659.92  3639.792 3674.072 3627.301
bic.hlm(model1, nels$school)
##        n groups total obs haroninc.mean.nj n random n fixed deviance  bic.new
## school       23       519         17.73371        4       8 3598.271 3660.828
##        bic.harm bic.ngrps bic.ntot      aic
## school   3658.9  3635.897 3673.294 3622.271

For LR test for gender & public

summary(model1.reduced <- lmer(math ~ homew + schCses + white + schCses*white + schMses + (1 + homew | school), data=nels, REML=FALSE,
control = lmerControl(optimizer="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homew + schCses + white + schCses * white + schMses +  
##     (1 + homew | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3618.8   3661.3  -1799.4   3598.8      509 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.23594 -0.68934 -0.00211  0.65555  2.95528 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  school   (Intercept) 46.07    6.787         
##           homew       13.42    3.663    -0.91
##  Residual             51.08    7.147         
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)     45.4350     1.7078  33.2900  26.604  < 2e-16 ***
## homew            1.8293     0.8177  20.6623   2.237 0.036434 *  
## schCses          1.3219     0.9411 492.7418   1.405 0.160754    
## white1           2.4396     0.9800 359.5614   2.489 0.013247 *  
## schMses          5.1909     1.2715  23.0642   4.083 0.000456 ***
## schCses:white1   1.3265     1.1529 494.5634   1.151 0.250450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) homew  schCss white1 schMss
## homew       -0.795                            
## schCses      0.162 -0.011                     
## white1      -0.425 -0.021 -0.286              
## schMses      0.149 -0.010  0.030 -0.135       
## schCss:wht1 -0.103 -0.009 -0.823  0.191 -0.005
anova(model1.reduced, model1)
## Data: nels
## Models:
## model1.reduced: math ~ homew + schCses + white + schCses * white + schMses + 
## model1.reduced:     (1 + homew | school)
## model1: math ~ homew + schCses + sex + white + schCses * white + schMses + 
## model1:     public + (1 + homew | school)
##                npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## model1.reduced   10 3618.8 3661.3 -1799.4   3598.8                     
## model1           12 3622.3 3673.3 -1799.1   3598.3 0.5015  2     0.7782
bic.hlm(model1.reduced, nels$school)
##        n groups total obs haroninc.mean.nj n random n fixed deviance  bic.new
## school       23       519         17.73371        4       6 3598.772 3648.826
##        bic.harm bic.ngrps bic.ntot      aic
## school  3647.38  3630.127 3661.291 3618.772
# for lecture notes: texreg(list(model1.reduced),single.row=TRUE, custom.model.names=c("Simpler Model"))
deviance(model1.reduced)
## [1] 3598.772

Drop sector

# page 156 
summary(model4 <- lmer(math ~ homew + schCses + sex + white + schCses*white  + schMses + (1 + homew | school),  data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +  
##     (1 + homew | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3620.3   3667.0  -1799.1   3598.3      508 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.27490 -0.68323 -0.01015  0.66496  2.94047 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  school   (Intercept) 45.69    6.759         
##           homew       13.29    3.646    -0.91
##  Residual             51.05    7.145         
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)     45.6272     1.7245  35.0768  26.458  < 2e-16 ***
## homew            1.8530     0.8149  20.6974   2.274 0.033759 *  
## schCses          1.2416     0.9475 493.6246   1.310 0.190668    
## sex2            -0.4696     0.6638 492.0838  -0.707 0.479633    
## white1           2.4635     0.9800 358.8678   2.514 0.012386 *  
## schMses          5.2127     1.2699  23.0562   4.105 0.000432 ***
## schCses:white1   1.4078     1.1580 495.2388   1.216 0.224638    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) homew  schCss sex2   white1 schMss
## homew       -0.777                                   
## schCses      0.141 -0.016                            
## sex2        -0.158 -0.040  0.119                     
## white1      -0.416 -0.019 -0.288 -0.032              
## schMses      0.151 -0.009  0.027 -0.023 -0.134       
## schCss:wht1 -0.087 -0.005 -0.825 -0.097  0.193 -0.003

Drop interaction

Betwoen gender and white, which has not been significant

summary(model5 <- lmer(math ~ homew + schCses + white + schMses + (1 + homew | school), data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homew + schCses + white + schMses + (1 + homew | school)
##    Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
## 
##      AIC      BIC   logLik deviance df.resid 
##   3618.1   3656.4  -1800.0   3600.1      510 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.24935 -0.69462 -0.01681  0.64798  2.98486 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  school   (Intercept) 46.61    6.827         
##           homew       13.80    3.715    -0.91
##  Residual             51.12    7.150         
## Number of obs: 519, groups:  school, 23
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  45.6453     1.7064  32.5449  26.750  < 2e-16 ***
## homew         1.8324     0.8280  20.7671   2.213 0.038228 *  
## schCses       2.2098     0.5343 487.7681   4.136 4.16e-05 ***
## white1        2.2160     0.9637 364.8911   2.300 0.022039 *  
## schMses       5.1797     1.2815  23.1491   4.042 0.000502 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) homew  schCss white1
## homew   -0.800                     
## schCses  0.137 -0.032              
## white1  -0.414 -0.019 -0.233       
## schMses  0.149 -0.009  0.045 -0.136

And some checks

anova(model5, model1.reduced)
## Data: nels
## Models:
## model5: math ~ homew + schCses + white + schMses + (1 + homew | school)
## model1.reduced: math ~ homew + schCses + white + schCses * white + schMses + 
## model1.reduced:     (1 + homew | school)
##                npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## model5            9 3618.1 3656.4 -1800.0   3600.1                     
## model1.reduced   10 3618.8 3661.3 -1799.4   3598.8 1.3105  1     0.2523
# For laTex: texreg(list(model5),single.row=TRUE,custom.model.names=c("Final Model"))
bic.hlm(model5,nels$school)
##        n groups total obs haroninc.mean.nj n random n fixed deviance  bic.new
## school       23       519         17.73371        4       5 3600.083 3643.884
##        bic.harm bic.ngrps bic.ntot      aic
## school  3642.68  3628.302  3656.35 3618.083

summary of model fit to data

screenreg(
  list(model0, model1, model2, model3, model4, model5))
## 
## =================================================================================================================
##                                Model 1       Model 2       Model 3       Model 4       Model 5       Model 6     
## -----------------------------------------------------------------------------------------------------------------
## (Intercept)                       50.76 ***     45.57 ***     45.36 ***     47.69 ***     45.63 ***     45.65 ***
##                                   (1.13)        (2.11)        (1.68)        (2.11)        (1.72)        (1.71)   
## homew                                            1.85 *        2.11 ***      1.83 *        1.85 *        1.83 *  
##                                                 (0.82)        (0.27)        (0.83)        (0.81)        (0.83)   
## schCses                                          1.24          1.05          1.15          1.24          2.21 ***
##                                                 (0.95)        (1.06)        (0.95)        (0.95)        (0.53)   
## sex2                                            -0.47         -0.80         -0.41         -0.47                  
##                                                 (0.66)        (0.74)        (0.66)        (0.66)                 
## white1                                           2.46 *        3.02 **       2.66 **       2.46 *        2.22 *  
##                                                 (0.98)        (1.07)        (1.00)        (0.98)        (0.96)   
## schMses                                          5.27 **       5.57 **                     5.21 ***      5.18 ***
##                                                 (1.81)        (1.76)                      (1.27)        (1.28)   
## public1                                          0.09          0.16         -4.19 *                              
##                                                 (2.11)        (2.07)        (1.73)                               
## schCses:white1                                   1.40          2.84 *        1.48          1.41                  
##                                                 (1.16)        (1.29)        (1.16)        (1.16)                 
## -----------------------------------------------------------------------------------------------------------------
## AIC                             3806.78       3622.27       3693.60       3627.30       3620.27       3618.08    
## BIC                             3819.53       3673.29       3736.12       3674.07       3667.04       3656.35    
## Log Likelihood                 -1900.39      -1799.14      -1836.80      -1802.65      -1799.14      -1800.04    
## Num. obs.                        519           519           519           519           519           519       
## Num. groups: school               23            23            23            23            23            23       
## Var: school (Intercept)           24.85         45.69          6.92         48.00         45.69         46.61    
## Var: Residual                     81.24         51.05         65.95         50.98         51.05         51.12    
## Var: school homew                               13.30                       13.65         13.29         13.80    
## Cov: school (Intercept) homew                  -22.40                      -22.16        -22.39        -23.04    
## =================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Diagnositics for Model 5

Since model 5 at this points looks like it might be our final model, we’ll do some diagnostic checks.

Graph EBLUPS

Well do this in steps

Step 1

We need to extract them from model object

ranU <- as.data.frame(ranef(model5))

Step 2

Put them in separate objects

U0j <- ranU[which(ranU$term=="(Intercept)"), ]
U1j <- ranU[which(ranU$term=="homew"), ]

Step 3

Sort rows by ranU, which is “condval”

U0j <-U0j[order(U0j$condval), ]
U1j <-U1j[order(U1j$condval), ]

step 4

Add integers for schools so that I can plot in order

U0j$xgrp <- seq(1:23)
U1j$xgrp <- seq(1:23)

Step 5

Draw graph for U0j

plot(U0j$xgrp, U0j$condval,type="p",
     main="Final Model: Random intercepts +/- 1 std error",
     xlab="Schools (sort by Uo)",
     ylab="EBLUP of U0j",
 col="blue"
     )

# Add in the lines for sd errors by
# drawing arrows but with no arrowhead.
# This is a hack solution but it works
arrows(U0j$xgrp, U0j$condval-U0j$condsd,  
       U0j$xgrp, U0j$condval+U0j$condsd, 
         length=0.00)

### Now for U1j: Repeat above but for the random slopes

plot(U1j$xgrp, U1j$condval, type="p",
   main="Final Model: Random slopes +/- 1 std error",
     xlab="Schools (sort by U1j)",
     ylab="EBLUP of U1j",
     col="blue")

# For sd errors, draw arrows but with no arrowhead
arrows(U1j$xgrp, U1j$condval-U1j$condsd,  
       U1j$xgrp, U1j$condval+U1j$condsd, 
         length=0.00)

QQplot for U’s

I will make custom qqplots that include +/- 1 sd error. ### Uoj

qqdata <- qqnorm(U0j$condval)

plot(qqdata$x, qqdata$y, type="p", pch=19, col="red",
main="Normal QQ plot of U0j with 95% Confidence Bands",
     xlab="Theoretical Value",
     ylab="Estimated U0j (EBLUPS)",
     ylim=c(-15,15))
arrows(qqdata$x, qqdata$y - 1.96*U0j$condsd, 
       qqdata$x, qqdata$y + 1.96*U0j$condsd, 
         length=0.00, col="gray")
qqline(U0j$condval,col="steelblue")

### For U1j{

qqdata <- qqnorm(U1j$condval)

plot(qqdata$x,qqdata$y,type="p",pch=19,col="red",
main="Normal QQ plot of U1j with 95% Confidence Bands",
     xlab="Theoretical Value",
     ylab="Estimated Uuj (EBLUPS)",
     ylim=c(-8,8)
     )
arrows(qqdata$x, qqdata$y - 1.96*U1j$condsd, 
       qqdata$x, qqdata$y + 1.96*U1j$condsd, 
         length=0.00, col="gray")
qqline(U1j$condval,col="steelblue")

## Residual Plots

This gives standardized level 1 residuals. These are stadnardized and

res5 <- hlm_resid(model5, standardize=TRUE, include.ls=FALSE)
head(res5)
## # A tibble: 6 x 11
##      id  math homew schCses white schMses school .std.~1 .fitted .chol~2 .mar.~3
##   <dbl> <int> <int>   <dbl> <fct>   <dbl> <fct>    <dbl>   <dbl>   <dbl>   <dbl>
## 1     1    50     1   0.150 1       0.700 6053    -0.595    54.3  -0.451    53.7
## 2     2    43     1  -0.270 1       0.700 6053    -1.44     53.3  -1.13     52.7
## 3     3    50     3  -1.29  0       0.700 6053    -0.440    53.1  -0.286    51.9
## 4     4    49     1   0.320 1       0.700 6053    -0.788    54.6  -0.349    54.0
## 5     5    62     1   0.140 1       0.700 6053     1.09     54.2   1.45     53.6
## 6     6    43     1   0.620 1       0.700 6053    -1.72     55.3  -1.37     54.7
## # ... with abbreviated variable names 1: .std.resid, 2: .chol.mar.resid,
## #   3: .mar.fitted
par(mfrow=c(2,2))
fit <- fitted(model5)
plot(fit, res5$.std.resid, 
     xlab='Conditional Fitted Values',
     ylab='Std Residuals',
     main='Conditional Level1 Residuals')
lines(c(min(fit),max(fit)),c(0,0), col="red", lwd=2)


# qqplot
qqnorm(res5$.std.resid)                    # draws plot
qqline(res5$.std.resid, col='red', lwd=2)  # reference line


# Histogram of standardized residuals with normal overlay
h<- hist(res5$.std.resid, breaks=15, density=20, col="seagreen")   # draw historgram
xfit <- seq(-3, 3, length=50)       # sets range & number of quantiles
yfit <- dnorm(xfit, mean=0, sd=1)   # should be normal(0,1)
yfit <- yfit*diff(h$mids[1:2])*length(res5$.std.resid) # use mid-points 
lines(xfit, yfit, col='blue', lwd=2)                   # draws normal


# Some things to describe the model
plot.new( ) 
text(.5,1.0,'Model 5')
text(.5,0.85,'Devience=3600.1')
text(.5,0.6,'AIC=3618.1')
text(.5,0.4,'BIC=3656.4')

Level 2 or marginal

These are estimated random effects, which we got above using ranef().

res5.margin <- resid_marginal(model5, type="pearson")
head(res5.margin)
##          1          2          3          4          5          6 
## -0.4511534 -1.2016262 -0.2094894 -0.6211822  1.0347516 -1.4447061