We will start with exploratory data analysis, followed by model the data, and assessing model fit and assumptions. Included in this example are location-scale models.
Many of the figures are in the lecture notes on model building. The page numbers are those in the lecture notes (at least a close approximation of their location)
library(lme4)
library(lmerTest)
library(lattice)
library(ggplot2)
library(stringi)
library(HLMdiag)
## Warning: package 'HLMdiag' was built under R version 4.0.5
library(texreg)
library(optimx)
We will use some of the functions described in lecture on inference of the marginal model (e.g., robust, contrast, hlmRsq, etc.)
source("D:\\Dropbox\\edps587\\lectures\\6 inference1\\All_functions.txt")
#source("C:\\Users\\cja\\Dropbox\\edps587\\lectures\\6 inference1\\All_functions.txt")
nels<-read.table("D:\\Dropbox\\edps587\\lectures\\8 modelbuilding\\nels23_data.txt",header=TRUE)
#nels<-read.table("C:\\Users\\cja\\Dropbox\\edps587\\lectures\\8 modelbuilding\\nels23_data.txt",header=TRUE)
#look at first few rows of data set
head(nels)
## school student sex race homew schtype ses pared math classstr schsize
## 1 7472 3 2 4 1 1 -0.13 2 48 2 3
## 2 7472 8 1 4 0 1 -0.39 2 48 2 3
## 3 7472 13 1 4 0 1 -0.80 2 53 2 3
## 4 7472 17 1 4 1 1 -0.72 2 42 2 3
## 5 7472 27 2 4 2 1 -0.74 2 43 2 3
## 6 7472 28 2 4 1 1 -0.58 2 57 2 3
## urbana geo minority ratio
## 1 2 2 0 19
## 2 2 2 0 19
## 3 2 2 0 19
## 4 2 2 0 19
## 5 2 2 0 19
## 6 2 2 0 19
Read in the data and create some variables we’ll need.
nels$public <- ifelse (nels$schtype==1, 1, 0)
nels$white <- ifelse (nels$race==4, 1, 0)
schMses <- as.data.frame(aggregate(ses~school, data=nels, FUN="mean"))
names(schMses) <- c('school', 'schMses')
nels <- merge(nels,schMses, by=c('school'))
nels$schCses <- nels$ses - nels$schMses
nels$homewrk <- as.numeric(nels$homew)
nels$school <- as.factor(nels$school)
head(nels)
## school student sex race homew schtype ses pared math classstr schsize
## 1 6053 1 2 4 1 4 0.85 4 50 3 3
## 2 6053 2 2 4 1 4 0.43 3 43 3 3
## 3 6053 4 2 1 3 4 -0.59 3 50 3 3
## 4 6053 11 2 4 1 4 1.02 5 49 3 3
## 5 6053 12 1 4 1 4 0.84 5 62 3 3
## 6 6053 13 2 4 1 4 1.32 6 43 3 3
## urbana geo minority ratio public white schMses schCses homewrk
## 1 1 2 3 18 0 1 0.6997727 0.1502273 1
## 2 1 2 3 18 0 1 0.6997727 -0.2697727 1
## 3 1 2 3 18 0 0 0.6997727 -1.2897727 3
## 4 1 2 3 18 0 1 0.6997727 0.3202273 1
## 5 1 2 3 18 0 1 0.6997727 0.1402273 1
## 6 1 2 3 18 0 1 0.6997727 0.6202273 1
That should be enough for now.
math.homewrk <- lm(math~homewrk, data=nels)
conflm1<-confint(math.homewrk)
plot(nels$homewrk,nels$math,type='p', main='Overall regression with 95% CI')
abline(math.homewrk)
abline(coef=conflm1[,1],lty=2)
abline(coef=conflm1[,2],lty=2)
Fit loess to data are plot
lw1 <- loess(math ~ homewrk,data=nels)
# --- plot data
plot(nels$homewrk,nels$math,
main="NELS:Math Score Plotted vs Homework \n with smooth (loess) curve",
xlab="Time Spent Doing homework",
ylab="Math Scores",
cex=1.0,
col="black"
)
# --- put in proper order
j <- order(nels$homewrk)
# --- Add loess to figure
lines(nels$homewrk[j],lw1$fitted[j],col="blue",lwd=3)
This graph is around page 25. We compute means mean of math for each level of time spent doing home for rach school. These are plotted just connecting points (one curve for each school)
math.hmwk <- aggregate(math ~ homewrk + school, data=nels, FUN="mean")
ggplot(math.hmwk, aes(x=homewrk, y=math, col=school,type='l'),
main='Regressions all in one plot') + geom_line() + theme(legend.position="none")
Or an alternative method
plot(nels$homewrk,nels$math,
main="NELS:Math Score Plotted vs Homework \n joining means for each school",
xlab="Time Spent Doing homework",
ylab="School Mean Math Scores",
cex=1.0,
col="black",
type="n"
)
# --- put in proper order
j <- order(math.hmwk$homewrk)
# --- Add to figure
for (i in unique(math.hmwk$school)) {
sub <- math.hmwk[which(math.hmwk$school==i),]
lines(sub$homewrk,sub$math,col=i)
}
xyplot(math ~ homew | school, data=nels, col.line='black',
type=c('p'),
main='Varability in Math ~ homework relationship')
Lattice plot with regressions overlaid
xyplot(math ~ homew | school, data=nels, col.line='black',
type=c('p','r'),
main='Varability in Math ~ homework relationship')
Lattice plot with smooth regressions (i.e., loess)
xyplot(math ~ homew | school, data=nels, col.line='black',
type=c('p','smooth'),
main='Varability in Math ~ homework relationship')
compute mean and plot by school just connecting points
math.hmwk <- aggregate(math ~ homewrk + school, data=nels, FUN="mean")
ggplot(math.hmwk, aes(x=homewrk, y=math, col=school,type='l'),
main='Regressions all in one plot') + geom_line() + theme(legend.position="none")
## page 26 All regressions in one figure
ggplot(nels, aes(x=homewrk, y=math, col=school,type='l'),
main='Regressions all in one plot') + geom_smooth(method="lm", se=FALSE ) + theme(legend.position="none")
## `geom_smooth()` using formula = 'y ~ x'
Set up graphic frame:
plot(nels$homew,nels$math,
type="n",
col="blue",
lwd=2,
main="NELS: Linear Regression by School \n(for where there is data)",
xlab="Time Spent Doing Homework",
ylab="Math Scores"
)
# --- loop through the schools and add line
for (j in unique(nels$school)) {
sub <- nels[ which(nels$school==j),]
fitted <- fitted(lm(math~homew,sub))
lines(sub$homew,fitted,col=j)
}
Need this for next set of graphs,
Draw mean math and plot by white
w <- as.factor(nels$white)
math.white <- aggregate(math ~ homewrk + w, data=nels, FUN="mean")
ggplot(math.white, aes(x=homewrk, y=math, col=w, type='l'),
main='Regressions all in one plot') + geom_line()
Alternate that looks nicer:
plot(math.white$homewrk[1:7],math.white$math[1:7],type="l",
main="Mean Math by Homework",
xlab="Time Spent Doing Homework",
ylab="Mean Math Score",
col="blue")
lines(math.white$homewrk[8:14],math.white$math[8:14],type="l",col="red")
legend("topleft",legend=c("Non-White", "White"),
col=c("blue","red"), lty=1, cex=1.2)
plot(math.white$homewrk,math.white$math,type="n",
main="Regression of Math on Homework",
xlab="Time Spent Doing Homework",
ylab="Mean Math Score",
ylim=c(40,70))
subw <- math.white[which(math.white$w==0),]
lm1 <- lm(math ~ homewrk,data=subw)
lines(subw$homewrk,fitted(lm1),col="red")
subw <- math.white[which(math.white$w==1),]
lm2 <- lm(math ~ homewrk,data=subw)
lines(subw$homewrk,fitted(lm2),col="blue")
legend("topleft",legend=c("Non-White", "White"),
col=c("red","blue"), lty=1, cex=1.2)
plot(math.white$homewrk,math.white$math,type="n",
main="Regression of Math on Homework",
xlab="Time Spent Doing Homework",
ylab="Mean Math Score",
ylim=c(40,70))
subw <- nels[which(nels$white==1),]
abline(lm(math~homewrk,data=subw),col="blue")
subn <- nels[which(nels$white==0),]
abline(lm(math~homewrk,data=subn),col="red")
legend("topleft",legend=c("Non-White", "White"),
col=c("red","blue"), lty=1, cex=1.2)
Need to get means by race
r <- as.factor(nels$race)
math.race <- aggregate(math ~ homewrk + r, data=nels, FUN="mean")
plot(math.race$homewrk[1:5],math.race$math[1:5],type="l",
main="Mean Math by Homework",
xlab="Time Spent Doing Homework",
ylab="Mean Math Score",
ylim=c(30,70),
xlim=c(0,7),
col="blue")
lines(math.race$homewrk[6:11],math.race$math[6:11],type="l", col="red")
lines(math.race$homewrk[12:18],math.race$math[12:18],type="l", col="darkgreen")
lines(math.race$homewrk[19:26],math.race$math[19:26],type="l", col="purple")
lines(math.race$homewrk[27:28],math.race$math[27:28],type="l", col="cyan")
legend("topleft",legend=c("Asian/PI", "Hispanic","Black","White","Native Amercian"),
col=c("blue","red","darkgreen","purple","cyan"), lty=1, cex=1.2)
page 32 linear regression math~homework + race
regressions extend beyond range where there is data
par(mfrow=c(1,1))
plot(math.race$homewrk[1:5],math.race$math[1:5],type="n",
main="Mean Math by Homework",
xlab="Time Spent Doing Homework",
ylab="Mean Math Score",
ylim=c(30,70),
xlim=c(0,7))
abline(lm(math.race$math[1:5]~math.race$homewrk[1:5]),type="l",col="blue")
abline(lm(math.race$math[6:11]~math.race$homewrk[6:11]),type="l",col="red")
abline(lm(math.race$math[12:18]~math.race$homewrk[12:18]),type="l",col="darkgreen")
abline(lm(math.race$math[19:26]~math.race$homewrk[19:26]),type="l",col="purple")
abline(lm(math.race$math[27:28]~math.race$homewrk[27:28]),type="l",col="cyan")
legend("topleft",legend=c("Asian/PI", "Hispanic","Black","White","Native American"),
col=c("blue","red","darkgreen","purple","cyan"), lty=1, cex=0.9)
Does not extend beyond the range of data. Set up graphic frame and then add stuff
plot(nels$homew,nels$math,
type="n",
col="blue",
lwd=2,
main="NELS: Linear Regression by School \n(for where there is data)",
xlab="Time Spent Doing Homework",
ylab="Math Scores"
)
# --- loop through the schools and add line
for (j in nels$race) {
sub <- nels[ which(nels$race==j),]
fitted <- fitted(lm(math~homew,sub))
lines(sub$homew,fitted,col=j)
}
legend("topleft",legend=c("Asian/PI", "Hispanic","Black","White","Native Amercian"),
col=c("blue","red","darkgreen","purple","cyan"), lty=1, cex=0.9)
There’s not much data for some races and homework levels
with(nels, table(homew, race))
## race
## homew 1 2 3 4 5
## 0 0 5 3 34 0
## 1 8 22 30 162 3
## 2 7 4 20 79 1
## 3 2 5 7 33 0
## 4 1 2 4 40 0
## 5 2 1 1 34 0
## 6 0 0 1 5 0
## 7 0 0 0 3 0
nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")
nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")
For many of these graphs, I also have code for ggplot; however, it doesn’t seem to be working, for example…
nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")
ggplot(math.gender, aes(x=homewrk, y=math, col=gender, type='l'),
main='Regressions all in one plot') + geom_line()
text(0,77,'1=Male,',col='red')
text(1.5,77,'2=Female',col='lightseagreen')
But this works
nels$gender <- as.factor(nels$sex)
math.gender <- aggregate(math ~ homewrk + gender, data=nels, FUN="mean")
plot(math.gender$homewrk,math.gender$math,type="n",
main="Regression of Math on Homework \n Seperate lines for genders",
xlab="Time Spent Doing Homework",
ylab="Mean Math Score")
lines(math.gender$homewrk[1:8],math.gender$math[1:8],col="blue")
lines(math.gender$homewrk[9:15],math.gender$math[9:15],col="red")
legend("topleft",legend=c("Female", "Male"),
col=c("blue","red"), lty=1, cex=1.2)
This is around page 37
with(nels,table(sex, homewrk))
## homewrk
## sex 0 1 2 3 4 5 6 7
## 1 27 110 46 23 24 14 2 3
## 2 15 115 65 24 23 24 4 0
These lines extend beyond data
plot(math.gender$homewrk,math.gender$math,type="n",
main="Regression of Mean Math on Homework \n Seperate lines for genders",
xlab="Time Spent Doing Homework",
ylab="Mean Math Score",
ylim=c(45,70))
abline(lm(math[1:7]~homewrk[1:7],data=nels),col="blue")
abline(lm(math[8:14]~homewrk[8:14],data=nels),col="red")
legend("topleft",legend=c("Female", "Male"),
col=c("blue","red"), lty=1, cex=1.2)
These do not extend beyond the range of data and fits lines to uncollapsed data
# --- set up graphic frame:
plot(nels$homew,nels$math,
type="n",
col="blue",
lwd=2,
main="NELS: Linear Regression by School by Gender \n(for where there is data)",
xlab="Time Spent Doing Homework",
ylab="Math Scores"
)
# --- loop through the schools and add line
for (j in nels$sex) {
sub <- nels[ which(nels$sex==j),]
fitted <- fitted(lm(math~homew,sub))
lines(sub$homew,fitted,col=j)
}
xyplot(math ~ ses | school, data=nels, col.line='black',
type=c('p','r'),
main='Varability in Math ~ ses relationship')
xyplot(math ~ ses | school, data=nels, col.line='black',
type=c('p','smooth'),
main='Varability in Math ~ ses relationship')
# --- set up graphic frame:
par(ask=FALSE)
plot(nels$homew,nels$ses,
type="n",
col="blue",
lwd=2,
main="NELS: Linear Regression by School \n(for where there is data)",
xlab="Socio-econmic status (SES)",
ylab="Math Scores",
ylim=c(30,70),
xlim=c(-2.5,2.5)
)
# --- loop through the schools and add line
for (j in unique(nels$school)) {
sub <- nels[ which(nels$school==j),]
sub2 <- sub[order(sub$ses),]
fitted <- fitted(lm(math~ses,sub2))
lines(sub2$ses,fitted,col=j)
}
nels$ses.grp <- as.numeric(cut(nels$ses, 10))
nels$ses.grp <- as.factor(nels$ses.grp)
mean.ses.grp <- aggregate(ses~ses.grp, data=nels, FUN="mean")
mean.math.grp<- aggregate(math~ses.grp, data=nels, FUN="mean")
# Plot grouped data
plot(mean.ses.grp$ses,mean.math.grp$math, type='b',
main="Grouped SES: math mean vs ses mean",
xlab="Mean SES",
ylab="Mean Math",
col="blue" )
# Plot math vs SES
plot(nels$ses,nels$math,type='p')
First make categorical as factors.
nels$white <- as.factor(nels$white)
nels$sex <- as.factor(nels$sex)
nels$public <- as.factor(nels$public)
This is just a simple random intercept model.
summary(model0 <- lmer(math ~ (1 | school), data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ (1 | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3806.8 3819.5 -1900.4 3800.8 516
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.67819 -0.72760 -0.01965 0.73306 2.67062
##
## Random effects:
## Groups Name Variance Std.Dev.
## school (Intercept) 24.85 4.985
## Residual 81.24 9.013
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 50.757 1.127 23.824 45.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(icc <- 24.85/(24.85+81.24))
## [1] 0.2342351
bic.hlm(model0, nels$school)
## n groups total obs haroninc.mean.nj n random n fixed deviance bic.new
## school 23 519 17.73371 2 1 3800.776 3813.299
## bic.harm bic.ngrps bic.ntot aic
## school 3813.058 3810.183 3819.532 3806.776
The equations for this model can be found on pages 134–135 and the output on page 140.
summary( model1 <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public
+ (1 + homew | school), data=nels, REML=FALSE,
control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +
## public + (1 + homew | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3622.3 3673.3 -1799.1 3598.3 507
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.27541 -0.68326 -0.00958 0.66476 2.94001
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 45.69 6.760
## homew 13.30 3.646 -0.91
## Residual 51.05 7.145
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 45.57373 2.10712 46.31987 21.628 < 2e-16 ***
## homew 1.85341 0.81512 20.69597 2.274 0.03376 *
## schCses 1.24444 0.94965 492.11090 1.310 0.19067
## sex2 -0.47049 0.66411 491.36235 -0.708 0.47900
## white1 2.45950 0.98427 373.16369 2.499 0.01289 *
## schMses 5.26977 1.81333 25.02604 2.906 0.00755 **
## public1 0.09328 2.11261 25.52103 0.044 0.96513
## schCses:white1 1.40386 1.16134 493.26442 1.209 0.22731
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) homew schCss sex2 white1 schMss publc1
## homew -0.644
## schCses 0.076 -0.015
## sex2 -0.112 -0.040 0.117
## white1 -0.286 -0.020 -0.292 -0.029
## schMses -0.324 0.003 0.067 -0.038 -0.160
## public1 -0.575 0.014 0.067 -0.031 -0.093 0.714
## schCss:wht1 -0.027 -0.006 -0.826 -0.094 0.198 -0.057 -0.076
screenreg(model1, single.row=TRUE)
##
## ==================================================
## Model 1
## --------------------------------------------------
## (Intercept) 45.57 (2.11) ***
## homew 1.85 (0.82) *
## schCses 1.24 (0.95)
## sex2 -0.47 (0.66)
## white1 2.46 (0.98) *
## schMses 5.27 (1.81) **
## public1 0.09 (2.11)
## schCses:white1 1.40 (1.16)
## --------------------------------------------------
## AIC 3622.27
## BIC 3673.29
## Log Likelihood -1799.14
## Num. obs. 519
## Num. groups: school 23
## Var: school (Intercept) 45.69
## Var: school homew 13.30
## Cov: school (Intercept) homew -22.40
## Var: Residual 51.05
## ==================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
bic.hlm(model1, nels$school)
## n groups total obs haroninc.mean.nj n random n fixed deviance bic.new
## school 23 519 17.73371 4 8 3598.271 3660.828
## bic.harm bic.ngrps bic.ntot aic
## school 3658.9 3635.897 3673.294 3622.271
This model is too complex and does not converge. It has an additional random slope.
model1b <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public + (1 + homew + schCses| school), data=nels, REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead"))
## boundary (singular) fit: see ?isSingular
model1c <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public + (1 + homew + sex| school), data=nels, REML=FALSE, control = lmerControl(optimizer="Nelder_Mead"))
## boundary (singular) fit: see ?isSingular
This converges but has larger BIC.new and AIC compared to our prelininary model.
summary( model1d <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public + (1 + homew + white| school), data=nels, REML=FALSE, control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +
## public + (1 + homew + white | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3623.5 3687.2 -1796.7 3593.5 504
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.19774 -0.68320 -0.01025 0.65298 2.98147
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 64.48 8.030
## homew 13.11 3.621 -0.86
## white1 15.93 3.991 -0.67 0.32
## Residual 50.01 7.072
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 46.6976 2.3015 30.5500 20.290 < 2e-16 ***
## homew 1.8717 0.8090 20.6538 2.314 0.03108 *
## schCses 1.1303 0.9638 460.1114 1.173 0.24151
## sex2 -0.4289 0.6621 490.8794 -0.648 0.51748
## white1 2.0143 1.3736 11.8180 1.466 0.16862
## schMses 5.5404 1.6584 21.9468 3.341 0.00297 **
## public1 -0.8064 1.9109 22.6738 -0.422 0.67701
## schCses:white1 1.5025 1.1611 480.4592 1.294 0.19630
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) homew schCss sex2 white1 schMss publc1
## homew -0.659
## schCses 0.074 -0.014
## sex2 -0.094 -0.040 0.107
## white1 -0.562 0.171 -0.175 -0.022
## schMses -0.286 0.002 0.037 -0.054 -0.110
## public1 -0.485 0.013 0.041 -0.044 -0.070 0.720
## schCss:wht1 -0.030 -0.006 -0.829 -0.088 0.125 -0.040 -0.064
The full model is model 1 (the preliminary model) and we need to fit a null model for this test (i.e., remove ``homew’’ from random part of model)
summary(model2 <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + public + (1 | school), data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +
## public + (1 | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3693.6 3736.1 -1836.8 3673.6 509
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.6152 -0.7115 0.0079 0.6988 3.2779
##
## Random effects:
## Groups Name Variance Std.Dev.
## school (Intercept) 6.921 2.631
## Residual 65.946 8.121
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 45.3648 1.6805 41.3323 26.994 < 2e-16 ***
## homew 2.1069 0.2676 518.2015 7.874 2.03e-14 ***
## schCses 1.0502 1.0595 516.0901 0.991 0.32208
## sex2 -0.7979 0.7352 512.2746 -1.085 0.27835
## white1 3.0228 1.0728 301.2941 2.818 0.00516 **
## schMses 5.5667 1.7580 22.5352 3.167 0.00438 **
## public1 0.1596 2.0678 23.8516 0.077 0.93913
## schCses:white1 2.8420 1.2946 516.4455 2.195 0.02859 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) homew schCss sex2 white1 schMss publc1
## homew -0.301
## schCses 0.109 -0.037
## sex2 -0.158 -0.068 0.097
## white1 -0.397 -0.038 -0.292 -0.029
## schMses -0.372 -0.051 0.076 -0.042 -0.183
## public1 -0.692 0.044 0.070 -0.048 -0.104 0.722
## schCss:wht1 -0.050 -0.029 -0.829 -0.077 0.194 -0.061 -0.074
The test statistic equals the difference between -2lnlike (i.e., deviances)
(lr <- deviance(model2) - deviance(model1) )
## [1] 75.33025
pvalue <- .5*(pchisq(lr,1,lower.tail=FALSE)) + .5*(pchisq(lr,2,lower.tail=FALSE))
round(pvalue, digits=2)
## [1] 0
Since sector not significant in model 1, we’ll try remove schMses (the predcitor of random intercept )
summary(model3 <- lmer(math ~ homew + schCses + sex + white + schCses*white + public + (1 + homew | school), data=nels, REML=FALSE,
control =lmerControl(optimizer="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + public +
## (1 + homew | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3627.3 3674.1 -1802.7 3605.3 508
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.27212 -0.68624 -0.00829 0.65111 2.92832
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 48.00 6.928
## homew 13.65 3.694 -0.87
## Residual 50.98 7.140
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 47.6883 2.1126 46.3192 22.574 < 2e-16 ***
## homew 1.8258 0.8253 20.7145 2.212 0.03833 *
## schCses 1.1531 0.9506 491.3680 1.213 0.22568
## sex2 -0.4083 0.6648 489.8831 -0.614 0.53941
## white1 2.6606 0.9972 404.4834 2.668 0.00793 **
## public1 -4.1864 1.7306 24.2811 -2.419 0.02341 *
## schCses:white1 1.4822 1.1632 491.6358 1.274 0.20317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) homew schCss sex2 white1 publc1
## homew -0.627
## schCses 0.102 -0.015
## sex2 -0.124 -0.039 0.120
## white1 -0.352 -0.021 -0.291 -0.035
## public1 -0.565 0.012 0.024 -0.004 0.030
## schCss:wht1 -0.046 -0.005 -0.826 -0.097 0.194 -0.046
# for latex in lecture notes: texreg(list(model3), single.row=TRUE,custom.model.names("No Mean SES"), caption=("Why Secter was ns?"))
anova(model3, model1)
## Data: nels
## Models:
## model3: math ~ homew + schCses + sex + white + schCses * white + public +
## model3: (1 + homew | school)
## model1: math ~ homew + schCses + sex + white + schCses * white + schMses +
## model1: public + (1 + homew | school)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model3 11 3627.3 3674.1 -1802.7 3605.3
## model1 12 3622.3 3673.3 -1799.1 3598.3 7.0303 1 0.008014 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bic.hlm(model3, nels$school)
## n groups total obs haroninc.mean.nj n random n fixed deviance bic.new
## school 23 519 17.73371 4 7 3605.301 3661.606
## bic.harm bic.ngrps bic.ntot aic
## school 3659.92 3639.792 3674.072 3627.301
bic.hlm(model1, nels$school)
## n groups total obs haroninc.mean.nj n random n fixed deviance bic.new
## school 23 519 17.73371 4 8 3598.271 3660.828
## bic.harm bic.ngrps bic.ntot aic
## school 3658.9 3635.897 3673.294 3622.271
summary(model1.reduced <- lmer(math ~ homew + schCses + white + schCses*white + schMses + (1 + homew | school), data=nels, REML=FALSE,
control = lmerControl(optimizer="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ homew + schCses + white + schCses * white + schMses +
## (1 + homew | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3618.8 3661.3 -1799.4 3598.8 509
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.23594 -0.68934 -0.00211 0.65555 2.95528
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 46.07 6.787
## homew 13.42 3.663 -0.91
## Residual 51.08 7.147
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 45.4350 1.7078 33.2900 26.604 < 2e-16 ***
## homew 1.8293 0.8177 20.6623 2.237 0.036434 *
## schCses 1.3219 0.9411 492.7418 1.405 0.160754
## white1 2.4396 0.9800 359.5614 2.489 0.013247 *
## schMses 5.1909 1.2715 23.0642 4.083 0.000456 ***
## schCses:white1 1.3265 1.1529 494.5634 1.151 0.250450
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) homew schCss white1 schMss
## homew -0.795
## schCses 0.162 -0.011
## white1 -0.425 -0.021 -0.286
## schMses 0.149 -0.010 0.030 -0.135
## schCss:wht1 -0.103 -0.009 -0.823 0.191 -0.005
anova(model1.reduced, model1)
## Data: nels
## Models:
## model1.reduced: math ~ homew + schCses + white + schCses * white + schMses +
## model1.reduced: (1 + homew | school)
## model1: math ~ homew + schCses + sex + white + schCses * white + schMses +
## model1: public + (1 + homew | school)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model1.reduced 10 3618.8 3661.3 -1799.4 3598.8
## model1 12 3622.3 3673.3 -1799.1 3598.3 0.5015 2 0.7782
bic.hlm(model1.reduced, nels$school)
## n groups total obs haroninc.mean.nj n random n fixed deviance bic.new
## school 23 519 17.73371 4 6 3598.772 3648.826
## bic.harm bic.ngrps bic.ntot aic
## school 3647.38 3630.127 3661.291 3618.772
# for lecture notes: texreg(list(model1.reduced),single.row=TRUE, custom.model.names=c("Simpler Model"))
deviance(model1.reduced)
## [1] 3598.772
# page 156
summary(model4 <- lmer(math ~ homew + schCses + sex + white + schCses*white + schMses + (1 + homew | school), data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ homew + schCses + sex + white + schCses * white + schMses +
## (1 + homew | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3620.3 3667.0 -1799.1 3598.3 508
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.27490 -0.68323 -0.01015 0.66496 2.94047
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 45.69 6.759
## homew 13.29 3.646 -0.91
## Residual 51.05 7.145
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 45.6272 1.7245 35.0768 26.458 < 2e-16 ***
## homew 1.8530 0.8149 20.6974 2.274 0.033759 *
## schCses 1.2416 0.9475 493.6246 1.310 0.190668
## sex2 -0.4696 0.6638 492.0838 -0.707 0.479633
## white1 2.4635 0.9800 358.8678 2.514 0.012386 *
## schMses 5.2127 1.2699 23.0562 4.105 0.000432 ***
## schCses:white1 1.4078 1.1580 495.2388 1.216 0.224638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) homew schCss sex2 white1 schMss
## homew -0.777
## schCses 0.141 -0.016
## sex2 -0.158 -0.040 0.119
## white1 -0.416 -0.019 -0.288 -0.032
## schMses 0.151 -0.009 0.027 -0.023 -0.134
## schCss:wht1 -0.087 -0.005 -0.825 -0.097 0.193 -0.003
Betwoen gender and white, which has not been significant
summary(model5 <- lmer(math ~ homew + schCses + white + schMses + (1 + homew | school), data=nels, REML=FALSE,control = lmerControl(optimizer ="Nelder_Mead")))
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: math ~ homew + schCses + white + schMses + (1 + homew | school)
## Data: nels
## Control: lmerControl(optimizer = "Nelder_Mead")
##
## AIC BIC logLik deviance df.resid
## 3618.1 3656.4 -1800.0 3600.1 510
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.24935 -0.69462 -0.01681 0.64798 2.98486
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 46.61 6.827
## homew 13.80 3.715 -0.91
## Residual 51.12 7.150
## Number of obs: 519, groups: school, 23
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 45.6453 1.7064 32.5449 26.750 < 2e-16 ***
## homew 1.8324 0.8280 20.7671 2.213 0.038228 *
## schCses 2.2098 0.5343 487.7681 4.136 4.16e-05 ***
## white1 2.2160 0.9637 364.8911 2.300 0.022039 *
## schMses 5.1797 1.2815 23.1491 4.042 0.000502 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) homew schCss white1
## homew -0.800
## schCses 0.137 -0.032
## white1 -0.414 -0.019 -0.233
## schMses 0.149 -0.009 0.045 -0.136
And some checks
anova(model5, model1.reduced)
## Data: nels
## Models:
## model5: math ~ homew + schCses + white + schMses + (1 + homew | school)
## model1.reduced: math ~ homew + schCses + white + schCses * white + schMses +
## model1.reduced: (1 + homew | school)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model5 9 3618.1 3656.4 -1800.0 3600.1
## model1.reduced 10 3618.8 3661.3 -1799.4 3598.8 1.3105 1 0.2523
# For laTex: texreg(list(model5),single.row=TRUE,custom.model.names=c("Final Model"))
bic.hlm(model5,nels$school)
## n groups total obs haroninc.mean.nj n random n fixed deviance bic.new
## school 23 519 17.73371 4 5 3600.083 3643.884
## bic.harm bic.ngrps bic.ntot aic
## school 3642.68 3628.302 3656.35 3618.083
screenreg(
list(model0, model1, model2, model3, model4, model5))
##
## =================================================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## -----------------------------------------------------------------------------------------------------------------
## (Intercept) 50.76 *** 45.57 *** 45.36 *** 47.69 *** 45.63 *** 45.65 ***
## (1.13) (2.11) (1.68) (2.11) (1.72) (1.71)
## homew 1.85 * 2.11 *** 1.83 * 1.85 * 1.83 *
## (0.82) (0.27) (0.83) (0.81) (0.83)
## schCses 1.24 1.05 1.15 1.24 2.21 ***
## (0.95) (1.06) (0.95) (0.95) (0.53)
## sex2 -0.47 -0.80 -0.41 -0.47
## (0.66) (0.74) (0.66) (0.66)
## white1 2.46 * 3.02 ** 2.66 ** 2.46 * 2.22 *
## (0.98) (1.07) (1.00) (0.98) (0.96)
## schMses 5.27 ** 5.57 ** 5.21 *** 5.18 ***
## (1.81) (1.76) (1.27) (1.28)
## public1 0.09 0.16 -4.19 *
## (2.11) (2.07) (1.73)
## schCses:white1 1.40 2.84 * 1.48 1.41
## (1.16) (1.29) (1.16) (1.16)
## -----------------------------------------------------------------------------------------------------------------
## AIC 3806.78 3622.27 3693.60 3627.30 3620.27 3618.08
## BIC 3819.53 3673.29 3736.12 3674.07 3667.04 3656.35
## Log Likelihood -1900.39 -1799.14 -1836.80 -1802.65 -1799.14 -1800.04
## Num. obs. 519 519 519 519 519 519
## Num. groups: school 23 23 23 23 23 23
## Var: school (Intercept) 24.85 45.69 6.92 48.00 45.69 46.61
## Var: Residual 81.24 51.05 65.95 50.98 51.05 51.12
## Var: school homew 13.30 13.65 13.29 13.80
## Cov: school (Intercept) homew -22.40 -22.16 -22.39 -23.04
## =================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Since model 5 at this points looks like it might be our final model, we’ll do some diagnostic checks.
Well do this in steps
We need to extract them from model object
ranU <- as.data.frame(ranef(model5))
Put them in separate objects
U0j <- ranU[which(ranU$term=="(Intercept)"), ]
U1j <- ranU[which(ranU$term=="homew"), ]
Sort rows by ranU, which is “condval”
U0j <-U0j[order(U0j$condval), ]
U1j <-U1j[order(U1j$condval), ]
Add integers for schools so that I can plot in order
U0j$xgrp <- seq(1:23)
U1j$xgrp <- seq(1:23)
Draw graph for U0j
plot(U0j$xgrp, U0j$condval,type="p",
main="Final Model: Random intercepts +/- 1 std error",
xlab="Schools (sort by Uo)",
ylab="EBLUP of U0j",
col="blue"
)
# Add in the lines for sd errors by
# drawing arrows but with no arrowhead.
# This is a hack solution but it works
arrows(U0j$xgrp, U0j$condval-U0j$condsd,
U0j$xgrp, U0j$condval+U0j$condsd,
length=0.00)
### Now for U1j: Repeat above but for the random slopes
plot(U1j$xgrp, U1j$condval, type="p",
main="Final Model: Random slopes +/- 1 std error",
xlab="Schools (sort by U1j)",
ylab="EBLUP of U1j",
col="blue")
# For sd errors, draw arrows but with no arrowhead
arrows(U1j$xgrp, U1j$condval-U1j$condsd,
U1j$xgrp, U1j$condval+U1j$condsd,
length=0.00)
I will make custom qqplots that include +/- 1 sd error. ### Uoj
qqdata <- qqnorm(U0j$condval)
plot(qqdata$x, qqdata$y, type="p", pch=19, col="red",
main="Normal QQ plot of U0j with 95% Confidence Bands",
xlab="Theoretical Value",
ylab="Estimated U0j (EBLUPS)",
ylim=c(-15,15))
arrows(qqdata$x, qqdata$y - 1.96*U0j$condsd,
qqdata$x, qqdata$y + 1.96*U0j$condsd,
length=0.00, col="gray")
qqline(U0j$condval,col="steelblue")
### For U1j{
qqdata <- qqnorm(U1j$condval)
plot(qqdata$x,qqdata$y,type="p",pch=19,col="red",
main="Normal QQ plot of U1j with 95% Confidence Bands",
xlab="Theoretical Value",
ylab="Estimated Uuj (EBLUPS)",
ylim=c(-8,8)
)
arrows(qqdata$x, qqdata$y - 1.96*U1j$condsd,
qqdata$x, qqdata$y + 1.96*U1j$condsd,
length=0.00, col="gray")
qqline(U1j$condval,col="steelblue")
## Residual Plots
This gives standardized level 1 residuals. These are stadnardized and
res5 <- hlm_resid(model5, standardize=TRUE, include.ls=FALSE)
head(res5)
## # A tibble: 6 x 11
## id math homew schCses white schMses school .std.~1 .fitted .chol~2 .mar.~3
## <dbl> <int> <int> <dbl> <fct> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 50 1 0.150 1 0.700 6053 -0.595 54.3 -0.451 53.7
## 2 2 43 1 -0.270 1 0.700 6053 -1.44 53.3 -1.13 52.7
## 3 3 50 3 -1.29 0 0.700 6053 -0.440 53.1 -0.286 51.9
## 4 4 49 1 0.320 1 0.700 6053 -0.788 54.6 -0.349 54.0
## 5 5 62 1 0.140 1 0.700 6053 1.09 54.2 1.45 53.6
## 6 6 43 1 0.620 1 0.700 6053 -1.72 55.3 -1.37 54.7
## # ... with abbreviated variable names 1: .std.resid, 2: .chol.mar.resid,
## # 3: .mar.fitted
par(mfrow=c(2,2))
fit <- fitted(model5)
plot(fit, res5$.std.resid,
xlab='Conditional Fitted Values',
ylab='Std Residuals',
main='Conditional Level1 Residuals')
lines(c(min(fit),max(fit)),c(0,0), col="red", lwd=2)
# qqplot
qqnorm(res5$.std.resid) # draws plot
qqline(res5$.std.resid, col='red', lwd=2) # reference line
# Histogram of standardized residuals with normal overlay
h<- hist(res5$.std.resid, breaks=15, density=20, col="seagreen") # draw historgram
xfit <- seq(-3, 3, length=50) # sets range & number of quantiles
yfit <- dnorm(xfit, mean=0, sd=1) # should be normal(0,1)
yfit <- yfit*diff(h$mids[1:2])*length(res5$.std.resid) # use mid-points
lines(xfit, yfit, col='blue', lwd=2) # draws normal
# Some things to describe the model
plot.new( )
text(.5,1.0,'Model 5')
text(.5,0.85,'Devience=3600.1')
text(.5,0.6,'AIC=3618.1')
text(.5,0.4,'BIC=3656.4')
These are estimated random effects, which we got above using ranef().
res5.margin <- resid_marginal(model5, type="pearson")
head(res5.margin)
## 1 2 3 4 5 6
## -0.4511534 -1.2016262 -0.2094894 -0.6211822 1.0347516 -1.4447061