This example using NELS data but with 10 schools.
First libraries that we’ll use
library(lme4)
## Loading required package: Matrix
library(lattice)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
Read in data and take a look
setwd("D:/Dropbox/edps587/lectures/2 anovas")
nels10 <- read.table(file="NELS10.txt", header=TRUE)
names(nels10)
## [1] "sch_id" "stud_id" "sex" "race" "homework" "schtyp"
## [7] "ses" "par_edu" "math" "clastruc" "size" "urban"
## [13] "geograph" "perminor" "ratio"
head(nels10)
## sch_id stud_id sex race homework schtyp ses par_edu math clastruc size
## 1 7472 3 2 4 1 1 -0.13 2 48 2 3
## 2 7472 8 1 4 0 1 -0.39 2 48 2 3
## 3 7472 13 1 4 0 1 -0.80 2 53 2 3
## 4 7472 17 1 4 1 1 -0.72 2 42 2 3
## 5 7472 27 2 4 2 1 -0.74 2 43 2 3
## 6 7472 28 2 4 1 1 -0.58 2 57 2 3
## urban geograph perminor ratio
## 1 2 2 0 19
## 2 2 2 0 19
## 3 2 2 0 19
## 4 2 2 0 19
## 5 2 2 0 19
## 6 2 2 0 19
tail(nels10)
## sch_id stud_id sex race homework schtyp ses par_edu math clastruc size
## 255 72292 85 2 4 0 1 -0.56 3 46 4 2
## 256 72292 89 1 4 2 1 -0.23 3 43 4 2
## 257 72292 92 1 4 0 1 -1.11 1 45 4 2
## 258 72292 97 1 4 1 1 -0.29 3 47 4 2
## 259 72292 98 1 4 1 1 -1.19 1 34 4 2
## 260 72292 99 2 4 1 1 0.45 4 53 4 2
## urban geograph perminor ratio
## 255 1 1 1 17
## 256 1 1 1 17
## 257 1 1 1 17
## 258 1 1 1 17
## 259 1 1 1 17
## 260 1 1 1 17
I tend to be a bit lazy, so can just use variable name without also using data table name
attach(nels10)
We could use base R
summary(math)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.0 42.0 49.5 51.3 62.0 71.0
mean(math)
## [1] 51.3
var(math)
## [1] 124.0023
# Summary statistics by school
mn <- aggregate(math~sch_id, data=nels10, mean)
sd <- aggregate(math~sch_id, data=nels10, sd)
med <- aggregate(math~sch_id, data=nels10, median)
simple <- cbind(mn, sd[,2], med[,2])
round(simple,digits=2)
## sch_id math sd[, 2] med[, 2]
## 1 7472 45.74 7.53 47.0
## 2 7829 42.15 8.32 41.0
## 3 7930 53.25 11.52 52.5
## 4 24725 43.55 10.01 41.0
## 5 25456 49.86 8.44 50.5
## 6 25642 46.40 4.32 46.5
## 7 62821 62.82 5.68 63.0
## 8 68448 49.67 10.34 46.0
## 9 68493 46.33 9.55 43.0
## 10 72292 47.85 11.30 44.5
Or the stargazer package
stargazer(nels10, type="text")
##
## ===================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## -------------------------------------------------------------------
## sch_id 260 41,023.790 25,855.920 7,472 7,930 62,821 72,292
## stud_id 260 49.869 28.943 0 24.8 73.2 99
## sex 260 1.492 0.501 1 1 2 2
## race 260 3.577 0.780 1 3 4 4
## homework 260 2.023 1.550 0 1 3 7
## schtyp 260 1.773 1.315 1 1 4 4
## ses 260 -0.073 0.970 -2.410 -0.770 0.812 1.850
## par_edu 260 3.177 1.517 1 2 4 6
## math 260 51.300 11.136 31 42 62 71
## clastruc 260 3.562 0.761 2 3 4 5
## size 260 3.269 1.377 2 2 3.2 6
## urban 260 1.773 0.929 1 1 3 3
## geograph 260 2.331 0.614 1 2 3 3
## perminor 260 2.835 2.266 0 1 5 7
## ratio 260 14.535 3.948 10 10 18 22
## -------------------------------------------------------------------
and by school,
by(nels10, nels10$sch_id, stargazer, type = 'text')
We’ll plot school mean maths scores with +/- one standard deviation.
# -- a numbers 1 to 10 as index variable
simple$index <- seq(1:10)
plot(simple$index, simple$math, pch=19,
xlab="School",
ylab="Math",
xlim=c(min(simple$index),max(simple$index)),
ylim=c(30,70),
main="School Mean Math Score +/- one Standard Deviation"
)
lines(rbind(1:10,1:10,NA),rbind(simple$math-simple[,3],simple$math+simple[,3],NA))
This is just plain old linear regression (no predictors), which is just the overall mean for math.
summary(linreg <- lm(math ~ 1, data=nels10))
##
## Call:
## lm(formula = math ~ 1, data = nels10)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.3 -9.3 -1.8 10.7 19.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.3000 0.6906 74.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.14 on 259 degrees of freedom
anova(linreg) # to get ANOVA table
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 259 32117 124
plot(linreg, las=1) # plots fitted by residuals
## hat values (leverages) are all = 0.003846154
## and there are no factor predictors; no plot no. 5
Here the predictors are schools so that each school will have different mean
id <- as.factor(sch_id)
summary( fixedANOVA <- lm(math ~ id, data=nels10,REML=TRUE) )
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'REML' will be disregarded
##
## Call:
## lm(formula = math ~ id, data = nels10, REML = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2500 -5.7052 -0.6423 4.4909 24.6667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.7391 1.7735 25.790 < 2e-16 ***
## id7829 -3.5891 2.6005 -1.380 0.16877
## id7930 7.5109 2.4819 3.026 0.00273 **
## id24725 -2.1937 2.5365 -0.865 0.38795
## id25456 4.1245 2.5365 1.626 0.10519
## id25642 0.6609 2.6005 0.254 0.79960
## id62821 17.0818 2.0555 8.310 6.13e-15 ***
## id68448 3.9275 2.5672 1.530 0.12730
## id68493 0.5942 2.5672 0.231 0.81715
## id72292 2.1109 2.6005 0.812 0.41773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.506 on 250 degrees of freedom
## Multiple R-squared: 0.4369, Adjusted R-squared: 0.4166
## F-statistic: 21.55 on 9 and 250 DF, p-value: < 2.2e-16
anova(fixedANOVA)
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## id 9 14030 1558.95 21.549 < 2.2e-16 ***
## Residuals 250 18086 72.34
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Note that the estimation method is REML, which will give same results as standard ANOVA
Check to see if these are different results than those when using REML=TRUE.
id <- as.factor(sch_id)
summary( fixedANOVA2 <- lm(math ~ id, data=nels10, REML=FALSE) )
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'REML' will be disregarded
##
## Call:
## lm(formula = math ~ id, data = nels10, REML = FALSE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2500 -5.7052 -0.6423 4.4909 24.6667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.7391 1.7735 25.790 < 2e-16 ***
## id7829 -3.5891 2.6005 -1.380 0.16877
## id7930 7.5109 2.4819 3.026 0.00273 **
## id24725 -2.1937 2.5365 -0.865 0.38795
## id25456 4.1245 2.5365 1.626 0.10519
## id25642 0.6609 2.6005 0.254 0.79960
## id62821 17.0818 2.0555 8.310 6.13e-15 ***
## id68448 3.9275 2.5672 1.530 0.12730
## id68493 0.5942 2.5672 0.231 0.81715
## id72292 2.1109 2.6005 0.812 0.41773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.506 on 250 degrees of freedom
## Multiple R-squared: 0.4369, Adjusted R-squared: 0.4166
## F-statistic: 21.55 on 9 and 250 DF, p-value: < 2.2e-16
anova(fixedANOVA2)
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## id 9 14030 1558.95 21.549 < 2.2e-16 ***
## Residuals 250 18086 72.34
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
REML is the default when using lmer
summary( ranintREML <- lmer(math ~ 1 + (1 | sch_id), data=nels10, REML=TRUE) )
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | sch_id)
## Data: nels10
##
## REML criterion at convergence: 1871.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.34027 -0.65554 -0.08304 0.54211 2.87452
##
## Random effects:
## Groups Name Variance Std.Dev.
## sch_id (Intercept) 34.01 5.832
## Residual 72.26 8.500
## Number of obs: 260, groups: sch_id, 10
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 48.861 1.927 25.35
And when using MLE,
summary( ranintMLE <- lmer(math ~ 1 + (1 | sch_id), data=nels10, REML=FALSE) )
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: math ~ 1 + (1 | sch_id)
## Data: nels10
##
## AIC BIC logLik deviance df.resid
## 1880.8 1891.5 -937.4 1874.8 257
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.33638 -0.65775 -0.08406 0.54767 2.87201
##
## Random effects:
## Groups Name Variance Std.Dev.
## sch_id (Intercept) 30.54 5.526
## Residual 72.24 8.499
## Number of obs: 260, groups: sch_id, 10
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 48.872 1.835 26.63
Just cutting and pasting from random intercept model
(icc <- (30.54)/(30.54 + 72.24))
## [1] 0.2971395
Or without cutting and pasting,
s <- summary(ranintMLE)
# -- see what available...we want what's in "varcor" as a data frame
names(s)
## [1] "methTitle" "objClass" "devcomp" "isLmer" "useScale"
## [6] "logLik" "family" "link" "ngrps" "coefficients"
## [11] "sigma" "vcov" "varcor" "AICtab" "call"
## [16] "residuals" "fitMsgs" "optinfo"
(s <- as.data.frame(s$varcor))
## grp var1 var2 vcov sdcor
## 1 sch_id (Intercept) <NA> 30.54173 5.526457
## 2 Residual <NA> <NA> 72.23582 8.499166
(icc2 <- s$vcov[1] / (s$vcov[1] + s$vcov[2]))
## [1] 0.2971634
Later we’ll write a little function to do this.
First a model with an intercept and homework
multreg <- lm(math ~ 1 + homework, data=nels10)
summary(multreg)
##
## Call:
## lm(formula = math ~ 1 + homework, data = nels10)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.9331 -6.6457 0.3543 7.0669 20.9261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.0739 0.9886 44.58 <2e-16 ***
## homework 3.5719 0.3882 9.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.682 on 258 degrees of freedom
## Multiple R-squared: 0.247, Adjusted R-squared: 0.2441
## F-statistic: 84.64 on 1 and 258 DF, p-value: < 2.2e-16
anova(multreg)
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## homework 1 7933.8 7933.8 84.644 < 2.2e-16 ***
## Residuals 258 24182.8 93.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitmath <- fitted(multreg)
plot(homework, math,
type='p',
main='Linear Regression of Math on Time Spent Doing Homework',
xlab='Time Spent Doing Homework',
ylab='Fitted Math Scores from Linear Regression')
lines(homework,fitmath,col="blue")
But we should look at things for each school
# Lattice plot with linear regressions
xyplot(math ~ homework | id, col.line='black',
type=c('p','r'),
cex.xlab=1.5,
cex.ylab=1.5,
xlab="Time Spent Doing Homework",
ylab="Math Scores",
cex.main=1.2,
main='Variability in Math ~ Homework relationship')
This will basically fit a linear regression to each school
mathbysch <- lm(formula = math ~ 1*id + homework * id, data = nels10)
summary(mathbysch)
##
## Call:
## lm(formula = math ~ 1 * id + homework * id, data = nels10)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.3049 -3.4192 -0.2385 4.2495 16.8703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.6835 2.2157 22.875 < 2e-16 ***
## homework -3.5538 1.2523 -2.838 0.004931 **
## id7829 -1.6713 3.7914 -0.441 0.659757
## id7930 -11.9335 3.4121 -3.497 0.000560 ***
## id24725 -16.2897 3.0526 -5.336 2.19e-07 ***
## id25456 3.2551 3.0693 1.061 0.289974
## id25642 -1.4246 3.4066 -0.418 0.676187
## id62821 8.5267 2.8185 3.025 0.002755 **
## id68448 -14.6282 3.7803 -3.870 0.000141 ***
## id68493 -12.1635 3.3997 -3.578 0.000419 ***
## id72292 -12.9696 3.1476 -4.121 5.21e-05 ***
## id7829:homework 0.6337 1.7006 0.373 0.709765
## id7930:homework 11.4629 1.7428 6.577 2.97e-10 ***
## id24725:homework 9.1465 1.5759 5.804 2.04e-08 ***
## id25456:homework -1.1646 2.2340 -0.521 0.602629
## id25642:homework 1.0677 2.2365 0.477 0.633499
## id62821:homework 4.6484 1.3372 3.476 0.000603 ***
## id68448:homework 10.0501 1.7994 5.585 6.31e-08 ***
## id68493:homework 9.4138 2.0381 4.619 6.29e-06 ***
## id72292:homework 9.8888 1.6367 6.042 5.77e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.564 on 240 degrees of freedom
## Multiple R-squared: 0.678, Adjusted R-squared: 0.6525
## F-statistic: 26.59 on 19 and 240 DF, p-value: < 2.2e-16
fschmath <- fitted(mathbysch)
We can plot results
school.list <- unique(id)
length(school.list)
## [1] 10
# Pull out intercepts and slopes from model objects
coef <- coefficients(mathbysch)
intercepts <- c(0, coef[3:11])
slopes <- c(0, coef[12:20])
# Plot regressions
plot(data = nels10, math~homework,type = 'n',
ylim = c(min(math),max(math)),
xlim = c(min(homework),max(homework)),
cex.main = 1.25,
xlab = 'Time Spent Doing Homework', ylab = "Math Scores",
main = "Separate Regression for Each School")
colors <- c("red", "orange", "darkgoldenrod1","magenta",
"green", "blue", "purple",
"white", "grey", "black")
for(i in 1:length(school.list)){
abline(a = (coef[1] + intercepts[i]), b = (coef[2]+slopes[i]),
col = colors[i])
par<- par(new=F)
}
We’ll use lmer and fit was is analogous to fitting regressions to each school, but the school effects are random. line is the average school and the multicolor are the conditional regressions (i.e., they include school specific estimates of intercept & slope)
summary( model2 <- lmer(math ~ homework + ( homework | id), data=nels10, REML=FALSE) )
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: math ~ homework + (homework | id)
## Data: nels10
##
## AIC BIC logLik deviance df.resid
## 1781.4 1802.7 -884.7 1769.4 254
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.49760 -0.54240 0.02171 0.60686 2.57103
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## id (Intercept) 61.81 7.862
## homework 19.98 4.470 -0.80
## Residual 43.07 6.563
## Number of obs: 260, groups: id, 10
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 44.773 2.603 17.199
## homework 2.049 1.472 1.392
##
## Correlation of Fixed Effects:
## (Intr)
## homework -0.803
school.list <- unique(id)
length(school.list)
## [1] 10
# Pull the fixed and random effects
fixed.effects <- fixef(model2) # these are gammas
rand.effects <- ranef(model2) # these are Us for schools
# these are used to get conditional regressions (conditional on school)
params.student <- cbind( (rand.effects$id[1]+fixed.effects[1]),
(rand.effects$id[2]+fixed.effects[2]) )
plot(data = nels10, math~homework,type = 'n',
ylim = c(min(math),max(math)),
xlim = c(min(homework),max(homework)),
cex.main = .75,
xlab = 'Time Spent Doing Homework', ylab = "Math Scores",
main = "Estimated Conditional Models from Random Intercepts & Slope Model")
abline(a=fixed.effects[1], b= fixed.effects[2], col='black', lwd=2)
for(i in 1:length(school.list)){
abline(a = params.student[i,1], b = params.student[i,2],col = i)
par<- par(new=F)
}