Set Up
A First Look
- Descriptive Statistics
- Plot the data
Some Models
Intra-class Correlation
Linear Regressions
By School

This example using NELS data but with 10 schools.

Set Up

First libraries that we’ll use

library(lme4)

## Loading required package: Matrix

library(lattice)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

Read in data and take a look

setwd("D:/Dropbox/edps587/lectures/2 anovas")

nels10 <- read.table(file="NELS10.txt", header=TRUE)
names(nels10)

##  [1] "sch_id"   "stud_id"  "sex"      "race"     "homework" "schtyp"  
##  [7] "ses"      "par_edu"  "math"     "clastruc" "size"     "urban"   
## [13] "geograph" "perminor" "ratio"

head(nels10)

##   sch_id stud_id sex race homework schtyp   ses par_edu math clastruc size
## 1   7472       3   2    4        1      1 -0.13       2   48        2    3
## 2   7472       8   1    4        0      1 -0.39       2   48        2    3
## 3   7472      13   1    4        0      1 -0.80       2   53        2    3
## 4   7472      17   1    4        1      1 -0.72       2   42        2    3
## 5   7472      27   2    4        2      1 -0.74       2   43        2    3
## 6   7472      28   2    4        1      1 -0.58       2   57        2    3
##   urban geograph perminor ratio
## 1     2        2        0    19
## 2     2        2        0    19
## 3     2        2        0    19
## 4     2        2        0    19
## 5     2        2        0    19
## 6     2        2        0    19

tail(nels10)

##     sch_id stud_id sex race homework schtyp   ses par_edu math clastruc size
## 255  72292      85   2    4        0      1 -0.56       3   46        4    2
## 256  72292      89   1    4        2      1 -0.23       3   43        4    2
## 257  72292      92   1    4        0      1 -1.11       1   45        4    2
## 258  72292      97   1    4        1      1 -0.29       3   47        4    2
## 259  72292      98   1    4        1      1 -1.19       1   34        4    2
## 260  72292      99   2    4        1      1  0.45       4   53        4    2
##     urban geograph perminor ratio
## 255     1        1        1    17
## 256     1        1        1    17
## 257     1        1        1    17
## 258     1        1        1    17
## 259     1        1        1    17
## 260     1        1        1    17

I tend to be a bit lazy, so can just use variable name without also using data table name

attach(nels10)

A First Look

Descriptive Statistics

We could use base R

summary(math)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    31.0    42.0    49.5    51.3    62.0    71.0

mean(math)

## [1] 51.3

var(math)

## [1] 124.0023

# Summary statistics by school

mn <- aggregate(math~sch_id, data=nels10,  mean)
sd <- aggregate(math~sch_id, data=nels10,  sd)
med <- aggregate(math~sch_id, data=nels10, median)
simple <- cbind(mn, sd[,2], med[,2])
round(simple,digits=2)

##    sch_id  math sd[, 2] med[, 2]
## 1    7472 45.74    7.53     47.0
## 2    7829 42.15    8.32     41.0
## 3    7930 53.25   11.52     52.5
## 4   24725 43.55   10.01     41.0
## 5   25456 49.86    8.44     50.5
## 6   25642 46.40    4.32     46.5
## 7   62821 62.82    5.68     63.0
## 8   68448 49.67   10.34     46.0
## 9   68493 46.33    9.55     43.0
## 10  72292 47.85   11.30     44.5

Or the stargazer package

stargazer(nels10, type="text")

## 
## ===================================================================
## Statistic  N     Mean     St. Dev.   Min   Pctl(25) Pctl(75)  Max  
## -------------------------------------------------------------------
## sch_id    260 41,023.790 25,855.920 7,472   7,930    62,821  72,292
## stud_id   260   49.869     28.943     0      24.8     73.2     99  
## sex       260   1.492      0.501      1       1        2       2   
## race      260   3.577      0.780      1       3        4       4   
## homework  260   2.023      1.550      0       1        3       7   
## schtyp    260   1.773      1.315      1       1        4       4   
## ses       260   -0.073     0.970    -2.410  -0.770   0.812   1.850 
## par_edu   260   3.177      1.517      1       2        4       6   
## math      260   51.300     11.136     31      42       62      71  
## clastruc  260   3.562      0.761      2       3        4       5   
## size      260   3.269      1.377      2       2       3.2      6   
## urban     260   1.773      0.929      1       1        3       3   
## geograph  260   2.331      0.614      1       2        3       3   
## perminor  260   2.835      2.266      0       1        5       7   
## ratio     260   14.535     3.948      10      10       18      22  
## -------------------------------------------------------------------

and by school,

by(nels10, nels10$sch_id, stargazer, type = 'text')

Plot the data

We’ll plot school mean maths scores with +/- one standard deviation.

# -- a numbers 1 to 10 as index variable
simple$index <- seq(1:10)

plot(simple$index, simple$math, pch=19,
     xlab="School",
     ylab="Math",
     xlim=c(min(simple$index),max(simple$index)),
     ylim=c(30,70),
     main="School Mean Math Score +/- one Standard Deviation"
     )
lines(rbind(1:10,1:10,NA),rbind(simple$math-simple[,3],simple$math+simple[,3],NA))

Some Models

No Predictors – just the mean of math

This is just plain old linear regression (no predictors), which is just the overall mean for math.

summary(linreg <- lm(math ~ 1, data=nels10))

## 
## Call:
## lm(formula = math ~ 1, data = nels10)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -20.3   -9.3   -1.8   10.7   19.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  51.3000     0.6906   74.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.14 on 259 degrees of freedom

anova(linreg)                    # to get ANOVA table

## Analysis of Variance Table
## 
## Response: math
##            Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 259  32117     124

plot(linreg, las=1)              # plots fitted by residuals

## hat values (leverages) are all = 0.003846154
##  and there are no factor predictors; no plot no. 5

ANOVA – least squares

Here the predictors are schools so that each school will have different mean

id <- as.factor(sch_id)
summary( fixedANOVA <- lm(math ~ id, data=nels10,REML=TRUE) )

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'REML' will be disregarded

## 
## Call:
## lm(formula = math ~ id, data = nels10, REML = TRUE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2500  -5.7052  -0.6423   4.4909  24.6667 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.7391     1.7735  25.790  < 2e-16 ***
## id7829       -3.5891     2.6005  -1.380  0.16877    
## id7930        7.5109     2.4819   3.026  0.00273 ** 
## id24725      -2.1937     2.5365  -0.865  0.38795    
## id25456       4.1245     2.5365   1.626  0.10519    
## id25642       0.6609     2.6005   0.254  0.79960    
## id62821      17.0818     2.0555   8.310 6.13e-15 ***
## id68448       3.9275     2.5672   1.530  0.12730    
## id68493       0.5942     2.5672   0.231  0.81715    
## id72292       2.1109     2.6005   0.812  0.41773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.506 on 250 degrees of freedom
## Multiple R-squared:  0.4369, Adjusted R-squared:  0.4166 
## F-statistic: 21.55 on 9 and 250 DF,  p-value: < 2.2e-16

anova(fixedANOVA)

## Analysis of Variance Table
## 
## Response: math
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## id          9  14030 1558.95  21.549 < 2.2e-16 ***
## Residuals 250  18086   72.34                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Note that the estimation method is REML, which will give same results as standard ANOVA

ANOVA via MLE

Check to see if these are different results than those when using REML=TRUE.

id <- as.factor(sch_id)
summary( fixedANOVA2 <- lm(math ~ id, data=nels10, REML=FALSE) )

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'REML' will be disregarded

## 
## Call:
## lm(formula = math ~ id, data = nels10, REML = FALSE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2500  -5.7052  -0.6423   4.4909  24.6667 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.7391     1.7735  25.790  < 2e-16 ***
## id7829       -3.5891     2.6005  -1.380  0.16877    
## id7930        7.5109     2.4819   3.026  0.00273 ** 
## id24725      -2.1937     2.5365  -0.865  0.38795    
## id25456       4.1245     2.5365   1.626  0.10519    
## id25642       0.6609     2.6005   0.254  0.79960    
## id62821      17.0818     2.0555   8.310 6.13e-15 ***
## id68448       3.9275     2.5672   1.530  0.12730    
## id68493       0.5942     2.5672   0.231  0.81715    
## id72292       2.1109     2.6005   0.812  0.41773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.506 on 250 degrees of freedom
## Multiple R-squared:  0.4369, Adjusted R-squared:  0.4166 
## F-statistic: 21.55 on 9 and 250 DF,  p-value: < 2.2e-16

anova(fixedANOVA2)

## Analysis of Variance Table
## 
## Response: math
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## id          9  14030 1558.95  21.549 < 2.2e-16 ***
## Residuals 250  18086   72.34                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Random Intercept HLM (Random Effects ANOVA))

REML is the default when using lmer

summary( ranintREML <- lmer(math ~ 1 + (1 | sch_id), data=nels10, REML=TRUE) )

## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | sch_id)
##    Data: nels10
## 
## REML criterion at convergence: 1871.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.34027 -0.65554 -0.08304  0.54211  2.87452 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  sch_id   (Intercept) 34.01    5.832   
##  Residual             72.26    8.500   
## Number of obs: 260, groups:  sch_id, 10
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   48.861      1.927   25.35

And when using MLE,

summary( ranintMLE <- lmer(math ~ 1 + (1 | sch_id), data=nels10, REML=FALSE) )

## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: math ~ 1 + (1 | sch_id)
##    Data: nels10
## 
##      AIC      BIC   logLik deviance df.resid 
##   1880.8   1891.5   -937.4   1874.8      257 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.33638 -0.65775 -0.08406  0.54767  2.87201 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  sch_id   (Intercept) 30.54    5.526   
##  Residual             72.24    8.499   
## Number of obs: 260, groups:  sch_id, 10
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   48.872      1.835   26.63

Intra-class Correlation

Just cutting and pasting from random intercept model

(icc <- (30.54)/(30.54 + 72.24))

## [1] 0.2971395

Or without cutting and pasting,

s <- summary(ranintMLE)
# -- see what available...we want what's in "varcor" as a data frame
names(s)

##  [1] "methTitle"    "objClass"     "devcomp"      "isLmer"       "useScale"    
##  [6] "logLik"       "family"       "link"         "ngrps"        "coefficients"
## [11] "sigma"        "vcov"         "varcor"       "AICtab"       "call"        
## [16] "residuals"    "fitMsgs"      "optinfo"

(s <- as.data.frame(s$varcor))

##        grp        var1 var2     vcov    sdcor
## 1   sch_id (Intercept) <NA> 30.54173 5.526457
## 2 Residual        <NA> <NA> 72.23582 8.499166

(icc2 <- s$vcov[1] / (s$vcov[1] + s$vcov[2]))

## [1] 0.2971634

Later we’ll write a little function to do this.

Linear Regressions

First a model with an intercept and homework

multreg <- lm(math ~ 1 + homework, data=nels10)
summary(multreg)

## 
## Call:
## lm(formula = math ~ 1 + homework, data = nels10)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.9331  -6.6457   0.3543   7.0669  20.9261 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44.0739     0.9886   44.58   <2e-16 ***
## homework      3.5719     0.3882    9.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.682 on 258 degrees of freedom
## Multiple R-squared:  0.247,  Adjusted R-squared:  0.2441 
## F-statistic: 84.64 on 1 and 258 DF,  p-value: < 2.2e-16

anova(multreg)

## Analysis of Variance Table
## 
## Response: math
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## homework    1  7933.8  7933.8  84.644 < 2.2e-16 ***
## Residuals 258 24182.8    93.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

fitmath <- fitted(multreg)
plot(homework, math, 
      type='p', 
      main='Linear Regression of Math on Time Spent Doing Homework',
      xlab='Time Spent Doing Homework', 
      ylab='Fitted Math Scores from Linear Regression')
lines(homework,fitmath,col="blue")

By School

But we should look at things for each school

# Lattice plot with linear regressions
xyplot(math ~ homework | id, col.line='black', 
       type=c('p','r'),
       cex.xlab=1.5,
       cex.ylab=1.5,
       xlab="Time Spent Doing Homework",
       ylab="Math Scores",
       cex.main=1.2,
       main='Variability in Math ~ Homework relationship')

This will basically fit a linear regression to each school

mathbysch <- lm(formula = math ~ 1*id + homework * id, data = nels10)
summary(mathbysch)

## 
## Call:
## lm(formula = math ~ 1 * id + homework * id, data = nels10)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.3049  -3.4192  -0.2385   4.2495  16.8703 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       50.6835     2.2157  22.875  < 2e-16 ***
## homework          -3.5538     1.2523  -2.838 0.004931 ** 
## id7829            -1.6713     3.7914  -0.441 0.659757    
## id7930           -11.9335     3.4121  -3.497 0.000560 ***
## id24725          -16.2897     3.0526  -5.336 2.19e-07 ***
## id25456            3.2551     3.0693   1.061 0.289974    
## id25642           -1.4246     3.4066  -0.418 0.676187    
## id62821            8.5267     2.8185   3.025 0.002755 ** 
## id68448          -14.6282     3.7803  -3.870 0.000141 ***
## id68493          -12.1635     3.3997  -3.578 0.000419 ***
## id72292          -12.9696     3.1476  -4.121 5.21e-05 ***
## id7829:homework    0.6337     1.7006   0.373 0.709765    
## id7930:homework   11.4629     1.7428   6.577 2.97e-10 ***
## id24725:homework   9.1465     1.5759   5.804 2.04e-08 ***
## id25456:homework  -1.1646     2.2340  -0.521 0.602629    
## id25642:homework   1.0677     2.2365   0.477 0.633499    
## id62821:homework   4.6484     1.3372   3.476 0.000603 ***
## id68448:homework  10.0501     1.7994   5.585 6.31e-08 ***
## id68493:homework   9.4138     2.0381   4.619 6.29e-06 ***
## id72292:homework   9.8888     1.6367   6.042 5.77e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.564 on 240 degrees of freedom
## Multiple R-squared:  0.678,  Adjusted R-squared:  0.6525 
## F-statistic: 26.59 on 19 and 240 DF,  p-value: < 2.2e-16

fschmath <- fitted(mathbysch)

We can plot results

school.list <- unique(id)
length(school.list)

## [1] 10

# Pull out intercepts and slopes from model objects
coef <- coefficients(mathbysch)
intercepts <- c(0, coef[3:11])
slopes <- c(0, coef[12:20])

# Plot regressions
plot(data = nels10, math~homework,type = 'n', 
     ylim = c(min(math),max(math)),
     xlim = c(min(homework),max(homework)),
     cex.main = 1.25,
     xlab = 'Time Spent Doing Homework', ylab = "Math Scores",
     main = "Separate Regression for Each School")
colors <- c("red", "orange", "darkgoldenrod1","magenta",
                      "green", "blue", "purple",
                      "white", "grey", "black")
for(i in 1:length(school.list)){
  abline(a = (coef[1] + intercepts[i]), b = (coef[2]+slopes[i]),
              col = colors[i])
  par<- par(new=F)
}

We’ll use lmer and fit was is analogous to fitting regressions to each school, but the school effects are random. line is the average school and the multicolor are the conditional regressions (i.e., they include school specific estimates of intercept & slope)

summary( model2 <- lmer(math ~ homework + ( homework | id), data=nels10, REML=FALSE) )

## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: math ~ homework + (homework | id)
##    Data: nels10
## 
##      AIC      BIC   logLik deviance df.resid 
##   1781.4   1802.7   -884.7   1769.4      254 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.49760 -0.54240  0.02171  0.60686  2.57103 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  id       (Intercept) 61.81    7.862         
##           homework    19.98    4.470    -0.80
##  Residual             43.07    6.563         
## Number of obs: 260, groups:  id, 10
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   44.773      2.603  17.199
## homework       2.049      1.472   1.392
## 
## Correlation of Fixed Effects:
##          (Intr)
## homework -0.803

school.list <- unique(id)
length(school.list)

## [1] 10

# Pull the fixed and random effects
fixed.effects <- fixef(model2)    # these are gammas
rand.effects <- ranef(model2)     # these are Us for schools
# these are used to get conditional regressions (conditional on school)
params.student <- cbind( (rand.effects$id[1]+fixed.effects[1]),
                         (rand.effects$id[2]+fixed.effects[2]) )

plot(data = nels10, math~homework,type = 'n', 
     ylim = c(min(math),max(math)),
     xlim = c(min(homework),max(homework)),
     cex.main = .75,
     xlab = 'Time Spent Doing Homework', ylab = "Math Scores",
     main = "Estimated Conditional Models from Random Intercepts & Slope Model")

abline(a=fixed.effects[1], b= fixed.effects[2], col='black', lwd=2)

for(i in 1:length(school.list)){
  abline(a = params.student[i,1], b = params.student[i,2],col = i)
  par<- par(new=F)
}

ANOVA and Such

Carolyn J. Anderson

12/28/2022