#  Edps/Psych/Stat 587
#  Spring 2020
#  Carolyn J. Anderson
#
#  R-code to reproduce material in Lecture notes on Random Intercept and Slope Models
#    for NELS N=23 data
#  

# Variables and coding of nels23 data
#
#      school    = 'SCHOOL ID'
#      id        = 'STUDENT ID'
#      gender    = 'STUDENT GENDER'         1 = 'MALE'     2 = 'FEMALE'         . = 'MISSING'
#      race      = 'STUDENT RACE'           1 = 'ASIAN/PI' 2 = 'HISPANIC'        3 = 'BLACK' 
#                                           4 = 'WHITE'    5 = 'NATIVE AMERICAN' . = 'MISSING'
#      homew     = 'TIME ON MATH HOMEWORK' 
#            0 = 'NONE' 1 = 'LT 1 HOUR'    2 = '1 HOUR'      3 = '2 HOURS' 4 = '3 HOURS' 5 = '4 TO 6 HOURS'
#            6 = '7 TO 9 HOURS' 7 = 'GE 10 HOURS' . = 'MISSING';
#      schtyp   = 'SCHOOL TYPE'
#            1 = 'PUBLIC'      2 = 'CATHOLIC' 3 = 'PRIVATE RELIGIOUS'  4 = 'PRIVATE'   . = 'MISSING'
#      ses       = 'SOCIOECONOMIC STATUS'
#      pared     = 'PARENTAL EDUCATION'
#                    1 = 'NO HS DEGREE'  2 = 'HS GRAD / GED' 3 = 'HS / LT 4 YEAR COLLEGE'   
#                    4 = 'COLLEGE GRAD'  5 = "MASTER'S"      6 = 'DOCTORATE'     . = 'MISSING'
#      math      = 'MATH SCORE'
#      classstr  = 'CLASS STRUCTURE'
#      schsize   = 'SCHOOL SIZE'
#                   1 = '1 TO 199'       2 = '200 TO 399'     3 = '400 TO 599'   4 = '600 TO 799'    
#                   5 = '800 TO 999'     6 = '1000 TO 1199'   7 = '1200+'        . = 'MISSING'
#      urban     = 'URBANICITY'    1 = 'URBAN'   2 = 'SUBURBAN'   3 = 'RURAL'    . = 'MISSING'
#      geo       = 'GEOGRAPHIC REGION'
#                   1 = 'NORTHEAST'    2 = 'NORTH CENTRAL'    3 = 'SOUTH'    4 = 'WEST'    . = 'MISSING'
#      minority  = 'PERCENT MINORITY'
#            0 = '0%'       1 = '1-5%'       2 = '6-10%'      3 = '11-20%'   4 = '21-40%'   
#            5 = '41-60%'   6 = '61-90%'    7 = '91-100%'    . = 'MISSING'
#      ratio     = 'STUDENT-TEACHER RATIO' 10 = '  LE 10'    . = 'MISSING'
#      public    = 'PUBLIC SCHOOL BINARY'   0 = 'ALL OTHERS'   1 = 'PUBLIC'   . = 'MISSING'

#
# Load packages that we'll need 
#
library(lme4)
library(lmerTest)
library(lattice)
library(texreg)   # nice lmer output

# Set working dircetory
setwd('D/Dropbox/edps587/lectures/4 randomslope')

# Read in data
nels23 <- read.table(file="school23_data.txt", header=FALSE)
names(nels23) <- c('id', 'student', 'gender', 'race', 'homework', 'schtype',
                   'ses', 'pared', 'math', 'classstr', 'schsize', 'urban', 'geo',
                   'minority', 'ratio')

nels23 <- nels23[order(id),]

#
# Names of variables and Look at first 23
#
names(nels23)
head(nels23,25)
#
# compute mean ses and center ----------- *********schtyp gets messed up somehow*********
#
meanses <- aggregate(ses~id, data=nels23, "mean")
names(meanses) <- c('id','meanses')
nels23 <- merge(nels23,meanses, by = c('id'))

cses <- nels23$ses - nels23$meanses
nels23 <- cbind(nels23,cses)

#
#  Recheck data after making new variables---do this 
#  everytime after you muck with the data.
#
names(nels23)
head(nels23)

#
# Recode race as a binary variable in new column
# create  new column

nels23$white <- NA

# The recoding

nels23$white[nels23$race==4] <- 1
nels23$white[nels23$race<=3] <- 0
nels23$white[nels23$race>=5] <- 0

#
# ReCode schtype to private
#

nels23$private <- NA
nels23$private[nels23$schtype==1] <- 0
nels23$private[nels23$schtype>=2] <- 1

names(nels23)
head(nels23)


#####################################################
# Result in lecture notes
#####################################################
#####################################
# First look at data: one per school#
#####################################
par(mfrow=c(1,1)) 


xyplot(nels23$math ~ nels23$homework | as.factor(nels23$id),type=c('p','r'))

#
#  All regressions in one 
#

# You only need to do the following once...this gives you various things in
# Data set-up

# sort by schools:
nels23 <- nels23[order(nels23$id) , ]

# Get information on number schools & create consecutive integer school.id
school <- unique(nels23$id)                        ##  
N <- length(school)                                  ## need N and school.id
school.int <- as.data.frame(cbind(school,seq(1:N)))  ##  for dataList & model  
names(school.int) <- c("id", "school.id")        ##
nels23 <- merge(nels23,school.int,by="id")           ##

# total number of students
n <- length(nels23$math)                               ## need n for dataList & model

# Double-check set-up
head(nels23,n=50)

# You don't need this but might want it (sample size per school)
nj <- matrix(999,nrow=N,ncol=1)
for (j in 1:N) {
  nj[j] <- nrow(subset(nels23,nels23$school.id==j))
}  

# Graph to illustrate need for random intercept and slope
plot(nels23$homew,nels23$math,
	type="n", 
	col="blue",
	lwd=2,
	main="NELS: Linear Regression by School \n(for where there is data)",
	xlab="Time Spent Doing Homework",
	ylab="Math Scores"
	)
for (j in 1:N) {
  sub <- subset(nels23,nels23$school.id==j)
  fitted <- fitted(lm(math~homework,sub))
  lines(sub$homework,fitted,col=j)
}  

########################
require(lmerTest)
#######################

#p25 --- number of schools
nschools <- length(unique(nels23$id))
nschools

#
# page 27   Null model = random effects ANOVA
#
# Fit model and get default summary)
summary(model.null <- lmer(math ~ 1 + (1|id), data=nels23, REML=FALSE))

# Nicer (summary)
# - with se for coeffiecents
screenreg(model.null,single.row=TRUE,custom.model.names=c("Null"))
# - with 95% CI rather than se for coefficients
screenreg(model.null,single.row=TRUE,custom.model.names=c("Null"),ci.force=TRUE)

# for more information on using screenreg see
# https://www.rdocumentation.org/packages/texreg/versions/1.36.18/topics/texreg

#
# Computing ICC (one of many)
#
vars <- as.data.frame(VarCorr(model.null))[4]
total <- sum(vars)
tau00 <- vars[1,1]
icc.0 <- tau00/total
round(icc.0,digits=4)

#
# page 30 Random intercept: homework
#
summary(model.ri <- lmer(math ~ 1 + homework + (1|id), data=nels23, REML=FALSE))

vars <- as.data.frame(VarCorr(model.ri))[4]
total <- sum(vars)
tau00 <- vars[1,1]
icc1 <- tau00/total
round(icc1,digits=4)

#
#p 35 Random intercept and slope homework
#
require(lmerTest)
summary(model.ris <- lmer(math ~ 1 + homework + (1 + nels23$homew|id), data=nels23, REML=FALSE))


# if you want the rather than correlation:  cov(U0j,U1j) = corr(U0j,U1j)*std(U0j)*st(U1j)
covU0U1 <- -.83*7.700*4.097
covU0U1

# Or if you don't want to compute cov(U0j,U1j) but want it...
as.data.frame(VarCorr(model.ris))[4]

#
# Comparison and summary so far....note that covariance(U0,U1) is given
#
screenreg(list(model.null,model.ri,model.ris),single.row=TRUE,custom.model.names=c("Null","Random Intercept", "Random Int & Slope"))
#
#

#
#p 40  Plot of school mean math +/1 1 standard deviation
#
mavg <- aggregate(math~id, data=nels23, "mean")
msdev <- aggregate(math~id, data=nels23, "sd")
sch.id <- seq(1:23)

plot(sch.id, mavg[,2], type='p',
    ylim=range(c(mavg[,2]-msdev[,2], mavg[,2]+msdev[,2])),
    pch=19, 
    xlab="Schools", 
     ylab="Mean Math +/- SD",
     main="School Mean Math +/- 1 Std.Dev."
 )

# hack: we draw arrows but with very special "arrowheads"
# For this graph, thanks to http://stackoverflow.com/questions/13032777/scatter-plot-with-error-bars

arrows(sch.id, mavg[,2]-msdev[,2], sch.id, mavg[,2]+msdev[,2], length=0.05, angle=90, code=3)


#
#page 41  Graph of overall regression
# Plot population average regression
#
intercept <- fixef(model.ris)[1]
slope <- fixef(model.ris)[2]

random <- ranef(model.ris)
student.int <- random$id[1]
student.slope <- random$id[2]


plot(math ~ homework,type = 'n', data = nels23,
     cex.main = 1.5,
     xlab = 'Time Spent Doing Homework',  
     ylab = "Fitted Math Scores",
     main = "Population Average Regression "
     )
# Population average line
for(i in min(nels23$homework):max(nels23$homework)){
   abline(a=intercept, b=slope, col='black',lwd=4)
}


#
#page 42  Population and School Specific
#
school.list <- unique(nels23$id)

plot(math~ nels23$homew,type = 'n', data = nels23, 
     ylim = c(30, 80),
     xlim = c(0, 7),
     cex.main = 1.5,
     xlab = 'Time Spent Doing Homework',  
     ylab = "Fitted Math Scores",
     main = "Population & School Specific Regressions "
     )
# draw in school specific line
for (i in 1:length(school.list)){
   abline(a=(intercept+student.int[i,]), b=(slope+student.slope[i,]), type='l', col=i)
  }

# Population average line
for(i in min(nels23$homew):max(nels23$homew)){
   abline(a=intercept, b=slope, col='black',lwd=4)
}

#
#p 44 School specific for random intercept model
#

intercept <- fixef(model.ri)[1]
slope <- fixef(model.ri)[2]

random <- ranef(model.ri)
student.int <- random$id[1]

plot(math~ homework,type = 'n', data = nels23, 
     ylim = c(30, 80),
     xlim = c(0, 7),
     cex.main = 1.5,
     xlab = 'Time Spent Doing Homework',  
     ylab = "Fitted Math Scores",
     main = "Population & School Specific: Random Intercept "
     )
# draw in school specific line
for (i in 1:length(school.list)){
   abline(a=(intercept+student.int[i,]), b=(slope), type='l', col=i)
  }


#
#page 45  Graph of estimated variances of random intercept and intercept & slope
#
hmwk <- seq(from=1, to=7, by=.1)
vars.ri <- as.data.frame(VarCorr(model.ri))[4]
total.ri <- rep(sum(vars.ri), 61)

vars.ris <- as.data.frame(VarCorr(model.ris))[4]
total.ris<- vars.ris[1,] + 2*vars.ris[3,]*hmwk + vars.ris[2,]*(hmwk**2) + vars.ris[4,]

plot(hmwk,total.ris,type = 'l', 
     cex.main = 1.5,
     col='blue',
     xlim=c(1,7),
     ylim=c(0,700),
     xlab = 'Time Spent Doing Homework',  
     ylab = "Estimated Variance of Math Scores",
     main = "Variance Function"
     )
# random inercept
     abline(a=sum(vars.ri), b=0, col='red')

legend(1,690, 
       c('Random Intercept & Slope','Random Intercept'),
       lty=c(1,1),
       lwd=c(2.5,2.5),
       col=c('blue','red'))


#
# p87 Random intercept and slope homework, but 2 more level 1 variables
#


summary(model1 <- lmer(math ~ homework + private + white + (1 + homework | id), data=nels23, REML=FALSE))

# and if you want cov(U0j,U1j) ...

as.data.frame(VarCorr(model1))[4]


#
# p92 Random slope for white & homework--- Model 2
#

summary(model2 <- lmer(math ~ homework + private + white + (1 + homework + white | id), data=nels23, REML=FALSE))
as.data.frame(VarCorr(model2))[4]


#
# p96 Random slope for private & homework --- bad model but warning isn't convincing
#

summary(model3 <- lmer(math ~ homework + private + white + (1 + homework + private | id), data=nels23, REML=FALSE))
as.data.frame(VarCorr(model3))[4]


#
# p72 --- working with Model 2,  plot of average schools for different levels of discrete variables
#

par(mfrow=c(1,1)) 

fixed <- fixef(model2)

prv_wht <- fixed[1] + fixed[2]*nels23$homework + fixed[3] + fixed[4]


plot(nels23$homework, prv_wht, type='l',
     col='black',
     ylim = c(40, 70),
     xlim = c(1,7),
     xlab='Time Spent Doing Homework',
     ylab='Fitted Math Scores',
     main='Population Average Regressions (Model 2)'
)
abline(a=(fixed[1] + fixed[4]), b=fixed[2], col='red')
abline(a=(fixed[1] + fixed[3]), b=fixed[2], col='blue')
abline(a=(fixed[1]), b=fixed[2], col='green')
legend(1,70,
       c('Private, White', 'Public, White', 'Private, Non-White', 'Public, Non-White'),
       lty=c(1,1,1,1),
       col=c('black', 'red', 'blue', 'green')
    )

#
# Page 99 --- working with Model 2, plot of the 23 school regressions (HLM)
#
# Population and School Specific
#
# These are model based and don't take into acount where the data are (lectures notes
#   figures are better)
#

par(mfrow=c(2,2)) 

random <- ranef(model2, level=0:2)
U0j <- random$id[,1]
U1j <- random$id[,2]
U2j <- random$id[,3]

plot(nels23$math ~ nels23$homework, type = 'n', 
     cex.main = 1.5,
     xlab = 'Time Spent Doing Homework',  
     ylab = "Fitted Math Scores",
     main = "Private, White "
     )
# draw in scho0l specific lines:  Private, white
for (i in 1:23){
    abline(a=(fixed[1]+fixed[3]*homework +fixed[4] + U0j[i]+ U2j[i]), b=(fixed[2]+U1j[i]), type='l', col=i)
  }

plot(nels23$math ~ nels23$homework, type = 'n', 
     cex.main = 1.5,
     xlab = 'Time Spent Doing Homework',  
     ylab = "Fitted Math Scores",
     main = "Private, Non-White "
     )
# draw in scho0l specific lines:  Private, non-white
for (i in 1:23){
    abline(a=(fixed[1]+fixed[3] + U0j[i]), b=(fixed[2]+U1j[i]), type='l', col='blue')
  }


plot(math ~ homework, type = 'n', 
     cex.main = 1.5,
     xlab = 'Time Spent Doing Homework',  
     ylab = "Fitted Math Scores",
     main = "Public, White "
     )
# draw in scho0l specific lines:  Public, white
for (i in 1:23){
    abline(a=(fixed[1]+fixed[4] + U0j[i] + U2j[i]), b=(fixed[2]+U1j[i]), type='l', col='blue')
  }


plot(math ~ homework, type = 'n', 
     cex.main = 1.5,
     xlab = 'Time Spent Doing Homework',  
     ylab = "Fitted Math Scores",
     main = "Public, non-White "
     )
# draw in scho0l specific lines:  Public, non-white
for (i in 1:23){
    abline(a=(fixed[1]+ U0j[i]), b=(fixed[2]+U1j[i]), type='l', col='blue')
  }


##################################################################
# Different kinds of centering                                   #
##################################################################

nels23$homework.mean <- rep(mean(nels23$homework), 519)

# Grand mean center homew
nels23$homew.GMC <- nels23$homework - mean(nels23$homework)


# Group Mean center
homew.GpM <- aggregate(homework~id, data=nels23, "mean")
names(homew.GpM) <- c('id','homew.GpM')
nels23 <- merge(nels23,homew.GpM, by = c('id'))

nels23$homew.GpMC <- nels23$homework - nels23$homew.GpM


###############################################################
# Random Intercept Models with different types of centering   #
###############################################################

#
# Raw Score
#

summary(model.raw <- lmer(math ~ 1 + homework + (1 |id), nels23, REML=FALSE))
(n2lnlike <- -2* logLik(model.raw))
# or 
(dev.raw <- deviance(model.raw))

#
# Grand mean centered
#
model.GMC <- lmer(math ~ 1 + nels23$homew.GMC + (1 |id), nels23, REML=FALSE)
summary(model.GMC)
(dev.GMC <- deviance(model.GMC))



#
# Group (school) mean centered
#
model.GpMC <- lmer(math ~ 1 + nels23$homew.GpMC + (1 |id), nels23, REML=FALSE)
summary(model.GpMC)
deviance(model.GpMC)



#
# Raw Score + School Mean
#
model.RGpm <- lmer(math ~ 1 + homework + nels23$homew.GpM + (1 |id), nels23, REML=FALSE)
summary(model.RGpm)
deviance(model.RGpm)



#
# School mean centered + School Mean
#

summary(model.GpMC <- lmer(math ~ 1 + nels23$homew.GpMC + nels23$homew.GpM + (1 |id), nels23, REML=FALSE))
deviance(model.GpMC)

#######################################################################
# Random Intercept & Slope Models with different types of centering   #
#######################################################################

#
# Raw Score
#

summary(model.raw2 <- lmer(math ~ 1 + homework + (1 + homework |id), nels23, REML=FALSE))

#
# Grand mean centered
#
summary(model.GMC <- lmer(math ~ 1 + nels23$homew.GMC + (1 + nels23$homew.GMC|id), nels23, REML=FALSE))

#
# Group (school) mean centered
#
summary(model.GpMC2 <- lmer(math ~ 1 + nels23$homew.GpMC + (1 + nels23$homew.GpMC|id), nels23, REML=FALSE))

#
# Raw Score + School Mean
#
summary(model.RGpm2 <- lmer(math ~ 1  + homework + nels23$homew.GpM + (1  + homework |id), nels23, REML=FALSE))

#
# School mean centered + School Mean
#
summary(model.GpMC2b <- lmer(math ~ 1 + nels23$homew.GpMC + nels23$homew.GpM + (1 + nels23$homew.GpMC|id), nels23, REML=FALSE))

screenreg(list(model.raw2,model.GMC,model.GpMC2,model.RGpm2,model.GpMC2b),
               custom.model.names=c("Raw Score C","Grand Mean C", "School Mean C","Raw+School Mean","School mean C & mean"))