# Edpsy 589 # Fall 2021 # C.J.Anderson # # Lecture 5 (logistic regression): High School and Beyond # # Read in data in csv formate # Fit the following models: # linear probability # logit # probit # Various nice graphs # # HSB data (N=600) # # id ='ID number ' # sex ='Gender: 1=male, 2=female' # ses ='Socio-Economic Status' # sctyp ='School Type: 1=public, 2=private' # hsp = 'High School Program' # locus ='Locus of Control' # concpt ='Self Concept' # mot ='Motivation' # car ='Career Choice' # rdg ='Reading T-Score' # wrtg ='Writing T-Score' # math ='Math T-Score' # sci ='Science T-Score' # civ ='Civics T-Score'; # # Variable I created for lecture # Group = group based on sum 5 achievement variables # Program = 1 for academic, 0 for general and votech # xmean = mean of 5 achievement variables # ...and some micsl as needed# library(data.table) # maybe I don't need this library(dplyr) library(MASS) # needed for negatiave binomial glm library(vcd) library(vcdExtra) library(ggplot2) library(ResourceSelection) # hosmer lemshow statistic library(pROC) # ROC curve/data library(DescTools) # concordance index library(car) # influence diagnostics ######################################################## # Set up and read in data # ######################################################## #setwd("D:\\Dropbox\\edps 589\\5 logistic regression") setwd("C:\\Users\\cja\\Dropbox\\edps 589\\5 logistic regression") hsb <- read.csv(file="hsb_data.csv", header=TRUE, sep=",") names(hsb) head(hsb) # Achieve in the dataset is the sum of achievement tests...turn this into a mean hsb$achieve <- hsb$achieve/5 ################################################### # Descriptive statistics and first look at data # ################################################### mean(hsb$achieve) sd(hsb$achieve) length(hsb$achieve) # sample size table(hsb$program) # 1 is academic # A look at the data: # -- ungroups data with loess (smooth) fitted plotted lw1 <- loess(program ~ achieve,data=hsb) # fit loess to data plot(hsb$achieve,jitter(hsb$program,0.1), main="Whether in Academic Program vs Mean Achievement", xlab="Mean of 5 Achievement Tests", ylab="Probabilty in Academic Program", cex=1.75, col="black", pch=20 ) j <- order(hsb$achieve) lines(hsb$achieve[j],lw1$fitted[j],col="cyan",lwd=3) text(64,.7,"Fitted Loess Curve",col="cyan") ###################################################### # Fit models # ###################################################### # Linear Probability Model summary( linear.mod <- glm(program ~ achieve, data=hsb, family=binomial(link="identity")) ) # R objects to this: `Error: no valid set of coefficients has been found: please supply starting values' # Logit model summary( logit.mod <- glm(program ~ achieve, data=hsb, family=binomial(link="logit")) ) hsb$logit.fit <- fitted.values(logit.mod) j <- order(hsb$xmean) lines(hsb$achieve[j],hsb$logit.fit[j],col="blue",lwd=3) text(64,.77,"Logit",col="blue") #################################################################### # Group data as done in lecture notes -- better than I did in SAS # # requires the dplyr # # # # Basically makes table on page 19 of notes -- grouping not # # identical but close # # # #################################################################### levels <- c(-Inf, 40, 45, 47, 49, 51, 53, 55, 57, 60, 65, Inf) labels <- c("<40","40-45","45-47","47-49","47-51","51-53","53-55","55-57","57-60","60-65",">65") hsb$grp.achieve = cut(hsb$achieve, levels, labels = labels) # - mean per group (grp.mean <- aggregate(hsb$achieve,by=list(hsb$grp.achieve),FUN=mean)) # - number of cases per group (grp.n <- table(hsb$grp.achieve)) # - number of non and academic students per group (data.table <- table(hsb$grp.achieve,hsb$program)) # - observed proportion (obs.p <- data.table[,2]/(data.table[,1]+data.table[,2])) # - pred prob sum prob.sum <- aggregate(hsb$logit.fit,by=list(hsb$grp.achieve),FUN=sum) (prob.sum <- prob.sum[,2]/grp.n) # - using equation on grouped data (p.eq <- exp(-7.05477 + 0.13691*grp.mean[,2])/(1+exp(-7.05477 + 0.13691*grp.mean[,2]))) # - pred number (pred.n <- p.eq*grp.n) pred.n <- matrix(pred.n,nrow=11,ncol=1) # - put into table like in lecture notes summary.p19<- cbind(grp.n,grp.mean[,2],data.table[,2],obs.p,prob.sum,p.eq,pred.n) summary.p19<- as.data.frame(summary.p19) names(summary.p19) <- c("range # cases ","mean","# academic","obs p","sum pi","equation pi","pred freq") # ---- this doesn't exactly match lectures notes, which # shows SAS results. The cutting was a bit different # between R and SAS. summary.p19 ## Now use just math (because I did it in lecture notes (summary(logit.mod <- glm(program ~ math, data=hsb, family=binomial))) # or achieve (summary(logit.mod <- glm(program ~ achieve, data=hsb, family=binomial))) ################################### # Interpretation # ######################################### # - change linear interpolation to mean achieve = 65 (p <- (subset(hsb,hsb$achieve==65)$logit.fit)) exp(-7.0548 +.1369*(65))/(1 + exp(-7.0548 +.1369*(65))) # slope equals (slope <- logit.mod$coefficients[2] * p * (1-p)) #### nhsb <- hsb[order(hsb$achieve),] plot(nhsb$achieve,nhsb$logit.fit, type='l', col="blue", main="Linear Interpretation at Achievement = 65", xlab="Mean of 5 Achievement Tests", ylab="Probility in Academic given mean Achievement", ylim=c(0,1) ) abline(h=p) abline(v=65) # to get line tangent at (65,p) xlin <- c(55,65,70) ylin <- c(-slope*10+p, p, slope*5+p) lines(xlin,ylin,col="red") text(40,.9,"p = .8635") text(63,0.05,"x = 65") text(50,0.7,"slope = 0.0161") ######################################################### # Confidence intervals for coefficients and odds ratios # ######################################################### # 95% for beta (lower <- logit.mod$coefficients[2] - 1.96*0.01327) (upper <- logit.mod$coefficients[2] + 1.96*0.01327) # 95% Confidence interval for exp(beta) = odds ratio exp(lower) exp(upper) ######################################################### # Tests of coefficients # ######################################################### # wald test = z^2, has same p-value as z^2 (wald <- (logit.mod$coefficients[2]/.01327)^2) 1- pchisq(wald,1) # likelihood ratio test (lr <- logit.mod$null.deviance-logit.mod$deviance) 1-pchisq(lr,1) # or an easier way anova(logit.mod, test="LRT") ######################################################## # "Tests" of model goodness-of-fit to data # ######################################################## #### Fit to individual then group -- we already did the grouping pred.acd <- summary.p19[,7] pred.not <- summary.p19[,1] - pred.acd number.attend <- summary.p19[,3] number.not <- summary.p19[,1] - number.attend Xsq <-(number.attend - pred.acd)**2 / pred.acd + (number.not - pred.not)**2 / pred.not (Xsq <- sum(Xsq)) Gsq <- number.attend* log(number.attend/pred.acd) + number.not * log(number.not/pred.not) (Gsq <- 2*sum(Gsq)) #### Group data then fit model (to assess whether good model) grp.data <- cbind(number.attend,number.not,summary.p19[,2]) grp.data <- as.data.frame(grp.data) (grp.data$ncases <- grp.data$number.attend + grp.data$number.not) grp.data$p <- grp.data$number.attend/grp.data$ncases (logit.mod2 <- glm(p ~ V3, weight=ncases, data=grp.data, family=binomial)) 1-pchisq(logit.mod2$deviance,logit.mod2$df.residual) anova(logit.mod2,test="LRT") # - fit statistics are bit different than what is in the lecture notes, # because I used a slightly different groups. # - note that fitted probabilities are very similar # - observed proportions grp.data$pobs <- grp.data$number.attend/grp.data$ncases # - fitted probabilities grp.data$phat <- logit.mod2$fitted # - fitted counts grp.data$predFreq <- grp.data$phat*grp.data$ncases # graph comparing observed proportions and fitted probabilities plot(grp.data$V3,grp.data$pobs, type='p',pch=1, col="blue", lwd=2, xlab="Achievement", ylab="Proporitions and Predications", main="Logit Model fit to Grouped Data") lines(grp.data$V3,grp.data$phat, col="red") smooth <- loess(grp.data$pobs~grp.data$V3) lines(grp.data$V3,smooth$fit, col="purple") legend("topleft",legend=c("Observed proportions","Fitted Probabilities","Loess"), col=c("blue","red","purple"), lty=1,lwd=2, cex=1) #---- CI for fitted probabilities (min.ach <- min(hsb$achieve)) (max.ach <- max(hsb$achieve)) achieve.range <- seq(from=32, to=70, by=.01) logit.modt <- predict(logit.mod, whatever <- data.frame(achieve=achieve.range), type="response", se.fit=TRUE) names(logit.modt) upper <- logit.modt$fit + 1.96*logit.modt$se.fit lower <- logit.modt$fit - 1.96*logit.modt$se.fit # ---- ungrouped data with loess (smooth) fitted plotted lw1 <- loess(program ~ achieve,data=hsb) # fit loess to data plot(hsb$achieve,fitted(logit.mod),type='n', main="Data (Loess) & Logit Fitted w/ 95% Bands", xlab="Mean of sum of 5 Achievement Tests", ylab="Probabilty in Academic Program", cex=1.75, pch=20 ) polygon(c(achieve.range,rev(achieve.range)) , c(upper,rev(lower)), col=rgb(0,0,1,0.2), border=NA ) j <- order(hsb$xmean) lines(hsb$achieve[j],lw1$fitted[j],col="green",lwd=3) lines(hsb$achieve[j],fitted(logit.mod)[j],col="blue") legend("topleft",legend=c("Loess","Logit","95% CI"), col=c("green","blue","azure"), lty=1, cex=1) ##################################### # Hosmer-Lemeshow # # Uses ResourceSelection package # ##################################### hoslem.test(logit.mod$y, fitted(logit.mod), g=10) ##################################### # Concordance index and ROC # # use plotROC package # ##################################### # - decision tables pred.decision <- ifelse(hsb$logit.fit >=.5,1,0) table(hsb$program) table(pred.decision) addmargins(cross <- table(pred.decision,hsb$program)) addmargins(prop.table(cross)) # Sensitivity (sensitivity <- 227/308) # - Specificity (specificity <- 198/292) # - proportion correct .3300+.37833 # - or (198+227)/600 # - or sensitivity*(308/600) + specificity*(292/600) # Cocordance index # --- Use pacakge DescTools Cstat(logit.mod,resp=hsb$program) # Plot of ROC curve # Uses package pROC ## ####################### # qqplot residuals # ####################### # - what comes from glm qqnorm(logit.mod$residuals) # Pearson standardized residuals--- seems to be the same resid <- rstandard(logit.mod, pearson=TRUE) rstud <- rstudent(logit.mod, pearson=TRUE) rdev <- rstudent(logit.mod, deviance=TRUE) qqnorm(resid, main="QQ plot of Pearson Standardized Residuals") ylin <- c(-2,3) lines(ylin,ylin) qqnorm(rstud, main="QQ plot of Studentized Standardized Residuals") lines(ylin,ylin) qqnorm(rdev, main="QQ plot of Deviance Standardized Residuals") lines(ylin,ylin) summary(rdev) ################# # Influence # ################# inf <- influence.measures(logit.mod) names(inf) summary(inf) # will pick out potentially influencial ones # --- requires car package influencePlot(logit.mod) influenceIndexPlot(logit.mod, vars=c("Cook","Studentized","hat"), id.n=5) # To see what is here: head(inf$infmat) # --- others are possible but require a bit more work. ######################################## # What can you get (i.e., 'extract')? # ######################################## names(logit.mod) # ----Values in lecture notes, a bit different # from SAS...probably have different cases hsb[307,] # -- predicted probability = pi(x) logit.mod$fitted.values[307] # -- linear predictor = Xbeta logit.mod$linear.predictors[307] ######################### # CI for pi is a bit harder # ######################### summary(hsb$achieve) achievement.range <- seq(from=32,to=350,by=1) logit.modt <- predict(logit.mod, anything <- data.frame(achieve = achievement.range), type="response", se.fit=TRUE ) names(logit.modt) upper <- logit.modt$fit + 1.96*logit.modt$se.fit lower <- logit.modt$fit - 1.96*logit.modt$se.fit lw1 <- loess(program ~ achieve,data=hsb) # fit loess to represent smoothed data # -- set up plotting frame w/o anying in it plot(hsb$achieve, logit.mod$fitted, type='n', main="Logit modeld Fitted Values and 95% Bands", xlab="Mean of 5 Achievement Tests", ylab="Probabilty in Academic Program", cex=1.75, pch=20 ) polygon(c(achievement.range,rev(achievement.range)) , c(upper,rev(lower)), col=rgb(0,0,1,0.2), border=NA ) j <- order(hsb$achieve) lines(hsb$achieve[j],fitted(logit.mod)[j],col="blue") lines(hsb$achieve[j],lw1$fitted[j],col="green",lwd=3) legend("topleft", legend=c("Logit" , "95% CI" , "Loess/smoothed ata"), col=c("blue","azure","green"), lty=1, cex=1)