# Edps 589 # Fall 2021 # c.j.anderson # # Titanic data and multiple logistic regression # # ###################################################### # Setup # ###################################################### # library(vcd) library(ResourceSelection) library(pROC) library(DescTools) library(rms) # an alternative package that does logistic regression # Will be especially useful for ordinal data. For now # I am using it to get Rsq. #setwd("C:\\Users\\cja\\Documents\\Dropbox\\edps 589\\5 logistic regression") #setwd("C:\\Users\\cja\\Dropbox\\edps 589\\5 logistic regression") setwd("D:\\Dropbox\\edps 589\\5 logistic regression") ##################################################### # Read in data # ##################################################### t <- read.csv(file="titanic_data_R.csv", header=TRUE,sep=",") names(t) head(t) dim(t) ##################################################### # Some descriptive statistics # ##################################################### N <- length(t$pclass) table(t$survived) table(t$pclass) table(t$sex) mean(t$age) summary(t$age) t$pclass <- as.factor(t$pclass) plot(t$age, jitter(t$survived, .1), type="p", pch=19, main="Titanic: Survived by Age (& Lowess)", xlab="Age", ylab="Survived (=1)") smooth <- loess(survived ~ age, data=t) j <- order(t$age) lines(t$age[j], smooth$fitted[j], col="cyan") female <- t[which(t$sex=="female"),] male <- t[which(t$sex=="male"),] plot(female$age, jitter(female$survived, .1), type="p", main="Titanic: Survived by Age & Gender", xlab="Age", ylab="Survived (=1)", col="red") lines(male$age, jitter(male$survived, .1), type="p") smooth <- loess(survived ~ age, data=female) j <- order(female$age) lines(female$age[j], smooth$fitted[j], col="red") smooth <- loess(survived ~ age, data=male) j <- order(male$age) lines(male$age[j], smooth$fitted[j], col="black") legend("right", legend=c("female", "male"), col=c("red","black"), lty=c(1,1,2), lwd=2) #################################################### # Models # #################################################### # I will use default R coding (i.e., dummy coding with 1st value # the reference category) t$pclass <- as.factor(t$pclass) # saturated summary( model0 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age + pclass*sex*age,data=t,family=binomial) ) # all 2-way summary( model1 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age, data=t, family=binomial) ) hoslem.test(model1$y, fitted(model1), g=10) anova(model1,model0) ( model1.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age,data=t) ) # PS,PA summary( model2 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age, data=t, family=binomial) ) hoslem.test(model2$y, fitted(model2), g=10) anova(model2,model1) ( model2.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + pclass*age,data=t) ) # PS,SA summary( model3 <- glm(survived ~ pclass + sex + age + pclass*sex + sex*age, data=t, family=binomial) ) hoslem.test(model3$y, fitted(model3), g=10) anova(model3,model1) ( model3.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + sex*age,data=t) ) # PA,SA summary( model4 <- glm(survived ~ pclass + sex + age + pclass*age + sex*age, data=t, family=binomial) ) hoslem.test(model4$y, fitted(model4), g=10) anova(model4,model1) ( model4.alt <- lrm(survived ~ pclass + sex + age + pclass*age + sex*age ,data=t) ) # P,S,A summary( model5 <- glm(survived ~ pclass + sex + age , data=t, family=binomial) ) ( model5.alt <- lrm(survived ~ pclass + sex + age,data=t) ) #### Graph the best model: model1 # sub-set data and fitted values, t$fitted <- model1$fitted female1 <- t[ (which(t$sex=="female" & t$pclass=="1")),] female2 <- t[ (which(t$sex=="female" & t$pclass=="2")),] female3 <- t[ (which(t$sex=="female" & t$pclass=="3")),] male1 <- t[ (which(t$sex=="male" & t$pclass=="1")),] male2 <- t[ (which(t$sex=="male" & t$pclass=="2")),] male3 <- t[ (which(t$sex=="male" & t$pclass=="3")),] j <- order(female1$age) plot(female1$age[j],female1$fitted[j], type='l', lwd=2, ylim=c(0,1), xlim=c(1,80), col="blue", xlab="Age of Passenger", ylab="Fitted Probability of Survival", main="Titanic: survived~pclass+sex+age+pclass*sex+pclass*age+sex*age" ) j <- order(female2$age) lines(female2$age[j],female2$fitted[j],col="darkblue",lwd=2) j <- order(female3$age) lines(female3$age[j],female3$fitted[j],col="cyan",lwd=2) j <- order(male1$age) lines(male1$age[j],male1$fitted[j],col="orange",lwd=2) j <- order(male2$age) lines(male2$age[j],male2$fitted[j],col="red",lwd=2) j <- order(male3$age) lines(male3$age[j],male3$fitted[j],col="green",lwd=2) legend(57.5,.92,legend=c("female 1st", "female 2nd","female 3rd","male 1st", "male 2nd", "male 3rd"), col=c("blue","darkblue","cyan","orange","red","green"), lty=1,lwd=2, cex=1) ############################### # Concordance index # ############################### # --- these are a bit different from lrm...but very close (Cstat(model1,resp=y)) (Cstat(model2,resp=y)) (Cstat(model3,resp=y)) (Cstat(model4,resp=y)) (Cstat(model5,resp=y)) roc.data5 <-roc(model5$y , model5$fitted.values,ci=T) names(roc.data5) plot((1-roc.data5$specificities),roc.data5$sensitivities, type='l', lwd=2, col="blue", main="ROC for Models 5, 1 and chance", xlab="1-sensitivity (false positives)", ylab="Specificity (true positives)" ) roc.data1 <- roc(model1$y, model1$fitted.values,ci=T) lines((1-roc.data1$specificities), roc.data1$sensitivities, col="pink", lwd=2) text(.1,.94,"model 5: c=.84",col="blue") text(.1,.99,"model 1: c=.86",col="pink") text(.1,.89,"chance: c=.50",col="black") lines(c(0,1),c(0,1), lty=2) legend("bottomright", legend=c("pclass + sex + age", "pclass + sex + age + pclass*sex + pclass*age + sex*age","chance"), col=c("blue","pink","black"), lty=c(1,1,2), lwd=2) ######################## # 12 a Hosmer-Lemeshow # ######################## hoslem.test(model1$y, fitted(model1), g=10) ######################## # 12b qq plot residuals--- looks different from SAS# ######################## # Pearson (adjusted standardized residuals resid <- rstandard(model1, pearson=TRUE) qqnorm(resid, main="QQ plot of Pearson Residuals", col="blue") ################# # Influence # ################# inf <- influence.measures(model1) names(inf) summary(inf) # will pick out potentially influencial ones # --- requires car package influencePlot(model1) influenceIndexPlot(model1, vars=c("Cook","Studentized","hat"), id.n=10) # To see what is here: head(inf$infmat)