# Edps 589
# Fall 2021
# c.j.anderson
#
# Titanic data and multiple logistic regression
#
#
######################################################
# Setup                                              #
######################################################
#
library(vcd)
library(ResourceSelection)
library(pROC)               
library(DescTools)

library(rms)       # an alternative package that does logistic regression
                   # Will be especially useful for ordinal data.  For now
				   # I am using it to get Rsq.

#setwd("C:\\Users\\cja\\Documents\\Dropbox\\edps 589\\5 logistic regression")
#setwd("C:\\Users\\cja\\Dropbox\\edps 589\\5 logistic regression")
setwd("D:\\Dropbox\\edps 589\\5 logistic regression")


#####################################################
#  Read in data                                     #
#####################################################

t <- read.csv(file="titanic_data_R.csv", header=TRUE,sep=",")
names(t)
head(t)
dim(t)


#####################################################
# Some descriptive statistics                       #
#####################################################

N <- length(t$pclass)   
table(t$survived)
table(t$pclass)
table(t$sex)
mean(t$age)
summary(t$age)

t$pclass <- as.factor(t$pclass)

plot(t$age, jitter(t$survived, .1), type="p", pch=19, main="Titanic: Survived by Age (& Lowess)", xlab="Age", ylab="Survived (=1)")
smooth <- loess(survived ~ age, data=t)
   j <- order(t$age)	                                
   lines(t$age[j], smooth$fitted[j], col="cyan")
   
female <- t[which(t$sex=="female"),]
male <- t[which(t$sex=="male"),]
  
plot(female$age, jitter(female$survived, .1), type="p", main="Titanic: Survived by Age & Gender", xlab="Age", ylab="Survived (=1)", col="red")
lines(male$age, jitter(male$survived, .1), type="p")
smooth <- loess(survived ~ age, data=female)
   j <- order(female$age)	                                
   lines(female$age[j], smooth$fitted[j], col="red")
smooth <- loess(survived ~ age, data=male)
   j <- order(male$age)	                                
   lines(male$age[j], smooth$fitted[j], col="black")
legend("right",
        legend=c("female", "male"),
		col=c("red","black"), lty=c(1,1,2), lwd=2)


####################################################
# Models                                           #
####################################################

# I will use default R coding (i.e., dummy coding with 1st value
#    the reference category)

t$pclass <- as.factor(t$pclass)

# saturated
summary( model0 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age + pclass*sex*age,data=t,family=binomial) )

# all 2-way 
summary( model1 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age, data=t, family=binomial) )
hoslem.test(model1$y, fitted(model1), g=10)
anova(model1,model0)

( model1.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age,data=t) )

# PS,PA
summary( model2 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age, data=t, family=binomial) )
hoslem.test(model2$y, fitted(model2), g=10)
anova(model2,model1)

( model2.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + pclass*age,data=t) )

# PS,SA
summary( model3 <- glm(survived ~ pclass + sex + age + pclass*sex + sex*age, data=t, family=binomial) )
hoslem.test(model3$y, fitted(model3), g=10)
anova(model3,model1)


( model3.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + sex*age,data=t) )

# PA,SA
summary( model4 <- glm(survived ~ pclass + sex + age + pclass*age + sex*age, data=t, family=binomial) )
hoslem.test(model4$y, fitted(model4), g=10)
anova(model4,model1)

( model4.alt <- lrm(survived ~ pclass + sex + age + pclass*age + sex*age ,data=t) )

# P,S,A
summary( model5 <- glm(survived ~ pclass + sex + age , data=t, family=binomial) )

( model5.alt <- lrm(survived ~ pclass + sex + age,data=t) )


#### Graph the best model:  model1
# sub-set data and fitted values,

t$fitted <- model1$fitted
female1 <- t[ (which(t$sex=="female" & t$pclass=="1")),]
female2 <- t[ (which(t$sex=="female" & t$pclass=="2")),]
female3 <- t[ (which(t$sex=="female" & t$pclass=="3")),]
male1 <- t[ (which(t$sex=="male" & t$pclass=="1")),]
male2 <- t[ (which(t$sex=="male" & t$pclass=="2")),]
male3 <- t[ (which(t$sex=="male" & t$pclass=="3")),]

j <- order(female1$age)
plot(female1$age[j],female1$fitted[j], 
    type='l',
	lwd=2,
	ylim=c(0,1),
	xlim=c(1,80),
	col="blue",
	xlab="Age of Passenger",
	ylab="Fitted Probability of Survival",
	main="Titanic: survived~pclass+sex+age+pclass*sex+pclass*age+sex*age"
	)
j <- order(female2$age)	                                
lines(female2$age[j],female2$fitted[j],col="darkblue",lwd=2)
j <- order(female3$age)	                                
lines(female3$age[j],female3$fitted[j],col="cyan",lwd=2)
j <- order(male1$age)	                                
lines(male1$age[j],male1$fitted[j],col="orange",lwd=2)
j <- order(male2$age)	                                
lines(male2$age[j],male2$fitted[j],col="red",lwd=2)
j <- order(male3$age)	                                
lines(male3$age[j],male3$fitted[j],col="green",lwd=2)
legend(57.5,.92,legend=c("female 1st", "female 2nd","female 3rd","male 1st", "male 2nd", "male 3rd"), 
       col=c("blue","darkblue","cyan","orange","red","green"), lty=1,lwd=2, cex=1)
	   

###############################
# Concordance index           #
###############################
# --- these are a bit different from lrm...but very close
(Cstat(model1,resp=y))
(Cstat(model2,resp=y))
(Cstat(model3,resp=y))
(Cstat(model4,resp=y))
(Cstat(model5,resp=y))

roc.data5 <-roc(model5$y , model5$fitted.values,ci=T)
names(roc.data5)
plot((1-roc.data5$specificities),roc.data5$sensitivities,
      type='l',
	  lwd=2,
	  col="blue",
     main="ROC for Models 5, 1 and chance",
	 xlab="1-sensitivity (false positives)",
	 ylab="Specificity (true positives)"
	 )
roc.data1 <- roc(model1$y, model1$fitted.values,ci=T)
lines((1-roc.data1$specificities), roc.data1$sensitivities, col="pink", lwd=2)
text(.1,.94,"model 5: c=.84",col="blue")
text(.1,.99,"model 1: c=.86",col="pink")
text(.1,.89,"chance:  c=.50",col="black")
lines(c(0,1),c(0,1), lty=2)
legend("bottomright",
        legend=c("pclass + sex + age", "pclass + sex + age + pclass*sex + pclass*age + sex*age","chance"),
		col=c("blue","pink","black"), lty=c(1,1,2), lwd=2)


########################
# 12 a Hosmer-Lemeshow #
########################
hoslem.test(model1$y, fitted(model1), g=10)

########################
# 12b qq plot residuals--- looks different from SAS#
########################
# Pearson (adjusted standardized residuals
resid <- rstandard(model1, pearson=TRUE)
qqnorm(resid,
       main="QQ plot of Pearson  Residuals",
	   col="blue")


#################
# Influence     #
#################
inf <- influence.measures(model1)
names(inf)

summary(inf)    # will pick out potentially influencial ones

# --- requires car package

influencePlot(model1)

influenceIndexPlot(model1, vars=c("Cook","Studentized","hat"), id.n=10)

# To see what is here:
head(inf$infmat)