# Edpsy 590BAY
# Fall 2021
# C.J.Anderson
#
# Comparing 2 proportions:  US Presdients 1789-1926 vs 1928-2016
#
# Data file has missing values that are coded at 9999 (either uncontested 
#   or height is unknown
#
#
setwd("D:\\Dropbox\\edps 590BAY\\Lectures\\2 Proportion")

elections <- read.table(file="all_presidential_data.txt",header=T)


elections <- na.omit(elections)                      # delete the ones with data missing
                                                     # In the data set missing is indicated
                                                     # by NA

elections$oldnew <- ifelse(elections$year<1928,1,0)  # create a variable used to split the data

elections$oldnew <- as.factor(elections$oldnew)      # new variable needs to be a factor
head(elections)                                      # look at data

split.data <- split(elections,elections$oldnew)

new <- split.data[1]
new <- as.data.frame(new)
names(new) <- c("year","winner","wcm","loser","lcm","wintaller","winlefty","oldnew")

old <- split.data[2]
old <- as.data.frame(old)
names(old) <- c("year","winner","wcm","loser","lcm","wintaller","winlefty","oldnew")


# Find the posteriors
table(new$wintaller)
y.new <- 16            # looking at result of table
n.new <- nrow(new)

a.new <- y.new + 1          # posterior is beta with a.new and b.new
b.new <- n.new - y.new + 1
theta.new <- (a.new-1)/(a.new+b.new-2)   


table(old$wintaller)   # posterior is beta with a.old and b.old
y.old <- 15
n.old <- nrow(old)


a.old <- y.new + 1
b.old <- n.old - y.old +1
theta.old <- (a.old-1)/(a.old+b.old-2)   


# Monte carlo 
S= 1e4
theta.new <- rbeta(S,a.new, b.new)
theta.old <- rbeta(S,a.old, b.old)

mean(theta.new>theta.old)
cor(theta.new, theta.old)

# Take look at the simulated posterior distributions
par(mfrow=c(2,2))
hist(theta.new,  main="1e4 samples from posterior 1928-2016")
hist(theta.old, main="1e4 samples from posterior 1879-1924")
plot(theta.new, theta.old, main="scatter plot of simulated values",
       xlim=c(0,1), ylim=c(0,1))
lines(c(0,1), c(0,1), col="blue")

plot.new( )
text(.5,.5,"r(theta1,theta2)=.0112")
text(.49,.4,"Pr(theta1>theta2)=  .9369")
# Find proportion/probability of times that theta1 > theta2
mean(theta1>theta2)