# http://doingbayesiandataanalysis.blogspot.com/2012/06/mixture-of-normal-distributions.html # # Fitting a mixture to the anorexia data # # Mixture of Normal Distributions # In this post I show a simple illustration of a # mixture of normal distributions. For the examples, # we assume we have metric values that we suppose # are generated by a mixture of two different normal # distributions, which I'll call clusters. We don't # know which datum came from each cluster. Our goal # is to estimate the probability that each score came # from each of the two clusters, and the means and SD # of the normal distributions that describe the clusters. #The model specification (for JAGS): The assumes that # the clusters have the same standard deviation, but different means. # library(coda) library(rjags) library(runjags) library(jagsUI) setwd("D:/Dropbox/edps 590BAY/Lectures/5 Gibbs Sampling") df <- read.table("getting_what_paid_for_data.txt", header=TRUE) names(df) head(df) model ="{ # Likelihood: for( i in 1 : N ) { y[i] ~ dnorm( mu[i] , tau ) mu[i] <- muOfClust[ clust[i] ] clust[i] ~ dcat( pClust[1:Nclust] ) } # Prior: tau ~ dgamma( 0.01 , 0.01 ) for ( clustIdx in 1: Nclust ) { muOfClust[clustIdx] ~ dnorm( 0 , 1.0E-10 ) } pClust[1:Nclust] ~ ddirch( onesRepNclust ) } " # Data specification # At least one value needed to be put in each cluster y <- df$ave_tot # less typing N <- nrow(df) # sample size Nclust <- 2 # number of clusters clust <- rep(NA,N) clust[which.min(y)] <- 1 # put smallest into cluster 1 clust[which.max(y)] <- 2 # put largest into cluster 2 dataList <- list( y = df$ave_tot, N = N, Nclust = Nclust, clust=clust, onesRepNclust = rep(1,Nclust) ) #The model specification (for JAGS): The assumes that # the clusters have the same standard deviation, but # different means. mixmodel = "model{ # Likelihood: for( i in 1 : N ) { y[i] ~ dnorm( mu[i] , precision ) mu[i] <- muOfClust[ clust[i] ] clust[i] ~ dcat( pClust[1:Nclust] ) } # Prior: precision ~ dgamma( 0.01 , 0.01 ) for ( clustIdx in 1: Nclust ) { muOfClust[clustIdx] ~ dnorm( 0 , 1.0E-10 ) } pClust[1:Nclust] ~ ddirch( onesRepNclust ) } " writeLines(mixmodel, con="MixtureModel.txt") inmu <- c(-10, 15) inprec <- 1/var(y) inpClust <- c(.6, .4) initslist <- list(muOfClust = inmu, precision=inprec, pClust=inpClust) try1 <- jags.model(file="MixtureModel.txt", # compiles and intializes model data=dataList, inits=initslist, n.chains=4, n.adapt=2000) # iterations adaption phase Samples <- coda.samples(try1, variable.names=c("muOfClust","pClust","precision"), n.iter=20000) plot(Samples) options(scipen=999) round(summary(Samples) # What does posterior for data look like? ymin <- min(y) ymax <- max(y) sigma <- sqrt(1/.0008864) t <- summary(Samples) stats <- t$statistics mu1 <- stats[1,1] sd.mu1 <- stats[1,2] mu2 <- stats[2,1] sd.mu2 <- stats[2,2] p1 <- stats[3,1] sd.p1 <- stats[3,2] p2 <- stats[4,1] sd.p2 <- stats[4,2] precision <- stats[5,1] sigma <- 1/sqrt(precision) n <- length(y) replications <- 600 yrep <- matrix(NA, nrow=n, ncol=replications) for (s in 1:replications) { mean1 <- rnorm(1, mu1, sd.mu1) mean2 <- rnorm(1, mu2, sd.mu2) for (i in 1:n){ yrep[i,s] <- p1*mean1 + p2*mean2 + rnorm(1,0,sigma) } } yhats <- apply(yrep,2,"mean") hist(yhats) x <- seq(from=ymin, to=ymax, length.out=600) (yhat1 <- rnorm(x, mu1, sigma)) (yhat2 <- rnorm(x, mu2, sigma)) c1 <- rgb(173,216,230,max = 255, alpha = 80, names = "lt.blue") c2 <- rgb(255,192,203, max = 255, alpha = 80, names = "lt.pink") b <- min(c(yhat1,yhat2)) -.001 # Set the minimum for the breakpoints e <- max(c(yhat1,yhat2)) # Set the maximum for the breakpoints ax <- pretty(b:e, n = 20) # Make a neat vector for the breakpoints hg1 <- hist(yhat1, breaks = ax, plot = FALSE) # Save first histogram data hg2 <- hist(yhat2, breaks = ax, plot = FALSE) # Save first histogram data # Plot 1st histogram using a transparent color plot(hg1, col = c1, main="Fitted values for 2 clusters") plot(hg2, col = c2, add = TRUE) # Add 2nd histogram using different color # Plot data hist(df$ave_tot, breaks=ax) par(mfrow=c(1,2)) # Plot 1st histogram using a transparent color plot(hg1, col = c1, main="Histrograms of fitted values for 2 clusters") plot(hg2, col = c2, add = TRUE) # Add 2nd histogram using different color # Plot data hist(df$ave_tot, breaks=ax) den1 <- dnorm(x, mu1, sd.mu1) den2 <- dnorm(x, mu2, sd.mu2) plot(x,den1, type='l', main='Posterior densities of means') lines(x,den2, type='l') h =hist(y, breaks = 10, density = 10, col = "darkblue", xlab = "State Average SAT Total", ylab = "Frequency", main="Normal curve s Over histogram") den1 <- den1 * diff(h$mids[1:2]) * 100 lines(density(den1), col = "black", lwd = 2) #+++++++++++++++++++++++++++++++++++++++++\# mixmodel = "model{ # Likelihood: for( i in 1 : N ) { y[i] ~ dnorm( mu[i] , precision ) mu[i] <- muOfClust[ clust[i] ] clust[i] ~ dcat( pClust[1:Nclust] ) res[i] <- y[i] - mu[i] # residuals emp.new[i] <- dnorm( mu[i] , precision ) # New predictions res.new[i] <- emp.new[i] - mu[i] } # Prior: precision ~ dgamma( 0.01 , 0.01 ) for ( clustIdx in 1: Nclust ) { muOfClust[clustIdx] ~ dnorm( 0 , 1.0E-10 ) } pClust[1:Nclust] ~ ddirch( onesRepNclust ) # Derived parameters # fit <- sum(set[]) # sum of data residuals # fit.new <- sum(res.new[]) # sum of predicted residuals # fit.mean <- mean(emp.new[]) # mean of postierior predictions } " writeLines(mixmodel, con="MixtureModel.txt") inmu <- c(-10, 15) inprec <- 1/var(y) inpClust <- c(.6, .4) initslist <- list(muOfClust = inmu, precision=inprec, pClust=inpClust) try1 <- jags.model(file="MixtureModel.txt", # compiles and intializes model data=dataList, inits=initslist, n.chains=4, n.adapt=2000) # iterations adaption phase Samples <- coda.samples(try1, variable.names=c("muOfClust","pClust","precision","emp.new","res.new"), n.iter=2000)