Set up
Data
Random Intercept (null/empty HLM)
- brm
- Stan data and code
Random Intercept and Slope
Specifying QR Decomposition
Comparing Models
Changing Stan code
Change Priors

#.libPaths("C:/Program Files/R/R-4.1.1/library")

Set up

To make this run but will drop when put into master set of examples.

library(rstan)

## Loading required package: StanHeaders

## Loading required package: ggplot2

## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)

## Do not specify '-march=native' in 'LOCAL_CPPFLAGS' or a Makevars file

library(loo)

## This is loo version 2.4.1

## - Online documentation and vignettes at mc-stan.org/loo

## - As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.

## - Windows 10 users: loo may be very slow if 'mc.cores' is set in your .Rprofile file (see https://github.com/stan-dev/loo/issues/94).

## 
## Attaching package: 'loo'

## The following object is masked from 'package:rstan':
## 
##     loo

library(brms)

## Loading required package: Rcpp

## Loading 'brms' package (version 2.16.1). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').

## 
## Attaching package: 'brms'

## The following object is masked from 'package:rstan':
## 
##     loo

## The following object is masked from 'package:stats':
## 
##     ar

library(coda)

## 
## Attaching package: 'coda'

## The following object is masked from 'package:rstan':
## 
##     traceplot

setwd("D:/Dropbox/edps 590BAY/Lectures/10 Ham_Stan_brms")

nels <- read.table("school23_data.txt",header=TRUE)
names(nels)

##  [1] "school"   "student"  "sex"      "race"     "homew"    "schtype" 
##  [7] "ses"      "pared"    "math"     "classtr"  "schsize"  "urban"   
## [13] "geo"      "minority" "ratio"

# sort by schools (may speed things up for large data sets):
nels <- nels[order(nels$school,nels$student) , ]


options(mc.cores=parallel::detectCores())
rstan_options(auto_write = TRUE)

Data

Need information on number schools & create school.id

school <- unique(nels$school)                         
N <- length(school)                                   
school.int <- as.data.frame(cbind(school,seq(1:N)))     
names(school.int) <- c("school", "school.id")        
nels <- merge(nels,school.int,by="school")           

# total number of students
n <- length(nels$math)

Random Intercept (null/empty HLM)

brm

hlm1 <- brm(math ~ 1 + (1 | school.id),
            data=nels,
            warmup=2000,
            iter=4000,
            chains=4,
            cores=4,
            seed=11121
            )

## Compiling Stan program...

## Start sampling

Stan data and code

Corresponding stan data and code

dataList <- standata(hlm1)

hlm1.stan <- stancode(hlm1)

This provide trace plots and densities,

plot(hlm1)

If you want the diagnostics from coda (ones we used with Gibbs sampling), you need to extract the samples from brm as an mcmc object. Note that as.mcmc is deprecated, but I haven’t found the function that can be used instead.

hlm1.mcmc <- as.mcmc(hlm1)

## Warning: as.mcmc.brmsfit is deprecated and will eventually be removed.

traceplot(hlm1.mcmc)

densplot(hlm1.mcmc)

gelman.diag(hlm1.mcmc)

## Potential scale reduction factors:
## 
##                           Point est. Upper C.I.
## b_Intercept                        1       1.01
## sd_school.id__Intercept            1       1.01
## sigma                              1       1.00
## r_school.id[1,Intercept]           1       1.01
## r_school.id[2,Intercept]           1       1.00
## r_school.id[3,Intercept]           1       1.00
## r_school.id[4,Intercept]           1       1.00
## r_school.id[5,Intercept]           1       1.00
## r_school.id[6,Intercept]           1       1.00
## r_school.id[7,Intercept]           1       1.00
## r_school.id[8,Intercept]           1       1.00
## r_school.id[9,Intercept]           1       1.00
## r_school.id[10,Intercept]          1       1.00
## r_school.id[11,Intercept]          1       1.00
## r_school.id[12,Intercept]          1       1.00
## r_school.id[13,Intercept]          1       1.01
## r_school.id[14,Intercept]          1       1.00
## r_school.id[15,Intercept]          1       1.00
## r_school.id[16,Intercept]          1       1.00
## r_school.id[17,Intercept]          1       1.00
## r_school.id[18,Intercept]          1       1.01
## r_school.id[19,Intercept]          1       1.00
## r_school.id[20,Intercept]          1       1.00
## r_school.id[21,Intercept]          1       1.00
## r_school.id[22,Intercept]          1       1.00
## r_school.id[23,Intercept]          1       1.00
## lp__                               1       1.00
## 
## Multivariate psrf
## 
## 1

gelman.plot(hlm1.mcmc)

autocorr.plot(hlm1.mcmc)

geweke.diag(hlm1.mcmc)

## [[1]]
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##               b_Intercept   sd_school.id__Intercept                     sigma 
##                   0.10854                  -0.05424                   1.04189 
##  r_school.id[1,Intercept]  r_school.id[2,Intercept]  r_school.id[3,Intercept] 
##                  -0.27439                  -0.16852                   1.18729 
##  r_school.id[4,Intercept]  r_school.id[5,Intercept]  r_school.id[6,Intercept] 
##                   0.68336                  -1.06509                   0.30441 
##  r_school.id[7,Intercept]  r_school.id[8,Intercept]  r_school.id[9,Intercept] 
##                  -0.10863                   0.02425                  -0.92654 
## r_school.id[10,Intercept] r_school.id[11,Intercept] r_school.id[12,Intercept] 
##                   0.43256                   0.70673                  -0.49497 
## r_school.id[13,Intercept] r_school.id[14,Intercept] r_school.id[15,Intercept] 
##                  -0.23715                  -1.19675                  -0.59860 
## r_school.id[16,Intercept] r_school.id[17,Intercept] r_school.id[18,Intercept] 
##                  -0.81785                   1.69402                  -0.10737 
## r_school.id[19,Intercept] r_school.id[20,Intercept] r_school.id[21,Intercept] 
##                  -0.04788                  -0.42454                  -0.34229 
## r_school.id[22,Intercept] r_school.id[23,Intercept]                      lp__ 
##                  -0.22318                   0.30522                   1.00040 
## 
## 
## [[2]]
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##               b_Intercept   sd_school.id__Intercept                     sigma 
##                   -1.0902                   -1.3315                   -0.7543 
##  r_school.id[1,Intercept]  r_school.id[2,Intercept]  r_school.id[3,Intercept] 
##                    0.8718                   -0.7431                    1.7296 
##  r_school.id[4,Intercept]  r_school.id[5,Intercept]  r_school.id[6,Intercept] 
##                    1.1987                    0.6886                   -0.4700 
##  r_school.id[7,Intercept]  r_school.id[8,Intercept]  r_school.id[9,Intercept] 
##                    1.1250                    1.0406                    1.4821 
## r_school.id[10,Intercept] r_school.id[11,Intercept] r_school.id[12,Intercept] 
##                    1.6837                    1.5428                    0.6244 
## r_school.id[13,Intercept] r_school.id[14,Intercept] r_school.id[15,Intercept] 
##                    1.7808                    1.2711                    1.4200 
## r_school.id[16,Intercept] r_school.id[17,Intercept] r_school.id[18,Intercept] 
##                    0.3766                    1.5801                    1.6146 
## r_school.id[19,Intercept] r_school.id[20,Intercept] r_school.id[21,Intercept] 
##                    1.2005                    0.9612                    0.8120 
## r_school.id[22,Intercept] r_school.id[23,Intercept]                      lp__ 
##                    1.6317                    1.3010                   -0.6127 
## 
## 
## [[3]]
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##               b_Intercept   sd_school.id__Intercept                     sigma 
##                    1.4669                   -0.6414                   -3.2995 
##  r_school.id[1,Intercept]  r_school.id[2,Intercept]  r_school.id[3,Intercept] 
##                   -0.7993                   -0.5237                   -1.4105 
##  r_school.id[4,Intercept]  r_school.id[5,Intercept]  r_school.id[6,Intercept] 
##                   -1.0622                   -1.0344                   -1.2764 
##  r_school.id[7,Intercept]  r_school.id[8,Intercept]  r_school.id[9,Intercept] 
##                   -0.2900                   -0.6783                   -1.4330 
## r_school.id[10,Intercept] r_school.id[11,Intercept] r_school.id[12,Intercept] 
##                   -0.9485                   -0.7540                   -1.4864 
## r_school.id[13,Intercept] r_school.id[14,Intercept] r_school.id[15,Intercept] 
##                   -1.1929                    0.2303                   -0.8430 
## r_school.id[16,Intercept] r_school.id[17,Intercept] r_school.id[18,Intercept] 
##                   -1.1772                   -0.8660                   -0.9336 
## r_school.id[19,Intercept] r_school.id[20,Intercept] r_school.id[21,Intercept] 
##                   -0.6953                   -0.9655                   -0.8315 
## r_school.id[22,Intercept] r_school.id[23,Intercept]                      lp__ 
##                   -0.3003                   -0.7386                    0.2180 
## 
## 
## [[4]]
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##               b_Intercept   sd_school.id__Intercept                     sigma 
##                   0.58021                  -0.56919                  -0.33500 
##  r_school.id[1,Intercept]  r_school.id[2,Intercept]  r_school.id[3,Intercept] 
##                  -0.57214                  -0.40371                  -0.88240 
##  r_school.id[4,Intercept]  r_school.id[5,Intercept]  r_school.id[6,Intercept] 
##                   0.56548                  -0.44539                  -0.17710 
##  r_school.id[7,Intercept]  r_school.id[8,Intercept]  r_school.id[9,Intercept] 
##                  -0.80998                  -0.03684                   0.30853 
## r_school.id[10,Intercept] r_school.id[11,Intercept] r_school.id[12,Intercept] 
##                  -0.11912                  -0.15905                  -0.22563 
## r_school.id[13,Intercept] r_school.id[14,Intercept] r_school.id[15,Intercept] 
##                   0.06420                  -0.37986                  -0.55414 
## r_school.id[16,Intercept] r_school.id[17,Intercept] r_school.id[18,Intercept] 
##                  -0.21614                  -0.19762                  -0.66882 
## r_school.id[19,Intercept] r_school.id[20,Intercept] r_school.id[21,Intercept] 
##                  -0.66693                  -0.25080                  -0.20310 
## r_school.id[22,Intercept] r_school.id[23,Intercept]                      lp__ 
##                  -0.73856                  -0.38457                  -0.40697

effectiveSize(hlm1.mcmc)

##               b_Intercept   sd_school.id__Intercept                     sigma 
##                  1719.398                  2157.094                 14075.964 
##  r_school.id[1,Intercept]  r_school.id[2,Intercept]  r_school.id[3,Intercept] 
##                  3199.767                  7765.179                  9031.354 
##  r_school.id[4,Intercept]  r_school.id[5,Intercept]  r_school.id[6,Intercept] 
##                  4287.016                  4553.169                  5889.439 
##  r_school.id[7,Intercept]  r_school.id[8,Intercept]  r_school.id[9,Intercept] 
##                  4267.457                  5182.562                  4650.450 
## r_school.id[10,Intercept] r_school.id[11,Intercept] r_school.id[12,Intercept] 
##                  4491.417                  4358.061                  4976.168 
## r_school.id[13,Intercept] r_school.id[14,Intercept] r_school.id[15,Intercept] 
##                  4700.320                  5547.942                  4365.282 
## r_school.id[16,Intercept] r_school.id[17,Intercept] r_school.id[18,Intercept] 
##                  4461.206                  5198.408                  2919.780 
## r_school.id[19,Intercept] r_school.id[20,Intercept] r_school.id[21,Intercept] 
##                  4600.983                  5149.036                  3835.855 
## r_school.id[22,Intercept] r_school.id[23,Intercept]                      lp__ 
##                  4460.820                  6603.583                  1702.233

And if all looks good

summary(hlm1)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: math ~ 1 + (1 | school.id) 
##    Data: nels (Number of observations: 519) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Group-Level Effects: 
## ~school.id (Number of levels: 23) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     5.40      0.99     3.80     7.69 1.00     2057     2991
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    50.77      1.19    48.41    53.16 1.00     1732     2638
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     9.03      0.29     8.48     9.61 1.00    13779     5974
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Random Intercept and Slope

hlm2 <- brm(math ~ 1 + homew + ses + (1 + homew| school.id),
            data=nels,
            warmup=2000,
            iter=4000,
            chains=4,
            cores=4,
            seed=11121
            )

## Compiling Stan program...

## Start sampling

plot(hlm2)

summary(hlm2)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: math ~ 1 + homew + ses + (1 + homew | school.id) 
##    Data: nels (Number of observations: 519) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Group-Level Effects: 
## ~school.id (Number of levels: 23) 
##                      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept)            7.41      1.37     5.18    10.57 1.00     1546
## sd(homew)                4.02      0.77     2.76     5.75 1.00     1400
## cor(Intercept,homew)    -0.83      0.08    -0.94    -0.62 1.00     1779
##                      Tail_ESS
## sd(Intercept)            2746
## sd(homew)                2516
## cor(Intercept,homew)     3374
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    47.01      1.70    43.65    50.23 1.00     1535     2879
## homew         1.83      0.91    -0.00     3.64 1.00     1389     2463
## ses           2.76      0.51     1.77     3.76 1.00     6906     6471
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     7.19      0.23     6.75     7.67 1.00     8430     5980
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(conditional_effects(hlm2), ask = FALSE)

#or
# plot(conditional_effects(hlm2), points=TRUE, ask = FALSE)

pp_check(hlm2)  # shows dens_overlay plot by default

## Using 10 posterior draws for ppc type 'dens_overlay' by default.

There are many possible posterior predictive checks. I suggest checking the brms manual. Just some of them are below.

``{r eval=FALSE} pp_check(hlm2, type = “error_hist”, ndraws = 11) pp_check(hlm2, type = “scatter_avg”, ndraws = 100) pp_check(hlm2, type = “stat_2d”) pp_check(hlm2, type = “loo_pit”)

# Level 2: Multiple Predictors, Factors and Cross-level Interactions

brm handles factors like lm or glm (i.e., dummy codes).  We need to declare the variables as factors.

```r
nels$urban <- as.factor(nels$urban)
nels$schtyp <- as.factor(nels$schtype)
nels$classtr <- as.factor(nels$classtr)

Then just are above,

hlm3 <- brm(math ~ 1 + homew + ses +  urban + ses*urban + (1 + homew| school.id),
            data=nels,
            warmup=2000,
            iter=4000,
            chains=4,
            cores=4,
            seed=11121
            )

## Compiling Stan program...

## Start sampling

plot(hlm3)              # trace plots & density

summary(hlm3)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: math ~ 1 + homew + ses + urban + ses * urban + (1 + homew | school.id) 
##    Data: nels (Number of observations: 519) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Group-Level Effects: 
## ~school.id (Number of levels: 23) 
##                      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept)            7.63      1.37     5.34    10.75 1.00     2607
## sd(homew)                3.92      0.71     2.75     5.57 1.00     2385
## cor(Intercept,homew)    -0.81      0.09    -0.93    -0.58 1.00     2599
##                      Tail_ESS
## sd(Intercept)            4302
## sd(homew)                3713
## cor(Intercept,homew)     4497
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     48.22      2.19    43.92    52.52 1.00     2484     3616
## homew          1.86      0.89     0.10     3.65 1.00     2388     3632
## ses            1.43      0.79    -0.14     2.98 1.00     4691     5137
## urban2        -2.90      2.47    -7.70     2.03 1.00     2763     3779
## urban3        -1.03      2.37    -5.70     3.70 1.00     2606     3448
## ses:urban2     2.14      1.31    -0.42     4.77 1.00     5664     6167
## ses:urban3     2.61      1.20     0.20     4.93 1.00     6732     6231
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     7.15      0.23     6.71     7.63 1.00     9954     5841
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(conditional_effects(hlm3), ask = FALSE)

pp_check(hlm3)  # shows dens_overlay plot by default

## Using 10 posterior draws for ppc type 'dens_overlay' by default.

real normal_lccdf(reals y | reals mu, reals sigma)

The log of the complementary cumulative normal distribution of y given location mu and scale sigma; normal_lccdf will overflow to 0 for y−μσ below -37.5 and underflow to −∞ for y−μσ above 8.25; see above for discussion of Phi_approx as an alternative.

Specifying QR Decomposition

By default centering is used. If you want QR decomposition (which is especially good for correlated predictors), this can be requested using brmsformula. I am including this because we talked about it for Stan.

The model formula can be specified separately and then put into the brm call. There are a number of options and two that I added here are center (which is TRUE by default) and QR decomposition.

# Options for the forumla
fm3 <- brmsformula(math ~ 1 + homew + ses +  urban + ses*urban + (1 + homew| school.id),
          center=TRUE,
          decomp="QR"
          )

The rest is repeating what we did above, except using fm3 as the formula.

hlm3.alt <- brm(fm3,
            data=nels,
            warmup=2000,
            iter=4000,
            chains=4,
            cores=4,
            seed=11121
            )

## Compiling Stan program...

## Start sampling

plot(hlm3.alt)              # trace plots & density

summary(hlm3.alt)

##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: math ~ 1 + homew + ses + urban + ses * urban + (1 + homew | school.id) 
##    Data: nels (Number of observations: 519) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Group-Level Effects: 
## ~school.id (Number of levels: 23) 
##                      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept)            7.63      1.35     5.38    10.61 1.00     2208
## sd(homew)                3.94      0.72     2.74     5.60 1.00     2028
## cor(Intercept,homew)    -0.81      0.09    -0.93    -0.57 1.00     1978
##                      Tail_ESS
## sd(Intercept)            3902
## sd(homew)                3904
## cor(Intercept,homew)     3346
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     48.28      2.15    44.09    52.50 1.00     1912     3556
## homew          1.85      0.87     0.13     3.56 1.00     2082     3695
## ses            1.44      0.78    -0.07     2.99 1.00     5423     5439
## urban2        -3.00      2.49    -7.88     1.96 1.00     1826     3007
## urban3        -1.12      2.40    -5.89     3.64 1.00     1843     2913
## ses:urban2     2.13      1.30    -0.42     4.72 1.00     6703     6253
## ses:urban3     2.60      1.16     0.35     4.87 1.00     7134     6215
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     7.15      0.24     6.70     7.64 1.00     8693     6292
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

plot(conditional_effects(hlm3.alt), ask = FALSE)

Comparing Models

In terms of criteria to compare models in order worst to best are Bayes Factor, DIC, waic, loo, and psis-loo.

The criterion “waic” (Widely accessible information criterion) is asymptotically equivalent to loo (Leave-one-out-validation), but loo is faster and easier to compute. Some work has shown that loo is the preferred for model comparison. For these data and models, R throws warnings when I ask for “waic” and suggests using loo instead..

loo is noisy and a solution suggested for this is to use Pareto smoothed importance sampling LOO or PSIS-loo (Verhari, Gelman & Gabry, “Practical Bayesian model evaluation using leave-one-out cross-validation, http://www.stat.columbia.edu/~gelman/research/unpublished/loo_stan.pdf)
Since I didn’t get is at first, I can add this. The command below computes PSIS-LOO and compares values for our 3 models.

For more information a good starting place is https://mc-stan.org/loo/reference/loo-glossary.html

Smaller values of looic are better (just like other information criteria), and for elpd larger is better. Note that looic equals −2∗elpd

hlm1 <- add_criterion(hlm1, criterion="loo")
hlm1$criteria$loo

## 
## Computed from 8000 by 519 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1889.8 13.7
## p_loo        22.0  1.3
## looic      3779.6 27.3
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

hlm2 <- add_criterion(hlm2, criterion="loo")

## Warning: Found 2 observations with a pareto_k > 0.7 in model 'hlm2'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching for
## problematic observations.

hlm2$criteria$loo

## 
## Computed from 8000 by 519 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1779.9 14.6
## p_loo        36.0  2.3
## looic      3559.9 29.2
## ------
## Monte Carlo SE of elpd_loo is NA.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     514   99.0%   1423      
##  (0.5, 0.7]   (ok)         3    0.6%   1943      
##    (0.7, 1]   (bad)        2    0.4%   397       
##    (1, Inf)   (very bad)   0    0.0%   <NA>      
## See help('pareto-k-diagnostic') for details.

hlm3 <- add_criterion(hlm3, criterion="loo")
hlm3$criteria$loo

## 
## Computed from 8000 by 519 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1778.8 14.7
## p_loo        39.0  2.5
## looic      3557.6 29.3
## ------
## Monte Carlo SE of elpd_loo is 0.1.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     512   98.7%   1504      
##  (0.5, 0.7]   (ok)         7    1.3%   354       
##    (0.7, 1]   (bad)        0    0.0%   <NA>      
##    (1, Inf)   (very bad)   0    0.0%   <NA>      
## 
## All Pareto k estimates are ok (k < 0.7).
## See help('pareto-k-diagnostic') for details.

If you wan to compare, this is a better

loo(hlm1, hlm2, hlm3, compare=TRUE, moment_match=TRUE, save_psis=TRUE)

## Output of model 'hlm1':
## 
## Computed from 8000 by 519 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1889.8 13.7
## p_loo        22.0  1.3
## looic      3779.6 27.3
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.
## 
## Output of model 'hlm2':
## 
## Computed from 8000 by 519 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1779.9 14.6
## p_loo        36.0  2.3
## looic      3559.9 29.2
## ------
## Monte Carlo SE of elpd_loo is NA.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     514   99.0%   1423      
##  (0.5, 0.7]   (ok)         3    0.6%   1943      
##    (0.7, 1]   (bad)        2    0.4%   397       
##    (1, Inf)   (very bad)   0    0.0%   <NA>      
## See help('pareto-k-diagnostic') for details.
## 
## Output of model 'hlm3':
## 
## Computed from 8000 by 519 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1778.8 14.7
## p_loo        39.0  2.5
## looic      3557.6 29.3
## ------
## Monte Carlo SE of elpd_loo is 0.1.
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. n_eff
## (-Inf, 0.5]   (good)     512   98.7%   1504      
##  (0.5, 0.7]   (ok)         7    1.3%   354       
##    (0.7, 1]   (bad)        0    0.0%   <NA>      
##    (1, Inf)   (very bad)   0    0.0%   <NA>      
## 
## All Pareto k estimates are ok (k < 0.7).
## See help('pareto-k-diagnostic') for details.
## 
## Model comparisons:
##      elpd_diff se_diff
## hlm3    0.0       0.0 
## hlm2   -1.2       2.6 
## hlm1 -111.0      12.7

Note that

 elpd_loo is the Expected Log Pointwise Predictive Density, which is the sum of N individual pointwise log predictive densities.

 p_loo is the defective number of parameters (i.e., the difference between elpd_loo and the non-cross validity log posterior predictive density).

 elpd_diff is the difference in elpl_loo for two models.  If more than 2 models are compared then it is the difference relative to the model with the highest elpd_loo.

For more information a good starting place is https://mc-stan.org/loo/reference/loo-glossary.html

Changing Stan code

Although most of the time you can modify your models within brms, you might want to modify the Stan code and run it using rstan (e.g., a simulation study where only data changes, or this is a starting point for your own model). Also this is a good way to learn Stan coding.

For the data input to Stan by brms,

data_1 <- standata(hlm1)

and for the model code,

stan_code <- stancode(hlm1)
stan_code

## // generated with brms 2.16.1
## functions {
## }
## data {
##   int<lower=1> N;  // total number of observations
##   vector[N] Y;  // response variable
##   // data for group-level effects of ID 1
##   int<lower=1> N_1;  // number of grouping levels
##   int<lower=1> M_1;  // number of coefficients per level
##   int<lower=1> J_1[N];  // grouping indicator per observation
##   // group-level predictor values
##   vector[N] Z_1_1;
##   int prior_only;  // should the likelihood be ignored?
## }
## transformed data {
## }
## parameters {
##   real Intercept;  // temporary intercept for centered predictors
##   real<lower=0> sigma;  // dispersion parameter
##   vector<lower=0>[M_1] sd_1;  // group-level standard deviations
##   vector[N_1] z_1[M_1];  // standardized group-level effects
## }
## transformed parameters {
##   vector[N_1] r_1_1;  // actual group-level effects
##   r_1_1 = (sd_1[1] * (z_1[1]));
## }
## model {
##   // likelihood including constants
##   if (!prior_only) {
##     // initialize linear predictor term
##     vector[N] mu = Intercept + rep_vector(0.0, N);
##     for (n in 1:N) {
##       // add more terms to the linear predictor
##       mu[n] += r_1_1[J_1[n]] * Z_1_1[n];
##     }
##     target += normal_lpdf(Y | mu, sigma);
##   }
##   // priors including constants
##   target += student_t_lpdf(Intercept | 3, 51, 13.3);
##   target += student_t_lpdf(sigma | 3, 0, 13.3)
##     - 1 * student_t_lccdf(0 | 3, 0, 13.3);
##   target += student_t_lpdf(sd_1 | 3, 0, 13.3)
##     - 1 * student_t_lccdf(0 | 3, 0, 13.3);
##   target += std_normal_lpdf(z_1[1]);
## }
## generated quantities {
##   // actual population-level intercept
##   real b_Intercept = Intercept;
## }

Change Priors

If you don’t want to look at Stan code, you may want to know what the priors are that were used to fit a model. For example,

prior_summary(hlm2)

##                   prior     class      coef     group resp dpar nlpar bound
##                  (flat)         b                                          
##                  (flat)         b     homew                                
##                  (flat)         b       ses                                
##  student_t(3, 51, 13.3) Intercept                                          
##    lkj_corr_cholesky(1)         L                                          
##    lkj_corr_cholesky(1)         L           school.id                      
##   student_t(3, 0, 13.3)        sd                                          
##   student_t(3, 0, 13.3)        sd           school.id                      
##   student_t(3, 0, 13.3)        sd     homew school.id                      
##   student_t(3, 0, 13.3)        sd Intercept school.id                      
##   student_t(3, 0, 13.3)     sigma                                          
##        source
##       default
##  (vectorized)
##  (vectorized)
##       default
##       default
##  (vectorized)
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default

Note that for sigma this is actually a half non-central t-distribution. Other distributions can be put here. Gelman recommends a half Cauchy distribution (a truncated Cauchy,

\[ f(y| \mu, \sigma) = \left\{ \begin{array}{ll} \frac{2}{\pi\sigma} \frac{1}{1+(y-\mu)^2/\sigma^2} & y \ge \mu \\ 0 & \mbox{otherwise} \end{array}\right. \]

How to change your priors? You can define priors and ask brms to create the Stan code.

prior1 <- prior(normal(0, 5), class = "Intercept")
prior2 <- prior(cauchy(0, 1), class = "sd")
hlm1priors <- c(prior1, prior2)

hlm1.stan <- make_stancode(math ~ 1 + (1 | school.id), data=nels, prior=hlm1priors)

data1 <-standata(hlm1)

hlm1.rstan <- stan(model_code = hlm1.stan, 
        model_name = "Nels23: math ~ 1 + (1 |school.id)",
      data = data1,
        iter = 4000,
        chains = 4,
        cores = 4,
        verbose = FALSE
       )

## Warning: There were 318 divergent transitions after warmup. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.

## Warning: There were 2 chains where the estimated Bayesian Fraction of Missing Information was low. See
## http://mc-stan.org/misc/warnings.html#bfmi-low

## Warning: Examine the pairs() plot to diagnose sampling problems

## Warning: The largest R-hat is 1.25, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#r-hat

## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess

## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess

Based on the following diagnostics, my changes make things pretty bad. Some of the chains are not mixing well, some large auto-correlations, and strange looking posteriors.

stan_trace(hlm1.rstan)

## 'pars' not specified. Showing first 10 parameters by default.

stan_dens(hlm1.rstan)

## 'pars' not specified. Showing first 10 parameters by default.

stan_hist(hlm1.rstan)

## 'pars' not specified. Showing first 10 parameters by default.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

stan_ac(hlm1.rstan)

## 'pars' not specified. Showing first 10 parameters by default.

print(hlm1.rstan)

## Inference for Stan model: Nels23: math ~ 1 + (1 |school.id).
## 4 chains, each with iter=4000; warmup=2000; thin=1; 
## post-warmup draws per chain=2000, total post-warmup draws=8000.
## 
##                 mean se_mean    sd     2.5%      25%      50%      75%    97.5%
## Intercept      38.73    5.58 12.73    10.76    27.70    45.81    47.80    49.90
## sigma           9.04    0.01  0.29     8.50     8.84     9.03     9.23     9.61
## sd_1[1]        14.61    4.87 13.25     4.26     5.71     7.28    23.65    48.53
## z_1[1,1]        1.32    0.03  0.31     0.72     1.12     1.29     1.51     1.99
## z_1[1,2]        1.25    0.01  0.43     0.42     1.00     1.23     1.49     2.16
## z_1[1,3]        1.08    0.01  0.48     0.04     0.81     1.10     1.37     2.06
## z_1[1,4]        0.41    0.15  0.46    -0.53     0.08     0.45     0.80     1.09
## z_1[1,5]        0.10    0.21  0.58    -1.02    -0.33     0.08     0.65     0.99
## z_1[1,6]        0.99    0.01  0.33     0.29     0.79     1.01     1.20     1.65
## z_1[1,7]        0.59    0.11  0.40    -0.28     0.32     0.63     0.90     1.20
## z_1[1,8]       -0.28    0.29  0.76    -1.64    -0.88    -0.38     0.51     0.87
## z_1[1,9]        0.94    0.02  0.31     0.26     0.75     0.97     1.14     1.53
## z_1[1,10]       0.39    0.16  0.47    -0.59     0.05     0.44     0.79     1.10
## z_1[1,11]      -0.14    0.26  0.70    -1.40    -0.68    -0.22     0.56     0.92
## z_1[1,12]       0.56    0.12  0.41    -0.31     0.28     0.61     0.89     1.19
## z_1[1,13]       0.18    0.20  0.56    -0.91    -0.24     0.17     0.69     1.02
## z_1[1,14]       1.25    0.02  0.35     0.62     1.03     1.22     1.45     2.02
## z_1[1,15]       1.21    0.02  0.32     0.64     1.01     1.19     1.39     1.89
## z_1[1,16]       0.69    0.09  0.38    -0.17     0.45     0.75     0.97     1.28
## z_1[1,17]      -0.41    0.32  0.82    -1.84    -1.06    -0.55     0.47     0.82
## z_1[1,18]       2.11    0.19  0.63     0.97     1.54     2.17     2.58     3.25
## z_1[1,19]       0.54    0.12  0.42    -0.35     0.26     0.60     0.87     1.18
## z_1[1,20]       0.17    0.20  0.55    -0.92    -0.24     0.15     0.68     1.01
## z_1[1,21]       0.73    0.08  0.34    -0.04     0.53     0.79     0.98     1.28
## z_1[1,22]       0.34    0.16  0.49    -0.68    -0.02     0.38     0.77     1.07
## z_1[1,23]       1.08    0.01  0.36     0.36     0.87     1.09     1.29     1.81
## r_1_1[1]       17.26    5.66 12.97     4.96     7.99    10.38    28.36    45.54
## r_1_1[2]       17.10    5.93 13.91     2.31     7.04    10.46    28.85    47.71
## r_1_1[3]       15.67    6.08 13.90     0.21     5.46     9.49    27.02    46.49
## r_1_1[4]        9.63    5.59 12.83    -2.79     0.48     2.96    20.66    38.02
## r_1_1[5]        7.05    5.53 12.72    -5.29    -1.97     0.52    17.69    35.22
## r_1_1[6]       14.63    5.67 13.16     1.53     5.20     7.99    25.82    43.80
## r_1_1[7]       11.13    5.61 12.90    -1.48     1.92     4.51    22.06    39.55
## r_1_1[8]        3.76    5.44 12.56    -8.41    -5.17    -2.59    13.82    31.68
## r_1_1[9]       14.14    5.69 13.05     1.35     4.74     7.45    25.35    42.74
## r_1_1[10]       9.50    5.60 12.90    -3.15     0.28     2.93    20.18    38.29
## r_1_1[11]       5.03    5.52 12.65    -7.26    -3.98    -1.49    15.48    33.11
## r_1_1[12]      10.96    5.59 12.91    -1.65     1.73     4.34    21.99    39.23
## r_1_1[13]       7.70    5.54 12.78    -4.76    -1.39     1.13    18.35    35.97
## r_1_1[14]      16.87    5.78 13.39     3.46     7.24    10.18    27.72    45.93
## r_1_1[15]      16.42    5.70 13.18     3.57     7.00     9.59    27.66    45.25
## r_1_1[16]      12.02    5.65 13.00    -0.92     2.70     5.47    23.09    40.71
## r_1_1[17]       2.65    5.43 12.51    -9.58    -6.19    -3.70    12.85    30.48
## r_1_1[18]      23.72    5.67 12.99    11.61    14.43    16.76    35.01    52.12
## r_1_1[19]      10.76    5.57 12.87    -1.87     1.53     4.21    21.42    38.96
## r_1_1[20]       7.63    5.57 12.76    -4.86    -1.41     1.05    18.41    35.76
## r_1_1[21]      12.35    5.64 12.87    -0.21     3.16     5.75    23.58    40.65
## r_1_1[22]       9.05    5.56 12.81    -3.56    -0.15     2.56    19.86    37.16
## r_1_1[23]      15.46    5.76 13.32     1.94     5.79     8.97    27.08    44.32
## b_Intercept    38.73    5.58 12.73    10.76    27.70    45.81    47.80    49.90
## lp__        -1950.70    6.73 16.82 -1971.82 -1963.13 -1957.72 -1934.03 -1917.86
##             n_eff Rhat
## Intercept       5 1.39
## sigma         962 1.00
## sd_1[1]         7 1.30
## z_1[1,1]       89 1.04
## z_1[1,2]      844 1.01
## z_1[1,3]     1961 1.00
## z_1[1,4]        9 1.17
## z_1[1,5]        7 1.22
## z_1[1,6]      727 1.01
## z_1[1,7]       13 1.11
## z_1[1,8]        7 1.25
## z_1[1,9]      234 1.03
## z_1[1,10]       9 1.16
## z_1[1,11]       7 1.25
## z_1[1,12]      12 1.12
## z_1[1,13]       8 1.20
## z_1[1,14]     369 1.01
## z_1[1,15]     344 1.01
## z_1[1,16]      17 1.08
## z_1[1,17]       7 1.27
## z_1[1,18]      11 1.19
## z_1[1,19]      12 1.12
## z_1[1,20]       8 1.21
## z_1[1,21]      19 1.08
## z_1[1,22]       9 1.17
## z_1[1,23]    1305 1.00
## r_1_1[1]        5 1.38
## r_1_1[2]        5 1.36
## r_1_1[3]        5 1.37
## r_1_1[4]        5 1.38
## r_1_1[5]        5 1.38
## r_1_1[6]        5 1.37
## r_1_1[7]        5 1.38
## r_1_1[8]        5 1.37
## r_1_1[9]        5 1.38
## r_1_1[10]       5 1.37
## r_1_1[11]       5 1.38
## r_1_1[12]       5 1.37
## r_1_1[13]       5 1.37
## r_1_1[14]       5 1.37
## r_1_1[15]       5 1.37
## r_1_1[16]       5 1.38
## r_1_1[17]       5 1.37
## r_1_1[18]       5 1.38
## r_1_1[19]       5 1.37
## r_1_1[20]       5 1.38
## r_1_1[21]       5 1.38
## r_1_1[22]       5 1.37
## r_1_1[23]       5 1.37
## b_Intercept     5 1.39
## lp__            6 1.33
## 
## Samples were drawn using NUTS(diag_e) at Thu Nov 04 16:13:30 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

NELS 23 HLM via brms and rstan

Carolyn J. Anderson

11/4/2021