Instructions

Grading scheme

Suggested readings

Required R packages


As always, load the required packages and change the seed to your last name.

library(TeachingDemos)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.3.0
## ✔ tibble  2.0.1     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.18.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:tidyr':
##
##     extract
library(ggmcmc)
library(bridgesampling)
library(polspline)

# set cores to use to the total number of cores (minimally 4)
options(mc.cores = max(parallel::detectCores(), 4))
# save a compiled version of the Stan model file
rstan_options(auto_write = TRUE)

lastname <- "YOURLASTNAME"

char2seed(lastname)
library(TeachingDemos)
library(tidyverse)
library(rstan)
library(ggmcmc)
library(bridgesampling)
library(polspline)

# set cores to use to the total number of cores (minimally 4)
options(mc.cores = max(parallel::detectCores(), 4))
# save a compiled version of the Stan model file
rstan_options(auto_write = TRUE)


lastname <- "bayes"

char2seed(lastname)

Here is some data from a forgetting experiment. t is the time point, and y is the proportion of people who correctly recalled the stimulus.

y <- c(.94, .77, .40, .26, .24, .16)
t <- c(1,   3,   6,   9,  12,  18)
obs <- y * 100

forgetting_data <- tibble(obs = obs,
                   N = 100,
                   t = t)

1. Implement models

Below is the code for the power model of forgetting we discussed in class: \[P(\text{recall} | t; c, d) = ct^{-d}\] where \(0 < c, d < 1.5\)

a. (1 point)

Implement an alternative model based on this exponential model for each participant:

\[P(\text{recall} | t; a, b) = a \exp(-bt)\] where \(0 < a, b < 1.5\)

power_model_str <- "
// power model of forgetting data
data {
  int obs[6] ;
  real t[6] ;
}
parameters {
  real<lower=0,upper=1.5> c ;
  real<lower=0,upper=1.5> d ;
}
transformed parameters {
  real theta[6];
  for (i in 1:6) {
    theta[i] = c * t[i]^(-d) ;
  }
}
model {
  // increment target log score with
  // probability of current samples from prior
  target += uniform_lpdf(c | 0, 1.5);
  target += uniform_lpdf(d | 0, 1.5);
  // increment target log score with
  // likelihood of data given current parameter samples
  target += binomial_lpmf(obs | 100, theta);
}

"

exponential_model_str <- "
// exponential model of forgetting data
// STAN CODE HERE

"
# SOLUTION:

power_model_str <- "
// power model of forgetting data
data {
  int obs[6] ;
  real t[6] ;
}
parameters {
  real<lower=0,upper=1.5> c ;
  real<lower=0,upper=1.5> d ;
}
transformed parameters {
  real theta[6];
  for (i in 1:6) {
    theta[i] = c * t[i]^(-d) ;
  }
}
model {
  // increment target log score with
  // probability of current samples from prior
  target += uniform_lpdf(c | 0, 1.5);
  target += uniform_lpdf(d | 0, 1.5);
  // increment target log score with
  // likelihood of data given current parameter samples
  target += binomial_lpmf(obs | 100, theta);
}

"

exponential_model_str <- "
// exponential model of forgetting data
data {
  int obs[6] ;
  real t[6] ;
}
parameters {
  real<lower=0,upper=1.5> a ;
  real<lower=0,upper=1.5> b ;
}
transformed parameters {
  real theta[6];
  for (i in 1:6) {
    theta[i] = a * exp(-b * t[i]) ;
  }
}
model {
  // increment target log score with
  // probability of current samples from prior
  target += uniform_lpdf(a | 0, 1.5);
  target += uniform_lpdf(b | 0, 1.5);
  // increment target log score with
  // likelihood of data given current parameter samples
  target += binomial_lpmf(obs | 100, theta);
}

"

2. Parameter inference

a. (2 points)

For both models, complete the code below to draw samples from the posterior distribution. NB: you should use a large amount of samples (as indicated) so that later bridge sampling is reliable. We use the control parameter here to fine-tune the inference for these models. Remember to set eval = TRUE.

# complete the code below
stanfit_exponential <- stan(# specify model
                            iter = 50000,
                            warmup = 10000,
                            control = list(adapt_delta = 0.999),
                            pars = , # replace NA with parameters to inspect
                            data =  ## provide data
                            )

stanfit_power <- stan(# specify model
                      iter = 50000,
                      warmup = 10000,
                      control = list(adapt_delta = 0.999),
                      pars = , # replace NA with parameters to inspect
                      data =  ## provide data
                      )                 
# SOLUTION:

stanfit_exponential <- stan(model_code = exponential_model_str,
                            iter = 50000,
                            warmup = 10000,
                            control = list(adapt_delta = 0.999),
                            pars = c("a", "b"),
                            data = forgetting_data)

stanfit_power <- stan(model_code = power_model_str,
                      iter = 50000,
                      warmup = 10000,
                      control = list(adapt_delta = 0.999),
                      pars = c("c", "d"),
                      data = forgetting_data)

b. (1 point)

Show a summary for each fit

# YOUR CODE HERE
# SOLUTION:

show(stanfit_exponential)
## Inference for Stan model: ebcc794d3a0629c00c6cbdfedb95d92f.
## 4 chains, each with iter=50000; warmup=10000; thin=1;
## post-warmup draws per chain=40000, total post-warmup draws=160000.
##
##        mean se_mean   sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
## a      1.06       0 0.03   0.99   1.04   1.06   1.08   1.12 48108    1
## b      0.13       0 0.01   0.11   0.12   0.13   0.14   0.15 48505    1
## lp__ -23.78       0 1.00 -26.46 -24.16 -23.47 -23.07 -22.81 44400    1
##
## Samples were drawn using NUTS(diag_e) at Fri Mar  1 22:48:15 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
show(stanfit_power)
## Inference for Stan model: 91ad2c5bbd7f83d5685ce9546dab87f1.
## 4 chains, each with iter=50000; warmup=10000; thin=1;
## post-warmup draws per chain=40000, total post-warmup draws=160000.
##
##        mean se_mean   sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
## c      0.95       0 0.02   0.90   0.93   0.95   0.96   0.98 57181    1
## d      0.50       0 0.03   0.43   0.47   0.50   0.52   0.56 68283    1
## lp__ -30.69       0 1.00 -33.38 -31.08 -30.38 -29.98 -29.72 47769    1
##
## Samples were drawn using NUTS(diag_e) at Fri Mar  1 22:49:28 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).

c. (1 point)

Make traceplots for the samples for both models

# YOUR CODE HERE
# SOLUTION:

stanfit_exponential %>%
  ggs() %>%
  ggs_traceplot

stanfit_power %>%
  ggs() %>%
  ggs_traceplot

d. (1 point)

Plot the density for posterior samples for both models

# YOUR CODE HERE
# SOLUTION:

plot(stanfit_exponential, plotfun = "dens")

plot(stanfit_power, plotfun = "dens")

3. Bridge Sampling

a. (1 point)

Approximate the marginal likelihood of each model using the bridgesampling package (Hint: use bridgesampling::bridge_sampler() as we did in the lecture, and use option silent = T to suppress output in the markdown).

# YOUR CODE HERE
# SOLUTION:
# approximate marginal likelihoods with bridge sampling
bridge_exponential <- bridge_sampler(samples = stanfit_exponential, silent = F)
## Warning: 46 of the 80000 log_prob() evaluations on the proposal draws
## produced -Inf/Inf.
## Iteration: 1
## Iteration: 2
## Iteration: 3
## Iteration: 4
bridge_power <- bridge_sampler(samples = stanfit_power, silent = F)
## Warning: 225 of the 80000 log_prob() evaluations on the proposal draws
## produced -Inf/Inf.
## Iteration: 1
## Iteration: 2
## Iteration: 3
## Iteration: 4

b. (1 point)

Determine the error of the estimated marginal likelihoods using bridgesampling::error_measures.

# YOUR CODE HERE
# SOLUTION:
# get approximate error measures for marginal likelihood estimates
error_measures(bridge_exponential)
## $re2
## [1] 1.42645e-07
##
## $cv
## [1] 0.0003776838
##
## $percentage
## [1] "0.0378%"
error_measures(bridge_power)
## $re2
## [1] 2.794152e-07
##
## $cv
## [1] 0.0005285974
##
## $percentage
## [1] "0.0529%"

c. (1 point)

The previous results only give you an estimate for the marginal likelihood of the data given each model. But you can use these estimates to compute the Bayes factor in favour of the exponential model.

# YOUR CODE HERE
# SOLUTION:
# approximate Bayes factor
bayes_factor(bridge_exponential, bridge_power)
## Estimated Bayes factor in favor of x1 over x2: 1220.39183

d. (1 point)

Interpret the Bayes factor. What is the value? What does it mean?

ANSWER:

SOLUTION: The Bayes factor is 1221 in favour of the exponential model. This means that a rational agent should update their beliefs by a factor of 1221 in favour of the exponential model, compared to the power model. This is massive empirical evidence in favor of the exponential model.

4. Savage-Dickey

Use the Savage-Dickey method to compute the Bayes factor in favor of the exponential model, when compared against a special case (properly nested model) of the exponential model where a = 1.

a. (1 point)

Fill in the required code:

# NB: we take only a subsest of the samples because otherwise polspline will not work
post_samples <- filter(ggs(stanfit_exponential),
                       Parameter == "a" & Iteration <= 1000)$value

fit.posterior <- polspline::oldlogspline(post_samples,
                                         ubound=1.5, lbound=0)

posterior_a <- polspline::doldlogspline(1, fit.posterior)

# calculate the Bayes factor via the Savage-Dickey method
# YOUR CODE HERE
# SOLUTION:

# NB: we take only a subsest of the samples because otherwise polspline will not work
post_samples <- filter(ggs(stanfit_exponential),
                       Parameter == "a" & Iteration <= 1000)$value

fit.posterior <- polspline::oldlogspline(post_samples,
                                     ubound=1.5, lbound=0)

# height of the posterior distribution at a = 1 (blue)
posterior_a <- polspline::doldlogspline(1, fit.posterior)

# height of the prior distribution at a = 1 (red)
prior_a <- dunif(x = 1, min = 0, max = 1.5)

# plot posterior density

stanfit_exponential %>%
ggs %>% filter(Parameter == "a") %>%
  ggplot(aes(x = value)) + geom_density() +
  geom_hline(yintercept = prior_a, linetype = "dashed") +
  geom_point(x = 1, y = posterior_a, color = "blue") +
  geom_point(x = 1, y = prior_a, color = "red")

# calculate the Bayes factor via the Savage-Dickey method in favor of nested model
savage_dickey_bf <- posterior_a / prior_a

paste0("Approximate BF: ",
       round(savage_dickey_bf, 3))
## [1] "Approximate BF: 3.114"

b. (1 point)

Interpret the result

Is this strong or mild evidence in favor of one of the two models?

ANSWER:

SOLUTION: A bayes factor of 3 is mild evidence for the nested model. As can be seen in the graph, after conditioning on the observed data, the likelihood of a = 1 has increased.

5. Posterior predictive distribution

We want to approximate the posterior predictive distribution. For this, we draw samples from the posterior, using basically the same scripts as above for posterior estimation. Additionally we generate a “sample imaginary prediction” of data that we’d expect to see in an exact repetition of the experiment, for each sampled vector of parameter values. To do so, we use the generated parameters block at the end of the Stan model code.

a. (2 points)

Complete the pseudo-code below for the exponential and the power model with actual Stan code which does that. (Hint: use command binomial_rng to take samples from a binomial distribution; check the Stan manual for its syntax.)

power_pred_model_str <- "
// Stan code here

// fill in the gaps and replace the pseudo-code
data {
 // as before
}
parameters {
 // as before
}
transformed parameters {
  // as before
}
model {
  // as before
}
## PSEUDO-CODE!
generated quantities {
 // fill in for yourself
 // DECLARE VARIABLE 'post_predict_sample' WITH SAME SHAPE AS THE DATA
 // post_predict_sample = SAMPLE FROM BINOMIAL DISTRIBUTION, WITH CURRENT PARAMETER ESTIMATES
}

"

exponential_pred_model_str <- "
// Stan code here

// fill in the gaps and replace the pseudo-code
data {
 // as before
}
parameters {
 // as before
}
transformed parameters {
  // as before
}
model {
  // as before
}
## PSEUDO-CODE!
generated quantities {
 // fill in for yourself
 // DECLARE VARIABLE 'post_predict_sample' WITH SAME SHAPE AS THE DATA
 //  post_predict_sample = SAMPLE FROM BINOMIAL DISTRIBUTION, WITH CURRENT PARAMETER ESTIMATES
}

"
# SOLUTION:

power_pred_model_str <- "
// power model of forgetting data
data {
  int obs[6] ;
  real t[6] ;
}
parameters {
  real<lower=0,upper=1.5> c ;
  real<lower=0,upper=1.5> d ;
}
transformed parameters {
  real theta[6];
  for (i in 1:6) {
    theta[i] = c * t[i]^(-d) ;
  }
}
model {
  // increment target log score with
  // probability of current samples from prior
  target += uniform_lpdf(c | 0, 1.5);
  target += uniform_lpdf(d | 0, 1.5);
  // incement target log score with
  // likelihood of data given current parameter samples
  target += binomial_lpmf(obs | 100, theta);
}
generated quantities {
  int post_predict_sample[6];
  post_predict_sample = binomial_rng(100, theta);
}

"

exponential_pred_model_str <- "
// exponential model of forgetting data
data {
  int obs[6] ;
  real t[6] ;
}
parameters {
  real<lower=0,upper=1.5> a ;
  real<lower=0,upper=1.5> b ;
}
transformed parameters {
  real theta[6];
  for (i in 1:6) {
    theta[i] = a * exp(-b * t[i]) ;
  }
}
model {
  // increment target log score with
  // probability of current samples from prior
  target += uniform_lpdf(a | 0, 1.5);
  target += uniform_lpdf(b | 0, 1.5);
  // incermenten target log score with
  // likelihood of data given current parameter samples
  target += binomial_lpmf(obs | 100, theta);
}
generated quantities {
  int post_predict_sample[6];
  post_predict_sample = binomial_rng(100, theta);
}


"

b. (2 points)

Plot the data and a visual posterior predictive check

Use the samples from variable post_predict_sample to generate plots for a visual posterior predictive check, which should similar what we had in the lecture. Use y limits of 0 to 100.

Hint: use geom_point() for the real data and geom_pointrange() for the predicted values and 95% credible intervals.

# YOUR CODE HERE
# SOLUTION:

stanfit_exponential_pred <- stan(model_code = exponential_pred_model_str,
                            iter = 50000,
                            warmup = 10000,
                            control = list(adapt_delta = 0.999),
                            pars = c("post_predict_sample"),
                            data = forgetting_data)

exp_summary <- summary(stanfit_exponential_pred,
                       pars = c("post_predict_sample"))$summary %>%
               as_tibble()

ggplot(forgetting_data, aes(x = t, y = obs)) +
  geom_point() +
  geom_pointrange(x = t, y = exp_summary$mean,
           ymin = exp_summary$`2.5%`,
           ymax = exp_summary$`97.5%`,
           color = "skyblue",
           alpha = 0.5) +
  ylim(0,100)

stanfit_power_pred <- stan(model_code = power_pred_model_str,
                            iter = 50000,
                            warmup = 10000,
                            control = list(adapt_delta = 0.999),
                            pars = c("post_predict_sample"),
                            data = forgetting_data)

power_summary <- summary(stanfit_power_pred,
                       pars = c("post_predict_sample"))$summary %>%
               as_tibble()

ggplot(forgetting_data, aes(x = t, y = obs)) +
  geom_point() +
  geom_pointrange(x = t, y = power_summary$mean,
                  ymin = power_summary$`2.5%`,
                  ymax = power_summary$`97.5%`,
                  color = "red",
                  alpha = 0.5) +
  ylim(0,100)

6. Computing posterior predictive \(p\)-value

To compute the posterior predictive \(p\)-value most efficiently (in the sense of having to write least code; not necessarily in the sense of fastest run time), we will expand the Stan code even more. Based on your previous code which computes samples from the posterior predictive distribution, expand the Stan code even more to also generate two more quantities. The first is the likelihood of the observed data under the currently sampled parameter values. The second is the likelihood of the post_predict_sample under the currently sampled parameter value. The posterior predictive \(p\)-value is approximated by the mean of the variable post_pred_p_value_flag in the pseudo-code below.

a. (2 points)

Complete the code accordingly.

power_ppp_model_str <- "
// fill in the gaps and replace the pseudo-code
data {
 // as before
}
parameters {
 // as before
}
transformed parameters {
  // as before
}
model {
  // as before
}
## PSEUDO-CODE!
generated quantities {
  // fill in for yourself
  // DECLARE VARIABLE 'post_predict_sample' WITH SAME SHAPE AS THE DATA
  // post_predict_sample = SAMPLE FROM BIOMIAL DISTRIBUTION, WITH CURRENT PARAMETER ESTIMATES
  // likelihood_data = binomial_lpmf( ... FILL ME ... )
  // likelihood_post_predict_sample = binomial_lpmf( ... FILL ME ... )
  // post_pred_p_value_flag = likelihood_data >= likelihood_post_predict_sample;
}

"

exponential_ppp_model_str <- "
// fill in the gaps and replace the pseudo-code
data {
 // as before
}
parameters {
 // as before
}
transformed parameters {
  // as before
}
model {
  // as before
}
## PSEUDO-CODE!
generated quantities {
  // fill in for yourself
  // DECLARE VARIABLE 'post_predict_sample' WITH SAME SHAPE AS THE DATA
  // post_predict_sample = SAMPLE FROM BIOMIAL DISTRIBUTION, WITH CURRENT PARAMETER ESTIMATES
  // likelihood_data = binomial_lpmf( ... FILL ME ... )
  // likelihood_post_predict_sample = binomial_lpmf( ... FILL ME ... )
  // post_pred_p_value_flag = likelihood_data >= likelihood_post_predict_sample;
}

"
power_model_ppp_str <- "
// power model of forgetting data
data {
  int obs[6] ;
  real t[6] ;
}
parameters {
  real<lower=0,upper=1.5> c ;
  real<lower=0,upper=1.5> d ;
}
transformed parameters {
  real theta[6];
  for (i in 1:6) {
    theta[i] = c * t[i]^(-d) ;
  }
}
model {
  // increment target log score with
  // probability of current samples from prior
  target += uniform_lpdf(c | 0, 1.5);
  target += uniform_lpdf(d | 0, 1.5);

  // increment target log score with
  // likelihood of data given current parameter samples
  target += binomial_lpmf(obs | 100, theta);
}
generated quantities {
  // declare variables
  int post_predict_sample[6];
  real likelihood_data;
  real likelihood_post_predict_sample;
  int post_predict_p_value_flag;

  // generate predictions
  post_predict_sample = binomial_rng(100, theta);

  // calculate likelihoods
  likelihood_data = binomial_lpmf(obs | 100, theta);
  likelihood_post_predict_sample = binomial_lpmf(post_predict_sample | 100, theta);

  // compare likelihoods
  post_predict_p_value_flag = likelihood_data >= likelihood_post_predict_sample;
}


"

exponential_model_ppp_str <- "
// exponential model of forgetting data
data {
  int obs[6] ;
  real t[6] ;
}
parameters {
  real<lower=0,upper=1.5> a ;
  real<lower=0,upper=1.5> b ;
}
transformed parameters {
  real theta[6];
  for (i in 1:6) {
    theta[i] = a * exp(-b * t[i]) ;
  }
}
model {

  // increment target log score with
  // probability of current samples from prior
  target += uniform_lpdf(a | 0, 1.5);
  target += uniform_lpdf(b | 0, 1.5);

  // increment target log score with
  // likelihood of data given current parameter samples
  target += binomial_lpmf(obs | 100, theta);
}
generated quantities {

  // declare variables
  int post_predict_sample[6];
  real likelihood_data;
  real likelihood_post_predict_sample;
  int post_predict_p_value_flag;

  // generate predictions
  post_predict_sample = binomial_rng(100, theta);

  // calculate likelihoods
  likelihood_data = 0;
  for (n in 1:6) {
    likelihood_data = likelihood_data + binomial_lpmf(obs[n] | 100, theta[n]);
  }

  likelihood_post_predict_sample = 0;
  for (n in 1:6) {
    likelihood_post_predict_sample = likelihood_post_predict_sample + binomial_lpmf(post_predict_sample[n] | 100, theta[n]);
  }

  // compare likelihoods
  post_predict_p_value_flag = likelihood_data >= likelihood_post_predict_sample;
}


"

b. (1 point)

Retrieve estimates of the posterior predictive \(p\)-values for both the exponential and power models.

# YOUR CODE HERE
# Run the stan models and retrieve an estimate of the posterior predictive
# p-value for both exponential and power model.
stanfit_exponential_ppp <- stan(model_code = exponential_model_ppp_str,
                            iter = 50000,
                            warmup = 10000,
                            control = list(adapt_delta = 0.999),
                            pars = c("post_predict_p_value_flag"),
                            data = forgetting_data)

exponential_p_summary <- summary(stanfit_exponential_ppp,
                           pars = c("post_predict_p_value_flag"))$summary %>%
  as_tibble()

exponential_p_summary$mean
## [1] 0.0670625
stanfit_power_ppp <- stan(model_code = power_model_ppp_str,
                            iter = 50000,
                            warmup = 10000,
                            control = list(adapt_delta = 0.999),
                            pars = c("post_predict_p_value_flag"),
                            data = forgetting_data)
## Warning: There were 1 divergent transitions after warmup. Increasing adapt_delta above 0.999 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
power_p_summary <- summary(stanfit_power_ppp,
                           pars = c("post_predict_p_value_flag"))$summary %>%
  as_tibble()

power_p_summary$mean
## [1] 0.00015

Can either of the models be criticised based on these values? If so, which and why?

ANSWER:

SOLUTION: Yes, the power model could be criticised because the data seems surprising to the model as the p-value is below the conventional alpha level of 0.05.


End of assignment