rmarkdown
working in order to ‘knit’ the HTML output.ctrl/cmd
+ shift
+ K
in RStudio) to produce a HTML file.rmarkdown
(comes with RStudio)tidyverse
(or ggplot2
, dplyr
, purrr
, tibble
)TeachingDemos
ggm
maps
rstan
ggmcmc
___As always, load the required packages and change the seed to your last name.
library(TeachingDemos)
library(ggm)
library(maps)
library(tidyverse)
library(rstan)
library(ggmcmc)
# set cores to use to the total number of cores (minimally 4)
options(mc.cores = max(parallel::detectCores(), 4))
# save a compiled version of the Stan model file
rstan_options(auto_write = TRUE)
lastname <- "YOURLASTNAME"
char2seed(lastname)
library(TeachingDemos)
library(ggm)
library(maps)
library(tidyverse)
library(rstan)
library(ggmcmc)
# set cores to use to the total number of cores (minimally 4)
options(mc.cores = max(parallel::detectCores(), 4))
# save a compiled version of the Stan model file
rstan_options(auto_write = TRUE)
lastname <- "bayes"
char2seed(lastname)
We say in class that we can (ab-)use Stan to obtain samples from the prior predictive distribution. Here is a wild script which completely sidesteps the model
block (the variable theta_dummy
is only included because otherwise Stan would complain about an empty model
block). The code generates (feedforward) samples in the generated quantities
block. The code is trying but failing to do something sensible, in fact something that we have already computed during the course.
data {
int<lower=0> N ;
int<lower=0, upper = N> k ;
}
parameters {
real<lower=0,upper=1> theta_dummy;
}
model {
theta_dummy ~ beta(1,1);
}
generated quantities {
real<lower=0,upper=1> theta_prior;
int<lower=0> prior_pred_k;
real<upper=1> llh_rep;
real<upper=1> llh_hypo;
int<lower=0,upper=1> at_least_as_extreme;
theta_prior = 0.5;
prior_pred_k = binomial_rng(N, theta_prior);
llh_rep = binomial_lpmf(prior_pred_k | N, theta_prior);
llh_hypo = binomial_lpmf(k | N, theta_prior);
at_least_as_extreme = llh_rep >= llh_hypo ? 1 : 0;
}
ANSWER:
SOLUTION: The code is trying to compute the p-value for seeing k=7 successes in N=24 flips for a null hypothesis that assumes that the binomial sampling is fair (theta = 0.5).
data {
int<lower=0> N ;
int<lower=0, upper = N> k ;
}
parameters {
real<lower=0,upper=1> theta_dummy;
}
model {
theta_dummy ~ beta(1,1);
}
generated quantities {
real<lower=0,upper=1> theta_prior;
int<lower=0> prior_pred_k;
real<upper=1> llh_rep;
real<upper=1> llh_hypo;
int<lower=0,upper=1> at_least_as_extreme;
theta_prior = 0.5;
prior_pred_k = binomial_rng(N, theta_prior);
llh_rep = binomial_lpmf(prior_pred_k | N, theta_prior);
llh_hypo = binomial_lpmf(k | N, theta_prior);
at_least_as_extreme = llh_rep >= llh_hypo ? 1 : 0;
}
data {
int<lower=0> N ;
int<lower=0, upper = N> k ;
}
parameters {
real<lower=0,upper=1> theta_dummy;
}
model {
theta_dummy ~ beta(1,1);
}
generated quantities {
real<lower=0,upper=1> theta_prior;
int<lower=0> prior_pred_k;
real<upper=1> llh_rep;
real<upper=1> llh_hypo;
int<lower=0,upper=1> at_least_as_extreme;
theta_prior = 0.5;
prior_pred_k = binomial_rng(N, theta_prior);
llh_rep = binomial_lpmf(prior_pred_k | N, theta_prior);
llh_hypo = binomial_lpmf(k | N, theta_prior);
at_least_as_extreme = llh_rep <= llh_hypo ? 1 : 0; // typo here!!! <= not >= !!!
}
model_string = '''
paste the code here after fixing the typo
'''
dataList <- tibble(N = 24, k = 7)
stan_fit <- stan(model_code = model_string,
data = dataList,
iter = 50000)
# get posterior probability of flag `at_least_as_extreme` to be 1
ggmcmc::ggs(stan_fit) %>% filter(Parameter %in% c("at_least_as_extreme")) %>%
group_by(Parameter) %>% summarise(value_of_interest = mean(value)) %>% show()
model_string = "
data {
int<lower=0> N ;
int<lower=0, upper = N> k ;
}
parameters {
real<lower=0,upper=1> theta_dummy;
}
model {
theta_dummy ~ beta(1,1);
}
generated quantities {
real<lower=0,upper=1> theta_prior;
int<lower=0> prior_pred_k;
real<upper=1> llh_rep;
real<upper=1> llh_hypo;
int<lower=0,upper=1> at_least_as_extreme;
theta_prior = 0.5;
prior_pred_k = binomial_rng(N, theta_prior);
llh_rep = binomial_lpmf(prior_pred_k | N, theta_prior);
llh_hypo = binomial_lpmf(k | N, theta_prior);
at_least_as_extreme = llh_rep <= llh_hypo ? 1 : 0; // typo here!!! <= not >= !!!
}
"
dataList <- tibble(N = 24, k = 7)
stan_fit <- stan(model_code = model_string,
data = dataList,
iter = 50000)
# get posterior probability of flag `at_least_as_extreme` to be 1
ggmcmc::ggs(stan_fit) %>% filter(Parameter %in% c("at_least_as_extreme")) %>%
group_by(Parameter) %>% summarise(value_of_interest = mean(value)) %>% show()
## # A tibble: 1 x 2
## Parameter value_of_interest
## <fct> <dbl>
## 1 at_least_as_extreme 0.0626
ANSWER:
SOLUTION: The output approximates the exact p-value calculated earlier pretty well.
theta
at level 0.5. Change the Stan model code to compute the same kind of output value, but for a model which assumes that theta
is sampled from a uniform distribution over the unit interval. (Hint: you only need to modify one line very slightly (the resulting code is not pretty or the best code to communicate the intend, but it suffices).)model_string <- '''
paste the code here after fixing the typo and adapting
it to a Bayesian model with maximal uncertainty about theta
'''
dataList <- tibble(N = 24, k = 7)
stan_fit <- stan(model_code = model_string,
data = dataList,
iter = 50000)
# get posterior probability of flag `at_least_as_extreme` to be 1
ggmcmc::ggs(stan_fit) %>% filter(Parameter %in% c("at_least_as_extreme")) %>%
group_by(Parameter) %>% summarise(value_of_interest = mean(value)) %>% show()
model_string <- "
data {
int<lower=0> N ;
int<lower=0, upper = N> k ;
}
parameters {
real<lower=0,upper=1> theta_dummy;
}
model {
theta_dummy ~ beta(1,1);
}
generated quantities {
real<lower=0,upper=1> theta_prior;
int<lower=0> prior_pred_k;
real<upper=1> llh_rep;
real<upper=1> llh_hypo;
int<lower=0,upper=1> at_least_as_extreme;
theta_prior = theta_dummy;
prior_pred_k = binomial_rng(N, theta_prior);
llh_rep = binomial_lpmf(prior_pred_k | N, theta_prior);
llh_hypo = binomial_lpmf(k | N, theta_prior);
at_least_as_extreme = llh_rep <= llh_hypo ? 1 : 0; // typo here!!! <= not >= !!!
}
"
dataList <- tibble(N = 24, k = 7)
stan_fit <- stan(model_code = model_string,
data = dataList,
iter = 50000)
# get posterior probability of flag `at_least_as_extreme` to be 1
ggmcmc::ggs(stan_fit) %>% filter(Parameter %in% c("at_least_as_extreme")) %>%
group_by(Parameter) %>% summarise(value_of_interest = mean(value)) %>% show()
## # A tibble: 1 x 2
## Parameter value_of_interest
## <fct> <dbl>
## 1 at_least_as_extreme 0.163
ANSWER:
SOLUTION: Technically, we have just computed a prior predictive p-value. It gives the probability of observing a hypothetical data point at least as extreme as what we actually observed, if the assumed (Bayesian) model (with prior uncertainty) is true. This is a generalization of the p-value that also incorporates modeller uncertainty. If the prior predictive p-value is low, the data is unexpected under the model we believed in a priori. We might want to discard it (replace it with another, e.g., use the posterior distribution instead). The number observed here (ca. 0.16) is no compelling reason to abandon the model, however. (This, by standard p-value logic.)
End of assignment