- Metropolis Hastings (recap)
- Hamiltonian Monte Carlo
- Stan
\[ \definecolor{firebrick}{RGB}{178,34,34} \newcommand{\red}[1]{{\color{firebrick}{#1}}} \] \[ \definecolor{mygray}{RGB}{178,34,34} \newcommand{\mygray}[1]{{\color{mygray}{#1}}} \] \[ \newcommand{\set}[1]{\{#1\}} \] \[ \newcommand{\tuple}[1]{\langle#1\rangle} \] \[\newcommand{\States}{{T}}\] \[\newcommand{\state}{{t}}\] \[\newcommand{\pow}[1]{{\mathcal{P}(#1)}}\]

dummy dummy dummy
potential

kinetic

dummy
dummy
actively developed by Andrew Gelman, Bob Carpenter and others
uses HMC and NUTS, but can do variational Bayes and MAPs too
written in C++
cross-platform
interfaces: command line, Python, Julia, R, Matlab, Stata, Mathematica
many additional packages/goodies
require('rjags')
ms = "
model{
theta ~ dbeta(1,1)
k ~ dbinom(theta, N)
}"
# prepare data for JAGS
dataList = list(k = 14, N = 20)
# set up and run model
jagsModel = jags.model(
file = textConnection(ms),
data = dataList, n.chains = 2)
update(jagsModel, n.iter = 5000)
codaSamples = coda.samples(
jagsModel,
variable.names = c("theta"),
n.iter = 5000)require('rstan')
ms = "
data {
int<lower=0> N ;
int<lower=0> k ;
}
parameters {
real<lower=0,upper=1> theta ;
}
model {
theta ~ beta(1,1) ;
k ~ binomial(N, theta) ;
}"
# prepare data for Stan
dataList = list(k = 14, N = 20)
# set up and run
mDso = stan_model(model_code = ms)
stanFit = sampling(object = mDso,
data = dataList,
chains = 2,
iter = 10000,
warmup = 5000)JAGS
coda::effectiveSize(codaSamples)
## theta ## 10063.5
ms = ggs(codaSamples) ggs_density(ms)
Stan
coda::effectiveSize(As.mcmc.list(stanFit))[1]
## theta ## 3858.451
ms = ggs(stanFit) ggs_density(ms)
plot(stanFit)
print(stanFit, digits_summary = 1)
## Inference for Stan model: 23b975ca8e7a249bab8c58ed69033d00. ## 2 chains, each with iter=10000; warmup=5000; thin=1; ## post-warmup draws per chain=5000, total post-warmup draws=10000. ## ## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat ## theta 0.7 0 0.1 0.5 0.6 0.7 0.7 0.9 3648 1 ## lp__ -14.3 0 0.7 -16.2 -14.4 -14.0 -13.8 -13.8 3781 1 ## ## Samples were drawn using NUTS(diag_e) at Tue Jul 4 16:08:16 2017. ## For each parameter, n_eff is a crude measure of effective sample size, ## and Rhat is the potential scale reduction factor on split chains (at ## convergence, Rhat=1).
bayesplot package
require(bayesplot)
posterior <- as.array(stanFit)
color_scheme_set("red")
mcmc_areas(
posterior,
pars = c("theta"),
prob = 0.8, # 80% intervals
prob_outer = 0.99, # 99%
point_est = "mean"
)more on the bayesplot package: here
dummy dummy
problem
Stan needs gradients / derivatives
(sometimes) possible solution
marginalize out
15 people answered 40 exam questions. Here's their scores:
k <- c(21,17,21,18,22,31,31,34,
34,35,35,36,39,36,35)
p <- length(k) #number of people
n <- 40 # number of questions
Lee & Wagenmakers, Chapter 6.1
k <- c(21,17,21,18,22,31,31,34,
34,35,35,36,39,36,35)
p <- length(k) #number of people
n <- 40 # number of questions
model{
for (i in 1:p){
z[i] ~ dbern(0.5)
}
psi = 0.5
phi ~ dunif(0.5,1)
for (i in 1:p){
theta[i] = ifelse(z[i]==0,psi,phi)
k[i] ~ dbin(theta[i],n)
}
}k <- c(21,17,21,18,22,31,31,34,34,35,35,36,39,36,35) show(summary(codaSamples)[[1]][,1:2])
## Mean SD ## phi 0.8632095 0.01716929 ## z[1] 0.0000000 0.00000000 ## z[2] 0.0000000 0.00000000 ## z[3] 0.0000000 0.00000000 ## z[4] 0.0000000 0.00000000 ## z[5] 0.0000000 0.00000000 ## z[6] 0.9952000 0.06911901 ## z[7] 0.9938000 0.07849953 ## z[8] 1.0000000 0.00000000 ## z[9] 1.0000000 0.00000000 ## z[10] 1.0000000 0.00000000 ## z[11] 1.0000000 0.00000000 ## z[12] 1.0000000 0.00000000 ## z[13] 1.0000000 0.00000000 ## z[14] 1.0000000 0.00000000 ## z[15] 1.0000000 0.00000000
model{
for (i in 1:p){
z[i] ~ dbern(0.5)
}
psi = 0.5
phi ~ dunif(0.5,1)
for (i in 1:p){
theta[i] = ifelse(z[i]==0,
psi,phi)
k[i] ~ dbin(theta[i],n)
}
}
JAGS \(\uparrow\)
Stan \(\rightarrow\)
data {
int<lower=1> p; int<lower=0> k[p]; int<lower=1> n;
}
transformed data {
real psi; psi <- .5;
}
parameters {
real<lower=.5,upper=1> phi;
}
transformed parameters {
vector[2] log_alpha[p];
for (i in 1:p) {
log_alpha[i,1] <- log(.5) + binomial_log(k[i], n, phi);
log_alpha[i,2] <- log(.5) + binomial_log(k[i], n, psi);
}
}
model {
for (i in 1:p)
increment_log_prob(log_sum_exp(log_alpha[i]));
}
generated quantities {
int<lower=0,upper=1> z[p];
for (i in 1:p) {
z[i] <- bernoulli_rng(softmax(log_alpha[i])[1]);
}
}log score
\[\log P(k_i, n, z_i, \phi) = \log \text{Bern}(z_i \mid 0.5) + \log \text{Binom}(k_i, n, \text{ifelse}(z=1, \phi, 0.5))\]
marginalizing out
\[ \begin{align*} P(k_i, n, \phi) & = \sum_{z_i \in \set{0,1}} \text{Bern}(z_i \mid 0.5) \ \text{Binom}(k_i, n, \text{ifelse}(z=1, \phi, 0.5)) \\ & = \text{Bern}(0 \mid 0.5) \ \text{Binom}(k_i, n, 0.5)) + \\ & \ \ \ \ \ \ \ \ \ \ \text{Bern}(1 \mid 0.5) \ \text{Binom}(k_i, n, \phi)) \\ & = \underbrace{0.5 \ \text{Binom}(k_i, n, 0.5))}_{\alpha_0} + \underbrace{0.5 \ \text{Binom}(k_i, n, \phi))}_{\alpha_1} \\ \log P(k_i, n, \phi) & = \log \sum_{i=0}^1 \exp \log \alpha_i \end{align*}\]
idea
approximate posterior distribution by a well-behaved function
concretely
find parametric function \(F(\theta \mid \phi)\) and look for best-fitting parameters:
\[\phi^* = \arg\min_{\phi} \text{KL}(F(\theta \mid \phi) \ \ || \ \ P(\theta \mid D) ) \]
where KL is the Kullback-Leibler divergence:
\[ \text{KL}(P || Q) = \int P(x) \log \frac{P(x)}{Q(x)} \ \text{d}x \]
the MAP is the most likely parameter vector under the posterior distribution:
\[ \theta^* = \arg\max_{\theta} P(\theta \mid D) \]
for flat (possibly improper) priors the MAP is the MLE
| JAGS | Stan | |
|---|---|---|
| syntax-style | BUGS, R | C++, R, BUGS |
| sampler | MH, Gibbs | Hamiltonian |
| paradigm | declarative | procedural |
| precompilation | no | yes |
| latent discrete variables | yes | hard (or impossible) |
| added goodies | DIC | variational Bayes, MAP, improper priors, rstanarm, loo, … |
Friday
Tuesday