- Metropolis Hastings (recap)
- Hamiltonian Monte Carlo
- Stan
\[ \definecolor{firebrick}{RGB}{178,34,34} \newcommand{\red}[1]{{\color{firebrick}{#1}}} \] \[ \definecolor{mygray}{RGB}{178,34,34} \newcommand{\mygray}[1]{{\color{mygray}{#1}}} \] \[ \newcommand{\set}[1]{\{#1\}} \] \[ \newcommand{\tuple}[1]{\langle#1\rangle} \] \[\newcommand{\States}{{T}}\] \[\newcommand{\state}{{t}}\] \[\newcommand{\pow}[1]{{\mathcal{P}(#1)}}\]
dummy dummy dummy
potential
kinetic
dummy
dummy
actively developed by Andrew Gelman, Bob Carpenter and others
uses HMC and NUTS, but can do variational Bayes and MAPs too
written in C++
cross-platform
interfaces: command line, Python, Julia, R, Matlab, Stata, Mathematica
many additional packages/goodies
require('rjags') ms = " model{ theta ~ dbeta(1,1) k ~ dbinom(theta, N) }" # prepare data for JAGS dataList = list(k = 14, N = 20) # set up and run model jagsModel = jags.model( file = textConnection(ms), data = dataList, n.chains = 2) update(jagsModel, n.iter = 5000) codaSamples = coda.samples( jagsModel, variable.names = c("theta"), n.iter = 5000)
require('rstan') ms = " data { int<lower=0> N ; int<lower=0> k ; } parameters { real<lower=0,upper=1> theta ; } model { theta ~ beta(1,1) ; k ~ binomial(N, theta) ; }" # prepare data for Stan dataList = list(k = 14, N = 20) # set up and run mDso = stan_model(model_code = ms) stanFit = sampling(object = mDso, data = dataList, chains = 2, iter = 10000, warmup = 5000)
JAGS
coda::effectiveSize(codaSamples)
## theta ## 10063.5
ms = ggs(codaSamples) ggs_density(ms)
Stan
coda::effectiveSize(As.mcmc.list(stanFit))[1]
## theta ## 3858.451
ms = ggs(stanFit) ggs_density(ms)
plot(stanFit)
print(stanFit, digits_summary = 1)
## Inference for Stan model: 23b975ca8e7a249bab8c58ed69033d00. ## 2 chains, each with iter=10000; warmup=5000; thin=1; ## post-warmup draws per chain=5000, total post-warmup draws=10000. ## ## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat ## theta 0.7 0 0.1 0.5 0.6 0.7 0.7 0.9 3648 1 ## lp__ -14.3 0 0.7 -16.2 -14.4 -14.0 -13.8 -13.8 3781 1 ## ## Samples were drawn using NUTS(diag_e) at Tue Jul 4 16:08:16 2017. ## For each parameter, n_eff is a crude measure of effective sample size, ## and Rhat is the potential scale reduction factor on split chains (at ## convergence, Rhat=1).
bayesplot
package
require(bayesplot) posterior <- as.array(stanFit) color_scheme_set("red") mcmc_areas( posterior, pars = c("theta"), prob = 0.8, # 80% intervals prob_outer = 0.99, # 99% point_est = "mean" )
more on the bayesplot
package: here
dummy dummy
problem
Stan needs gradients / derivatives
(sometimes) possible solution
marginalize out
15 people answered 40 exam questions. Here's their scores:
k <- c(21,17,21,18,22,31,31,34, 34,35,35,36,39,36,35) p <- length(k) #number of people n <- 40 # number of questions
Lee & Wagenmakers, Chapter 6.1
k <- c(21,17,21,18,22,31,31,34, 34,35,35,36,39,36,35) p <- length(k) #number of people n <- 40 # number of questions
model{ for (i in 1:p){ z[i] ~ dbern(0.5) } psi = 0.5 phi ~ dunif(0.5,1) for (i in 1:p){ theta[i] = ifelse(z[i]==0,psi,phi) k[i] ~ dbin(theta[i],n) } }
k <- c(21,17,21,18,22,31,31,34,34,35,35,36,39,36,35) show(summary(codaSamples)[[1]][,1:2])
## Mean SD ## phi 0.8632095 0.01716929 ## z[1] 0.0000000 0.00000000 ## z[2] 0.0000000 0.00000000 ## z[3] 0.0000000 0.00000000 ## z[4] 0.0000000 0.00000000 ## z[5] 0.0000000 0.00000000 ## z[6] 0.9952000 0.06911901 ## z[7] 0.9938000 0.07849953 ## z[8] 1.0000000 0.00000000 ## z[9] 1.0000000 0.00000000 ## z[10] 1.0000000 0.00000000 ## z[11] 1.0000000 0.00000000 ## z[12] 1.0000000 0.00000000 ## z[13] 1.0000000 0.00000000 ## z[14] 1.0000000 0.00000000 ## z[15] 1.0000000 0.00000000
model{ for (i in 1:p){ z[i] ~ dbern(0.5) } psi = 0.5 phi ~ dunif(0.5,1) for (i in 1:p){ theta[i] = ifelse(z[i]==0, psi,phi) k[i] ~ dbin(theta[i],n) } }
JAGS \(\uparrow\)
Stan \(\rightarrow\)
data { int<lower=1> p; int<lower=0> k[p]; int<lower=1> n; } transformed data { real psi; psi <- .5; } parameters { real<lower=.5,upper=1> phi; } transformed parameters { vector[2] log_alpha[p]; for (i in 1:p) { log_alpha[i,1] <- log(.5) + binomial_log(k[i], n, phi); log_alpha[i,2] <- log(.5) + binomial_log(k[i], n, psi); } } model { for (i in 1:p) increment_log_prob(log_sum_exp(log_alpha[i])); } generated quantities { int<lower=0,upper=1> z[p]; for (i in 1:p) { z[i] <- bernoulli_rng(softmax(log_alpha[i])[1]); } }
log score
\[\log P(k_i, n, z_i, \phi) = \log \text{Bern}(z_i \mid 0.5) + \log \text{Binom}(k_i, n, \text{ifelse}(z=1, \phi, 0.5))\]
marginalizing out
\[ \begin{align*} P(k_i, n, \phi) & = \sum_{z_i \in \set{0,1}} \text{Bern}(z_i \mid 0.5) \ \text{Binom}(k_i, n, \text{ifelse}(z=1, \phi, 0.5)) \\ & = \text{Bern}(0 \mid 0.5) \ \text{Binom}(k_i, n, 0.5)) + \\ & \ \ \ \ \ \ \ \ \ \ \text{Bern}(1 \mid 0.5) \ \text{Binom}(k_i, n, \phi)) \\ & = \underbrace{0.5 \ \text{Binom}(k_i, n, 0.5))}_{\alpha_0} + \underbrace{0.5 \ \text{Binom}(k_i, n, \phi))}_{\alpha_1} \\ \log P(k_i, n, \phi) & = \log \sum_{i=0}^1 \exp \log \alpha_i \end{align*}\]
idea
approximate posterior distribution by a well-behaved function
concretely
find parametric function \(F(\theta \mid \phi)\) and look for best-fitting parameters:
\[\phi^* = \arg\min_{\phi} \text{KL}(F(\theta \mid \phi) \ \ || \ \ P(\theta \mid D) ) \]
where KL is the Kullback-Leibler divergence:
\[ \text{KL}(P || Q) = \int P(x) \log \frac{P(x)}{Q(x)} \ \text{d}x \]
the MAP is the most likely parameter vector under the posterior distribution:
\[ \theta^* = \arg\max_{\theta} P(\theta \mid D) \]
for flat (possibly improper) priors the MAP is the MLE
JAGS | Stan | |
---|---|---|
syntax-style | BUGS, R | C++, R, BUGS |
sampler | MH, Gibbs | Hamiltonian |
paradigm | declarative | procedural |
precompilation | no | yes |
latent discrete variables | yes | hard (or impossible) |
added goodies | DIC | variational Bayes, MAP, improper priors, rstanarm, loo, … |
Friday
Tuesday