iq_data=readr::read_csv('data/03_Kruschke_TwoGroupIQ.csv')
iq_data
## # A tibble: 120 x 2
## Score Group
## <int> <chr>
## 1 102 Smart Drug
## 2 107 Smart Drug
## 3 92 Smart Drug
## 4 101 Smart Drug
## 5 110 Smart Drug
## 6 68 Smart Drug
## 7 119 Smart Drug
## 8 106 Smart Drug
## 9 99 Smart Drug
## 10 103 Smart Drug
## # ... with 110 more rows
possible research questions?
ggplot(iq_data, aes(x=Group, y = Score)) +
geom_boxplot()
data from Kruschke (2015, Chapter 16)
base_plot = iq_data %>% ggplot(aes(x = Score))
density_plot = base_plot + geom_density()
histogram_plot = base_plot + geom_histogram()
cowplot::plot_grid(density_plot, histogram_plot, labels = c("A", "B"))
group_avg = mean(iq_data$Score)
density_plot + geom_vline(xintercept = group_avg, color = "firebrick")
\[ Z = \frac{\bar{X} - \mu_0}{\sigma / \sqrt{n}} \]
\[ Z = \frac{\bar{X} - \mu_0}{\sigma / \sqrt{n}} \ \ \ \ \ \ \ z \sim \mathcal{N}(0,1) \]
\[ z' = \frac{\bar{x}' - \mu_0}{\sigma / \sqrt{n}} \]
for a concrete realization of \(z'\), we can compute a \(p\)-value based on knowledge of its distribution
alternatively, we can also look up the critical value for \(c\) such that:
\[ P(-c \le Z \le c) = 1-\alpha \]
d = iq_data$Score
mu_0 = 100 # null-hypothesis
sigma_X = 25 # pure speculation for sake of example!
z_prime = (mean(d) - mu_0) / (sigma_X / sqrt(length(d)))
tibble("upper-bound c" = qnorm(0.975),
"test statistic" = z_prime,
"p-value" = (1-pnorm(z_prime))*2 )
## variable value
## 1 upper-bound c 1.9600
## 2 test statistic 1.8111
## 3 p-value 0.0701
z.test
z.test
is not part of base R, but supplied in package BSDA
BSDA::z.test(x = iq_data$Score, mu = 100, sigma.x = 25)
##
## One-sample z-Test
##
## data: iq_data$Score
## z = 1.8111, p-value = 0.07012
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
## 99.66035 108.60632
## sample estimates:
## mean of x
## 104.1333
compare our hand-made results:
## variable value
## 1 upper-bound c 1.9600
## 2 test statistic 1.8111
## 3 p-value 0.0701
\[ T = \frac{X}{\sqrt{S / n}} \]
\[ f(t \mid n) \propto \left(1 + \frac{t^2}{n} \right)^{- \frac{n+1}{2}} \]
\[ T = \frac{\bar{X} - \mu_0}{S / \sqrt{n}} \]
\[ T = \frac{\bar{X} - \mu_0}{S / \sqrt{n}} \ \ \ \ \ \ \ \ \ \ t \sim T(df = n-1) \]
\[ t' = \frac{\bar{x}' - \mu_0}{s' / \sqrt{n}} \]
for a concrete realization of \(t'\), we can compute a \(p\)-value based on knowledge of its distribution
alternatively, we can also look up the critical value for \(c\) such that:
\[ P(-c \le T \le c) = 1-\alpha \]
d = iq_data$Score
sigma_prime = sd(d) # now we use the sample to estimate SD!
test_value = (mean(d) - 100) / sigma_prime * sqrt(length(d))
out = data.frame(variable = c("upper-bound c", "sample mean", "p-value"),
value = c(round(qt(0.975, df = length(d)-1),4),
round(test_value,4),
round((1-pt(test_value, df = length(d)-1))*2,4)))
as.data.frame(out)[1:3,1:2]
## variable value
## 1 upper-bound c 1.9801
## 2 sample mean 2.0182
## 3 p-value 0.0458
t.test
t.test(iq_data$Score, mu = 100)
##
## One Sample t-test
##
## data: iq_data$Score
## t = 2.0182, df = 119, p-value = 0.04582
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
## 100.0780 108.1887
## sample estimates:
## mean of x
## 104.1333
murder_data = readr::read_csv('data/02_murder_rates.csv') %>%
rename(murder_rate = annual_murder_rate_per_million_inhabitants,
low_income = percentage_low_income,
unemployment = percentage_unemployment) %>%
select(murder_rate, low_income, unemployment,population)
murder_data %>% head
## # A tibble: 6 x 4
## murder_rate low_income unemployment population
## <dbl> <dbl> <dbl> <int>
## 1 11.2 16.5 6.20 587000
## 2 13.4 20.5 6.40 643000
## 3 40.7 26.3 9.30 635000
## 4 5.30 16.5 5.30 692000
## 5 24.8 19.2 7.30 1248000
## 6 12.7 16.5 5.90 643000
GGally::ggpairs(murder_data, title = "Murder rate data")
murder_sub = select(murder_data, murder_rate, low_income)
rate_income_plot = ggplot(murder_sub, aes(x = low_income, y = murder_rate)) + geom_point()
rate_income_plot
given
question
wich linear function –in terms of intercept \(\beta_0\) and slope \(\beta_1\)– approximates the data best?
\[ y_{\text{pred}} = \beta_0 + \beta_1 x \]
“geometric” approach
find \(\beta_0\) and \(\beta_1\) that minimize the squared error between predicted \(y_\text{pred}\) and observed \(y\)
mse = function(y, x, beta_0, beta_1) {
yPred = beta_0 + x * beta_1
(y-yPred)^2 %>% mean
}
fit_mse = optim(par = c(0, 1),
fn = function(par) with(murder_data,
mse(murder_rate,low_income,
par[1], par[2])))
fit_mse$par
## [1] -29.905188 2.559588
maximum likelihood approach
find \(\beta_0\) and \(\beta_1\) and \(\sigma\) that maximize the likelihood of observed \(y\):
\[ \begin{align*} y_{\text{pred}} & = \beta_0 + \beta_1 x & \ \ \ \ \ \ \ \ \ \ \ \ \ \ y & \sim \mathcal{N}(\mu = y_{\text{pred}}, \sigma) \end{align*} \]
nll = function(y, x, beta_0, beta_1, sd) {
if (sd <= 0) {return( Inf )}
yPred = beta_0 + x * beta_1
nll = -dnorm(y, mean=yPred, sd=sd, log = T)
sum(nll)
}
fit_lh = optim(par = c(0, 1, 1),
fn = function(par) with(murder_data,
nll(murder_rate, low_income,
par[1], par[2], par[3])))
fit_lh$par
## [1] -29.901445 2.559601 5.234126
fit_lm = lm(formula = murder_rate ~ low_income, data = murder_data)
tibble(parameter = c("beta_0", "beta_1"),
manual_mse = fit_mse$par,
manual_lh = fit_lh$par[1:2],
lm = fit_lm %>% coefficients) %>% show
## # A tibble: 2 x 4
## parameter manual_mse manual_lh lm
## <chr> <dbl> <dbl> <dbl>
## 1 beta_0 -29.9 -29.9 -29.9
## 2 beta_1 2.56 2.56 2.56
lm
lm(formula = murder_rate ~ low_income, data = murder_data) %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -29.90116 7.7891841 -3.838805 1.202748e-03
## 2 low_income 2.55939 0.3900129 6.562320 3.637535e-06
n_virtual_exps = 5000 # number of virtual experiments
sample_and_check = function(sample_size) {
map_lgl(1:n_virtual_exps, function(i) {
X = runif(n = sample_size, 0, 1000)
Y = rnorm(n = sample_size, mean = 0 * X, sd = 30)
summary(lm(Y ~ X))$coefficients[2,4] < 0.05
}) %>% sum() / n_virtual_exps
}
results = tibble(sample_size = c(15, 25, 100),
false_pos_rate = map_dbl(sample_size, sample_and_check))
results
## # A tibble: 3 x 2
## sample_size false_pos_rate
## <dbl> <dbl>
## 1 15. 0.0506
## 2 25. 0.0512
## 3 100. 0.0456