freely available online: R for Data Science
read more
from R for Data Science
special purpose programming language for data science statistical computing
authority says to tell you to not think of R as a programming language!
free (GNU General Public License)
interpreted language
6 * 7
## [1] 42
x = c(1,2,3)
x + 1
## [1] 2 3 4
m = matrix(c(1,2,3,4), nrow = 2)
m[1,]
## [1] 1 3
supports object-oriented, procedural & functional style
prefers functional style over loops
# this is (usually) slow
x = c()
for (i in 1:10000) {
x[i] = crazyFunction(i)
}
# this is (usually) faster
sapply(1:10000, function(i) crazyFunction(i))
convenient interfaces to other languages
assignment in both directions possible
x <- 3
3 -> y
x == y
## [1] TRUE
a lot of innovation and development takes place in packages
go browse some 12,000 packages on CRAN
install packages (only once)
install.packages('tidyverse')
load packages (for every session)
library(tidyverse)
base R functionality is always available
rnorm(n = 5, mean = 10) # 5 samples from a normal with mean 10 & std. dev. 1 (default)
## [1] 9.208483 9.652608 10.813054 9.724418 9.919161
packages bring extra functions
library(mvtnorm)
mvtnorm::rmvnorm(n = 3, mean = rep(10,5)) # 3 samples from a multivariate normal
## [,1] [,2] [,3] [,4] [,5]
## [1,] 10.365201 9.824756 9.502824 9.455536 10.553926
## [2,] 11.757588 10.121859 11.122611 10.619925 11.393111
## [3,] 9.769219 9.930861 10.394108 11.623506 9.759871
help('rmvnorm')
Mvnorm {mvtnorm} R Documentation
Multivariate Normal Density and Random Deviates
Description
These functions provide the density function and a random number generator
for the multivariate normal distribution with mean equal to mean and
covariance matrix sigma.
Usage
dmvnorm(x, mean = rep(0, p), sigma = diag(p), log = FALSE)
rmvnorm(n, mean = rep(0, nrow(sigma)), sigma = diag(length(mean)),
method=c("eigen", "svd", "chol"), pre0.9_9994 = FALSE)
integrated development environment for R
this course will focus (entirely?) on rectangular data
not covered:
library(nycflights13)
nycflights13::flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 533 529 4. 850
## 3 2013 1 1 542 540 2. 923
## 4 2013 1 1 544 545 -1. 1004
## 5 2013 1 1 554 600 -6. 812
## 6 2013 1 1 554 558 -4. 740
## 7 2013 1 1 555 600 -5. 913
## 8 2013 1 1 557 600 -3. 709
## 9 2013 1 1 557 600 -3. 838
## 10 2013 1 1 558 600 -2. 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
study Chapters 5 and 12 from R for Data Science
this is untidy
grades = tibble(name = c('Michael', 'Noa', 'Obioma'),
midterm = c(3.7, 1.0, 1.3),
final = c(4.0, 1.3, 1.0))
grades
## # A tibble: 3 x 3
## name midterm final
## <chr> <dbl> <dbl>
## 1 Michael 3.70 4.00
## 2 Noa 1.00 1.30
## 3 Obioma 1.30 1.00
to tidy up, we need to gather columns which are not variables into a new column
grades %>% gather('midterm', 'final', key = 'exam', value = 'grade')
## # A tibble: 6 x 3
## name exam grade
## <chr> <chr> <dbl>
## 1 Michael midterm 3.70
## 2 Noa midterm 1.00
## 3 Obioma midterm 1.30
## 4 Michael final 4.00
## 5 Noa final 1.30
## 6 Obioma final 1.00
this is untidy too
results = tibble(name = c('Michael', 'Noa', 'Obioma', 'Michael', 'Noa', 'Obioma'),
what = rep(c('grade', 'points'), each = 3),
howmuch = c(3.7, 1.0, 1.0, 55, 99, 99))
results
## # A tibble: 6 x 3
## name what howmuch
## <chr> <chr> <dbl>
## 1 Michael grade 3.70
## 2 Noa grade 1.00
## 3 Obioma grade 1.00
## 4 Michael points 55.0
## 5 Noa points 99.0
## 6 Obioma points 99.0
to tidy up, we need to spread cells from a row out over several columns
results %>% spread(key = 'what', value = 'howmuch')
## # A tibble: 3 x 3
## name grade points
## <chr> <dbl> <dbl>
## 1 Michael 3.70 55.
## 2 Noa 1.00 99.
## 3 Obioma 1.00 99.
“Some of the circles are black.”
dummy
d = readr::read_csv('../data/00_typicality_some.csv') # from package 'readr'
## Parsed with column specification:
## cols(
## id = col_integer(),
## language = col_character(),
## rt = col_integer(),
## type = col_character(),
## response = col_integer(),
## nr_black = col_integer(),
## variant = col_character(),
## comments = col_character()
## )
d
## # A tibble: 5,112 x 8
## id language rt type response nr_black variant comments
## <int> <chr> <int> <chr> <int> <int> <chr> <chr>
## 1 1 English 3930 filler 1 5 C No
## 2 1 English 3108 most 0 5 C No
## 3 1 English 2599 filler 1 8 C No
## 4 1 English 4405 many 1 7 C No
## 5 1 English 2574 some 1 6 C No
## 6 1 English 1917 filler 1 3 C No
## 7 1 English 2471 filler 0 3 C No
## 8 1 English 2495 many 0 6 C No
## 9 1 English 2093 some 1 9 C No
## 10 1 English 1767 filler 0 2 C No
## # ... with 5,102 more rows
levels(factor(d$comments))[1:20]
## [1] "bonuses always help with a toddler in the home ;)"
## [2] "Cheers."
## [3] "cool!"
## [4] "Easy HIT thanks!"
## [5] "Everything worked fine, thanks"
## [6] "fun"
## [7] "Fun and interactive. Thank you!"
## [8] "fun fun fun"
## [9] "Fun study"
## [10] "Fun study, thanks"
## [11] "fun survey"
## [12] "Fun, thanks!"
## [13] "good hit"
## [14] "Good luck with your research!"
## [15] "great hit"
## [16] "Great hit, good luck with your research."
## [17] "Great HIT!"
## [18] "Great survey. Thank you!"
## [19] "Hi"
## [20] "I accidentally clicked \"false\" on one of the \"some are black\" statements. It was the one where around half were black"
table(d$language)
##
## American English Egnlish Enblish Englashi
## 8 13 8 8
## english English ENGLISH englsih
## 2410 2457 103 26
## englush Enlglish FRENCH Japanese
## 13 8 13 8
## Russian Spanish Tamil white
## 8 8 8 13
d = dplyr::filter(d, ! language %in% c("FRENCH", "Japanese", "Russian", "Spanish", "Tamil", "white"))
table(d$language)
##
## American English Egnlish Enblish Englashi
## 8 13 8 8
## english English ENGLISH englsih
## 2410 2457 103 26
## englush Enlglish
## 13 8
d = d %>% dplyr::filter(type == "some") %>%
dplyr::select(-language, -comments, -type)
d
## # A tibble: 1,449 x 5
## id rt response nr_black variant
## <int> <int> <int> <int> <chr>
## 1 1 2574 1 6 C
## 2 1 2093 1 9 C
## 3 1 2543 1 3 C
## 4 2 1857 4 5 B
## 5 2 11454 4 10 B
## 6 2 2053 4 3 B
## 7 3 1479 1 10 D
## 8 3 1640 0 0 D
## 9 3 1199 1 7 D
## 10 4 4828 6 6 B
## # ... with 1,439 more rows
d = d %>% dplyr::rename(condition = nr_black)
d
## # A tibble: 1,449 x 5
## id rt response condition variant
## <int> <int> <int> <int> <chr>
## 1 1 2574 1 6 C
## 2 1 2093 1 9 C
## 3 1 2543 1 3 C
## 4 2 1857 4 5 B
## 5 2 11454 4 10 B
## 6 2 2053 4 3 B
## 7 3 1479 1 10 D
## 8 3 1640 0 0 D
## 9 3 1199 1 7 D
## 10 4 4828 6 6 B
## # ... with 1,439 more rows
d = d %>% dplyr::mutate(dependent.measure = ifelse(variant %in% c("A", "B"), "ordinal", "binary"),
alternatives = factor(ifelse(variant %in% c("A", "C"), "present", "absent"))) %>%
dplyr::select(- variant)
d
## # A tibble: 1,449 x 6
## id rt response condition dependent.measure alternatives
## <int> <int> <int> <int> <chr> <fct>
## 1 1 2574 1 6 binary present
## 2 1 2093 1 9 binary present
## 3 1 2543 1 3 binary present
## 4 2 1857 4 5 ordinal absent
## 5 2 11454 4 10 ordinal absent
## 6 2 2053 4 3 ordinal absent
## 7 3 1479 1 10 binary absent
## 8 3 1640 0 0 binary absent
## 9 3 1199 1 7 binary absent
## 10 4 4828 6 6 ordinal absent
## # ... with 1,439 more rows
d = d %>% mutate(response = purrr::map2_dbl(dependent.measure, response,
function(x,y) { ifelse(x == "ordinal", (y-1)/6, y) } ))
d
## # A tibble: 1,449 x 6
## id rt response condition dependent.measure alternatives
## <int> <int> <dbl> <int> <chr> <fct>
## 1 1 2574 1.00 6 binary present
## 2 1 2093 1.00 9 binary present
## 3 1 2543 1.00 3 binary present
## 4 2 1857 0.500 5 ordinal absent
## 5 2 11454 0.500 10 ordinal absent
## 6 2 2053 0.500 3 ordinal absent
## 7 3 1479 1.00 10 binary absent
## 8 3 1640 0. 0 binary absent
## 9 3 1199 1.00 7 binary absent
## 10 4 4828 0.833 6 ordinal absent
## # ... with 1,439 more rows
d %>% dplyr::group_by(dependent.measure) %>%
dplyr::summarize(mean.response = mean(response))
## # A tibble: 2 x 2
## dependent.measure mean.response
## <chr> <dbl>
## 1 binary 0.785
## 2 ordinal 0.600
resp.summary = d %>% dplyr::group_by(dependent.measure, alternatives, condition) %>%
dplyr::summarize(mean.response = mean(response))
resp.summary
## # A tibble: 44 x 4
## # Groups: dependent.measure, alternatives [?]
## dependent.measure alternatives condition mean.response
## <chr> <fct> <int> <dbl>
## 1 binary absent 0 0.0909
## 2 binary absent 1 0.478
## 3 binary absent 2 0.778
## 4 binary absent 3 0.958
## 5 binary absent 4 0.964
## 6 binary absent 5 1.00
## 7 binary absent 6 0.938
## 8 binary absent 7 0.980
## 9 binary absent 8 0.929
## 10 binary absent 9 0.967
## # ... with 34 more rows
ggplot()
ggplot(data = resp.summary, aes(x = condition, y = mean.response)) +
geom_point()
ggplot(data = resp.summary, aes(x = condition, y = mean.response, color = alternatives)) +
geom_point()
ggplot(data = resp.summary, aes(x = condition, y = mean.response, color = alternatives)) +
geom_point() + geom_line() + facet_grid( . ~ dependent.measure)
ggplot(data = resp.summary, aes(x = condition, y = mean.response, color = alternatives)) +
geom_point() + geom_line() + facet_grid( . ~ dependent.measure) +
xlab("number of black balls") + ylab("mean response") +
scale_x_continuous(breaks = 0:10) + scale_color_manual(values = c("darkgrey", "firebrick"))