library(dplyr) # Data manipulation library(magrittr) # Pipes library(stringr) # For counting substrings library(ggplot2) # Plotting suite
Statistical Programming with R
library(dplyr) # Data manipulation library(magrittr) # Pipes library(stringr) # For counting substrings library(ggplot2) # Plotting suite
With Monte Carlo methods we can compute expectations: what do we expect to happen to a particular sample quantity (such as the mean or a p-value) upon repeated sampling?
It builds upon the principles of inferential statistics and needs:
It works by repetitively sampling from a population while varying the input, but keeping the method consistent.
Who should be most confident about their estimate of delay probability?
In repeated independent experiments
true probability \(p\) of a particular outcome in each experimenttrue probability \(p\)So if we replicate the same procedure an infinite number of times, the difference between our estimate and the true value would be zero.
The experiments must be independent, i.e., the event probability in one trial does not depend on other trials.
set.seed(123) x <- sample(1:6, 10, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.3 0.1 0.1 0.2 0.2 0.1
x <- sample(1:6, 100, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.15 0.17 0.16 0.21 0.14 0.17
x <- sample(1:6, 10000, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.1616 0.1683 0.1663 0.1713 0.1663 0.1662
x <- sample(1:6, 1000000, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.166610 0.167284 0.166763 0.166686 0.166458 0.166199
charx <- paste(x, collapse = "") estprob <- str_count(charx, "123") / 1e6 trueprob <- (1/6)^3 cat(estprob,"\n", trueprob, sep = "")
## 0.004635 ## 0.00462963
# Initialise results object
result <- list()
# repeat K times
for (i in 1:100) {
# generate a dataset of interest
dataset <- sample(1:6, 100000, prob = rep(1/6, 6), replace = TRUE)
# optional: perform your method on this dataset
# Save the values you are interested in
result[[i]] <- dataset
}
# Aggregate your results
probs <- sapply(result, function(x) prop.table(table(x)))
# Display output
rowMeans(probs)
## 1 2 3 4 5 6 ## 0.1665194 0.1666168 0.1668015 0.1666164 0.1666844 0.1667615
result <- list()
for (i in 1:1000) {
result[[i]] <- sample(1:6,
size = 1000,
prob = c(.01, .04, .1, .15, .2, .5),
replace = TRUE)
} # 1000 trials
probs <- sapply(result, function(x) prop.table(table(x)))
rowMeans(probs)
## 1 2 3 4 5 6 ## 0.010030 0.040065 0.099689 0.149490 0.200011 0.500715
The probability of detecting an effect if there is in fact a true effect.
What happens to the power of an independent-samples t-test if we measure on a 5-point scale instead of a continuous outcome?
cuts <- c(-Inf, -1.2, -0.4, 0.4, 1.2, Inf) curve(dnorm, from = -3, to = 3) abline(v = cuts)
set.seed(123) effect_size <- 0.7 treatment <- rnorm(24, mean = effect_size) control <- rnorm(24, mean = 0) treatCut <- as.numeric(cut(treatment, cuts)) contrCut <- as.numeric(cut(control, cuts)) t.test(treatment, control)$p.value
## [1] 0.02382654
t.test(treatCut, contrCut)$p.value
## [1] 0.09583241
How confident are we in this result?
# Initialise results object
result <- matrix(FALSE, nrow = 100000, ncol = 2)
for (i in 1:100000) {
# generate a dataset of interest
treatment <- rnorm(24, .7)
control <- rnorm(24)
treatCut <- as.numeric(cut(treatment, cuts))
contrCut <- as.numeric(cut(control, cuts))
# perform your method on this dataset
treatp <- t.test(treatment, control)$p.value
contrp <- t.test(treatCut, contrCut)$p.value
# Save the values you are interested in
result[i,1] <- treatp < 0.05
result[i,2] <- contrp < 0.05
}
colMeans(result)
## [1] 0.66311 0.62295