library(dplyr) # Data manipulation library(magrittr) # Pipes library(stringr) # For counting substrings library(ggplot2) # Plotting suite
Statistical Programming with R
library(dplyr) # Data manipulation library(magrittr) # Pipes library(stringr) # For counting substrings library(ggplot2) # Plotting suite
With Monte Carlo methods we can compute expectations: what do we expect to happen to a particular sample quantity (such as the mean or a p-value) upon repeated sampling?
It builds upon the principles of inferential statistics and needs:
It works by repetitively sampling from a population while varying the input, but keeping the method consistent.
Who should be most confident about their estimate of delay probability?
In repeated independent
experiments
true
probability \(p\) of a particular outcome in each experimenttrue
probability \(p\)So if we replicate the same procedure an infinite number of times, the difference between our estimate and the true value would be zero.
The experiments must be independent, i.e., the event probability in one trial does not depend on other trials.
set.seed(123) x <- sample(1:6, 10, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.3 0.1 0.1 0.2 0.2 0.1
x <- sample(1:6, 100, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.15 0.17 0.16 0.21 0.14 0.17
x <- sample(1:6, 10000, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.1616 0.1683 0.1663 0.1713 0.1663 0.1662
x <- sample(1:6, 1000000, prob = rep(1/6, 6), replace = TRUE) prop.table(table(x))
## x ## 1 2 3 4 5 6 ## 0.166610 0.167284 0.166763 0.166686 0.166458 0.166199
charx <- paste(x, collapse = "") estprob <- str_count(charx, "123") / 1e6 trueprob <- (1/6)^3 cat(estprob,"\n", trueprob, sep = "")
## 0.004635 ## 0.00462963
# Initialise results object result <- list() # repeat K times for (i in 1:100) { # generate a dataset of interest dataset <- sample(1:6, 100000, prob = rep(1/6, 6), replace = TRUE) # optional: perform your method on this dataset # Save the values you are interested in result[[i]] <- dataset } # Aggregate your results probs <- sapply(result, function(x) prop.table(table(x))) # Display output rowMeans(probs)
## 1 2 3 4 5 6 ## 0.1665194 0.1666168 0.1668015 0.1666164 0.1666844 0.1667615
result <- list() for (i in 1:1000) { result[[i]] <- sample(1:6, size = 1000, prob = c(.01, .04, .1, .15, .2, .5), replace = TRUE) } # 1000 trials probs <- sapply(result, function(x) prop.table(table(x))) rowMeans(probs)
## 1 2 3 4 5 6 ## 0.010030 0.040065 0.099689 0.149490 0.200011 0.500715
The probability of detecting an effect if there is in fact a true effect.
What happens to the power of an independent-samples t-test if we measure on a 5-point scale instead of a continuous outcome?
cuts <- c(-Inf, -1.2, -0.4, 0.4, 1.2, Inf) curve(dnorm, from = -3, to = 3) abline(v = cuts)
set.seed(123) effect_size <- 0.7 treatment <- rnorm(24, mean = effect_size) control <- rnorm(24, mean = 0) treatCut <- as.numeric(cut(treatment, cuts)) contrCut <- as.numeric(cut(control, cuts)) t.test(treatment, control)$p.value
## [1] 0.02382654
t.test(treatCut, contrCut)$p.value
## [1] 0.09583241
How confident are we in this result?
# Initialise results object result <- matrix(FALSE, nrow = 100000, ncol = 2) for (i in 1:100000) { # generate a dataset of interest treatment <- rnorm(24, .7) control <- rnorm(24) treatCut <- as.numeric(cut(treatment, cuts)) contrCut <- as.numeric(cut(control, cuts)) # perform your method on this dataset treatp <- t.test(treatment, control)$p.value contrp <- t.test(treatCut, contrCut)$p.value # Save the values you are interested in result[i,1] <- treatp < 0.05 result[i,2] <- contrp < 0.05 } colMeans(result)
## [1] 0.66311 0.62295