Data manipulation
Basic analysis (correlation & t-test)
Pipes
Statistical Programming in R
Data manipulation
Basic analysis (correlation & t-test)
Pipes
library(MASS) # for the cats data library(dplyr) # data manipulation library(haven) # in/exporting data library(magrittr) # pipes library(ggplot2) # Plotting device
library(mice) # missing data
## ## Attaching package: 'mice'
## The following object is masked from 'package:stats': ## ## filter
## The following objects are masked from 'package:base': ## ## cbind, rbind
head(cats)
## Sex Bwt Hwt ## 1 F 2.0 7.0 ## 2 F 2.0 7.4 ## 3 F 2.0 9.5 ## 4 F 2.1 7.2 ## 5 F 2.1 7.3 ## 6 F 2.1 7.6
str(cats)
## 'data.frame': 144 obs. of 3 variables: ## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ... ## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ... ## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
fem.cats <- cats[cats$Sex == "F", ] dim(fem.cats)
## [1] 47 3
head(fem.cats)
## Sex Bwt Hwt ## 1 F 2.0 7.0 ## 2 F 2.0 7.4 ## 3 F 2.0 9.5 ## 4 F 2.1 7.2 ## 5 F 2.1 7.3 ## 6 F 2.1 7.6
heavy.cats <- cats[cats$Bwt > 3, ] dim(heavy.cats)
## [1] 36 3
head(heavy.cats)
## Sex Bwt Hwt ## 109 M 3.1 9.9 ## 110 M 3.1 11.5 ## 111 M 3.1 12.1 ## 112 M 3.1 12.5 ## 113 M 3.1 13.0 ## 114 M 3.1 14.3
heavy.cats <- subset(cats, Bwt > 3) dim(heavy.cats)
## [1] 36 3
head(heavy.cats)
## Sex Bwt Hwt ## 109 M 3.1 9.9 ## 110 M 3.1 11.5 ## 111 M 3.1 12.1 ## 112 M 3.1 12.5 ## 113 M 3.1 13.0 ## 114 M 3.1 14.3
dplyr
filter(cats, Bwt > 2, Bwt < 2.2, Sex == "F")
## Sex Bwt Hwt ## 1 F 2.1 7.2 ## 2 F 2.1 7.3 ## 3 F 2.1 7.6 ## 4 F 2.1 8.1 ## 5 F 2.1 8.2 ## 6 F 2.1 8.3 ## 7 F 2.1 8.5 ## 8 F 2.1 8.7 ## 9 F 2.1 9.8
class(cats$Sex)
## [1] "factor"
levels(cats$Sex)
## [1] "F" "M"
levels(cats$Sex) <- c("Female", "Male") table(cats$Sex)
## ## Female Male ## 47 97
head(cats)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 4 Female 2.1 7.2 ## 5 Female 2.1 7.3 ## 6 Female 2.1 7.6
lm(Hwt ~ Bwt + Sex, data = cats)
## ## Call: ## lm(formula = Hwt ~ Bwt + Sex, data = cats) ## ## Coefficients: ## (Intercept) Bwt SexMale ## -0.4150 4.0758 -0.0821
cats$Sex <- relevel(cats$Sex, ref = "Male") lm(Hwt ~ Bwt + Sex, data = cats)
## ## Call: ## lm(formula = Hwt ~ Bwt + Sex, data = cats) ## ## Coefficients: ## (Intercept) Bwt SexFemale ## -0.4970 4.0758 0.0821
order(cats$Bwt)
## [1] 1 2 3 48 49 4 5 6 7 8 9 10 11 12 50 13 14 15 ## [19] 16 17 18 51 52 53 54 55 56 57 58 19 20 21 22 23 24 25 ## [37] 26 27 28 29 30 59 31 32 33 34 60 61 62 63 64 35 36 65 ## [55] 66 67 68 69 70 71 72 37 38 39 73 74 75 76 77 78 40 41 ## [73] 42 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 43 ## [91] 44 45 95 96 97 98 99 46 47 100 101 102 103 104 105 106 107 108 ## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 ## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
sorted.cats <- cats[order(cats$Bwt), ] head(sorted.cats)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 48 Male 2.0 6.5 ## 49 Male 2.0 6.5 ## 4 Female 2.1 7.2
arrange()
sorted.cats2 <- arrange(cats, Bwt) head(sorted.cats)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 48 Male 2.0 6.5 ## 49 Male 2.0 6.5 ## 4 Female 2.1 7.2
head(sorted.cats2)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 4 Male 2.0 6.5 ## 5 Male 2.0 6.5 ## 6 Female 2.1 7.2
arrange()
cats.2 <- arrange(cats, Bwt, desc(Hwt)) head(cats.2, n = 10)
## Sex Bwt Hwt ## 1 Female 2.0 9.5 ## 2 Female 2.0 7.4 ## 3 Female 2.0 7.0 ## 4 Male 2.0 6.5 ## 5 Male 2.0 6.5 ## 6 Male 2.1 10.1 ## 7 Female 2.1 9.8 ## 8 Female 2.1 8.7 ## 9 Female 2.1 8.5 ## 10 Female 2.1 8.3
arrange()
cats.2 <- arrange(cats, desc(Hwt), Bwt) head(cats.2, n = 10)
## Sex Bwt Hwt ## 1 Male 3.9 20.5 ## 2 Male 3.5 17.2 ## 3 Male 3.8 16.8 ## 4 Male 3.5 15.7 ## 5 Male 3.5 15.6 ## 6 Male 3.3 15.4 ## 7 Male 3.6 15.0 ## 8 Male 3.3 14.9 ## 9 Male 3.6 14.8 ## 10 Male 3.8 14.8
cor(cats[, -1])
## Bwt Hwt ## Bwt 1.0000000 0.8041274 ## Hwt 0.8041274 1.0000000
With [, -1]
we exclude the first column
cor.test(cats$Bwt, cats$Hwt)
## ## Pearson's product-moment correlation ## ## data: cats$Bwt and cats$Hwt ## t = 16.119, df = 142, p-value < 2.2e-16 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## 0.7375682 0.8552122 ## sample estimates: ## cor ## 0.8041274
What do we conclude?
plot(cats$Bwt, cats$Hwt)
Test the null hypothesis that the difference in mean heart weight between male and female cats is 0
t.test(formula = Hwt ~ Sex, data = cats)
## ## Welch Two Sample t-test ## ## data: Hwt by Sex ## t = 6.5179, df = 140.61, p-value = 1.186e-09 ## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0 ## 95 percent confidence interval: ## 1.477352 2.763753 ## sample estimates: ## mean in group Male mean in group Female ## 11.322680 9.202128
plot(formula = Hwt ~ Sex, data = cats)
boys <- read_sav("boys.sav") %>% head()
It effectively replaces head(read_sav("boys.sav"))
.
Let’s assume that we want to load data, change a variable, filter cases and select columns. Without a pipe, this would look like
boys <- read_sav("boys.sav") boys2 <- transform(boys, hgt = hgt / 100) boys3 <- filter(boys2, age > 15) boys4 <- subset(boys3, select = c(hgt, wgt, bmi))
With the pipe:
boys <- read_sav("boys.sav") %>% transform(hgt = hgt/100) %>% filter(age > 15) %>% subset(select = c(hgt, wgt, bmi))
Benefit: a single object in memory that is easy to interpret
Your code becomes more readable:
boys %>% head()
## hgt wgt bmi ## 1 1.880 91.6 25.91 ## 2 1.806 58.0 17.78 ## 3 1.855 62.7 18.22 ## 4 1.785 54.1 16.97 ## 5 1.725 64.0 21.50 ## 6 1.781 74.5 23.48
f(x)
becomes x %>% f()
rnorm(10) %>% mean()
## [1] 0.5604495
f(x, y)
becomes x %>% f(y)
boys %>% cor(use = "pairwise.complete.obs")
## hgt wgt bmi ## hgt 1.0000000 0.6100784 0.1758781 ## wgt 0.6100784 1.0000000 0.8841304 ## bmi 0.1758781 0.8841304 1.0000000
h(g(f(x)))
becomes x %>% f %>% g %>% h
boys %>% subset(select = wgt) %>% na.omit() %>% max()
## [1] 117.4
%>%
pipe%$%
pipe%T>%
pipe.
in a pipeIn a %>% b(arg1, arg2, arg3)
, a
will become arg1
. With .
we can change this.
set.seed(123) 1:5 %>% mean() %>% rnorm(10)
## [1] 9.439524 9.769823 11.558708
VS
set.seed(123) 1:5 %>% mean() %>% rnorm(n = 10, mean = .)
## [1] 2.439524 2.769823 4.558708 3.070508 3.129288 4.715065 3.460916 1.734939 ## [9] 2.313147 2.554338
The .
can be used as a placeholder in the pipe.
Remember: sample()
takes a random sample from a vector
sample(x = c(1, 1, 2, 3, 5, 8), size = 2)
## [1] 1 3
Sample 3 positions from the alphabet and show the position and the letter
set.seed(123) 1:26 %>% sample(3) %>% paste(., LETTERS[.])
## [1] "15 O" "19 S" "14 N"
If you don’t know what’s going on, run each statement separately!
set.seed(123) 1:26
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ## [26] 26
set.seed(123) 1:26 %>% sample(3)
## [1] 15 19 14
set.seed(123) 1:26 %>% sample(3) %>% paste(., LETTERS[.])
## [1] "15 O" "19 S" "14 N"
cats %$% t.test(Hwt ~ Sex)
## ## Welch Two Sample t-test ## ## data: Hwt by Sex ## t = 6.5179, df = 140.61, p-value = 1.186e-09 ## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0 ## 95 percent confidence interval: ## 1.477352 2.763753 ## sample estimates: ## mean in group Male mean in group Female ## 11.322680 9.202128
is the same as
t.test(Hwt ~ Sex, data = cats)
cats.test <- cats %$% t.test(Bwt ~ Sex) cats.test
## ## Welch Two Sample t-test ## ## data: Bwt by Sex ## t = 8.7095, df = 136.84, p-value = 8.831e-15 ## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0 ## 95 percent confidence interval: ## 0.4177242 0.6631268 ## sample estimates: ## mean in group Male mean in group Female ## 2.900000 2.359574
boys %<>% arrange(desc(bmi)) head(boys)
## hgt wgt bmi ## 1 1.923 117.4 31.74 ## 2 1.740 94.9 31.34 ## 3 1.825 102.0 30.62 ## 4 1.943 113.0 29.93 ## 5 1.809 94.4 28.84 ## 6 1.808 93.8 28.69
boys %$% lm(bmi ~ .*., data = .)
## ## Call: ## lm(formula = bmi ~ . * ., data = .) ## ## Coefficients: ## (Intercept) hgt wgt hgt:wgt ## 11.5686 -6.3493 0.7470 -0.2442
hist()
: histogramplot()
: R’s plotting devicebarplot()
: bar plot functionboxplot()
: box plot functiondensity()
: function that calculates the densityggplot()
: ggplot’s plotting deviceSource: Anscombe, F. J. (1973). “Graphs in Statistical Analysis”. American Statistician. 27 (1): 17–21.
Source: https://www.autodeskresearch.com/publications/samestats
base
graphics in R
ggplot2
graphicshist(boys$hgt, main = "Histogram", xlab = "Height")
dens <- density(boys$hgt, na.rm = TRUE) plot(dens, main = "Density plot", xlab = "Height", bty = "L")
plot(x = boys$hgt, y = boys$wgt, main = "Scatter plot", xlab = "Height", ylab = "Weight", bty = "L")
boxplot(boys$hgt ~ boys$reg, main = "Boxplot", xlab = "Region", ylab = "Height")
boxplot(hgt ~ reg, boys, main = "Boxplot", xlab = "Region", ylab = "Height", lwd = 2, notch = TRUE, col = rainbow(5))
boys %>% md.pattern(rotate.names = TRUE) # from mice
## age reg wgt hgt bmi hc gen phb tv ## 223 1 1 1 1 1 1 1 1 1 0 ## 19 1 1 1 1 1 1 1 1 0 1 ## 1 1 1 1 1 1 1 1 0 1 1 ## 1 1 1 1 1 1 1 0 1 0 2 ## 437 1 1 1 1 1 1 0 0 0 3 ## 43 1 1 1 1 1 0 0 0 0 4 ## 16 1 1 1 0 0 1 0 0 0 5 ## 1 1 1 1 0 0 0 0 0 0 6 ## 1 1 1 0 1 0 1 0 0 0 5 ## 1 1 1 0 0 0 1 1 1 1 3 ## 1 1 1 0 0 0 0 1 1 1 4 ## 1 1 1 0 0 0 0 0 0 0 7 ## 3 1 0 1 1 1 1 0 0 0 4 ## 0 3 4 20 21 46 503 503 522 1622
plot()
methodresult <- lm(age~wgt, boys) plot(result, which = 1)
plot()
methodresult <- lm(age~wgt, boys) plot(result, which = 2)
plot()
methodresult <- lm(age~wgt, boys) plot(result, which = 5)
ggplot2
?Layered plotting based on the book The Grammer of Graphics by Leland Wilkinsons.
With ggplot2
you
ggplot2
then takes care of the details
1: Provide the data
boys %>% ggplot()
2: map variable to aesthetics
boys %>% ggplot(aes(x = age, y = bmi))
3: state which geometric object to display
boys %>% ggplot(aes(x = age, y = bmi)) + geom_point()
Create the plot
gg <- boys %>% ggplot(aes(x = age, y = bmi)) + geom_point(col = "dark green")
Add another layer (smooth fit line)
gg <- gg + geom_smooth(col = "dark blue")
Give it some labels and a nice look
gg <- gg + labs(x = "Age", y = "BMI", title = "BMI trend for boys") + theme_minimal()
plot(gg)
gg <- boys %>% filter(!is.na(reg)) %>% ggplot(aes(x = age, y = bmi, size = hc, colour = reg)) + geom_point(alpha = 0.5) + labs(title = "BMI trend for boys", x = "Age", y = "BMI", size = "Head circumference", colour = "Region") + theme_minimal()
plot(gg)
geom_point
geom_bar
geom_line
geom_smooth
geom_histogram
geom_boxplot
geom_density