Data manipulation
Basic analysis (correlation & t-test)
Pipes
Statistical Programming with R
Data manipulation
Basic analysis (correlation & t-test)
Pipes
library(MASS) # for the cats data library(dplyr) # data manipulation library(haven) # in/exporting data library(magrittr) # pipes
transform()
: changing and adding columnsdplyr::filter()
: row-wise selection (of cases)table()
: frequency tablesclass()
: object classlevels()
: levels of a factororder()
: data entries in increasing orderhaven::read_sav()
: import SPSS datacor()
: bivariate correlationsample()
: drawing a samplet.test()
: t-testhead(cats)
## Sex Bwt Hwt ## 1 F 2.0 7.0 ## 2 F 2.0 7.4 ## 3 F 2.0 9.5 ## 4 F 2.1 7.2 ## 5 F 2.1 7.3 ## 6 F 2.1 7.6
str(cats)
## 'data.frame': 144 obs. of 3 variables: ## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ... ## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ... ## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
fem.cats <- cats[cats$Sex == "F", ] dim(fem.cats)
## [1] 47 3
head(fem.cats)
## Sex Bwt Hwt ## 1 F 2.0 7.0 ## 2 F 2.0 7.4 ## 3 F 2.0 9.5 ## 4 F 2.1 7.2 ## 5 F 2.1 7.3 ## 6 F 2.1 7.6
heavy.cats <- cats[cats$Bwt > 3, ] dim(heavy.cats)
## [1] 36 3
head(heavy.cats)
## Sex Bwt Hwt ## 109 M 3.1 9.9 ## 110 M 3.1 11.5 ## 111 M 3.1 12.1 ## 112 M 3.1 12.5 ## 113 M 3.1 13.0 ## 114 M 3.1 14.3
heavy.cats <- subset(cats, Bwt > 3) dim(heavy.cats)
## [1] 36 3
head(heavy.cats)
## Sex Bwt Hwt ## 109 M 3.1 9.9 ## 110 M 3.1 11.5 ## 111 M 3.1 12.1 ## 112 M 3.1 12.5 ## 113 M 3.1 13.0 ## 114 M 3.1 14.3
dplyr
filter(cats, Bwt > 2, Bwt < 2.2, Sex == "F")
## Sex Bwt Hwt ## 1 F 2.1 7.2 ## 2 F 2.1 7.3 ## 3 F 2.1 7.6 ## 4 F 2.1 8.1 ## 5 F 2.1 8.2 ## 6 F 2.1 8.3 ## 7 F 2.1 8.5 ## 8 F 2.1 8.7 ## 9 F 2.1 9.8
class(cats$Sex)
## [1] "factor"
levels(cats$Sex)
## [1] "F" "M"
levels(cats$Sex) <- c("Female", "Male") table(cats$Sex)
## ## Female Male ## 47 97
head(cats)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 4 Female 2.1 7.2 ## 5 Female 2.1 7.3 ## 6 Female 2.1 7.6
sorted.cats <- cats[order(cats$Bwt), ] head(sorted.cats)
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 48 Male 2.0 6.5 ## 49 Male 2.0 6.5 ## 4 Female 2.1 7.2
cats.numbers <- cbind(Weight = cats$Bwt, Height = cats$Hwt) head(cats.numbers)
## Weight Height ## [1,] 2.0 7.0 ## [2,] 2.0 7.4 ## [3,] 2.0 9.5 ## [4,] 2.1 7.2 ## [5,] 2.1 7.3 ## [6,] 2.1 7.6
rbind(cats[1:3, ], cats[1:5, ])
## Sex Bwt Hwt ## 1 Female 2.0 7.0 ## 2 Female 2.0 7.4 ## 3 Female 2.0 9.5 ## 4 Female 2.0 7.0 ## 5 Female 2.0 7.4 ## 6 Female 2.0 9.5 ## 7 Female 2.1 7.2 ## 8 Female 2.1 7.3
cor(cats[, -1])
## Bwt Hwt ## Bwt 1.0000000 0.8041274 ## Hwt 0.8041274 1.0000000
With [, -1]
we exclude the first column
cor.test(cats$Bwt, cats$Hwt)
## ## Pearson's product-moment correlation ## ## data: cats$Bwt and cats$Hwt ## t = 16.119, df = 142, p-value < 2.2e-16 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## 0.7375682 0.8552122 ## sample estimates: ## cor ## 0.8041274
What do we conclude?
plot(cats$Bwt, cats$Hwt)
Test the null hypothesis that the difference in mean heart weight between male and female cats is 0
t.test(formula = Hwt ~ Sex, data = cats)
## ## Welch Two Sample t-test ## ## data: Hwt by Sex ## t = -6.5179, df = 140.61, p-value = 1.186e-09 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -2.763753 -1.477352 ## sample estimates: ## mean in group Female mean in group Male ## 9.202128 11.322680
plot(formula = Hwt ~ Sex, data = cats)
boys <- read_sav("boys.sav") %>% head()
It effectively replaces head(read_sav("boys.sav"))
.
Let’s assume that we want to load data, change a variable, filter cases and select columns. Without a pipe, this would look like
boys <- read_sav("boys.sav") boys2 <- transform(boys, hgt = hgt / 100) boys3 <- filter(boys2, age > 15) boys4 <- subset(boys3, select = c(hgt, wgt, bmi))
With the pipe:
boys <- read_sav("boys.sav") %>% transform(hgt = hgt/100) %>% filter(age > 15) %>% subset(select = c(hgt, wgt, bmi))
Benefit: a single object in memory that is easy to interpret
Your code becomes more readable:
f(x)
becomes x %>% f()
rnorm(10) %>% mean()
## [1] 0.4798629
f(x, y)
becomes x %>% f(y)
boys %>% cor(use = "pairwise.complete.obs")
## hgt wgt bmi ## hgt 1.0000000 0.6100784 0.1758781 ## wgt 0.6100784 1.0000000 0.8841304 ## bmi 0.1758781 0.8841304 1.0000000
h(g(f(x)))
becomes x %>% f %>% g %>% h
boys %>% subset(select = wgt) %>% na.omit() %>% max()
## [1] 117.4
nrow(cats)
## [1] 144
cats.outl <- cats %>% filter(Hwt < mean(Hwt) + 3 * sd(Hwt), Hwt > mean(Hwt) - 3 * sd(Hwt)) nrow(cats.outl)
## [1] 143
cats %>% filter(Hwt > mean(Hwt) + 3 * sd(Hwt))
## Sex Bwt Hwt ## 1 Male 3.9 20.5
%>%
pipe%$%
pipe%T>%
pipe.
in a pipeIn a %>% b(arg1, arg2, arg3)
, a
will become arg1
. With .
we can change this.
set.seed(123) 1:5 %>% mean() %>% rnorm(10)
## [1] 9.439524 9.769823 11.558708
VS
set.seed(123) 1:5 %>% mean() %>% rnorm(n = 10, mean = .)
## [1] 2.439524 2.769823 4.558708 3.070508 3.129288 4.715065 3.460916 ## [8] 1.734939 2.313147 2.554338
The .
can be used as a placeholder in the pipe.
Remember: sample()
takes a random sample from a vector
sample(x = c(1, 1, 2, 3, 5, 8), size = 2)
## [1] 1 3
Sample 3 positions from the alphabet and show the position and the letter
set.seed(123) 1:26 %>% sample(3) %>% paste(., LETTERS[.])
## [1] "15 O" "19 S" "14 N"
If you don’t know what’s going on, run each statement separately!
set.seed(123) 1:26
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ## [24] 24 25 26
set.seed(123) 1:26 %>% sample(3)
## [1] 15 19 14
set.seed(123) 1:26 %>% sample(3) %>% paste(., LETTERS[.])
## [1] "15 O" "19 S" "14 N"
cats %$% t.test(Hwt ~ Sex)
## ## Welch Two Sample t-test ## ## data: Hwt by Sex ## t = -6.5179, df = 140.61, p-value = 1.186e-09 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -2.763753 -1.477352 ## sample estimates: ## mean in group Female mean in group Male ## 9.202128 11.322680
is the same as
t.test(Hwt ~ Sex, data = cats)
cats.test <- cats %$% t.test(Bwt ~ Sex) cats.test
## ## Welch Two Sample t-test ## ## data: Bwt by Sex ## t = -8.7095, df = 136.84, p-value = 8.831e-15 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.6631268 -0.4177242 ## sample estimates: ## mean in group Female mean in group Male ## 2.359574 2.900000