From slides
ChickWeight |>
group_by(Chick) |>
mutate(growth = (weight-min(weight)) / Time)
## # A tibble: 578 × 5
## # Groups: Chick [50]
## weight Time Chick Diet growth
## <dbl> <dbl> <ord> <fct> <dbl>
## 1 42 0 1 1 NaN
## 2 51 2 1 1 4.5
## 3 59 4 1 1 4.25
## 4 64 6 1 1 3.67
## 5 76 8 1 1 4.25
## 6 93 10 1 1 5.1
## 7 106 12 1 1 5.33
## 8 125 14 1 1 5.93
## 9 149 16 1 1 6.69
## 10 171 18 1 1 7.17
## # ℹ 568 more rows
Try single = for logical comparison and see what happens!
ChickWeight |>
filter(Time == 10) |>
group_by(Diet) |>
summarize(weight_10 = mean(weight), .groups = "drop")
## # A tibble: 4 × 2
## Diet weight_10
## <fct> <dbl>
## 1 1 93.1
## 2 2 108.
## 3 3 117.
## 4 4 126
Use the dplyr functions: mutate, filter, group_by, summarize, select, arrange to answer the following questions.
penguins
? How many observations of each species?penguins |> count(species)
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 152
## 2 Chinstrap 68
## 3 Gentoo 124
penguins |> summarize(max_flipper = max(flipper_length_mm),
min_flipper = min(flipper_length_mm))
## # A tibble: 1 × 2
## max_flipper min_flipper
## <int> <int>
## 1 NA NA
penguins |> summarize(max_flipper = max(flipper_length_mm, na.rm=TRUE),
min_flipper = min(flipper_length_mm, na.rm=TRUE),
count = n())
## # A tibble: 1 × 3
## max_flipper min_flipper count
## <int> <int> <int>
## 1 231 172 344
na.omit
and filter(!is.na())
to remove
missing data.penguins |> filter(!is.na(flipper_length_mm)) |>
summarize(max_flipper = max(flipper_length_mm),
min_flipper = min(flipper_length_mm),
count = n())
## # A tibble: 1 × 3
## max_flipper min_flipper count
## <int> <int> <int>
## 1 231 172 342
penguins |> na.omit() |>
summarize(max_flipper = max(flipper_length_mm),
min_flipper = min(flipper_length_mm),
count = n())
## # A tibble: 1 × 3
## max_flipper min_flipper count
## <int> <int> <int>
## 1 231 172 333
penguins |> filter(!is.na(flipper_length_mm)) |>
group_by(species, sex) |>
summarize(max_flipper = max(flipper_length_mm),
median_flipper = median(flipper_length_mm),
min_flipper = min(flipper_length_mm),
count = n())
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 6
## # Groups: species [3]
## species sex max_flipper median_flipper min_flipper count
## <fct> <fct> <int> <dbl> <int> <int>
## 1 Adelie female 202 188 172 73
## 2 Adelie male 210 193 178 73
## 3 Adelie <NA> 193 186 179 5
## 4 Chinstrap female 202 192 178 34
## 5 Chinstrap male 212 200. 187 34
## 6 Gentoo female 222 212 203 58
## 7 Gentoo male 231 221 208 61
## 8 Gentoo <NA> 217 216 214 4
penguins |> summarize( max_flipper_length_mm = max(flipper_length_mm) )
## # A tibble: 1 × 1
## max_flipper_length_mm
## <int>
## 1 NA
penguins |> summarize( max_flipper_length_mm = max(flipper_length_mm, na.rm=TRUE) )
## # A tibble: 1 × 1
## max_flipper_length_mm
## <int>
## 1 231
penguins |> filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )
## # A tibble: 1 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Gentoo Biscoe 54.3 15.7 231 5650
## # ℹ 2 more variables: sex <fct>, year <int>
penguins |>
group_by(species, island) |>
filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )
## # A tibble: 5 × 8
## # Groups: species, island [5]
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Dream 40.8 18.9 208 4300
## 2 Adelie Biscoe 41 20 203 4725
## 3 Adelie Torgersen 44.1 18 210 4000
## 4 Gentoo Biscoe 54.3 15.7 231 5650
## 5 Chinstrap Dream 49 19.6 212 4300
## # ℹ 2 more variables: sex <fct>, year <int>
penguins |>
group_by(species) |>
filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )
## # A tibble: 3 × 8
## # Groups: species [3]
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 44.1 18 210 4000
## 2 Gentoo Biscoe 54.3 15.7 231 5650
## 3 Chinstrap Dream 49 19.6 212 4300
## # ℹ 2 more variables: sex <fct>, year <int>
penguins |>
group_by(island) |>
filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )
## # A tibble: 3 × 8
## # Groups: island [3]
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 44.1 18 210 4000
## 2 Gentoo Biscoe 54.3 15.7 231 5650
## 3 Chinstrap Dream 49 19.6 212 4300
## # ℹ 2 more variables: sex <fct>, year <int>
penguins |>
count(island, species)
## # A tibble: 5 × 3
## island species n
## <fct> <fct> <int>
## 1 Biscoe Adelie 44
## 2 Biscoe Gentoo 124
## 3 Dream Adelie 56
## 4 Dream Chinstrap 68
## 5 Torgersen Adelie 52
We will talk next week about shifting this “long” format table into a “wide” format table that has one row per island and one column per species.
penguins |>
mutate(ratio = bill_length_mm / bill_depth_mm) |>
# select(species, island, bill_length_mm, bill_depth_mm, ratio) |>
ggplot(aes(x = ratio)) +
geom_histogram(bins = 25)
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).
penguins |>
mutate(ratio = bill_length_mm / bill_depth_mm) |>
# select(species, island, bill_length_mm, bill_depth_mm, ratio) |>
ggplot(aes(x = ratio, fill = species)) +
geom_histogram(bins = 25)
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).
penguins |>
mutate(ratio = bill_length_mm / bill_depth_mm) |>
# select(species, island, bill_length_mm, bill_depth_mm, ratio) |>
ggplot(aes(x = ratio, fill = island)) +
geom_histogram(bins = 25)
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).
penguins |>
filter(!is.na(body_mass_g)) |>
group_by(species, sex) |>
summarize( mean_body_mass = mean(body_mass_g),
sd_body_mass = sd(body_mass_g),
n = n(),
se_mean_body_mass = sd_body_mass / sqrt(n),
cv_body_mass = sd_body_mass / mean_body_mass)
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 7
## # Groups: species [3]
## species sex mean_body_mass sd_body_mass n se_mean_body_mass cv_body_mass
## <fct> <fct> <dbl> <dbl> <int> <dbl> <dbl>
## 1 Adelie fema… 3369. 269. 73 31.5 0.0800
## 2 Adelie male 4043. 347. 73 40.6 0.0858
## 3 Adelie <NA> 3540 477. 5 213. 0.135
## 4 Chinst… fema… 3527. 285. 34 48.9 0.0809
## 5 Chinst… male 3939. 362. 34 62.1 0.0919
## 6 Gentoo fema… 4680. 282. 58 37.0 0.0602
## 7 Gentoo male 5485. 313. 61 40.1 0.0571
## 8 Gentoo <NA> 4588. 338. 4 169. 0.0737
slice_tail
instead of
slice_head
, or remove the -
in the
arrange
function.)diamonds |>
mutate(price_per_carat = price / carat) |>
group_by(color, clarity, cut) |>
summarise(median_price_per_carat = median(price_per_carat),
n = n(),
.groups = "drop") |>
arrange(-median_price_per_carat) |>
group_by(cut) |>
slice_head(n=2) |>
# filter(median_price_per_carat == max(median_price_per_carat)) |>
ungroup() |>
arrange(-median_price_per_carat)
## # A tibble: 10 × 5
## color clarity cut median_price_per_carat n
## <ord> <ord> <ord> <dbl> <int>
## 1 D IF Good 14932. 9
## 2 D IF Premium 11057. 10
## 3 D IF Very Good 10202. 23
## 4 D IF Ideal 7162. 28
## 5 J VVS1 Premium 5336. 24
## 6 J VVS2 Very Good 5227. 29
## 7 H IF Good 5100. 4
## 8 E VVS1 Fair 4921. 3
## 9 G VS2 Fair 4838 45
## 10 H SI1 Ideal 4469. 763
How many kinds of species of penguin are in the dataset
penguins
? How many observations of each species?
penguins |> group_by(species) |> summarize(n = n()) # These 3 lines all do the same thing
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 152
## 2 Chinstrap 68
## 3 Gentoo 124
penguins |> group_by(species) |> count()
## # A tibble: 3 × 2
## # Groups: species [3]
## species n
## <fct> <int>
## 1 Adelie 152
## 2 Chinstrap 68
## 3 Gentoo 124
count_table <- penguins |> count(species)
penguins |> count(species)
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 152
## 2 Chinstrap 68
## 3 Gentoo 124
What is the maximum and minimum flipper length?
penguins |>
na.omit() |>
summarize(min_flipper_length = min(flipper_length_mm),
max_flipper_length = max(flipper_length_mm))
## # A tibble: 1 × 2
## min_flipper_length max_flipper_length
## <int> <int>
## 1 172 231
Another way of omitting missing data.
penguins |>
filter(!is.na(flipper_length_mm)) |>
summarize(min_flipper_length = min(flipper_length_mm),
max_flipper_length = max(flipper_length_mm))
## # A tibble: 1 × 2
## min_flipper_length max_flipper_length
## <int> <int>
## 1 172 231
Do this for each species:
penguins |>
filter(!is.na(flipper_length_mm)) |>
group_by(species) |>
summarize(min_flipper_length = min(flipper_length_mm),
max_flipper_length = max(flipper_length_mm))
## # A tibble: 3 × 3
## species min_flipper_length max_flipper_length
## <fct> <int> <int>
## 1 Adelie 172 210
## 2 Chinstrap 178 212
## 3 Gentoo 203 231
Find the observation with the largest flipper length.
penguins |>
filter(!is.na(flipper_length_mm)) |>
group_by(species, sex) |>
mutate(max_flipper_length = max(flipper_length_mm)) |>
filter(flipper_length_mm == max_flipper_length) |>
select(species, sex, flipper_length_mm, max_flipper_length)
## # A tibble: 8 × 4
## # Groups: species, sex [8]
## species sex flipper_length_mm max_flipper_length
## <fct> <fct> <int> <int>
## 1 Adelie <NA> 193 193
## 2 Adelie female 202 202
## 3 Adelie male 210 210
## 4 Gentoo male 231 231
## 5 Gentoo female 222 222
## 6 Gentoo <NA> 217 217
## 7 Chinstrap male 212 212
## 8 Chinstrap female 202 202
Combine two steps
penguins |>
filter(!is.na(flipper_length_mm)) |>
group_by(species, sex) |>
filter(flipper_length_mm == max(flipper_length_mm)) |>
select(species, sex, flipper_length_mm)
## # A tibble: 8 × 3
## # Groups: species, sex [8]
## species sex flipper_length_mm
## <fct> <fct> <int>
## 1 Adelie <NA> 193
## 2 Adelie female 202
## 3 Adelie male 210
## 4 Gentoo male 231
## 5 Gentoo female 222
## 6 Gentoo <NA> 217
## 7 Chinstrap male 212
## 8 Chinstrap female 202
Make a table showing the number of observations of each combination of species and island.
penguins |> count(species, island)
## # A tibble: 5 × 3
## species island n
## <fct> <fct> <int>
## 1 Adelie Biscoe 44
## 2 Adelie Dream 56
## 3 Adelie Torgersen 52
## 4 Chinstrap Dream 68
## 5 Gentoo Biscoe 124
penguins |> group_by(species, island) |> summarize(n = n())
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 5 × 3
## # Groups: species [3]
## species island n
## <fct> <fct> <int>
## 1 Adelie Biscoe 44
## 2 Adelie Dream 56
## 3 Adelie Torgersen 52
## 4 Chinstrap Dream 68
## 5 Gentoo Biscoe 124
Follow up with a second summarize(n) to see what happens when the island grouping is dropped. Answers the question: How many islands was each species found on?
penguins |> group_by(species, island) |> summarize(n = n()) |> summarize(n = n())
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 2
## species n
## <fct> <int>
## 1 Adelie 3
## 2 Chinstrap 1
## 3 Gentoo 1
Compute the ratio of the bill length to bill depth and make a histogram with these data.
penguins |> mutate(bill_ratio = bill_length_mm / bill_depth_mm) |>
filter(!is.na(bill_ratio)) |>
ggplot(aes(x = bill_ratio)) +
geom_histogram(binwidth = 0.1)
Repeat that calculation for each species.
penguins |>
mutate(bill_ratio = bill_length_mm / bill_depth_mm) |>
filter(!is.na(bill_ratio)) |>
ggplot(aes(x = bill_ratio, fill = species)) +
geom_histogram(binwidth = 0.1)
Compute the mean and standard deviation of body mass for each species and sex.