Summarizing data practice

From slides

ChickWeight |> 
  group_by(Chick) |>
  mutate(growth = (weight-min(weight)) / Time)

## # A tibble: 578 × 5
## # Groups:   Chick [50]
##    weight  Time Chick Diet  growth
##     <dbl> <dbl> <ord> <fct>  <dbl>
##  1     42     0 1     1     NaN   
##  2     51     2 1     1       4.5 
##  3     59     4 1     1       4.25
##  4     64     6 1     1       3.67
##  5     76     8 1     1       4.25
##  6     93    10 1     1       5.1 
##  7    106    12 1     1       5.33
##  8    125    14 1     1       5.93
##  9    149    16 1     1       6.69
## 10    171    18 1     1       7.17
## # ℹ 568 more rows

Try single = for logical comparison and see what happens!

ChickWeight |> 
  filter(Time == 10) |> 
  group_by(Diet) |> 
  summarize(weight_10 = mean(weight), .groups = "drop")

## # A tibble: 4 × 2
##   Diet  weight_10
##   <fct>     <dbl>
## 1 1          93.1
## 2 2         108. 
## 3 3         117. 
## 4 4         126

Use the dplyr functions: mutate, filter, group_by, summarize, select, arrange to answer the following questions.

How many kinds of species of penguin are in the dataset penguins? How many observations of each species?

penguins |> count(species)

## # A tibble: 3 × 2
##   species       n
##   <fct>     <int>
## 1 Adelie      152
## 2 Chinstrap    68
## 3 Gentoo      124

What is the maximum and minimum flipper length?

penguins |> summarize(max_flipper = max(flipper_length_mm),
                      min_flipper = min(flipper_length_mm))

## # A tibble: 1 × 2
##   max_flipper min_flipper
##         <int>       <int>
## 1          NA          NA

penguins |> summarize(max_flipper = max(flipper_length_mm, na.rm=TRUE),
                      min_flipper = min(flipper_length_mm, na.rm=TRUE),
                      count = n())

## # A tibble: 1 × 3
##   max_flipper min_flipper count
##         <int>       <int> <int>
## 1         231         172   344

Use na.omit and filter(!is.na()) to remove missing data.

penguins |> filter(!is.na(flipper_length_mm)) |>
  summarize(max_flipper = max(flipper_length_mm),
            min_flipper = min(flipper_length_mm),
            count = n())

## # A tibble: 1 × 3
##   max_flipper min_flipper count
##         <int>       <int> <int>
## 1         231         172   342

penguins |> na.omit() |>
  summarize(max_flipper = max(flipper_length_mm),
            min_flipper = min(flipper_length_mm),
            count = n())

## # A tibble: 1 × 3
##   max_flipper min_flipper count
##         <int>       <int> <int>
## 1         231         172   333

Find the max and min flipper length for each species.

penguins |> filter(!is.na(flipper_length_mm)) |>
  group_by(species, sex) |>
  summarize(max_flipper = max(flipper_length_mm),
            median_flipper = median(flipper_length_mm),
            min_flipper = min(flipper_length_mm),
            count = n())

## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.

## # A tibble: 8 × 6
## # Groups:   species [3]
##   species   sex    max_flipper median_flipper min_flipper count
##   <fct>     <fct>        <int>          <dbl>       <int> <int>
## 1 Adelie    female         202           188          172    73
## 2 Adelie    male           210           193          178    73
## 3 Adelie    <NA>           193           186          179     5
## 4 Chinstrap female         202           192          178    34
## 5 Chinstrap male           212           200.         187    34
## 6 Gentoo    female         222           212          203    58
## 7 Gentoo    male           231           221          208    61
## 8 Gentoo    <NA>           217           216          214     4

Find the observation with the largest flipper length.

penguins |> summarize( max_flipper_length_mm = max(flipper_length_mm) )

## # A tibble: 1 × 1
##   max_flipper_length_mm
##                   <int>
## 1                    NA

penguins |> summarize( max_flipper_length_mm = max(flipper_length_mm, na.rm=TRUE) )

## # A tibble: 1 × 1
##   max_flipper_length_mm
##                   <int>
## 1                   231

penguins |> filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )

## # A tibble: 1 × 8
##   species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>   <fct>           <dbl>         <dbl>             <int>       <int>
## 1 Gentoo  Biscoe           54.3          15.7               231        5650
## # ℹ 2 more variables: sex <fct>, year <int>

For each combination of species and island.

penguins |>
  group_by(species, island) |> 
  filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )

## # A tibble: 5 × 8
## # Groups:   species, island [5]
##   species   island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>     <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie    Dream               40.8          18.9               208        4300
## 2 Adelie    Biscoe              41            20                 203        4725
## 3 Adelie    Torgersen           44.1          18                 210        4000
## 4 Gentoo    Biscoe              54.3          15.7               231        5650
## 5 Chinstrap Dream               49            19.6               212        4300
## # ℹ 2 more variables: sex <fct>, year <int>

penguins |>
  group_by(species) |> 
  filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )

## # A tibble: 3 × 8
## # Groups:   species [3]
##   species   island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>     <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie    Torgersen           44.1          18                 210        4000
## 2 Gentoo    Biscoe              54.3          15.7               231        5650
## 3 Chinstrap Dream               49            19.6               212        4300
## # ℹ 2 more variables: sex <fct>, year <int>

penguins |>
  group_by(island) |> 
  filter( flipper_length_mm == max(flipper_length_mm, na.rm=TRUE) )

## # A tibble: 3 × 8
## # Groups:   island [3]
##   species   island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>     <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie    Torgersen           44.1          18                 210        4000
## 2 Gentoo    Biscoe              54.3          15.7               231        5650
## 3 Chinstrap Dream               49            19.6               212        4300
## # ℹ 2 more variables: sex <fct>, year <int>

Make a table showing the number of observations of each combination of species and island.

penguins |> 
  count(island, species)

## # A tibble: 5 × 3
##   island    species       n
##   <fct>     <fct>     <int>
## 1 Biscoe    Adelie       44
## 2 Biscoe    Gentoo      124
## 3 Dream     Adelie       56
## 4 Dream     Chinstrap    68
## 5 Torgersen Adelie       52

We will talk next week about shifting this “long” format table into a “wide” format table that has one row per island and one column per species.

Compute the ratio of the bill length to bill depth and make a histogram with these data.

penguins |> 
  mutate(ratio = bill_length_mm / bill_depth_mm) |>
  # select(species, island, bill_length_mm, bill_depth_mm, ratio) |>
  ggplot(aes(x = ratio)) +
  geom_histogram(bins = 25)

## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

For each species

penguins |> 
  mutate(ratio = bill_length_mm / bill_depth_mm) |>
  # select(species, island, bill_length_mm, bill_depth_mm, ratio) |>
  ggplot(aes(x = ratio, fill = species)) +
  geom_histogram(bins = 25)

## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

penguins |> 
  mutate(ratio = bill_length_mm / bill_depth_mm) |>
  # select(species, island, bill_length_mm, bill_depth_mm, ratio) |>
  ggplot(aes(x = ratio, fill = island)) +
  geom_histogram(bins = 25)

## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

Compute the mean and standard deviation of body mass for each species and sex.

penguins |>
  filter(!is.na(body_mass_g)) |>
  group_by(species, sex) |>
  summarize( mean_body_mass = mean(body_mass_g),
             sd_body_mass = sd(body_mass_g),
             n = n(),
             se_mean_body_mass = sd_body_mass / sqrt(n),
             cv_body_mass = sd_body_mass / mean_body_mass)

## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.

## # A tibble: 8 × 7
## # Groups:   species [3]
##   species sex   mean_body_mass sd_body_mass     n se_mean_body_mass cv_body_mass
##   <fct>   <fct>          <dbl>        <dbl> <int>             <dbl>        <dbl>
## 1 Adelie  fema…          3369.         269.    73              31.5       0.0800
## 2 Adelie  male           4043.         347.    73              40.6       0.0858
## 3 Adelie  <NA>           3540          477.     5             213.        0.135 
## 4 Chinst… fema…          3527.         285.    34              48.9       0.0809
## 5 Chinst… male           3939.         362.    34              62.1       0.0919
## 6 Gentoo  fema…          4680.         282.    58              37.0       0.0602
## 7 Gentoo  male           5485.         313.    61              40.1       0.0571
## 8 Gentoo  <NA>           4588.         338.     4             169.        0.0737

Decode this complex calculation from the course notes by executing just one line at a time, making a prediction about the result, and checking your understanding. Change the calculation to make it do something similar but different to test your understanding. (For example, try using slice_tail instead of slice_head, or remove the - in the arrange function.)

diamonds |>
  mutate(price_per_carat = price / carat) |>
  group_by(color, clarity, cut) |>
  summarise(median_price_per_carat = median(price_per_carat), 
            n = n(),
            .groups = "drop") |>
  arrange(-median_price_per_carat) |>
  group_by(cut) |>
  slice_head(n=2) |>
  # filter(median_price_per_carat == max(median_price_per_carat)) |>
  ungroup() |>
  arrange(-median_price_per_carat)

## # A tibble: 10 × 5
##    color clarity cut       median_price_per_carat     n
##    <ord> <ord>   <ord>                      <dbl> <int>
##  1 D     IF      Good                      14932.     9
##  2 D     IF      Premium                   11057.    10
##  3 D     IF      Very Good                 10202.    23
##  4 D     IF      Ideal                      7162.    28
##  5 J     VVS1    Premium                    5336.    24
##  6 J     VVS2    Very Good                  5227.    29
##  7 H     IF      Good                       5100.     4
##  8 E     VVS1    Fair                       4921.     3
##  9 G     VS2     Fair                       4838     45
## 10 H     SI1     Ideal                      4469.   763

Solutions from a previous year

How many kinds of species of penguin are in the dataset penguins? How many observations of each species?

penguins |> group_by(species) |> summarize(n = n()) # These 3 lines all do the same thing

## # A tibble: 3 × 2
##   species       n
##   <fct>     <int>
## 1 Adelie      152
## 2 Chinstrap    68
## 3 Gentoo      124

penguins |> group_by(species) |> count()

## # A tibble: 3 × 2
## # Groups:   species [3]
##   species       n
##   <fct>     <int>
## 1 Adelie      152
## 2 Chinstrap    68
## 3 Gentoo      124

count_table <- penguins |> count(species)
penguins |> count(species)

## # A tibble: 3 × 2
##   species       n
##   <fct>     <int>
## 1 Adelie      152
## 2 Chinstrap    68
## 3 Gentoo      124

What is the maximum and minimum flipper length?

penguins |> 
  na.omit() |>
  summarize(min_flipper_length = min(flipper_length_mm),
            max_flipper_length = max(flipper_length_mm))

## # A tibble: 1 × 2
##   min_flipper_length max_flipper_length
##                <int>              <int>
## 1                172                231

Another way of omitting missing data.

penguins |> 
  filter(!is.na(flipper_length_mm)) |>
  summarize(min_flipper_length = min(flipper_length_mm),
            max_flipper_length = max(flipper_length_mm))

## # A tibble: 1 × 2
##   min_flipper_length max_flipper_length
##                <int>              <int>
## 1                172                231

Do this for each species:

penguins |> 
  filter(!is.na(flipper_length_mm)) |>
  group_by(species) |>
  summarize(min_flipper_length = min(flipper_length_mm),
            max_flipper_length = max(flipper_length_mm))

## # A tibble: 3 × 3
##   species   min_flipper_length max_flipper_length
##   <fct>                  <int>              <int>
## 1 Adelie                   172                210
## 2 Chinstrap                178                212
## 3 Gentoo                   203                231

Find the observation with the largest flipper length.

penguins |>
  filter(!is.na(flipper_length_mm)) |>
  group_by(species, sex) |>
  mutate(max_flipper_length = max(flipper_length_mm)) |>
  filter(flipper_length_mm == max_flipper_length) |>
  select(species, sex, flipper_length_mm, max_flipper_length)

## # A tibble: 8 × 4
## # Groups:   species, sex [8]
##   species   sex    flipper_length_mm max_flipper_length
##   <fct>     <fct>              <int>              <int>
## 1 Adelie    <NA>                 193                193
## 2 Adelie    female               202                202
## 3 Adelie    male                 210                210
## 4 Gentoo    male                 231                231
## 5 Gentoo    female               222                222
## 6 Gentoo    <NA>                 217                217
## 7 Chinstrap male                 212                212
## 8 Chinstrap female               202                202

Combine two steps

penguins |>
  filter(!is.na(flipper_length_mm)) |>
  group_by(species, sex) |>
  filter(flipper_length_mm == max(flipper_length_mm)) |>
  select(species, sex, flipper_length_mm)

## # A tibble: 8 × 3
## # Groups:   species, sex [8]
##   species   sex    flipper_length_mm
##   <fct>     <fct>              <int>
## 1 Adelie    <NA>                 193
## 2 Adelie    female               202
## 3 Adelie    male                 210
## 4 Gentoo    male                 231
## 5 Gentoo    female               222
## 6 Gentoo    <NA>                 217
## 7 Chinstrap male                 212
## 8 Chinstrap female               202

Make a table showing the number of observations of each combination of species and island.

penguins |> count(species, island)

## # A tibble: 5 × 3
##   species   island        n
##   <fct>     <fct>     <int>
## 1 Adelie    Biscoe       44
## 2 Adelie    Dream        56
## 3 Adelie    Torgersen    52
## 4 Chinstrap Dream        68
## 5 Gentoo    Biscoe      124

penguins |> group_by(species, island) |> summarize(n = n())

## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.

## # A tibble: 5 × 3
## # Groups:   species [3]
##   species   island        n
##   <fct>     <fct>     <int>
## 1 Adelie    Biscoe       44
## 2 Adelie    Dream        56
## 3 Adelie    Torgersen    52
## 4 Chinstrap Dream        68
## 5 Gentoo    Biscoe      124

Follow up with a second summarize(n) to see what happens when the island grouping is dropped. Answers the question: How many islands was each species found on?

penguins |> group_by(species, island) |> summarize(n = n()) |> summarize(n = n())

## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.

## # A tibble: 3 × 2
##   species       n
##   <fct>     <int>
## 1 Adelie        3
## 2 Chinstrap     1
## 3 Gentoo        1

Compute the ratio of the bill length to bill depth and make a histogram with these data.

penguins |> mutate(bill_ratio = bill_length_mm / bill_depth_mm) |>
  filter(!is.na(bill_ratio)) |>
  ggplot(aes(x = bill_ratio)) +
  geom_histogram(binwidth = 0.1)

Repeat that calculation for each species.

penguins |> 
  mutate(bill_ratio = bill_length_mm / bill_depth_mm) |>
  filter(!is.na(bill_ratio)) |>
  ggplot(aes(x = bill_ratio, fill = species)) +
  geom_histogram(binwidth = 0.1)

Compute the mean and standard deviation of body mass for each species and sex.

Summarizing data practice

AJ Irwin

2024-02-01

Solutions from a previous year