In this demonstration, we will practice adding smooths (regression lines) to plots, including using colour and facets to create multiple lines.

Life expectancy by year

Plot life expectancy by year, facet on continent, add regression line by continent, show a thin line for each country. Possibly show a regression line for each country instead of data.

gapminder |> ggplot(aes(x = year, y = lifeExp)) +
  # geom_point() +
  facet_wrap(~ continent) +
  geom_smooth(method = "lm", formula = y ~ x, aes(group = country), se = FALSE,
              linewidth = 0.5, color = "lightgray", alpha = 0.1) +
  geom_smooth(method = "lm", formula = y ~ x) +
  theme_bw()

Quantile regression

Use geom_quantile and quantiles = ... to do quantile regression. Try a polynomial instead of a straight line.

gapminder |> ggplot(aes(x = year, y = lifeExp)) +
  # geom_point() +
  facet_wrap(~ continent) +
  geom_quantile(method = "rqss", formula = y ~ x, quantiles = c(0.25, 0.75)) +
  theme_bw()
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated
## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

## Warning in sparse.model.matrix(object, data = data, contrasts.arg =
## contrasts.arg, : non-list contrasts argument ignored

Logarithmic increases

Show increase in life expectancy as a function of GDP per capita. Add an appropriate regression line.

gapminder |> ggplot(aes(gdpPercap, lifeExp)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x) +
  scale_x_log10()

gapminder |> ggplot(aes(gdpPercap, lifeExp)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ log(x)) 

Compare regression on the mean (ordinary least squares, linear model, lm) with quantile regression on the median (geom_quantile, quantiles = 0.5).

CO2

Plot full data and add a smooth.

co2 <- read_table("https://www.esrl.noaa.gov/gmd/webdata/ccgg/trends/co2/co2_mm_mlo.txt",
                  comment = "#", col_names = FALSE) |>
  rename(year = X1,
         decimal_year = X3, 
         co2 = X4)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_double()
## )
co2 |> ggplot(aes(decimal_year, co2)) + geom_line()

Show interannual cycle for CO2 after subtracting annual mean, plot as a function of the fraction of the year elapsed, smooth with a LOESS or GAM.

co2 |> group_by(year) |>
  mutate(annual_mean = mean(co2),
         co2_season = co2 - annual_mean) |>
  ggplot(aes(decimal_year, co2_season)) + 
  geom_line()

co2 |> group_by(year) |>
  mutate(annual_mean = mean(co2),
         co2_season = co2 - annual_mean) |>
  ggplot(aes(factor(decimal_year - year) , co2_season)) + 
  geom_boxplot()

co2 |> group_by(year) |>
  mutate(annual_mean = mean(co2),
         co2_season = co2 - annual_mean) |>
  ggplot(aes(factor(X2) , co2_season)) + 
  geom_boxplot()

Try some boxplots, facet by decade, other variations.

Examples from previous year

Life expectancy by year

Build up a complex plot piece by piece.

gapminder |> ggplot(aes(year, lifeExp)) + geom_point()

gapminder |> ggplot(aes(year, lifeExp)) + geom_point() + 
  facet_grid(continent ~ .)

gapminder |> ggplot(aes(year, lifeExp)) + 
  geom_smooth() + 
  facet_grid(continent ~ .)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

gapminder |> ggplot(aes(year, lifeExp)) + 
  geom_smooth(method = "lm") + 
  facet_grid(continent ~ .)
## `geom_smooth()` using formula = 'y ~ x'

gapminder |> ggplot(aes(year, lifeExp)) + 
  geom_smooth(aes(group = country), se = FALSE, color = "#00000040", size = 0.4) + 
  geom_smooth(color = "blue") + 
  facet_grid(continent ~ .) + 
  theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Quantile regression

gapminder |> ggplot(aes(year, lifeExp)) + 
  geom_quantile() + 
  facet_grid(continent ~ .)
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x

gapminder |> ggplot(aes(year, lifeExp)) + 
  geom_quantile(quantiles = c(0.1, 0.5, 0.9), formula = y ~ poly(x,2)) + 
  facet_grid(continent ~ .)
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

Compare regression on the mean (ordinary least squares, linear model, lm) with quantile regression on the median (tau = 0.5).

gapminder |> 
  # filter(continent %in% c("Africa", "Americas", "Asia")) |>
  ggplot(aes(gdpPercap, lifeExp)) + 
  scale_x_log10() + 
  geom_quantile(quantiles = 0.5, formula = y ~ x) +
  geom_smooth(method = "lm", formula = y ~ x, color = "green") + 
  facet_grid(continent ~ .)

  # facet_wrap(continent ~ ., ncol=1)

CO2

Get data from previous lesson (reading data):

co2 <- read_table("https://www.esrl.noaa.gov/gmd/webdata/ccgg/trends/co2/co2_mm_mlo.txt",
                  comment = "#", col_names = FALSE)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_double()
## )
co2 |> ggplot(aes(X3, X4)) + geom_point(size = 0.4) + 
  labs(x = "Year", y = "Atmospheric pCO2 (ppmv)")

co2 |> ggplot(aes(X3, X4)) + geom_point(size = 0.4) +
  geom_smooth() + 
  labs(x = "Year", y = "Atmospheric pCO2 (ppmv)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

co2 |> ggplot(aes(X3, X4)) +
  geom_smooth() + 
  labs(x = "Year", y = "Atmospheric pCO2 (ppmv)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

What’s the annual cycle? Subtract the mean CO2 computed from each year. Compute the fractional part of th e year.

co2 |> group_by(X1) |>
  mutate(fractional_year = X3 - X1, 
         mean_co2 = mean(X4),
         co2_deviation = X4 - mean_co2) |>
  ungroup() |>
  ggplot(aes(fractional_year, co2_deviation)) + 
  geom_point()

co2_modified <- co2 |> group_by(X1) |>
  mutate(fractional_year = X3 - X1, 
         mean_co2 = mean(X4),
         co2_deviation = X4 - mean_co2) |>
  ungroup() 
co2_modified |>
  ggplot(aes(factor(X2), co2_deviation)) + 
  geom_boxplot()

co2_modified |>
  ggplot(aes(fractional_year, co2_deviation)) + 
  geom_smooth() + theme_bw()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

lubridate::date_decimal(2022.10)
## [1] "2022-02-06 11:59:59 UTC"
lubridate::date_decimal(2022.77)
## [1] "2022-10-09 01:11:59 UTC"
lubridate::decimal_date(ymd("2022-10-11"))
## [1] 2022.775

Make a decade by decade plot

co2 |> mutate(decade = 10*floor(X1/10),
              decimal_decade = X3 - decade) |>
  ggplot(aes(decimal_decade, X4)) + 
  geom_point() + 
  facet_grid(decade ~ . , scale = "free_y")

co2 |> mutate(decade = 10*floor(X1/10),
              decimal_decade = X3 - decade) |>
  ggplot(aes(decimal_decade, X4)) + 
  geom_smooth(method = "loess", span = 0.1) + 
  geom_point(size = 0.2) + 
  facet_grid(decade ~ . , scale = "free_y")
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small.  fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 8.1939
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.093768
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.0082751
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small.  fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 8.1939
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.093768
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 0.0082751
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

co2 |> mutate(decade = 10*floor(X1/10),
              decimal_decade = X3 - decade) |>
  ggplot(aes(decimal_decade, X4)) + 
  geom_quantile(quantiles = c(0.1, 0.9)) + 
  geom_point(size = 0.2) + 
  facet_grid(decade ~ . , scale = "free_y")
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x

GAM is just linear regression

co2 |> filter(X1 > 2019) |>
  ggplot(aes(X3, X4)) + 
  geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs")) +
  geom_point()

gam1 <- mgcv::gam(X4 ~ s(dd, bs = "cc") + s(X1), 
             data = co2 |> filter(X1 > 2010) |> mutate(dd = X3 - X1) )
broom::augment(gam1, 
        co2 |> filter(X1 > 2010) ) |> 
  ggplot() + 
  geom_line(aes(X3, .fitted)) +
  geom_point(aes(X3, X4))