Demonstration of a PCA.
Try the midwest data:
glimpse(midwest)
## Rows: 437
## Columns: 28
## $ PID <int> 561, 562, 563, 564, 565, 566, 567, 568, 569, 570,…
## $ county <chr> "ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN", "…
## $ state <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "…
## $ area <dbl> 0.052, 0.014, 0.022, 0.017, 0.018, 0.050, 0.017, …
## $ poptotal <int> 66090, 10626, 14991, 30806, 5836, 35688, 5322, 16…
## $ popdensity <dbl> 1270.9615, 759.0000, 681.4091, 1812.1176, 324.222…
## $ popwhite <int> 63917, 7054, 14477, 29344, 5264, 35157, 5298, 165…
## $ popblack <int> 1702, 3496, 429, 127, 547, 50, 1, 111, 16, 16559,…
## $ popamerindian <int> 98, 19, 35, 46, 14, 65, 8, 30, 8, 331, 51, 26, 17…
## $ popasian <int> 249, 48, 16, 150, 5, 195, 15, 61, 23, 8033, 89, 3…
## $ popother <int> 124, 9, 34, 1139, 6, 221, 0, 84, 6, 1596, 20, 7, …
## $ percwhite <dbl> 96.71206, 66.38434, 96.57128, 95.25417, 90.19877,…
## $ percblack <dbl> 2.57527614, 32.90043290, 2.86171703, 0.41225735, …
## $ percamerindan <dbl> 0.14828264, 0.17880670, 0.23347342, 0.14932156, 0…
## $ percasian <dbl> 0.37675897, 0.45172219, 0.10673071, 0.48691813, 0…
## $ percother <dbl> 0.18762294, 0.08469791, 0.22680275, 3.69733169, 0…
## $ popadults <int> 43298, 6724, 9669, 19272, 3979, 23444, 3583, 1132…
## $ perchsd <dbl> 75.10740, 59.72635, 69.33499, 75.47219, 68.86152,…
## $ percollege <dbl> 19.63139, 11.24331, 17.03382, 17.27895, 14.47600,…
## $ percprof <dbl> 4.355859, 2.870315, 4.488572, 4.197800, 3.367680,…
## $ poppovertyknown <int> 63628, 10529, 14235, 30337, 4815, 35107, 5241, 16…
## $ percpovertyknown <dbl> 96.27478, 99.08714, 94.95697, 98.47757, 82.50514,…
## $ percbelowpoverty <dbl> 13.151443, 32.244278, 12.068844, 7.209019, 13.520…
## $ percchildbelowpovert <dbl> 18.011717, 45.826514, 14.036061, 11.179536, 13.02…
## $ percadultpoverty <dbl> 11.009776, 27.385647, 10.852090, 5.536013, 11.143…
## $ percelderlypoverty <dbl> 12.443812, 25.228976, 12.697410, 6.217047, 19.200…
## $ inmetro <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0…
## $ category <chr> "AAR", "LHR", "AAR", "ALU", "AAR", "AAR", "LAR", …
Make a subset with only quantitative data.
midwest_ss <- midwest |> slice_sample(n = 100) |>
select(-category, -PID, -county, -state)
midwest_pca <- prcomp(midwest_ss,
center = TRUE,
scale. = TRUE)
Get a text summary of the PCA results showing the standard deviation associated with each principal component and the proportion of variance for each PC.
summary(midwest_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.3269 2.0880 1.7017 1.22797 1.07614 0.89038 0.77355
## Proportion of Variance 0.4612 0.1817 0.1207 0.06283 0.04825 0.03303 0.02493
## Cumulative Proportion 0.4612 0.6428 0.7635 0.82632 0.87458 0.90761 0.93254
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.66319 0.57011 0.49974 0.46927 0.36043 0.30950 0.25191
## Proportion of Variance 0.01833 0.01354 0.01041 0.00918 0.00541 0.00399 0.00264
## Cumulative Proportion 0.95087 0.96441 0.97482 0.98399 0.98941 0.99340 0.99604
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.22394 0.19452 0.07302 0.02761 0.02462 0.01721 0.005019
## Proportion of Variance 0.00209 0.00158 0.00022 0.00003 0.00003 0.00001 0.000000
## Cumulative Proportion 0.99813 0.99971 0.99993 0.99996 0.99999 1.00000 1.000000
## PC22 PC23 PC24
## Standard deviation 0.001508 3.248e-09 5.275e-16
## Proportion of Variance 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion 1.000000 1.000e+00 1.000e+00
Show a biplot using ggplot (from ggfortify package)
autoplot(midwest_pca, data = midwest_ss, loadings = TRUE,
loadings.label = TRUE, label = FALSE, loadings.label.size = 3,
variance_percentage = TRUE)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggfortify package.
## Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Show the row number of each observation instead of a dot:
autoplot(midwest_pca, data = midwest_ss,
shape = FALSE, label = TRUE,
loadings = TRUE, loadings.label = TRUE,
variance_percentage = TRUE)
What is row 44?
midwest_ss[44,]
## # A tibble: 1 × 24
## area poptotal popdensity popwhite popblack popamerindian popasian popother
## <dbl> <int> <dbl> <int> <int> <int> <int> <int>
## 1 0.021 14931 711 14695 10 139 38 49
## # ℹ 16 more variables: percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
## # percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
## # percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
## # percpovertyknown <dbl>, percbelowpoverty <dbl>, percchildbelowpovert <dbl>,
## # percadultpoverty <dbl>, percelderlypoverty <dbl>, inmetro <int>
Show a similar plot using a different function:
biplot(midwest_pca)
Show the variances associated with each PC.
screeplot(midwest_pca)
Show the vector of each PC in terms of the orginal variables:
tidy(midwest_pca, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
## column `PC 1` `PC 2` `PC 3` `PC 4` `PC 5` `PC 6` `PC 7` `PC 8`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 area 0.0420 0.0616 -0.0618 0.599 0.200 0.431 0.597 0.00598
## 2 poptotal 0.294 0.0360 0.104 0.0242 -0.0617 -0.0241 -0.00195 0.0177
## 3 popdens… 0.289 -0.0103 0.0800 -0.0469 -0.00930 -0.0835 -0.0350 0.0349
## 4 popwhite 0.294 0.0172 0.0912 0.0101 -0.0396 -0.0333 0.0132 0.0333
## 5 popblack 0.286 0.0686 0.121 0.0407 -0.0877 -0.00262 -0.0282 -0.0205
## 6 popamer… 0.279 0.00747 0.0327 0.141 0.0294 -0.125 -0.0603 0.122
## 7 popasian 0.286 0.0617 0.121 0.0612 -0.124 -0.0230 -0.0193 0.0165
## 8 popother 0.282 0.0694 0.133 0.0639 -0.128 -0.0135 -0.0310 0.0111
## 9 percwhi… -0.230 -0.0553 0.172 0.135 -0.422 -0.224 0.170 0.189
## 10 percbla… 0.200 0.0473 -0.142 -0.323 0.415 0.312 -0.0700 -0.285
## # ℹ 14 more rows
## # ℹ 16 more variables: `PC 9` <dbl>, `PC 10` <dbl>, `PC 11` <dbl>,
## # `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>, `PC 16` <dbl>,
## # `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>, `PC 21` <dbl>,
## # `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>
Alternative function to do the same computation.
midwest_pca2 <- princomp(midwest_ss, cor = TRUE)
summary(midwest_pca2)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 3.3269058 2.0880105 1.7017079 1.22796524 1.07613792
## Proportion of Variance 0.4611793 0.1816578 0.1206587 0.06282911 0.04825303
## Cumulative Proportion 0.4611793 0.6428371 0.7634958 0.82632493 0.87457797
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## Standard deviation 0.89037827 0.77355291 0.66318849 0.57010921 0.49974125
## Proportion of Variance 0.03303223 0.02493267 0.01832579 0.01354269 0.01040589
## Cumulative Proportion 0.90761019 0.93254287 0.95086866 0.96441134 0.97481723
## Comp.11 Comp.12 Comp.13 Comp.14
## Standard deviation 0.469274090 0.360426030 0.30950076 0.251910585
## Proportion of Variance 0.009175757 0.005412788 0.00399128 0.002644123
## Cumulative Proportion 0.983992989 0.989405777 0.99339706 0.996041180
## Comp.15 Comp.16 Comp.17 Comp.18
## Standard deviation 0.223936611 0.194523202 0.0730240050 2.761447e-02
## Proportion of Variance 0.002089484 0.001576636 0.0002221877 3.177328e-05
## Cumulative Proportion 0.998130663 0.999707300 0.9999294877 9.999613e-01
## Comp.19 Comp.20 Comp.21 Comp.22
## Standard deviation 2.461687e-02 1.721290e-02 5.019034e-03 1.507589e-03
## Proportion of Variance 2.524959e-05 1.234516e-05 1.049613e-06 9.470097e-08
## Cumulative Proportion 9.999865e-01 9.999989e-01 9.999999e-01 1.000000e+00
## Comp.23 Comp.24
## Standard deviation 2.867046e-08 3.735374e-09
## Proportion of Variance 3.424981e-17 5.813759e-19
## Cumulative Proportion 1.000000e+00 1.000000e+00
biplot(midwest_pca2)
ggplot plots
autoplot(midwest_pca2, data = midwest_ss, loadings = TRUE,
loadings.label=TRUE)
What happens if you don’t scale the variables? You might choose to not scale data if all quantitiative variables are measured in the same units and the relative magnitudes of the values are important for qualitative analysis. You rarely want to do this!
midwest_ss <- midwest |> slice_sample(n = 100) |>
select(-category, -PID, -county, -state)
midwest_pca3 <- prcomp(midwest_ss,
center = TRUE,
scale. = FALSE)
autoplot(midwest_pca3)
screeplot(midwest_pca3)
tidy(midwest_pca3, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
## column `PC 1` `PC 2` `PC 3` `PC 4` `PC 5` `PC 6` `PC 7`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 area 3.17e-10 -5.82e-11 -4.63e-7 -8.65e-7 -2.63e-6 -9.81e-7 1.19e-6
## 2 poptotal 5.67e- 1 -1.86e- 1 4.29e-1 -6.29e-2 -1.39e-1 1.38e-2 4.78e-1
## 3 popdensity 1.76e- 2 -5.23e- 3 3.33e-1 4.46e-1 8.03e-1 1.70e-1 -7.77e-2
## 4 popwhite 4.85e- 1 5.20e- 1 2.55e-3 -4.29e-1 2.91e-1 -1.57e-1 7.80e-2
## 5 popblack 6.26e- 2 -8.16e- 1 -3.66e-2 -2.63e-1 2.09e-1 -1.24e-1 8.93e-2
## 6 popamerindi… 1.30e- 3 3.31e- 3 -1.31e-2 -3.32e-2 -9.40e-2 4.77e-3 -5.96e-2
## 7 popasian 1.29e- 2 7.20e- 2 1.86e-1 6.10e-1 -2.63e-1 -4.94e-1 2.84e-1
## 8 popother 4.72e- 3 3.37e- 2 2.91e-1 5.26e-2 -2.82e-1 7.84e-1 8.57e-2
## 9 percwhite -7.12e- 6 1.40e- 4 -1.96e-4 -2.24e-4 6.24e-4 -9.44e-4 1.79e-5
## 10 percblack 7.52e- 6 -1.31e- 4 8.59e-5 -1.23e-4 2.81e-5 6.31e-4 3.70e-4
## # ℹ 14 more rows
## # ℹ 17 more variables: `PC 8` <dbl>, `PC 9` <dbl>, `PC 10` <dbl>,
## # `PC 11` <dbl>, `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>,
## # `PC 16` <dbl>, `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>,
## # `PC 21` <dbl>, `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>
library(palmerpenguins)
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
my_penguins_raw = penguins_raw |> select(-`Sample Number`) |>
select(Species, where(is.numeric) ) |> na.omit() |>
mutate(Species = str_remove(Species, "\\(.*\\)"))
pca2 <- my_penguins_raw |> select(-Species) |> prcomp(scale=TRUE)
autoplot(pca2, data = my_penguins_raw,
loadings=TRUE, loadings.label = TRUE,
loadings.label.colour = "black", loadings.colour = "black",
colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)
pca2a <- my_penguins_raw |> select(-Species) |> prcomp(scale=FALSE)
autoplot(pca2a, data = my_penguins_raw,
loadings=TRUE, loadings.label = TRUE,
loadings.label.colour = "black", loadings.colour = "black",
colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).