Demonstration of a PCA.

  1. Start with a dataset with several quantitative variables
  2. Standardize each variable to have mean 0 and standard deviation 1, or use “correlation” option below
  3. Compute PCA
  4. Visualize PCA as biplot, screeplot

Try the midwest data:

glimpse(midwest)
## Rows: 437
## Columns: 28
## $ PID                  <int> 561, 562, 563, 564, 565, 566, 567, 568, 569, 570,…
## $ county               <chr> "ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN", "…
## $ state                <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "…
## $ area                 <dbl> 0.052, 0.014, 0.022, 0.017, 0.018, 0.050, 0.017, …
## $ poptotal             <int> 66090, 10626, 14991, 30806, 5836, 35688, 5322, 16…
## $ popdensity           <dbl> 1270.9615, 759.0000, 681.4091, 1812.1176, 324.222…
## $ popwhite             <int> 63917, 7054, 14477, 29344, 5264, 35157, 5298, 165…
## $ popblack             <int> 1702, 3496, 429, 127, 547, 50, 1, 111, 16, 16559,…
## $ popamerindian        <int> 98, 19, 35, 46, 14, 65, 8, 30, 8, 331, 51, 26, 17…
## $ popasian             <int> 249, 48, 16, 150, 5, 195, 15, 61, 23, 8033, 89, 3…
## $ popother             <int> 124, 9, 34, 1139, 6, 221, 0, 84, 6, 1596, 20, 7, …
## $ percwhite            <dbl> 96.71206, 66.38434, 96.57128, 95.25417, 90.19877,…
## $ percblack            <dbl> 2.57527614, 32.90043290, 2.86171703, 0.41225735, …
## $ percamerindan        <dbl> 0.14828264, 0.17880670, 0.23347342, 0.14932156, 0…
## $ percasian            <dbl> 0.37675897, 0.45172219, 0.10673071, 0.48691813, 0…
## $ percother            <dbl> 0.18762294, 0.08469791, 0.22680275, 3.69733169, 0…
## $ popadults            <int> 43298, 6724, 9669, 19272, 3979, 23444, 3583, 1132…
## $ perchsd              <dbl> 75.10740, 59.72635, 69.33499, 75.47219, 68.86152,…
## $ percollege           <dbl> 19.63139, 11.24331, 17.03382, 17.27895, 14.47600,…
## $ percprof             <dbl> 4.355859, 2.870315, 4.488572, 4.197800, 3.367680,…
## $ poppovertyknown      <int> 63628, 10529, 14235, 30337, 4815, 35107, 5241, 16…
## $ percpovertyknown     <dbl> 96.27478, 99.08714, 94.95697, 98.47757, 82.50514,…
## $ percbelowpoverty     <dbl> 13.151443, 32.244278, 12.068844, 7.209019, 13.520…
## $ percchildbelowpovert <dbl> 18.011717, 45.826514, 14.036061, 11.179536, 13.02…
## $ percadultpoverty     <dbl> 11.009776, 27.385647, 10.852090, 5.536013, 11.143…
## $ percelderlypoverty   <dbl> 12.443812, 25.228976, 12.697410, 6.217047, 19.200…
## $ inmetro              <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0…
## $ category             <chr> "AAR", "LHR", "AAR", "ALU", "AAR", "AAR", "LAR", …

Make a subset with only quantitative data.

midwest_ss <- midwest |> slice_sample(n = 100) |>
  select(-category, -PID, -county, -state)
midwest_pca <- prcomp(midwest_ss, 
                      center = TRUE,
                      scale. = TRUE)

Get a text summary of the PCA results showing the standard deviation associated with each principal component and the proportion of variance for each PC.

summary(midwest_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.3269 2.0880 1.7017 1.22797 1.07614 0.89038 0.77355
## Proportion of Variance 0.4612 0.1817 0.1207 0.06283 0.04825 0.03303 0.02493
## Cumulative Proportion  0.4612 0.6428 0.7635 0.82632 0.87458 0.90761 0.93254
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.66319 0.57011 0.49974 0.46927 0.36043 0.30950 0.25191
## Proportion of Variance 0.01833 0.01354 0.01041 0.00918 0.00541 0.00399 0.00264
## Cumulative Proportion  0.95087 0.96441 0.97482 0.98399 0.98941 0.99340 0.99604
##                           PC15    PC16    PC17    PC18    PC19    PC20     PC21
## Standard deviation     0.22394 0.19452 0.07302 0.02761 0.02462 0.01721 0.005019
## Proportion of Variance 0.00209 0.00158 0.00022 0.00003 0.00003 0.00001 0.000000
## Cumulative Proportion  0.99813 0.99971 0.99993 0.99996 0.99999 1.00000 1.000000
##                            PC22      PC23      PC24
## Standard deviation     0.001508 3.248e-09 5.275e-16
## Proportion of Variance 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion  1.000000 1.000e+00 1.000e+00

Show a biplot using ggplot (from ggfortify package)

autoplot(midwest_pca, data = midwest_ss, loadings = TRUE, 
         loadings.label = TRUE, label = FALSE, loadings.label.size = 3,
         variance_percentage = TRUE)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggfortify package.
##   Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Show the row number of each observation instead of a dot:

autoplot(midwest_pca, data = midwest_ss, 
         shape = FALSE, label = TRUE,
         loadings = TRUE, loadings.label = TRUE, 
         variance_percentage = TRUE)

What is row 44?

midwest_ss[44,]
## # A tibble: 1 × 24
##    area poptotal popdensity popwhite popblack popamerindian popasian popother
##   <dbl>    <int>      <dbl>    <int>    <int>         <int>    <int>    <int>
## 1 0.021    14931        711    14695       10           139       38       49
## # ℹ 16 more variables: percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
## #   percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
## #   percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
## #   percpovertyknown <dbl>, percbelowpoverty <dbl>, percchildbelowpovert <dbl>,
## #   percadultpoverty <dbl>, percelderlypoverty <dbl>, inmetro <int>

Show a similar plot using a different function:

biplot(midwest_pca)

Show the variances associated with each PC.

screeplot(midwest_pca)

Show the vector of each PC in terms of the orginal variables:

tidy(midwest_pca, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
##    column    `PC 1`   `PC 2`  `PC 3`  `PC 4`   `PC 5`   `PC 6`   `PC 7`   `PC 8`
##    <chr>      <dbl>    <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1 area      0.0420  0.0616  -0.0618  0.599   0.200    0.431    0.597    0.00598
##  2 poptotal  0.294   0.0360   0.104   0.0242 -0.0617  -0.0241  -0.00195  0.0177 
##  3 popdens…  0.289  -0.0103   0.0800 -0.0469 -0.00930 -0.0835  -0.0350   0.0349 
##  4 popwhite  0.294   0.0172   0.0912  0.0101 -0.0396  -0.0333   0.0132   0.0333 
##  5 popblack  0.286   0.0686   0.121   0.0407 -0.0877  -0.00262 -0.0282  -0.0205 
##  6 popamer…  0.279   0.00747  0.0327  0.141   0.0294  -0.125   -0.0603   0.122  
##  7 popasian  0.286   0.0617   0.121   0.0612 -0.124   -0.0230  -0.0193   0.0165 
##  8 popother  0.282   0.0694   0.133   0.0639 -0.128   -0.0135  -0.0310   0.0111 
##  9 percwhi… -0.230  -0.0553   0.172   0.135  -0.422   -0.224    0.170    0.189  
## 10 percbla…  0.200   0.0473  -0.142  -0.323   0.415    0.312   -0.0700  -0.285  
## # ℹ 14 more rows
## # ℹ 16 more variables: `PC 9` <dbl>, `PC 10` <dbl>, `PC 11` <dbl>,
## #   `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>, `PC 16` <dbl>,
## #   `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>, `PC 21` <dbl>,
## #   `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>

Alternative function to do the same computation.

midwest_pca2 <- princomp(midwest_ss, cor = TRUE)
summary(midwest_pca2)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     3.3269058 2.0880105 1.7017079 1.22796524 1.07613792
## Proportion of Variance 0.4611793 0.1816578 0.1206587 0.06282911 0.04825303
## Cumulative Proportion  0.4611793 0.6428371 0.7634958 0.82632493 0.87457797
##                            Comp.6     Comp.7     Comp.8     Comp.9    Comp.10
## Standard deviation     0.89037827 0.77355291 0.66318849 0.57010921 0.49974125
## Proportion of Variance 0.03303223 0.02493267 0.01832579 0.01354269 0.01040589
## Cumulative Proportion  0.90761019 0.93254287 0.95086866 0.96441134 0.97481723
##                            Comp.11     Comp.12    Comp.13     Comp.14
## Standard deviation     0.469274090 0.360426030 0.30950076 0.251910585
## Proportion of Variance 0.009175757 0.005412788 0.00399128 0.002644123
## Cumulative Proportion  0.983992989 0.989405777 0.99339706 0.996041180
##                            Comp.15     Comp.16      Comp.17      Comp.18
## Standard deviation     0.223936611 0.194523202 0.0730240050 2.761447e-02
## Proportion of Variance 0.002089484 0.001576636 0.0002221877 3.177328e-05
## Cumulative Proportion  0.998130663 0.999707300 0.9999294877 9.999613e-01
##                             Comp.19      Comp.20      Comp.21      Comp.22
## Standard deviation     2.461687e-02 1.721290e-02 5.019034e-03 1.507589e-03
## Proportion of Variance 2.524959e-05 1.234516e-05 1.049613e-06 9.470097e-08
## Cumulative Proportion  9.999865e-01 9.999989e-01 9.999999e-01 1.000000e+00
##                             Comp.23      Comp.24
## Standard deviation     2.867046e-08 3.735374e-09
## Proportion of Variance 3.424981e-17 5.813759e-19
## Cumulative Proportion  1.000000e+00 1.000000e+00
biplot(midwest_pca2)

ggplot plots

autoplot(midwest_pca2, data = midwest_ss, loadings = TRUE,
         loadings.label=TRUE)

Scaling experiment

What happens if you don’t scale the variables? You might choose to not scale data if all quantitiative variables are measured in the same units and the relative magnitudes of the values are important for qualitative analysis. You rarely want to do this!

midwest_ss <- midwest |> slice_sample(n = 100) |>
  select(-category, -PID, -county, -state)
midwest_pca3 <- prcomp(midwest_ss, 
                      center = TRUE,
                      scale. = FALSE)
autoplot(midwest_pca3)

screeplot(midwest_pca3)

tidy(midwest_pca3, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
##    column          `PC 1`    `PC 2`   `PC 3`   `PC 4`   `PC 5`   `PC 6`   `PC 7`
##    <chr>            <dbl>     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1 area          3.17e-10 -5.82e-11 -4.63e-7 -8.65e-7 -2.63e-6 -9.81e-7  1.19e-6
##  2 poptotal      5.67e- 1 -1.86e- 1  4.29e-1 -6.29e-2 -1.39e-1  1.38e-2  4.78e-1
##  3 popdensity    1.76e- 2 -5.23e- 3  3.33e-1  4.46e-1  8.03e-1  1.70e-1 -7.77e-2
##  4 popwhite      4.85e- 1  5.20e- 1  2.55e-3 -4.29e-1  2.91e-1 -1.57e-1  7.80e-2
##  5 popblack      6.26e- 2 -8.16e- 1 -3.66e-2 -2.63e-1  2.09e-1 -1.24e-1  8.93e-2
##  6 popamerindi…  1.30e- 3  3.31e- 3 -1.31e-2 -3.32e-2 -9.40e-2  4.77e-3 -5.96e-2
##  7 popasian      1.29e- 2  7.20e- 2  1.86e-1  6.10e-1 -2.63e-1 -4.94e-1  2.84e-1
##  8 popother      4.72e- 3  3.37e- 2  2.91e-1  5.26e-2 -2.82e-1  7.84e-1  8.57e-2
##  9 percwhite    -7.12e- 6  1.40e- 4 -1.96e-4 -2.24e-4  6.24e-4 -9.44e-4  1.79e-5
## 10 percblack     7.52e- 6 -1.31e- 4  8.59e-5 -1.23e-4  2.81e-5  6.31e-4  3.70e-4
## # ℹ 14 more rows
## # ℹ 17 more variables: `PC 8` <dbl>, `PC 9` <dbl>, `PC 10` <dbl>,
## #   `PC 11` <dbl>, `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>,
## #   `PC 16` <dbl>, `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>,
## #   `PC 21` <dbl>, `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>

Same thing with penguins

library(palmerpenguins)
## 
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw
my_penguins_raw = penguins_raw |> select(-`Sample Number`) |>
  select(Species, where(is.numeric) ) |> na.omit() |>
  mutate(Species = str_remove(Species, "\\(.*\\)"))
pca2 <- my_penguins_raw |> select(-Species) |> prcomp(scale=TRUE)
autoplot(pca2, data = my_penguins_raw, 
         loadings=TRUE, loadings.label = TRUE,
         loadings.label.colour = "black", loadings.colour = "black",
         colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)

pca2a <- my_penguins_raw |> select(-Species) |> prcomp(scale=FALSE)
autoplot(pca2a, data = my_penguins_raw, 
         loadings=TRUE, loadings.label = TRUE,
         loadings.label.colour = "black", loadings.colour = "black",
         colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).