Demonstration of a PCA.

  1. Start with a dataset with several quantitative variables
  2. Standardize each variable to have mean 0 and standard deviation 1, or use “correlation” option below
  3. Compute PCA
  4. Visualize PCA as biplot, screeplot

Try the midwest data:

glimpse(midwest)
## Rows: 437
## Columns: 28
## $ PID                  <int> 561, 562, 563, 564, 565, 566, 567, 568, 569, 570,…
## $ county               <chr> "ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN", "…
## $ state                <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "…
## $ area                 <dbl> 0.052, 0.014, 0.022, 0.017, 0.018, 0.050, 0.017, …
## $ poptotal             <int> 66090, 10626, 14991, 30806, 5836, 35688, 5322, 16…
## $ popdensity           <dbl> 1270.9615, 759.0000, 681.4091, 1812.1176, 324.222…
## $ popwhite             <int> 63917, 7054, 14477, 29344, 5264, 35157, 5298, 165…
## $ popblack             <int> 1702, 3496, 429, 127, 547, 50, 1, 111, 16, 16559,…
## $ popamerindian        <int> 98, 19, 35, 46, 14, 65, 8, 30, 8, 331, 51, 26, 17…
## $ popasian             <int> 249, 48, 16, 150, 5, 195, 15, 61, 23, 8033, 89, 3…
## $ popother             <int> 124, 9, 34, 1139, 6, 221, 0, 84, 6, 1596, 20, 7, …
## $ percwhite            <dbl> 96.71206, 66.38434, 96.57128, 95.25417, 90.19877,…
## $ percblack            <dbl> 2.57527614, 32.90043290, 2.86171703, 0.41225735, …
## $ percamerindan        <dbl> 0.14828264, 0.17880670, 0.23347342, 0.14932156, 0…
## $ percasian            <dbl> 0.37675897, 0.45172219, 0.10673071, 0.48691813, 0…
## $ percother            <dbl> 0.18762294, 0.08469791, 0.22680275, 3.69733169, 0…
## $ popadults            <int> 43298, 6724, 9669, 19272, 3979, 23444, 3583, 1132…
## $ perchsd              <dbl> 75.10740, 59.72635, 69.33499, 75.47219, 68.86152,…
## $ percollege           <dbl> 19.63139, 11.24331, 17.03382, 17.27895, 14.47600,…
## $ percprof             <dbl> 4.355859, 2.870315, 4.488572, 4.197800, 3.367680,…
## $ poppovertyknown      <int> 63628, 10529, 14235, 30337, 4815, 35107, 5241, 16…
## $ percpovertyknown     <dbl> 96.27478, 99.08714, 94.95697, 98.47757, 82.50514,…
## $ percbelowpoverty     <dbl> 13.151443, 32.244278, 12.068844, 7.209019, 13.520…
## $ percchildbelowpovert <dbl> 18.011717, 45.826514, 14.036061, 11.179536, 13.02…
## $ percadultpoverty     <dbl> 11.009776, 27.385647, 10.852090, 5.536013, 11.143…
## $ percelderlypoverty   <dbl> 12.443812, 25.228976, 12.697410, 6.217047, 19.200…
## $ inmetro              <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0…
## $ category             <chr> "AAR", "LHR", "AAR", "ALU", "AAR", "AAR", "LAR", …

Make a subset with only quantitative data.

midwest_ss <- midwest |> slice_sample(n = 100) |>
  select(-category, -PID, -county, -state)
midwest_pca <- prcomp(midwest_ss, 
                      center = TRUE,
                      scale. = TRUE)

Get a text summary of the PCA results showing the standard deviation associated with each principal component and the proportion of variance for each PC.

summary(midwest_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.1420 2.3020 1.6001 1.22650 1.01983 0.93262 0.85173
## Proportion of Variance 0.4113 0.2208 0.1067 0.06268 0.04334 0.03624 0.03023
## Cumulative Proportion  0.4113 0.6321 0.7388 0.80151 0.84485 0.88109 0.91132
##                            PC8     PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.81704 0.64727 0.52493 0.4724 0.40268 0.33418 0.30394
## Proportion of Variance 0.02781 0.01746 0.01148 0.0093 0.00676 0.00465 0.00385
## Cumulative Proportion  0.93913 0.95659 0.96807 0.9774 0.98412 0.98878 0.99263
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.26166 0.18934 0.17909 0.17174 0.10169 0.02413 0.01288
## Proportion of Variance 0.00285 0.00149 0.00134 0.00123 0.00043 0.00002 0.00001
## Cumulative Proportion  0.99548 0.99697 0.99831 0.99954 0.99997 0.99999 1.00000
##                            PC22      PC23      PC24
## Standard deviation     0.004393 1.556e-09 3.781e-16
## Proportion of Variance 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion  1.000000 1.000e+00 1.000e+00

Show a biplot using ggplot (from ggfortify package)

autoplot(midwest_pca, data = midwest_ss, loadings = TRUE, 
         loadings.label = TRUE, label = FALSE, loadings.label.size = 3,
         variance_percentage = TRUE)

Show the row number of each observation instead of a dot:

autoplot(midwest_pca, data = midwest_ss, 
         shape = FALSE, label = TRUE,
         loadings = TRUE, loadings.label = TRUE, 
         variance_percentage = TRUE)

What is row 44?

midwest_ss[44,]
## # A tibble: 1 × 24
##    area poptotal popdensity popwhite popblack popamerindian popasian popother
##   <dbl>    <int>      <dbl>    <int>    <int>         <int>    <int>    <int>
## 1 0.024    38413      1601.    37765      330           138      114       66
## # ℹ 16 more variables: percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
## #   percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
## #   percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
## #   percpovertyknown <dbl>, percbelowpoverty <dbl>, percchildbelowpovert <dbl>,
## #   percadultpoverty <dbl>, percelderlypoverty <dbl>, inmetro <int>

Show a similar plot using a different function:

biplot(midwest_pca)

Show the variances associated with each PC.

screeplot(midwest_pca)

Show the vector of each PC in terms of the orginal variables:

tidy(midwest_pca, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
##    column      `PC 1`  `PC 2`  `PC 3`   `PC 4`  `PC 5`   `PC 6`  `PC 7`   `PC 8`
##    <chr>        <dbl>   <dbl>   <dbl>    <dbl>   <dbl>    <dbl>   <dbl>    <dbl>
##  1 area       -0.0436  0.0464  0.319   0.00114  0.661   2.56e-1 -0.467  -0.172  
##  2 poptotal   -0.302  -0.0934 -0.119   0.0257   0.0958 -6.07e-3  0.0427  0.0233 
##  3 popdensity -0.287  -0.112  -0.184   0.0318   0.0331  1.81e-5  0.108   0.0502 
##  4 popwhite   -0.306  -0.0720 -0.0880  0.0139   0.0814 -3.28e-2  0.0203  0.00300
##  5 popblack   -0.248  -0.159  -0.230   0.0729   0.159   8.11e-2  0.150   0.0646 
##  6 popamerin… -0.159  -0.223   0.222  -0.278    0.168   2.98e-1  0.0420  0.00727
##  7 popasian   -0.297  -0.0755 -0.0232  0.0952   0.0130 -2.81e-2 -0.0556  0.239  
##  8 popother   -0.268  -0.0668 -0.0987 -0.0739  -0.190   1.10e-1 -0.315   0.245  
##  9 percwhite   0.0707  0.334  -0.217   0.292    0.160  -4.94e-2 -0.202   0.146  
## 10 percblack  -0.166  -0.199  -0.0962  0.212   -0.176   9.80e-2 -0.0246 -0.642  
## # ℹ 14 more rows
## # ℹ 16 more variables: `PC 9` <dbl>, `PC 10` <dbl>, `PC 11` <dbl>,
## #   `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>, `PC 16` <dbl>,
## #   `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>, `PC 21` <dbl>,
## #   `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>

Alternative function to do the same computation.

midwest_pca2 <- princomp(midwest_ss, cor = TRUE)
summary(midwest_pca2)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     3.1420381 2.3020005 1.6001192 1.22650079 1.01983193
## Proportion of Variance 0.4113501 0.2208003 0.1066826 0.06267934 0.04333572
## Cumulative Proportion  0.4113501 0.6321504 0.7388330 0.80151232 0.84484803
##                            Comp.6     Comp.7     Comp.8     Comp.9    Comp.10
## Standard deviation     0.93262487 0.85173069 0.81704223 0.64727278 0.52492500
## Proportion of Variance 0.03624121 0.03022688 0.02781492 0.01745675 0.01148109
## Cumulative Proportion  0.88108925 0.91131613 0.93913105 0.95658780 0.96806889
##                           Comp.11     Comp.12     Comp.13     Comp.14
## Standard deviation     0.47238559 0.402680289 0.334177286 0.303941265
## Proportion of Variance 0.00929784 0.006756309 0.004653102 0.003849179
## Cumulative Proportion  0.97736673 0.984123040 0.988776143 0.992625322
##                            Comp.15     Comp.16     Comp.17     Comp.18
## Standard deviation     0.261658181 0.189336913 0.179094559 0.171744730
## Proportion of Variance 0.002852708 0.001493686 0.001336453 0.001229011
## Cumulative Proportion  0.995478030 0.996971716 0.998308169 0.999537179
##                             Comp.19      Comp.20      Comp.21      Comp.22
## Standard deviation     0.1016872398 2.413132e-02 1.287572e-02 4.393143e-03
## Proportion of Variance 0.0004308456 2.426337e-05 6.907670e-06 8.041544e-07
## Cumulative Proportion  0.9999680248 9.999923e-01 9.999992e-01 1.000000e+00
##                             Comp.23 Comp.24
## Standard deviation     2.134365e-08       0
## Proportion of Variance 1.898130e-17       0
## Cumulative Proportion  1.000000e+00       1
biplot(midwest_pca2)

ggplot plots

autoplot(midwest_pca2, data = midwest_ss, loadings = TRUE,
         loadings.label=TRUE)

Scaling experiment

Don’t scale the variables. You rarely really want to do this!

midwest_ss <- midwest |> slice_sample(n = 100) |>
  select(-category, -PID, -county, -state)
midwest_pca3 <- prcomp(midwest_ss, 
                      center = TRUE,
                      scale. = FALSE)
autoplot(midwest_pca3)

screeplot(midwest_pca3)

tidy(midwest_pca3, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
##    column          `PC 1`    `PC 2`   `PC 3`   `PC 4`   `PC 5`   `PC 6`   `PC 7`
##    <chr>            <dbl>     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1 area          -2.86e-9  -1.95e-8  1.73e-8  3.10e-7  4.17e-6 -4.42e-6  6.81e-6
##  2 poptotal      -5.92e-1   3.22e-2  8.16e-2  6.26e-1  1.81e-1  1.54e-1  1.49e-1
##  3 popdensity    -1.12e-2   3.96e-2 -1.35e-1 -9.20e-2 -4.86e-1  8.49e-1  1.24e-1
##  4 popwhite      -3.67e-1  -4.58e-1 -6.45e-1  3.93e-2  1.10e-1  1.64e-2 -2.33e-1
##  5 popblack      -1.65e-1   8.41e-1 -1.26e-1  8.79e-2  1.10e-1  4.64e-2 -2.33e-1
##  6 popamerindian -1.34e-3   6.82e-3 -1.68e-2  3.02e-2  8.31e-2 -7.54e-2  8.70e-1
##  7 popasian      -1.97e-2  -1.04e-1  2.47e-1  2.82e-1 -2.11e-2  4.24e-2  4.78e-2
##  8 popother      -3.91e-2  -2.53e-1  6.22e-1  1.87e-1 -1.01e-1  1.25e-1 -3.02e-1
##  9 percwhite      5.13e-6  -4.35e-5  1.50e-4 -3.86e-4 -5.97e-4 -1.04e-3 -2.23e-3
## 10 percblack     -4.06e-6   5.37e-5 -1.42e-4  1.24e-4  4.06e-4  1.11e-3  4.14e-5
## # ℹ 14 more rows
## # ℹ 17 more variables: `PC 8` <dbl>, `PC 9` <dbl>, `PC 10` <dbl>,
## #   `PC 11` <dbl>, `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>,
## #   `PC 16` <dbl>, `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>,
## #   `PC 21` <dbl>, `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>

Same thing with penguins

library(palmerpenguins)
my_penguins_raw = penguins_raw |> select(-`Sample Number`) |>
  select(Species, where(is.numeric) ) |> na.omit() |>
  mutate(Species = str_remove(Species, "\\(.*\\)"))
pca2 <- my_penguins_raw |> select(-Species) |> prcomp(scale=TRUE)
autoplot(pca2, data = my_penguins_raw, 
         loadings=TRUE, loadings.label = TRUE,
         loadings.label.colour = "black", loadings.colour = "black",
         colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)

pca2a <- my_penguins_raw |> select(-Species) |> prcomp(scale=FALSE)
autoplot(pca2a, data = my_penguins_raw, 
         loadings=TRUE, loadings.label = TRUE,
         loadings.label.colour = "black", loadings.colour = "black",
         colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).