Demonstration of a PCA.
Try the midwest
data:
glimpse(midwest)
## Rows: 437
## Columns: 28
## $ PID <int> 561, 562, 563, 564, 565, 566, 567, 568, 569, 570,…
## $ county <chr> "ADAMS", "ALEXANDER", "BOND", "BOONE", "BROWN", "…
## $ state <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "…
## $ area <dbl> 0.052, 0.014, 0.022, 0.017, 0.018, 0.050, 0.017, …
## $ poptotal <int> 66090, 10626, 14991, 30806, 5836, 35688, 5322, 16…
## $ popdensity <dbl> 1270.9615, 759.0000, 681.4091, 1812.1176, 324.222…
## $ popwhite <int> 63917, 7054, 14477, 29344, 5264, 35157, 5298, 165…
## $ popblack <int> 1702, 3496, 429, 127, 547, 50, 1, 111, 16, 16559,…
## $ popamerindian <int> 98, 19, 35, 46, 14, 65, 8, 30, 8, 331, 51, 26, 17…
## $ popasian <int> 249, 48, 16, 150, 5, 195, 15, 61, 23, 8033, 89, 3…
## $ popother <int> 124, 9, 34, 1139, 6, 221, 0, 84, 6, 1596, 20, 7, …
## $ percwhite <dbl> 96.71206, 66.38434, 96.57128, 95.25417, 90.19877,…
## $ percblack <dbl> 2.57527614, 32.90043290, 2.86171703, 0.41225735, …
## $ percamerindan <dbl> 0.14828264, 0.17880670, 0.23347342, 0.14932156, 0…
## $ percasian <dbl> 0.37675897, 0.45172219, 0.10673071, 0.48691813, 0…
## $ percother <dbl> 0.18762294, 0.08469791, 0.22680275, 3.69733169, 0…
## $ popadults <int> 43298, 6724, 9669, 19272, 3979, 23444, 3583, 1132…
## $ perchsd <dbl> 75.10740, 59.72635, 69.33499, 75.47219, 68.86152,…
## $ percollege <dbl> 19.63139, 11.24331, 17.03382, 17.27895, 14.47600,…
## $ percprof <dbl> 4.355859, 2.870315, 4.488572, 4.197800, 3.367680,…
## $ poppovertyknown <int> 63628, 10529, 14235, 30337, 4815, 35107, 5241, 16…
## $ percpovertyknown <dbl> 96.27478, 99.08714, 94.95697, 98.47757, 82.50514,…
## $ percbelowpoverty <dbl> 13.151443, 32.244278, 12.068844, 7.209019, 13.520…
## $ percchildbelowpovert <dbl> 18.011717, 45.826514, 14.036061, 11.179536, 13.02…
## $ percadultpoverty <dbl> 11.009776, 27.385647, 10.852090, 5.536013, 11.143…
## $ percelderlypoverty <dbl> 12.443812, 25.228976, 12.697410, 6.217047, 19.200…
## $ inmetro <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0…
## $ category <chr> "AAR", "LHR", "AAR", "ALU", "AAR", "AAR", "LAR", …
Make a subset with only quantitative data.
midwest_ss <- midwest |> slice_sample(n = 100) |>
select(-category, -PID, -county, -state)
midwest_pca <- prcomp(midwest_ss,
center = TRUE,
scale. = TRUE)
Get a text summary of the PCA results showing the standard deviation associated with each principal component and the proportion of variance for each PC.
summary(midwest_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.1420 2.3020 1.6001 1.22650 1.01983 0.93262 0.85173
## Proportion of Variance 0.4113 0.2208 0.1067 0.06268 0.04334 0.03624 0.03023
## Cumulative Proportion 0.4113 0.6321 0.7388 0.80151 0.84485 0.88109 0.91132
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.81704 0.64727 0.52493 0.4724 0.40268 0.33418 0.30394
## Proportion of Variance 0.02781 0.01746 0.01148 0.0093 0.00676 0.00465 0.00385
## Cumulative Proportion 0.93913 0.95659 0.96807 0.9774 0.98412 0.98878 0.99263
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.26166 0.18934 0.17909 0.17174 0.10169 0.02413 0.01288
## Proportion of Variance 0.00285 0.00149 0.00134 0.00123 0.00043 0.00002 0.00001
## Cumulative Proportion 0.99548 0.99697 0.99831 0.99954 0.99997 0.99999 1.00000
## PC22 PC23 PC24
## Standard deviation 0.004393 1.556e-09 3.781e-16
## Proportion of Variance 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion 1.000000 1.000e+00 1.000e+00
Show a biplot using ggplot (from ggfortify package)
autoplot(midwest_pca, data = midwest_ss, loadings = TRUE,
loadings.label = TRUE, label = FALSE, loadings.label.size = 3,
variance_percentage = TRUE)
Show the row number of each observation instead of a dot:
autoplot(midwest_pca, data = midwest_ss,
shape = FALSE, label = TRUE,
loadings = TRUE, loadings.label = TRUE,
variance_percentage = TRUE)
What is row 44?
midwest_ss[44,]
## # A tibble: 1 × 24
## area poptotal popdensity popwhite popblack popamerindian popasian popother
## <dbl> <int> <dbl> <int> <int> <int> <int> <int>
## 1 0.024 38413 1601. 37765 330 138 114 66
## # ℹ 16 more variables: percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
## # percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
## # percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
## # percpovertyknown <dbl>, percbelowpoverty <dbl>, percchildbelowpovert <dbl>,
## # percadultpoverty <dbl>, percelderlypoverty <dbl>, inmetro <int>
Show a similar plot using a different function:
biplot(midwest_pca)
Show the variances associated with each PC.
screeplot(midwest_pca)
Show the vector of each PC in terms of the orginal variables:
tidy(midwest_pca, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
## column `PC 1` `PC 2` `PC 3` `PC 4` `PC 5` `PC 6` `PC 7` `PC 8`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 area -0.0436 0.0464 0.319 0.00114 0.661 2.56e-1 -0.467 -0.172
## 2 poptotal -0.302 -0.0934 -0.119 0.0257 0.0958 -6.07e-3 0.0427 0.0233
## 3 popdensity -0.287 -0.112 -0.184 0.0318 0.0331 1.81e-5 0.108 0.0502
## 4 popwhite -0.306 -0.0720 -0.0880 0.0139 0.0814 -3.28e-2 0.0203 0.00300
## 5 popblack -0.248 -0.159 -0.230 0.0729 0.159 8.11e-2 0.150 0.0646
## 6 popamerin… -0.159 -0.223 0.222 -0.278 0.168 2.98e-1 0.0420 0.00727
## 7 popasian -0.297 -0.0755 -0.0232 0.0952 0.0130 -2.81e-2 -0.0556 0.239
## 8 popother -0.268 -0.0668 -0.0987 -0.0739 -0.190 1.10e-1 -0.315 0.245
## 9 percwhite 0.0707 0.334 -0.217 0.292 0.160 -4.94e-2 -0.202 0.146
## 10 percblack -0.166 -0.199 -0.0962 0.212 -0.176 9.80e-2 -0.0246 -0.642
## # ℹ 14 more rows
## # ℹ 16 more variables: `PC 9` <dbl>, `PC 10` <dbl>, `PC 11` <dbl>,
## # `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>, `PC 16` <dbl>,
## # `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>, `PC 21` <dbl>,
## # `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>
Alternative function to do the same computation.
midwest_pca2 <- princomp(midwest_ss, cor = TRUE)
summary(midwest_pca2)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 3.1420381 2.3020005 1.6001192 1.22650079 1.01983193
## Proportion of Variance 0.4113501 0.2208003 0.1066826 0.06267934 0.04333572
## Cumulative Proportion 0.4113501 0.6321504 0.7388330 0.80151232 0.84484803
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## Standard deviation 0.93262487 0.85173069 0.81704223 0.64727278 0.52492500
## Proportion of Variance 0.03624121 0.03022688 0.02781492 0.01745675 0.01148109
## Cumulative Proportion 0.88108925 0.91131613 0.93913105 0.95658780 0.96806889
## Comp.11 Comp.12 Comp.13 Comp.14
## Standard deviation 0.47238559 0.402680289 0.334177286 0.303941265
## Proportion of Variance 0.00929784 0.006756309 0.004653102 0.003849179
## Cumulative Proportion 0.97736673 0.984123040 0.988776143 0.992625322
## Comp.15 Comp.16 Comp.17 Comp.18
## Standard deviation 0.261658181 0.189336913 0.179094559 0.171744730
## Proportion of Variance 0.002852708 0.001493686 0.001336453 0.001229011
## Cumulative Proportion 0.995478030 0.996971716 0.998308169 0.999537179
## Comp.19 Comp.20 Comp.21 Comp.22
## Standard deviation 0.1016872398 2.413132e-02 1.287572e-02 4.393143e-03
## Proportion of Variance 0.0004308456 2.426337e-05 6.907670e-06 8.041544e-07
## Cumulative Proportion 0.9999680248 9.999923e-01 9.999992e-01 1.000000e+00
## Comp.23 Comp.24
## Standard deviation 2.134365e-08 0
## Proportion of Variance 1.898130e-17 0
## Cumulative Proportion 1.000000e+00 1
biplot(midwest_pca2)
ggplot plots
autoplot(midwest_pca2, data = midwest_ss, loadings = TRUE,
loadings.label=TRUE)
Don’t scale the variables. You rarely really want to do this!
midwest_ss <- midwest |> slice_sample(n = 100) |>
select(-category, -PID, -county, -state)
midwest_pca3 <- prcomp(midwest_ss,
center = TRUE,
scale. = FALSE)
autoplot(midwest_pca3)
screeplot(midwest_pca3)
tidy(midwest_pca3, "loadings") |> pivot_wider(names_from = PC, values_from = value, names_prefix = "PC ")
## # A tibble: 24 × 25
## column `PC 1` `PC 2` `PC 3` `PC 4` `PC 5` `PC 6` `PC 7`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 area -2.86e-9 -1.95e-8 1.73e-8 3.10e-7 4.17e-6 -4.42e-6 6.81e-6
## 2 poptotal -5.92e-1 3.22e-2 8.16e-2 6.26e-1 1.81e-1 1.54e-1 1.49e-1
## 3 popdensity -1.12e-2 3.96e-2 -1.35e-1 -9.20e-2 -4.86e-1 8.49e-1 1.24e-1
## 4 popwhite -3.67e-1 -4.58e-1 -6.45e-1 3.93e-2 1.10e-1 1.64e-2 -2.33e-1
## 5 popblack -1.65e-1 8.41e-1 -1.26e-1 8.79e-2 1.10e-1 4.64e-2 -2.33e-1
## 6 popamerindian -1.34e-3 6.82e-3 -1.68e-2 3.02e-2 8.31e-2 -7.54e-2 8.70e-1
## 7 popasian -1.97e-2 -1.04e-1 2.47e-1 2.82e-1 -2.11e-2 4.24e-2 4.78e-2
## 8 popother -3.91e-2 -2.53e-1 6.22e-1 1.87e-1 -1.01e-1 1.25e-1 -3.02e-1
## 9 percwhite 5.13e-6 -4.35e-5 1.50e-4 -3.86e-4 -5.97e-4 -1.04e-3 -2.23e-3
## 10 percblack -4.06e-6 5.37e-5 -1.42e-4 1.24e-4 4.06e-4 1.11e-3 4.14e-5
## # ℹ 14 more rows
## # ℹ 17 more variables: `PC 8` <dbl>, `PC 9` <dbl>, `PC 10` <dbl>,
## # `PC 11` <dbl>, `PC 12` <dbl>, `PC 13` <dbl>, `PC 14` <dbl>, `PC 15` <dbl>,
## # `PC 16` <dbl>, `PC 17` <dbl>, `PC 18` <dbl>, `PC 19` <dbl>, `PC 20` <dbl>,
## # `PC 21` <dbl>, `PC 22` <dbl>, `PC 23` <dbl>, `PC 24` <dbl>
library(palmerpenguins)
my_penguins_raw = penguins_raw |> select(-`Sample Number`) |>
select(Species, where(is.numeric) ) |> na.omit() |>
mutate(Species = str_remove(Species, "\\(.*\\)"))
pca2 <- my_penguins_raw |> select(-Species) |> prcomp(scale=TRUE)
autoplot(pca2, data = my_penguins_raw,
loadings=TRUE, loadings.label = TRUE,
loadings.label.colour = "black", loadings.colour = "black",
colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)
pca2a <- my_penguins_raw |> select(-Species) |> prcomp(scale=FALSE)
autoplot(pca2a, data = my_penguins_raw,
loadings=TRUE, loadings.label = TRUE,
loadings.label.colour = "black", loadings.colour = "black",
colour = 'Species') + xlim(-0.15, 0.15) + ylim(-0.15, 0.15)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).