Text

Cleaning up input, for example for data entered in a spreadsheet.

my_string <- "   A cat  is a small 
                    and furry animal.  "
my_string2 <- str_squish(my_string)
my_string2
## [1] "A cat is a small and furry animal."
str_to_lower(my_string2)
## [1] "a cat is a small and furry animal."
str_to_upper(my_string2)
## [1] "A CAT IS A SMALL AND FURRY ANIMAL."
str_to_sentence(my_string2)
## [1] "A cat is a small and furry animal."
str_to_title(my_string2)
## [1] "A Cat Is A Small And Furry Animal."

Converting text to numbers

text_and_numbers <- tibble( text = c("Andrew", "33", "12.45", 
                                     "-1.00", "Inf"))
text_and_numbers |> mutate(numbers = as.numeric(text), 
                            integers = as.integer(text)) |> kable()
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `numbers = as.numeric(text)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
text numbers integers
Andrew NA NA
33 33.00 33
12.45 12.45 12
-1.00 -1.00 -1
Inf Inf NA

Extracting information from longer labels

sets <- c("A1", "A2", "B1", "B4", "C5")
str_extract(sets, "[0-9]")
## [1] "1" "2" "1" "4" "5"
str_extract(sets, "[A-Z]")
## [1] "A" "A" "B" "B" "C"

Inserting and extrating data from longer messages

library(glue)
library(unglue)
a <- 1
b <- 6
c <- 15.63
my_string3 <- glue("The numbers a, b, and c are {a}, {b}, and {c}, respectively. Their sum is {a+b+c}.")
my_string3
## The numbers a, b, and c are 1, 6, and 15.63, respectively. Their sum is 22.63.
unglue(my_string3, "The numbers a, b, and c are {a}, {b}, and {c}, respectively. Their sum is {d}.")
## $`1`
##   a b     c     d
## 1 1 6 15.63 22.63
my_strings1 <- tibble(greeting = c("My name is Andrew.", 
                                   "My name is Li.", 
                                   "My name is Emily."))
unglue_unnest(my_strings1, 
              greeting, 
              "My name is {name}.", 
              remove=FALSE) |> kable()
greeting name
My name is Andrew. Andrew
My name is Li. Li
My name is Emily. Emily

Plots and text labels (factors)

mpg |> ggplot(aes(x = cty,
                   y = trans)) +
  geom_boxplot()

mpg |> ggplot(aes(x = cty,
                   y = fct_reorder(trans, cty, min, .desc=TRUE))) +
  geom_boxplot() 

Simplify the number of categories

mpg |> 
  ggplot(aes(y = cty,
             x = fct_lump(trans, 4) |> fct_reorder(cty, median))) +
  geom_boxplot() 

mpg |> 
  ggplot(aes(x = cty,
             y = fct_lump(trans, 4) |> fct_reorder(cty, median))) +
  geom_boxplot() 

Dates and times

Using lubridate package

today()
## [1] "2024-03-26"
now()
## [1] "2024-03-26 11:17:39 ADT"
now(tz = "UTC")
## [1] "2024-03-26 14:17:39 UTC"
now(tz = "America/Toronto")
## [1] "2024-03-26 10:17:39 EDT"
now(tz = "Asia/Shanghai")
## [1] "2024-03-26 22:17:39 CST"

List of timezones: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones

Text to date

dt1 <- tibble(text_date = c("1999-01-31", "2000-02-28", "2010-06-28",
                            "2024-03-14", "2021-02-29"),
             date = ymd(text_date))
## Warning: 1 failed to parse.
dt1 |> arrange(date)
## # A tibble: 5 × 2
##   text_date  date      
##   <chr>      <date>    
## 1 1999-01-31 1999-01-31
## 2 2000-02-28 2000-02-28
## 3 2010-06-28 2010-06-28
## 4 2024-03-14 2024-03-14
## 5 2021-02-29 NA

Crazy formats!

tibble(date = c("Jan 5, 1999", "Saturday May 16, 70", "8-8-88",
               "December 31/99", "Jan 1, 01"),
      decoded = mdy(date)) |> kable()
date decoded
Jan 5, 1999 1999-01-05
Saturday May 16, 70 1970-05-16
8-8-88 1988-08-08
December 31/99 1999-12-31
Jan 1, 01 2001-01-01

With times

dt2 <- tibble(text_date = c("1999-01-31 09:14", "2000-02-28 12:15",
                            "2010-06-28 23:45", 
                            "2023-03-10 00:15", "2023-03-10 01:15", "2023-03-10 02:15", "2023-03-10 03:15",
                            "2024-03-14 07:00 AM", "2021-03-01 6:16 PM"),
             date_time = ymd_hm(text_date, tz="America/Halifax"))
dt2 |> kable()
text_date date_time
1999-01-31 09:14 1999-01-31 09:14:00
2000-02-28 12:15 2000-02-28 12:15:00
2010-06-28 23:45 2010-06-28 23:45:00
2023-03-10 00:15 2023-03-10 00:15:00
2023-03-10 01:15 2023-03-10 01:15:00
2023-03-10 02:15 2023-03-10 02:15:00
2023-03-10 03:15 2023-03-10 03:15:00
2024-03-14 07:00 AM 2024-03-14 07:00:00
2021-03-01 6:16 PM 2021-03-01 18:16:00

Converting dates to numbers

t1 <- now()
year(t1)
## [1] 2024
day(t1)
## [1] 26
hour(t1)
## [1] 11
decimal_date(t1)
## [1] 2024.233
yday(t1)
## [1] 86
date_decimal(2022.95)
## [1] "2022-12-13 18:00:00 UTC"

Plotting with dates and times

dt2 |> mutate(r = rnorm(n(), 20, 3)) |>
  ggplot(aes(x = date_time, y = r)) + 
  geom_point()  +
  # scale_x_datetime(date_labels = "%Y %Z\n%b-%d %H:%M:%S")
  scale_x_datetime(date_labels = "%Y-%b-%d")