Cleaning the WHO TB data

We need the tidyverse metapackage

library(tidyverse)

Coalesce the code columns into ‘key’ and ‘cases’ (slide 28)

who1 <- who |> pivot_longer(cols = new_sp_m014:newrel_f65,
                    names_to = "key",
                    values_to = "cases",
                    values_drop_na = TRUE)
who1
## # A tibble: 76,046 × 6
##    country     iso2  iso3   year key          cases
##    <chr>       <chr> <chr> <int> <chr>        <int>
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36
## # … with 76,036 more rows

Rename newrel to new_rel then separate

who2 <- who1 |> mutate(key = stringr::str_replace(key, "newrel", "new_rel"))
who3 <- who2 |> separate(key, c("new", "type", "sexage"), sep = "_")
who3
## # A tibble: 76,046 × 8
##    country     iso2  iso3   year new   type  sexage cases
##    <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int>
##  1 Afghanistan AF    AFG    1997 new   sp    m014       0
##  2 Afghanistan AF    AFG    1997 new   sp    m1524     10
##  3 Afghanistan AF    AFG    1997 new   sp    m2534      6
##  4 Afghanistan AF    AFG    1997 new   sp    m3544      3
##  5 Afghanistan AF    AFG    1997 new   sp    m4554      5
##  6 Afghanistan AF    AFG    1997 new   sp    m5564      2
##  7 Afghanistan AF    AFG    1997 new   sp    m65        0
##  8 Afghanistan AF    AFG    1997 new   sp    f014       5
##  9 Afghanistan AF    AFG    1997 new   sp    f1524     38
## 10 Afghanistan AF    AFG    1997 new   sp    f2534     36
## # … with 76,036 more rows

Drop redundant columns

who4 <- who3 |> select(-new, -iso2, -iso3)
who4
## # A tibble: 76,046 × 5
##    country      year type  sexage cases
##    <chr>       <int> <chr> <chr>  <int>
##  1 Afghanistan  1997 sp    m014       0
##  2 Afghanistan  1997 sp    m1524     10
##  3 Afghanistan  1997 sp    m2534      6
##  4 Afghanistan  1997 sp    m3544      3
##  5 Afghanistan  1997 sp    m4554      5
##  6 Afghanistan  1997 sp    m5564      2
##  7 Afghanistan  1997 sp    m65        0
##  8 Afghanistan  1997 sp    f014       5
##  9 Afghanistan  1997 sp    f1524     38
## 10 Afghanistan  1997 sp    f2534     36
## # … with 76,036 more rows

Separate sex and age

who5 <- who4 |> separate(sexage, c("sex", "age"), 1)
who5
## # A tibble: 76,046 × 6
##    country      year type  sex   age   cases
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1997 sp    m     1524     10
##  3 Afghanistan  1997 sp    m     2534      6
##  4 Afghanistan  1997 sp    m     3544      3
##  5 Afghanistan  1997 sp    m     4554      5
##  6 Afghanistan  1997 sp    m     5564      2
##  7 Afghanistan  1997 sp    m     65        0
##  8 Afghanistan  1997 sp    f     014       5
##  9 Afghanistan  1997 sp    f     1524     38
## 10 Afghanistan  1997 sp    f     2534     36
## # … with 76,036 more rows

Doing all the above in a single pipeline

who_tidy <- who |>
  pivot_longer(cols = new_sp_m014:newrel_f65,
               names_to = "key",
               values_to = "cases",
               values_drop_na = TRUE) |>
  mutate(key = stringr::str_replace(key, "newrel", "new_rel")) |>
  separate(key, c("new", "type", "sexage"), sep = "_") |>
  select(-new, -iso2, -iso3) |>
  separate(sexage, c("sex", "age"), 1)
who_tidy
## # A tibble: 76,046 × 6
##    country      year type  sex   age   cases
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1997 sp    m     1524     10
##  3 Afghanistan  1997 sp    m     2534      6
##  4 Afghanistan  1997 sp    m     3544      3
##  5 Afghanistan  1997 sp    m     4554      5
##  6 Afghanistan  1997 sp    m     5564      2
##  7 Afghanistan  1997 sp    m     65        0
##  8 Afghanistan  1997 sp    f     014       5
##  9 Afghanistan  1997 sp    f     1524     38
## 10 Afghanistan  1997 sp    f     2534     36
## # … with 76,036 more rows