You will need the tidyverse metapackage for this homework.

library(tidyverse)

Problem 1: Airquality

airquality |>
  mutate(hotorcold = ifelse(Temp > median(Temp), "Hotter", "Colder")) |>
  ggplot(mapping = aes(x = Wind, y = Ozone)) +
    geom_point(mapping = aes(color = hotorcold)) +
    geom_smooth(method = "lm") +
    scale_color_manual(values = c("Colder" = "cadetblue3", "Hotter" = "tomato")) +
    labs(title = "Ozone and Wind in NYC, 1973",
         x = "Wind (mph)", y = "Ozone (ppb)", color = "") +
    theme_classic()

Problem 2: Derangement

In lecture 8, we plotted the approximate probability that a permutation of 100 elements is a derangement.

# Run the simulation
set.seed(100)
is_deranged <- function() {
 !any(sample(1:100) == 1:100)
}
result <- replicate(2000, is_deranged())
running_means <- cumsum(result) / 1:2000

# Plot the result
# We need to store the variables in a tibble.
result <- tibble(no_reps = 1:2000, p = running_means)
ggplot(result, mapping = aes(x = no_reps, y = p)) +
  geom_line() + geom_hline(yintercept = 1/exp(1), color = "red") +
  theme_classic() +
  labs(x = "Number of replications", title = "Probability of a derangement")

Problem 3: World Health Organization

Part 1

For each country, year, and sex compute the total number of cases of TB. Put the result into a tibble with 4 columns.

who_total <- who_tidy |>
  drop_na(country, year, sex, cases) |>
  group_by(country, year, sex) |>
  summarize(total_cases = sum(cases))
## `summarise()` has grouped output by 'country', 'year'. You can override using
## the `.groups` argument.

Part 2

ggplot(who_total, mapping = aes(x = year, y = total_cases)) +
    geom_jitter(alpha = 0.4, width = 0.3) +
    ggrepel::geom_text_repel(
      data = filter(who_total, (total_cases > 7e5 & sex == "m") | 
                               (total_cases > 4e5 & sex == "f")),
      mapping = aes(label = paste(country, year)),
      color = "red") +
    facet_wrap(~ sex, labeller = labeller(sex = c("f" = "Women", "m" = "Men"))) +
    scale_y_continuous(labels = scales::label_comma()) +
    scale_x_continuous(breaks = seq(1980, 2015, by = 5)) +
    labs(x = "", y = "Total Cases",
         title = "Tuberculosis Cases in Countries by Year",
         subtitle = "Dramatic increase in case count since mid 90s",
         caption = "Source: World Health Organization") +
    theme_minimal()

Problem 4: Pew Research Center

The following is data from the Pew Research Center about religion and income. It is part of the tidyr package which is part of the tidyverse metapackage.

relig_income
## # A tibble: 18 x 11
##    religion `<$10k` `$10-20k` `$20-30k` `$30-40k` `$40-50k` `$50-75k` `$75-100k`
##    <chr>      <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>      <dbl>
##  1 Agnostic      27        34        60        81        76       137        122
##  2 Atheist       12        27        37        52        35        70         73
##  3 Buddhist      27        21        30        34        33        58         62
##  4 Catholic     418       617       732       670       638      1116        949
##  5 Don’t k~      15        14        15        11        10        35         21
##  6 Evangel~     575       869      1064       982       881      1486        949
##  7 Hindu          1         9         7         9        11        34         47
##  8 Histori~     228       244       236       238       197       223        131
##  9 Jehovah~      20        27        24        24        21        30         15
## 10 Jewish        19        19        25        25        30        95         69
## 11 Mainlin~     289       495       619       655       651      1107        939
## 12 Mormon        29        40        48        51        56       112         85
## 13 Muslim         6         7         9        10         9        23         16
## 14 Orthodox      13        17        23        32        32        47         38
## 15 Other C~       9         7        11        13        13        14         18
## 16 Other F~      20        33        40        46        49        63         46
## 17 Other W~       5         2         3         4         2         7          3
## 18 Unaffil~     217       299       374       365       341       528        407
## # ... with 3 more variables: `$100-150k` <dbl>, `>150k` <dbl>,
## #   `Don't know/refused` <dbl>

Part 1

In a short sentence or two, explain why this dataset is not tidy. Not tidy since the income levels are columns. In order to make it tidy, there should be a single column for the income with the levels as values.

Part 2

Tidy the dataset and store the result in relig_income_tidy.

relig_income_tidy <- relig_income |>
  pivot_longer(-religion, names_to = "income", values_to = "frequency")
head(relig_income_tidy, 4)
## # A tibble: 4 x 3
##   religion income  frequency
##   <chr>    <chr>       <dbl>
## 1 Agnostic <$10k          27
## 2 Agnostic $10-20k        34
## 3 Agnostic $20-30k        60
## 4 Agnostic $30-40k        81

Part 3

relig_income_tidy |>
  group_by(religion) |>
  summarize(count = sum(frequency)) |>
  ggplot(mapping = aes(y = reorder(religion, count), x = count, fill = religion)) +
  geom_col() +
  guides(fill = "none") +
  labs(x = "", y = "", title = "Participants in Pew Research Survey",
       caption = "Source: Pew Research Center") +
  theme_minimal()