Last week: Pipes and dplyr

Recall: using pipes (%>%)

Tidyverse functions are at their best when composed together using the pipe (%>%) operator!

library(tidyverse) 
mtcars %>% 
  filter((mpg >= 14 & disp >= 200) | (drat <= 3)) %>% 
  head(2)
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 2 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

Shortcut: use ctrl + shift + m in RStudio as a shortcut for typing %>%

Part I

Mastering the tidyr verbs

tidyr verbs

Our tidyr journey starts of with learning the following verbs (functions):

Key takeaway: as with dplyr, think of data frames as nouns and tidyr verbs as actions that you apply to manipulate them—especially natural when using pipes

pivot_longer()

Use pivot_longer() to make “wide” data longer:

# devtools::install_github("rstudio/EDAWR")
library(EDAWR) # Load some nice data sets

EDAWR::cases %>% 
  head(3)
##   country  2011  2012  2013
## 1      FR  7000  6900  7000
## 2      DE  5800  6000  6200
## 3      US 15000 14000 13000
EDAWR::cases %>% 
  pivot_longer(names_to = "year", values_to = "n", cols = 2:4) %>% 
  head(5)
## # A tibble: 5 x 3
##   country year      n
##   <chr>   <chr> <dbl>
## 1 FR      2011   7000
## 2 FR      2012   6900
## 3 FR      2013   7000
## 4 DE      2011   5800
## 5 DE      2012   6000


# Different approach to do the same thing
EDAWR::cases %>% 
  pivot_longer(names_to = "year", values_to = "n", -country) %>% 
  head(5)
## # A tibble: 5 x 3
##   country year      n
##   <chr>   <chr> <dbl>
## 1 FR      2011   7000
## 2 FR      2012   6900
## 3 FR      2013   7000
## 4 DE      2011   5800
## 5 DE      2012   6000
# Could also do: 
# EDAWR::cases %>% 
#   pivot_longer(names_to = "year", values_to = "n", c(`2011`, `2012`, `2013`)) 
# or: 
# EDAWR::cases %>% 
#   pivot_longer(names_to = "year", values_to = "n",  `2011`:`2013`) 

pivot_wider()

Use pivot_wider() to make “long” data wider:

EDAWR::pollution %>% 
  head(5)
##       city  size amount
## 1 New York large     23
## 2 New York small     14
## 3   London large     22
## 4   London small     16
## 5  Beijing large    121
EDAWR::pollution %>% 
  pivot_wider(names_from = "size", 
              values_from = "amount")
## # A tibble: 3 x 3
##   city     large small
##   <chr>    <dbl> <dbl>
## 1 New York    23    14
## 2 London      22    16
## 3 Beijing    121    56

When could I used these?

Additionally, if you find yourself getting stuck (with nuance situations) they are more complicated functions like pivot_wider_spec, etc. for these cases (see Manual Specs)

separate()

Use separate() to split a single column into multiple ones:

EDAWR::storms %>% 
  head(3)
##     storm wind pressure       date
## 1 Alberto  110     1007 2000-08-03
## 2    Alex   45     1009 1998-07-27
## 3 Allison   65     1005 1995-06-03
storms2 <- EDAWR::storms %>% 
  separate(date, c("y", "m", "d")) # sep = "-"

unite()

Use unite() to combine multiple columns into a single column:

storms2 %>%
  unite(date, y, m, d, sep = "-")
## # A tibble: 6 x 4
##   storm    wind pressure date      
##   <chr>   <int>    <int> <chr>     
## 1 Alberto   110     1007 2000-08-03
## 2 Alex       45     1009 1998-07-27
## 3 Allison    65     1005 1995-06-03
## 4 Ana        40     1013 1997-06-30
## 5 Arlene     50     1010 1999-06-11
## 6 Arthur     45     1010 1996-06-17

Note that unite() and separate() are inverse operations

Part II

dplyr: group_by() and summarize()

Recall: dplyr and SQL

group_by()

Use group_by() to define a grouping of rows based on a column:

mtcars %>% 
  group_by(cyl) %>%
  head(4)
## # A tibble: 4 x 11
## # Groups:   cyl [2]
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
## 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
## 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
## 4  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1
mtcars %>% 
  group_by(cyl) %>%
  head(4) %>% class
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

summarize()

Use summarise() (or summarize() for us Americans) to apply functions to rows—ungrouped or grouped—of a data frame:

# Ungrouped
mtcars %>% 
  summarize(mpg = mean(mpg),
            hp = mean(hp))
##        mpg       hp
## 1 20.09062 146.6875
# Grouped by number of cylinders
mtcars %>%
  group_by(cyl) %>%
  summarize(mpg = mean(mpg),
            hp = mean(hp))
## # A tibble: 3 x 3
##     cyl   mpg    hp
##   <dbl> <dbl> <dbl>
## 1     4  26.7  82.6
## 2     6  19.7 122. 
## 3     8  15.1 209.

mtcars %>%
  group_by(cyl) %>%
  summarize(mpg_mean = mean(mpg),
            mpg_max = max(mpg),
            hp_mean = mean(hp),
            hp_max = max(hp))
## # A tibble: 3 x 5
##     cyl mpg_mean mpg_max hp_mean hp_max
##   <dbl>    <dbl>   <dbl>   <dbl>  <dbl>
## 1     4     26.7    33.9    82.6    113
## 2     6     19.7    21.4   122.     175
## 3     8     15.1    19.2   209.     335

ungroup()

Use ungroup() to remove groupings structure from a data frame:

mtcars %>%
  group_by(cyl) %>%
  ungroup() %>%
  summarize(hp = mean(hp),
            mpg = mean(mpg))
## # A tibble: 1 x 2
##      hp   mpg
##   <dbl> <dbl>
## 1  147.  20.1

Beyond summarize() (bonus material)

summarize() in dplyr only allows you to calculate single numerical summaries, but you could image that you want more than that.

For example:

mtcars %>% 
  pull(hp) %>% tapply(INDEX = mtcars$cyl, FUN = summary)
## $`4`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   52.00   65.50   91.00   82.64   96.00  113.00 
## 
## $`6`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   105.0   110.0   110.0   122.3   123.0   175.0 
## 
## $`8`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   150.0   176.2   192.5   209.2   241.2   335.0

There are tools to do this in tidyverse, specifically nest/unnesting data into columns and purrr::map function. (please see the first section of this lecture (Rmd, html) for some notes). These are pretty useful if you are using a lot of the tidyverse paradigm

mtcars %>%
  group_by(cyl) %>%
  nest() %>% # creates a column with the data conditional of subset
  mutate(sum = purrr::map(data, function(df) summary(df$hp)),
         sum_df = purrr::map(sum, broom::tidy)) %>% # unravel things to be data.frames
  select(cyl, sum_df) %>%
  unnest(cols = sum_df) 
## # A tibble: 3 x 7
## # Groups:   cyl [3]
##     cyl minimum    q1 median  mean    q3 maximum
##   <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>   <dbl>
## 1     6     105 110     110  122.   123      175
## 2     4      52  65.5    91   82.6   96      113
## 3     8     150 176.    192. 209.   241.     335

Part III

dplyr: left_join() and inner_join()

Join operations

A “join” operation in database terminology is a merging of two data frames for us. There are 4 types of joins:

Column values that cannot be filled in are assigned NA values


It helps to visualize the join types:

Two toy data frames

has_kids_tab1 <- data.frame(name = c("Robert Downey, Jr", "Scarlett Johansson", "Chris Hemsworth"), 
                  children = c(3, 1, 3),
                  stringsAsFactors = FALSE)
americans_tab2 <- data.frame(name = c("Chris Evans", "Robert Downey, Jr", "Scarlett Johansson"),
                  age = c(38, 54, 34),
                  stringsAsFactors = FALSE)
has_kids_tab1
##                 name children
## 1  Robert Downey, Jr        3
## 2 Scarlett Johansson        1
## 3    Chris Hemsworth        3
americans_tab2
##                 name age
## 1        Chris Evans  38
## 2  Robert Downey, Jr  54
## 3 Scarlett Johansson  34

inner_join()

Suppose we want to join tab1 and tab2 by name, but keep only actors in intersection (aka in both tables):

inner_join(x = has_kids_tab1, y = americans_tab2, by = "name")
##                 name children age
## 1  Robert Downey, Jr        3  54
## 2 Scarlett Johansson        1  34

left_join()

Suppose we want to join tab1 and tab2 by name, but keep all actors from tab1:

left_join(x = has_kids_tab1, y = americans_tab2, by = c("name" = "name"))
##                 name children age
## 1  Robert Downey, Jr        3  54
## 2 Scarlett Johansson        1  34
## 3    Chris Hemsworth        3  NA

right_join()

Suppose we want to join tab1 and tab2 by name, but keep all actors from tab2:

right_join(x = has_kids_tab1, y = americans_tab2, by = "name")
##                 name children age
## 1        Chris Evans       NA  38
## 2  Robert Downey, Jr        3  54
## 3 Scarlett Johansson        1  34

full_join()

Finally, suppose we want to join tab1 and tab2 by name, and keep all actors from both:

full_join(x = has_kids_tab1, y = americans_tab2, by = "name")
##                 name children age
## 1  Robert Downey, Jr        3  54
## 2 Scarlett Johansson        1  34
## 3    Chris Hemsworth        3  NA
## 4        Chris Evans       NA  38

More nuanced structure

my_peeps %T>% print() %>% dim()
##    pol gender state  IQ
## 1    D      F    PA  94
## 2    D      M    PA 102
## 3    R      F    PA  93
## 4    D      F    AZ 107
## 5    D      F    AZ 106
## 6    D      F    AZ  88
## 7    D      F    PA  99
## 8    R      M    AZ  88
## 9    D      M    AZ 102
## 10   D      F    AZ  96
## [1] 10  4
politics %T>% print() %>% dim()
##          senator pol gender STATE
## 1 Kyrsten Sinema   D      F    AZ
## 2 Martha McSally   R      F    AZ
## 3     Pat Toomey   R      M    PA
## 4  Boy Casey Jr.   D      M    PA
## [1] 4 4

my_peeps %>% left_join(politics,by = c("state" = "STATE",
                                       "pol" = "pol")) %>% head(6)
##   pol gender.x state  IQ        senator gender.y
## 1   D        F    PA  94  Boy Casey Jr.        M
## 2   D        M    PA 102  Boy Casey Jr.        M
## 3   R        F    PA  93     Pat Toomey        M
## 4   D        F    AZ 107 Kyrsten Sinema        F
## 5   D        F    AZ 106 Kyrsten Sinema        F
## 6   D        F    AZ  88 Kyrsten Sinema        F

Notice:

Summary

References