- Quick Review: Split-Apply-Combine
- base R functions: *apply(), by(), aggregate(), split()
5 November 2014
These are tools that, when put together, give us three general steps:
x <- split(strikes, strikes$country) y <- lapply(x, strikes_vs_left, coefficients.only=TRUE) coefs <- do.call(rbind, y)
*apply
in base Rapply()
– arrayslapply()
– list or vector, output listsapply()
– list or vector, simplify to vectorvapply()
– list or vector, safer simplify to vectortapply()
, by()
, aggregate()
– data frames (tables)mapply()
– multiple vectors (special case of 2d array)Useful set, but some issues: inconsistent output
plyr
We have three common data structures beyond vectors: arrays (matrices), data frames, lists.
#install.packages ("plyr") library(plyr)
Functions within: ?*ply()
, replace ?
with character denoting types:
a
, d
, l
a
, d
, l
, or "drop" _
a*ply()
– input arraysprocessed <- a*ply(.data, .margins, .fun, ...)
.data
– an array.margins
– subscripts which the function gets applied over.fun
– the function to be applied...
– additional arguments to functionLooks like
processed <- apply (data, margin, function)
a*ply()
– input arraysLet's test it out!
my.boring.array <- array(1:27, c(3,3,3)) rownames(my.boring.array) <- c("Curly", "Larry", "Moe") colnames(my.boring.array) <- c("Groucho", "Harpo", "Zeppo") dimnames(my.boring.array)[[3]] <- c("Bart", "Lisa", "Maggie") adply (my.boring.array, 1, sum)
## X1 V1 ## 1 Curly 117 ## 2 Larry 126 ## 3 Moe 135
a*ply()
– input arraysalply (my.boring.array, 2, sum)
## $`1` ## [1] 99 ## ## $`2` ## [1] 126 ## ## $`3` ## [1] 153 ## ## attr(,"split_type") ## [1] "array" ## attr(,"split_labels") ## X1 ## 1 Groucho ## 2 Harpo ## 3 Zeppo
a*ply()
– input arraysaaply (my.boring.array, 3, sum)
## Bart Lisa Maggie ## 45 126 207
a*ply()
– input arraysadply (my.boring.array, 1:2, sum)
## X1 X2 V1 ## 1 Curly Groucho 30 ## 2 Larry Groucho 33 ## 3 Moe Groucho 36 ## 4 Curly Harpo 39 ## 5 Larry Harpo 42 ## 6 Moe Harpo 45 ## 7 Curly Zeppo 48 ## 8 Larry Zeppo 51 ## 9 Moe Zeppo 54
a*ply()
– input arraysalply (my.boring.array, 2:3, sum)
## $`1` ## [1] 6 ## ## $`2` ## [1] 15 ## ## $`3` ## [1] 24 ## ## $`4` ## [1] 33 ## ## $`5` ## [1] 42 ## ## $`6` ## [1] 51 ## ## $`7` ## [1] 60 ## ## $`8` ## [1] 69 ## ## $`9` ## [1] 78 ## ## attr(,"split_type") ## [1] "array" ## attr(,"split_labels") ## X1 X2 ## 1 Groucho Bart ## 2 Harpo Bart ## 3 Zeppo Bart ## 4 Groucho Lisa ## 5 Harpo Lisa ## 6 Zeppo Lisa ## 7 Groucho Maggie ## 8 Harpo Maggie ## 9 Zeppo Maggie
a*ply()
– input arraysaaply (my.boring.array, c(1,3), sum)
## X2 ## X1 Bart Lisa Maggie ## Curly 12 39 66 ## Larry 15 42 69 ## Moe 18 45 72
Return no values, but use outcomes in secondary functions.
pdf("lotsaplots.pdf") a_ply (my.boring.array, 2:3, plot) dev.off()
## pdf ## 2
processed <- d*ply(.data, .(splitvariable), .fun, ...)
.data
– an array.(...)
– arguments to split by.fun
– the function to be applied...
– additional arguments to functionLooks like
processed <- by (data, data$splitvariable, function)
Strikes example again:
strikes <- read.csv("strikes.csv") my.strike.lm.better <- function (country.df) { lm(strike.volume ~ left.parliament + unemployment + inflation, data=country.df)$coefficients }
Syntax: for data frame inputs, include the variables for the splitting in a stand-in function .()
– syntax like by()
.
Output to array:
results <- daply (strikes, .(country), my.strike.lm.better) results[1:5,]
## ## country (Intercept) left.parliament unemployment inflation ## Australia 157.9191 0.5658674 -1.1181489 30.4666061 ## Austria 600.6778 -11.2441604 -10.9216990 -0.5923972 ## Belgium -243.4823 12.4516118 0.3578217 10.2673539 ## Canada 167.0712 13.4386408 -48.1790264 27.2180651 ## Denmark -1331.7517 33.1605982 -8.1864655 5.1826201
Output to data frame: in this case, same as array
results <- ddply (strikes, .(country), my.strike.lm.better) results[1:5,]
## country (Intercept) left.parliament unemployment inflation ## 1 Australia 157.9191 0.5658674 -1.1181489 30.4666061 ## 2 Austria 600.6778 -11.2441604 -10.9216990 -0.5923972 ## 3 Belgium -243.4823 12.4516118 0.3578217 10.2673539 ## 4 Canada 167.0712 13.4386408 -48.1790264 27.2180651 ## 5 Denmark -1331.7517 33.1605982 -8.1864655 5.1826201
Output to list: what you would expect.
results <- dlply (strikes, .(country), my.strike.lm.better) results[1:3]
## $Australia ## (Intercept) left.parliament unemployment inflation ## 157.9191118 0.5658674 -1.1181489 30.4666061 ## ## $Austria ## (Intercept) left.parliament unemployment inflation ## 600.6777769 -11.2441604 -10.9216990 -0.5923972 ## ## $Belgium ## (Intercept) left.parliament unemployment inflation ## -243.4822938 12.4516118 0.3578217 10.2673539
processed <- l*ply(.data, , .fun, ...)
.data
– an array.fun
– the function to be applied...
– additional arguments to functionLooks like
processed <- lapply (data, function, ...)
For demo sake:
strikes.split <- split (strikes, strikes$country)
results <- laply (strikes.split, my.strike.lm.better) results[1:5,]
## (Intercept) left.parliament unemployment inflation ## [1,] 157.9191 0.5658674 -1.1181489 30.4666061 ## [2,] 600.6778 -11.2441604 -10.9216990 -0.5923972 ## [3,] -243.4823 12.4516118 0.3578217 10.2673539 ## [4,] 167.0712 13.4386408 -48.1790264 27.2180651 ## [5,] -1331.7517 33.1605982 -8.1864655 5.1826201
results <- ldply (strikes.split, my.strike.lm.better) results[1:5,]
## .id (Intercept) left.parliament unemployment inflation ## 1 Australia 157.9191 0.5658674 -1.1181489 30.4666061 ## 2 Austria 600.6778 -11.2441604 -10.9216990 -0.5923972 ## 3 Belgium -243.4823 12.4516118 0.3578217 10.2673539 ## 4 Canada 167.0712 13.4386408 -48.1790264 27.2180651 ## 5 Denmark -1331.7517 33.1605982 -8.1864655 5.1826201
results <- llply (strikes.split, my.strike.lm.better) results[1:3]
## $Australia ## (Intercept) left.parliament unemployment inflation ## 157.9191118 0.5658674 -1.1181489 30.4666061 ## ## $Austria ## (Intercept) left.parliament unemployment inflation ## 600.6777769 -11.2441604 -10.9216990 -0.5923972 ## ## $Belgium ## (Intercept) left.parliament unemployment inflation ## -243.4822938 12.4516118 0.3578217 10.2673539
Recall the NHL data from last time:
print(load ("all-team-seasons-20132014.RData"))
## [1] "teamtable"
head (teamtable)
## FAC_WIN FAC_LOSE PENL_TAKEN PENL_DRAWN HIT HIT_TAKEN CF CA FF FA SF ## MTL 4 1 2 1 3 6 12 5 10 4 4 ## MTL1 3 2 0 0 0 3 11 9 7 5 6 ## MTL2 6 7 2 2 9 7 12 13 11 9 10 ## MTL3 5 2 2 1 5 6 21 12 16 8 9 ## MTL4 2 1 3 0 0 0 2 1 2 1 2 ## MTL5 3 2 2 2 0 1 3 1 1 1 1 ## SA GF GA Off Neu Def TOI team opponent season gcode scorediffcat ## MTL 3 1 0 2 3 0 632.0 MTL TOR 20132014 20001 1 ## MTL1 3 0 1 1 3 1 459.5 MTL TOR 20132014 20001 2 ## MTL2 8 1 0 3 4 6 641.0 MTL TOR 20132014 20001 3 ## MTL3 5 0 1 3 2 2 641.0 MTL TOR 20132014 20001 4 ## MTL4 1 0 0 2 1 0 91.0 MTL TOR 20132014 20001 2 ## MTL5 1 0 1 4 1 0 190.5 MTL TOR 20132014 20001 3 ## gamestate home ## MTL 1 1 ## MTL1 1 1 ## MTL2 1 1 ## MTL3 1 1 ## MTL4 2 1 ## MTL5 2 1
teamtable <- subset (teamtable, scorediffcat < 6)
Split by team name, version 1:
teamperf <- ddply (teamtable[,c("GF","GA","HIT","HIT_TAKEN","team")], .(team), function(dd) colSums(dd[,-5]))
Split by team name:
teamperf <- ddply (teamtable, .(team), function(dd) colSums(dd[,c("GF","GA","HIT","HIT_TAKEN")])) head(teamperf)
## team GF GA HIT HIT_TAKEN ## 1 ANA 298 240 2515 2512 ## 2 BOS 287 197 2441 2466 ## 3 BUF 150 243 1992 1537 ## 4 CAR 204 226 1807 1965 ## 5 CBJ 241 234 2901 2109 ## 6 CGY 201 238 1708 1463
Split by team name and home/away:
teamperf <- ddply (teamtable, .(team, home), function(dd) colSums(dd[,c("GF","GA","HIT","HIT_TAKEN")])) head(teamperf)
## team home GF GA HIT HIT_TAKEN ## 1 ANA 0 136 129 1079 1245 ## 2 ANA 1 162 111 1436 1267 ## 3 BOS 0 137 113 1106 1270 ## 4 BOS 1 150 84 1335 1196 ## 5 BUF 0 65 124 1039 906 ## 6 BUF 1 85 119 953 631
Split by team name and game ID: The lethal case from last time
teamperf <- ddply (teamtable, .(team, gcode), function(dd) colSums(dd[,c("GF","GA","HIT","HIT_TAKEN")])) dim(teamperf) ## worst case: 30*1323 rows.
## [1] 2646 6
head(teamperf)
## team gcode GF GA HIT HIT_TAKEN ## 1 ANA 20006 1 6 13 21 ## 2 ANA 20029 4 3 20 23 ## 3 ANA 20033 3 2 21 25 ## 4 ANA 20057 6 0 22 26 ## 5 ANA 20078 4 1 17 28 ## 6 ANA 20095 3 2 21 22
plyr
comes with a number of helper functions:
teamtable <- mutate (teamtable, blocksfor = CF - FF, blocksagainst = CA - FA, missesfor = FF - SF, missesagainst = FA - SA) head(teamtable)
## FAC_WIN FAC_LOSE PENL_TAKEN PENL_DRAWN HIT HIT_TAKEN CF CA FF FA SF ## MTL 4 1 2 1 3 6 12 5 10 4 4 ## MTL1 3 2 0 0 0 3 11 9 7 5 6 ## MTL2 6 7 2 2 9 7 12 13 11 9 10 ## MTL3 5 2 2 1 5 6 21 12 16 8 9 ## MTL4 2 1 3 0 0 0 2 1 2 1 2 ## MTL5 3 2 2 2 0 1 3 1 1 1 1 ## SA GF GA Off Neu Def TOI team opponent season gcode scorediffcat ## MTL 3 1 0 2 3 0 632.0 MTL TOR 20132014 20001 1 ## MTL1 3 0 1 1 3 1 459.5 MTL TOR 20132014 20001 2 ## MTL2 8 1 0 3 4 6 641.0 MTL TOR 20132014 20001 3 ## MTL3 5 0 1 3 2 2 641.0 MTL TOR 20132014 20001 4 ## MTL4 1 0 0 2 1 0 91.0 MTL TOR 20132014 20001 2 ## MTL5 1 0 1 4 1 0 190.5 MTL TOR 20132014 20001 3 ## gamestate home blocksfor blocksagainst missesfor missesagainst ## MTL 1 1 2 1 6 1 ## MTL1 1 1 4 4 1 2 ## MTL2 1 1 1 4 1 1 ## MTL3 1 1 5 4 7 3 ## MTL4 2 1 0 0 0 0 ## MTL5 2 1 2 0 0 0
plyr
comes with a number of helper functions:
teambits <- summarize (teamtable, blocksfor = CF - FF, blocksagainst = CA - FA, missesfor = FF - SF, missesagainst = FA - SA) head(teambits)
## blocksfor blocksagainst missesfor missesagainst ## 1 2 1 6 1 ## 2 4 4 1 2 ## 3 1 4 1 1 ## 4 5 4 7 3 ## 5 0 0 0 0 ## 6 2 0 0 0