More on Apply

Statistical Computing, 36-350

Friday October 14, 2016

lapply(), elements of a list or vector

The lapply() function takes inputs as in: lapply(x, FUN=my.fun), to apply my.fun() across elements of a list or vector x. The output is a list

my.list = list(nums=seq(0.1,0.6,by=0.1), chars=letters[1:12], 
               bools=sample(c(TRUE,FALSE), 6, replace=TRUE))
my.list
## $nums
## [1] 0.1 0.2 0.3 0.4 0.5 0.6
## 
## $chars
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l"
## 
## $bools
## [1]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
lapply(my.list, FUN=mean) # Warning, because mean() can't be applied to chars
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
## $nums
## [1] 0.35
## 
## $chars
## [1] NA
## 
## $bools
## [1] 0.8333333

The return value is always a list

With lapply() and say, FUN=my.fun, we’ll get back a list, no matter the output type of my.fun()

lapply(my.list, FUN=summary)
## $nums
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   0.225   0.350   0.350   0.475   0.600 
## 
## $chars
##    Length     Class      Mode 
##        12 character character 
## 
## $bools
##    Mode   FALSE    TRUE    NA's 
## logical       1       5       0

Custom functions, “on-the-fly” functions, extra arguments

These all work like they do in apply(). E.g., can use: lapply(x, FUN=my.fun, extra.arg.1, extra.arg.2), for two extra arguments extra.arg.1, extra.arg.2 to be passed to my.fun()

# How to compute the leave-one-out means, also called jackknife means,
# without a for() loop?
mean.omitting.one = function(i, vec) { return(mean(vec[-i])) }
my.vec = state.x77[,"Frost"] # Frost variable in the states data
n = length(my.vec)
my.vec.jack = lapply(1:n, FUN=mean.omitting.one, vec=my.vec)
head(my.vec.jack) # It's a list, and here are the first 5 elements
## [[1]]
## [1] 106.1837
## 
## [[2]]
## [1] 103.4898
## 
## [[3]]
## [1] 106.2857
## 
## [[4]]
## [1] 105.2653
## 
## [[5]]
## [1] 106.1837
## 
## [[6]]
## [1] 103.2041

sapply(), elements of a list or vector

The sapply() function works just like lapply(), but tries to simplify the return value whenever possible. E.g., most common is the conversion from a list to a vector

my.vec.jack = sapply(1:n, FUN=mean.omitting.one, vec=my.vec)
head(my.vec.jack) # Now this is a vector, with same elements as before
## [1] 106.1837 103.4898 106.2857 105.2653 106.1837 103.2041
sqrt((n-1)^2/n) * sd(my.vec.jack) # Jackknife standard error
## [1] 7.351202
sd(my.vec)/sqrt(n) # Compare to "usual" standard error of the mean
## [1] 7.351202

tapply(), levels of a factor vector

The function tapply() takes inputs as in: tapply(x, INDEX=my.index, FUN=my.fun), to apply my.fun() to subsets of entries in x that share a common level in my.index

head(state.x77) # Matrix of states data, 50 states x 8 variables
##            Population Income Illiteracy Life Exp Murder HS Grad Frost
## Alabama          3615   3624        2.1    69.05   15.1    41.3    20
## Alaska            365   6315        1.5    69.31   11.3    66.7   152
## Arizona          2212   4530        1.8    70.55    7.8    58.1    15
## Arkansas         2110   3378        1.9    70.66   10.1    39.9    65
## California      21198   5114        1.1    71.71   10.3    62.6    20
## Colorado         2541   4884        0.7    72.06    6.8    63.9   166
##              Area
## Alabama     50708
## Alaska     566432
## Arizona    113417
## Arkansas    51945
## California 156361
## Colorado   103766
head(state.region) # Factor of regions for the 50 states
## [1] South West  West  South West  West 
## Levels: Northeast South North Central West
# Now, let's average the Frost variable, within in each region
tapply(state.x77[,"Frost"], INDEX=state.region, FUN=mean)
##     Northeast         South North Central          West 
##      132.7778       64.6250      138.8333      102.1538

split(), split by levels of a factor

Sometimes we want to split up the rows of a data frame or entries of a vector by levels of a factor. The function split() does this, as in: split(x, f=my.index) to split a data frame or vector x according to levels of my.index

# Let's split up the state.x77 matrix according to region
state.x77.df = data.frame(state.x77) # First cast into a data frame
state.x77.by.reg = split(state.x77.df, f=state.region) # Now split
class(state.x77.by.reg) # The result is a list
## [1] "list"
names(state.x77.by.reg) # This has 4 elements for the 4 regions
## [1] "Northeast"     "South"         "North Central" "West"
class(state.x77.by.reg[[1]]) # Each element is a data frame
## [1] "data.frame"

(Continued)

# For each region, display the first 3 rows of the data frame
lapply(state.x77.by.reg, FUN=head, 3) 
## $Northeast
##               Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## Connecticut         3100   5348        1.1    72.48    3.1    56.0   139
## Maine               1058   3694        0.7    70.39    2.7    54.7   161
## Massachusetts       5814   4755        1.1    71.83    3.3    58.5   103
##                Area
## Connecticut    4862
## Maine         30920
## Massachusetts  7826
## 
## $South
##          Population Income Illiteracy Life.Exp Murder HS.Grad Frost  Area
## Alabama        3615   3624        2.1    69.05   15.1    41.3    20 50708
## Arkansas       2110   3378        1.9    70.66   10.1    39.9    65 51945
## Delaware        579   4809        0.9    70.06    6.2    54.6   103  1982
## 
## $`North Central`
##          Population Income Illiteracy Life.Exp Murder HS.Grad Frost  Area
## Illinois      11197   5107        0.9    70.14   10.3    52.6   127 55748
## Indiana        5313   4458        0.7    70.88    7.1    52.9   122 36097
## Iowa           2861   4628        0.5    72.56    2.3    59.0   140 55941
## 
## $West
##            Population Income Illiteracy Life.Exp Murder HS.Grad Frost
## Alaska            365   6315        1.5    69.31   11.3    66.7   152
## Arizona          2212   4530        1.8    70.55    7.8    58.1    15
## California      21198   5114        1.1    71.71   10.3    62.6    20
##              Area
## Alaska     566432
## Arizona    113417
## California 156361

(Continued)

# For each region, average each of the 8 numeric variables
lapply(state.x77.by.reg, FUN=function(df) { apply(df, MARGIN=2, mean) })
## $Northeast
##   Population       Income   Illiteracy     Life.Exp       Murder 
##  5495.111111  4570.222222     1.000000    71.264444     4.722222 
##      HS.Grad        Frost         Area 
##    53.966667   132.777778 18141.000000 
## 
## $South
##  Population      Income  Illiteracy    Life.Exp      Murder     HS.Grad 
##  4208.12500  4011.93750     1.73750    69.70625    10.58125    44.34375 
##       Frost        Area 
##    64.62500 54605.12500 
## 
## $`North Central`
##  Population      Income  Illiteracy    Life.Exp      Murder     HS.Grad 
##  4803.00000  4611.08333     0.70000    71.76667     5.27500    54.51667 
##       Frost        Area 
##   138.83333 62652.00000 
## 
## $West
##   Population       Income   Illiteracy     Life.Exp       Murder 
## 2.915308e+03 4.702615e+03 1.023077e+00 7.123462e+01 7.215385e+00 
##      HS.Grad        Frost         Area 
## 6.200000e+01 1.021538e+02 1.344630e+05