Statistical Computing, 36-350
Monday October 12, 2015
How to get the positions in a vector / columns in a matrix / rows in a data frame, matching some condition?
data(cats,package="MASS")
head(cats$Hwt[cats$Sex=="M"])
## [1] 6.5 6.5 10.1 7.2 7.6 7.9
head(cats[cats$Sex=="M","Hwt"])
## [1] 6.5 6.5 10.1 7.2 7.6 7.9
cats$Sex=="M"
is a Boolean vector, as long as cats$Sex
cats.subset = cats[sample(1:nrow(cats),size=nrow(cats)/2),]
head(cats.subset)
## Sex Bwt Hwt
## 134 M 3.5 15.7
## 97 M 2.9 10.6
## 14 F 2.2 8.7
## 70 M 2.5 11.0
## 65 M 2.5 7.9
## 53 M 2.2 7.9
head(cats.subset[,"Bwt"])
## [1] 3.5 2.9 2.2 2.5 2.5 2.2
males = cats$Sex=="M"
row.inds = sample(1:nrow(cats),size=nrow(cats)/2)
# (Now go forth and use these to index)
apply()
tricks belown = 50; set.seed(0)
movies = data.frame(gross=round(runif(n,1,500)),
genre=sample(c("comedy","action","vampire"),n,replace=T),
lead=sample(c("Morgan Freeman","Justin Bieber"),n,replace=T))
movies$lead["Justin Bieber"] # Not going to work!
## [1] <NA>
## Levels: Justin Bieber Morgan Freeman
# Inefficient and clumsy!
gross.bieber = c()
for (i in 1:nrow(movies)) {
if (movies$lead[i]=="Justin Bieber") {
gross.bieber = c(gross.bieber, movies$gross[i])
}
}
gross.bieber
## [1] 448 133 454 102 315 32 104 89 359 389 467 134 8 171 242 94 414
## [18] 335 397 55 362 206 411 324 265 239
movies$gross[movies$lead=="Justin Bieber"] # Much better
## [1] 448 133 454 102 315 32 104 89 359 389 467 134 8 171 242 94 414
## [18] 335 397 55 362 206 411 324 265 239
movies[movies$lead=="Justin Bieber","gross"] # Equivalent to above
## [1] 448 133 454 102 315 32 104 89 359 389 467 134 8 171 242 94 414
## [18] 335 397 55 362 206 411 324 265 239
mean(movies$gross[movies$lead=="Justin Bieber"]) # Bieber avg
## [1] 251.8846
mean(movies$gross[movies$lead!="Justin Bieber"]) # Freeman avg
## [1] 287.1667
dim(is.na(movies)) # checks each element for being NA
## [1] 50 3
mean(log(movies$gross))
## [1] 5.352082
rnorm(n=5,mean=-2:2,sd=0.001*(1:5)) # Check that you understand this!
## [1] -1.9988990309 -0.9997124570 -0.0003532608 0.9963517265 1.9928120688
grep
, regexp
)apply()
family of functionssapply()
or lapply()
mean.omitting.one = function(i,x) { return(mean(x[-i])) }
jackknifed.means = sapply(1:nrow(cats),mean.omitting.one,x=cats$Bwt)
(n = length(jackknifed.means))
## [1] 144
sd(jackknifed.means)*sqrt((n-1)^2/n)
## [1] 0.04044222
sapply()
tries to return a vector or an array (with one column per entry in the original vector)lapply()
, which just returns a listFUN
to every row of an array or data frame X
: apply(X,1,FUN)
movies$gross[sample(1:nrow(movies),2)] = NA
movies$genre[sample(1:nrow(movies),1)] = NA
movies$lead[sample(1:nrow(movies),1)] = NA
rows.with.NAs = apply(is.na(movies),1,any)
length(rows.with.NAs)
## [1] 50
sum(rows.with.NAs)
## [1] 4
apply()
tries to return a vector or an array; will return a list if it can’tapply()
assumes FUN
will work on a row of X
; might need to write a little adapter function to make this work# Let's rewrite the books and pretend Bieber grossed more $$...
gross.fake = apply(movies,1, function(r) {
return(ifelse(r[3]=="Justin Bieber",
1.1*as.numeric(r[1]),
0.9*as.numeric(r[1])))})
movies$gross = gross.fake
mean(movies$gross[movies$lead=="Justin Bieber"],na.rm=T) # New Bieber avg
## [1] 286.748
mean(movies$gross[movies$lead!="Justin Bieber"],na.rm=T) # New Freeman avg
## [1] 263.4955
(Why did we have to do such trickery? Turns out apply is mostly mean to work with matrices, and will cast each row to be a single data type, here a string
FUN()
to every column of an array or data frame X
apply(X,2,FUN)
apply(cats[,2:3],2,median)
## Bwt Hwt
## 2.7 10.1
f
which takes 2+ arguments; vectors x
, y
, … z
, suppose we want f(x[1],y[1],..., z[1]), f(x[2],y[2],...,z[2])
, etc. How?mapply(FUN=f,x,y,z)
The variables in the data are often either not what’s most relevant to the analysis, or they’re not arranged conveniently, or both
(Note: satisfying model assumptions is a big issue here)
Hence we often want to transform the data to make it closer to the data we wish we had to start with
Because
\[Y = f(X)g(Z) \iff \log{Y} = \log{f(X)} + \log{g(X)},\]
taking logs lets us use (say) linear models when the real relationship is multiplicative
Sometimes the variance of our random variables might grow with the mean, and this would not be good for (say) a linear model
E.g., Poisson distribution: if \(Y \sim \mathrm{Poisson}(\lambda)\), then \(\mathbb{E}(Y)=\lambda\) and \(\mathrm{Var}(Y)=\lambda\). To stabilize the variance, we transform to \(Z=\sqrt{Y}\), which has
\[ \mathbb{E}(Z) \approx \sqrt{\lambda}, \;\;\;\text{and}\;\;\; \mathrm{Var}(Z) \approx 1/4 \]
(This is a result of what is called the delta method, which you can view as a first-order Taylor series approximation)
lambda = (1:5)^2
num = 10000
Y = matrix(rpois(num*length(lambda),
lambda=rep(lambda,each=num)),
nrow=num)
apply(Y,2,mean)
## [1] 1.0047 4.0016 8.9696 16.0497 25.1090
apply(Y,2,var)
## [1] 1.010179 3.943792 8.871563 15.980028 25.595079
apply(sqrt(Y),2,mean)
## [1] 0.7731338 1.9229928 2.9512973 3.9741548 4.9851141
apply(sqrt(Y),2,var)
## [1] 0.4070049 0.3037290 0.2594699 0.2558191 0.2576628
Centering and scaling: scale(x,center=T,scale=F)
subtracts the mean from each column of x
; scale(x,center=F,scale=T)
divides each column by its sd; scale(x,center=T,scale=T)
does both
Successive differences: diff()
; lags are possible; higher-order differences are possible; vectorizes over columns of a matrix
Cumulative functions: cumsum()
, cumprod()
, cummax()
, cummin()
Rolling means: rollmean()
from the zoo
package; see Recipe 14.10 in the R Cookbook
rank(x)
outputs the rank of each element of x
within the vector, with 1 being the smallest:n = 100; set.seed(0)
x = runif(n,-5,5)
head(rank(x))
## [1] 93 22 32 56 95 14
y = x^3 + rnorm(n,sd=2)
cor(x,y)
## [1] 0.903845
plot(x,y)
cor(rank(x),rank(y)) # Called Spearman's correlation
## [1] 0.9830423
X
, with tapply(X,INDEX,FUN)
:tapply(cats$Hwt,cats$Sex,max)
## F M
## 13.0 20.5
tapply(cats$Hwt,cats$Sex,summary)
## $F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.300 8.350 9.100 9.202 10.100 13.000
##
## $M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.50 9.40 11.40 11.32 12.80 20.50
order()
takes in a vector, and returns the vector of indices which would put it in increasing orderdecreasing=TRUE
option to get decreasing orderorder
can be saved to re-order multiple data frames the same wayhead(cats,4)
## Sex Bwt Hwt
## 1 F 2.0 7.0
## 2 F 2.0 7.4
## 3 F 2.0 9.5
## 4 F 2.1 7.2
head(order(cats$Hwt))
## [1] 31 48 49 1 13 4
head(cats[order(cats$Hwt),],4)
## Sex Bwt Hwt
## 31 F 2.4 6.3
## 48 M 2.0 6.5
## 49 M 2.0 6.5
## 1 F 2.0 7.0
rank()
and order()
are effectively inverses of each othersort()
returns the sorted vector, not the orderinghead(sort(cats$Hwt))
## [1] 6.3 6.5 6.5 7.0 7.1 7.2
which.min
or which.max
which.min(cats$Hwt) == order(cats$Hwt)[1]
## [1] TRUE
t(x)
movies
)aperm
similarly for higher-dimensional arraysSuppose you have two data frames X
, Y
, and you want to combine them
Simplest case: the data frames have exactly the same number of rows, that the rows represent exactly the same units, and you want all columns from both; just use, data.frame(X,Y)
Next best case: you know that the two data frames have the same rows, but you only want certain columns from each; just use, e.g., data.frame(X$col1,X$col5,Y$favcol)
Next best case: same number of rows but in different order; put one of them in same order as the other, with order()
. Alternatively, use merge()
Worse cases: different numbers of rows … hard to line up rows …
Seems reasonable that people in larger cities (larger areas) would drive more. Is this true?
Distance driven, and city population: [http://www.fhwa.dot.gov/policyinformation/statistics/2011/hm71.cfm]
fha = read.csv("http://www.stat.cmu.edu/~ryantibs/statcomp-F15/lectures/fha.csv",
na.strings="NA",
colClasses=c("character","double","double","double"))
nrow(fha)
## [1] 498
colnames(fha)
## [1] "City" "Population" "Miles.of.Road"
## [4] "Daily.Miles.Traveled"
Area and population of “urbanized areas” [http://www2.census.gov/geo/ua/ua_list_all.txt]:
ua = read.csv("http://www.stat.cmu.edu/~ryantibs/statcomp-F15/lectures/ua.txt",sep=";")
nrow(ua)
## [1] 3598
colnames(ua)
## [1] "UACE" "NAME" "POP" "HU"
## [5] "AREALAND" "AREALANDSQMI" "AREAWATER" "AREAWATERSQMI"
## [9] "POPDEN" "LSADC"
This isn’t a simple case, because:
fha
orders cities by population, ua
is alphabetical by nameBut both use the same Census figures for population! And it turns out every settlement (in the top 498) has a unique Census population:
length(unique(fha$Population)) == nrow(fha)
## [1] TRUE
max(abs(fha$Population - sort(ua$POP,decreasing=TRUE)[1:nrow(fha)]))
## [1] 0
Option 1: re-order the 2nd table by population
ua = ua[order(ua$POP,decreasing=TRUE),]
df1 = data.frame(fha, area=ua$AREALANDSQMI[1:nrow(fha)])
# Neaten up names
colnames(df1) = c("City","Population","Roads","Mileage","Area")
nrow(df1)
## [1] 498
head(df1)
## City Population Roads Mileage Area
## 1 New York--Newark, NY--NJ--CT 18351295 43893 286101 3450.20
## 2 Los Angeles--Long Beach--Anaheim, CA 12150996 24877 270807 1736.02
## 3 Chicago, IL--IN 8608208 25905 172708 2442.75
## 4 Miami, FL 5502379 15641 125899 1238.61
## 5 Philadelphia, PA--NJ--DE--MD 5441567 19867 99190 1981.37
## 6 Dallas--Fort Worth--Arlington, TX 5121892 21610 125389 1779.13
Option 2: Use the merge()
function
df2 = merge(x=fha,y=ua,by.x="Population",by.y="POP")
nrow(df2)
## [1] 498
tail(df2,3)
## Population City Miles.of.Road
## 496 8608208 Chicago, IL--IN 25905
## 497 12150996 Los Angeles--Long Beach--Anaheim, CA 24877
## 498 18351295 New York--Newark, NY--NJ--CT 43893
## Daily.Miles.Traveled UACE NAME
## 496 172708 16264 Chicago, IL--IN
## 497 270807 51445 Los Angeles--Long Beach--Anaheim, CA
## 498 286101 63217 New York--Newark, NY--NJ--CT
## HU AREALAND AREALANDSQMI AREAWATER AREAWATERSQMI POPDEN LSADC
## 496 3459257 6326686332 2442.75 105649916 40.79 3524.0 75
## 497 4217448 4496266014 1736.02 61141327 23.61 6999.3 75
## 498 7263095 8935981360 3450.20 533176599 205.86 5318.9 75
merge()
by.x
and by.y
say which columns need to match to do a merge
merge()
is doing a JOIN
df3 = merge(x=fha,y=ua,by.x="City", by.y="NAME")
nrow(df3)
## [1] 492
We can force unmatched rows of either data frame to be included, with NA
values as appropriate:
df4 = merge(x=fha,y=ua,by.x="City",by.y="NAME",all.x=TRUE)
nrow(df4)
## [1] 498
Where are the mis-matches?
df4$City[is.na(df4$POP)]
## [1] "Aguadilla--Isabela--San Sebastián, PR"
## [2] "Danville, VA – NC"
## [3] "Florida--Imbéry--Barceloneta, PR"
## [4] "Juana Díaz, PR"
## [5] "Mayagüez, PR"
## [6] "San Germán--Cabo Rojo--Sabana Grande, PR"
(On investigation, fha.csv
and ua.txt
use 2 different encodings for accent characters, and one writes things like VA -- NC
and the other says VA--NC
)
order()
and manual tricks versus merge()
merge()
takes some learningmerge()
handles many columnsmerge
handles different sizes# Convert 1,000s of miles to miles
df1$Mileage = 1000*df1$Mileage
# Plot daily miles per person vs. area
plot(Mileage/Population ~ Area, data=df1, log="x",
xlab="Miles driven per person per day",
ylab="City area in square miles")
# Impressively flat regression line
abline(lm(Mileage/Population ~ Area, data=df1), col="blue")
apply()
and friends for doing the same thing to all parts of the dataorder()
(basic way), merge()
(more advanced)