Lecture 9: Transforming and Reshaping Data

Statistical Computing, 36-350

Monday October 12, 2015

In previous episodes

Agenda

Access tricks

How to get the positions in a vector / columns in a matrix / rows in a data frame, matching some condition?

data(cats,package="MASS")
head(cats$Hwt[cats$Sex=="M"])
## [1]  6.5  6.5 10.1  7.2  7.6  7.9
head(cats[cats$Sex=="M","Hwt"])
## [1]  6.5  6.5 10.1  7.2  7.6  7.9
cats.subset = cats[sample(1:nrow(cats),size=nrow(cats)/2),]
head(cats.subset)
##     Sex Bwt  Hwt
## 134   M 3.5 15.7
## 97    M 2.9 10.6
## 14    F 2.2  8.7
## 70    M 2.5 11.0
## 65    M 2.5  7.9
## 53    M 2.2  7.9
head(cats.subset[,"Bwt"])
## [1] 3.5 2.9 2.2 2.5 2.5 2.2
males = cats$Sex=="M"
row.inds = sample(1:nrow(cats),size=nrow(cats)/2)
# (Now go forth and use these to index)

Access tricks: don’t do these

n = 50; set.seed(0)
movies = data.frame(gross=round(runif(n,1,500)),
                    genre=sample(c("comedy","action","vampire"),n,replace=T),
                    lead=sample(c("Morgan Freeman","Justin Bieber"),n,replace=T))
movies$lead["Justin Bieber"] # Not going to work!
## [1] <NA>
## Levels: Justin Bieber Morgan Freeman
# Inefficient and clumsy!
gross.bieber = c()
for (i in 1:nrow(movies)) {
  if (movies$lead[i]=="Justin Bieber") {
    gross.bieber = c(gross.bieber, movies$gross[i])
  }
}
gross.bieber
##  [1] 448 133 454 102 315  32 104  89 359 389 467 134   8 171 242  94 414
## [18] 335 397  55 362 206 411 324 265 239
movies$gross[movies$lead=="Justin Bieber"]       # Much better
##  [1] 448 133 454 102 315  32 104  89 359 389 467 134   8 171 242  94 414
## [18] 335 397  55 362 206 411 324 265 239
movies[movies$lead=="Justin Bieber","gross"]     # Equivalent to above
##  [1] 448 133 454 102 315  32 104  89 359 389 467 134   8 171 242  94 414
## [18] 335 397  55 362 206 411 324 265 239
mean(movies$gross[movies$lead=="Justin Bieber"]) # Bieber avg
## [1] 251.8846
mean(movies$gross[movies$lead!="Justin Bieber"]) # Freeman avg
## [1] 287.1667

Apply tricks, continued

dim(is.na(movies)) # checks each element for being NA
## [1] 50  3
mean(log(movies$gross))
## [1] 5.352082
rnorm(n=5,mean=-2:2,sd=0.001*(1:5)) # Check that you understand this!
## [1] -1.9988990309 -0.9997124570 -0.0003532608  0.9963517265  1.9928120688

Apply tricks, vectors

mean.omitting.one = function(i,x) { return(mean(x[-i])) }
jackknifed.means = sapply(1:nrow(cats),mean.omitting.one,x=cats$Bwt)
(n = length(jackknifed.means))
## [1] 144
sd(jackknifed.means)*sqrt((n-1)^2/n)
## [1] 0.04044222

Apply tricks, rows

movies$gross[sample(1:nrow(movies),2)] = NA
movies$genre[sample(1:nrow(movies),1)] = NA
movies$lead[sample(1:nrow(movies),1)] = NA
rows.with.NAs = apply(is.na(movies),1,any)
length(rows.with.NAs)
## [1] 50
sum(rows.with.NAs)
## [1] 4
# Let's rewrite the books and pretend Bieber grossed more $$...
gross.fake = apply(movies,1, function(r) { 
  return(ifelse(r[3]=="Justin Bieber",
                1.1*as.numeric(r[1]),
                0.9*as.numeric(r[1])))})
movies$gross = gross.fake
mean(movies$gross[movies$lead=="Justin Bieber"],na.rm=T) # New Bieber avg
## [1] 286.748
mean(movies$gross[movies$lead!="Justin Bieber"],na.rm=T) # New Freeman avg
## [1] 263.4955

(Why did we have to do such trickery? Turns out apply is mostly mean to work with matrices, and will cast each row to be a single data type, here a string

Apply tricks, columns

apply(cats[,2:3],2,median)
##  Bwt  Hwt 
##  2.7 10.1

Apply tricks, multiple arguments

mapply(FUN=f,x,y,z)

Transformations

The variables in the data are often either not what’s most relevant to the analysis, or they’re not arranged conveniently, or both
(Note: satisfying model assumptions is a big issue here)

Hence we often want to transform the data to make it closer to the data we wish we had to start with

Lossless versus lossy

Log transforms

Because

\[Y = f(X)g(Z) \iff \log{Y} = \log{f(X)} + \log{g(X)},\]

taking logs lets us use (say) linear models when the real relationship is multiplicative

Variance stabilizing transforms

Sometimes the variance of our random variables might grow with the mean, and this would not be good for (say) a linear model

E.g., Poisson distribution: if \(Y \sim \mathrm{Poisson}(\lambda)\), then \(\mathbb{E}(Y)=\lambda\) and \(\mathrm{Var}(Y)=\lambda\). To stabilize the variance, we transform to \(Z=\sqrt{Y}\), which has

\[ \mathbb{E}(Z) \approx \sqrt{\lambda}, \;\;\;\text{and}\;\;\; \mathrm{Var}(Z) \approx 1/4 \]

(This is a result of what is called the delta method, which you can view as a first-order Taylor series approximation)

lambda = (1:5)^2
num = 10000
Y = matrix(rpois(num*length(lambda),
                 lambda=rep(lambda,each=num)),
           nrow=num)
apply(Y,2,mean)
## [1]  1.0047  4.0016  8.9696 16.0497 25.1090
apply(Y,2,var)
## [1]  1.010179  3.943792  8.871563 15.980028 25.595079
apply(sqrt(Y),2,mean)
## [1] 0.7731338 1.9229928 2.9512973 3.9741548 4.9851141
apply(sqrt(Y),2,var)
## [1] 0.4070049 0.3037290 0.2594699 0.2558191 0.2576628

Some common numerical transforms

Ranks

n = 100; set.seed(0)
x = runif(n,-5,5)
head(rank(x))
## [1] 93 22 32 56 95 14
y = x^3 + rnorm(n,sd=2)
cor(x,y)
## [1] 0.903845
plot(x,y)

cor(rank(x),rank(y)) # Called Spearman's correlation
## [1] 0.9830423

Summarizing subsets

tapply(cats$Hwt,cats$Sex,max)
##    F    M 
## 13.0 20.5
tapply(cats$Hwt,cats$Sex,summary)
## $F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.300   8.350   9.100   9.202  10.100  13.000 
## 
## $M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.50    9.40   11.40   11.32   12.80   20.50

Re-organizing

Re-ordering

head(cats,4)
##   Sex Bwt Hwt
## 1   F 2.0 7.0
## 2   F 2.0 7.4
## 3   F 2.0 9.5
## 4   F 2.1 7.2
head(order(cats$Hwt))
## [1] 31 48 49  1 13  4
head(cats[order(cats$Hwt),],4)
##    Sex Bwt Hwt
## 31   F 2.4 6.3
## 48   M 2.0 6.5
## 49   M 2.0 6.5
## 1    F 2.0 7.0
head(sort(cats$Hwt))
## [1] 6.3 6.5 6.5 7.0 7.1 7.2
which.min(cats$Hwt) == order(cats$Hwt)[1]
## [1] TRUE

Flipping arrays

Merging data frames

Suppose you have two data frames X, Y, and you want to combine them

Example: big city drivers

Seems reasonable that people in larger cities (larger areas) would drive more. Is this true?

Distance driven, and city population: [http://www.fhwa.dot.gov/policyinformation/statistics/2011/hm71.cfm]

fha = read.csv("http://www.stat.cmu.edu/~ryantibs/statcomp-F15/lectures/fha.csv",
                na.strings="NA",
                colClasses=c("character","double","double","double"))
nrow(fha)
## [1] 498
colnames(fha)
## [1] "City"                 "Population"           "Miles.of.Road"       
## [4] "Daily.Miles.Traveled"

Area and population of “urbanized areas” [http://www2.census.gov/geo/ua/ua_list_all.txt]:

ua = read.csv("http://www.stat.cmu.edu/~ryantibs/statcomp-F15/lectures/ua.txt",sep=";")
nrow(ua)
## [1] 3598
colnames(ua)
##  [1] "UACE"          "NAME"          "POP"           "HU"           
##  [5] "AREALAND"      "AREALANDSQMI"  "AREAWATER"     "AREAWATERSQMI"
##  [9] "POPDEN"        "LSADC"

This isn’t a simple case, because:

But both use the same Census figures for population! And it turns out every settlement (in the top 498) has a unique Census population:

length(unique(fha$Population)) == nrow(fha)
## [1] TRUE
max(abs(fha$Population - sort(ua$POP,decreasing=TRUE)[1:nrow(fha)]))
## [1] 0

First way to merge

Option 1: re-order the 2nd table by population

ua = ua[order(ua$POP,decreasing=TRUE),]
df1 = data.frame(fha, area=ua$AREALANDSQMI[1:nrow(fha)])
# Neaten up names
colnames(df1) = c("City","Population","Roads","Mileage","Area")
nrow(df1)
## [1] 498
head(df1)
##                                   City Population Roads Mileage    Area
## 1         New York--Newark, NY--NJ--CT   18351295 43893  286101 3450.20
## 2 Los Angeles--Long Beach--Anaheim, CA   12150996 24877  270807 1736.02
## 3                      Chicago, IL--IN    8608208 25905  172708 2442.75
## 4                            Miami, FL    5502379 15641  125899 1238.61
## 5         Philadelphia, PA--NJ--DE--MD    5441567 19867   99190 1981.37
## 6    Dallas--Fort Worth--Arlington, TX    5121892 21610  125389 1779.13

Second way to merge

Option 2: Use the merge() function

df2 = merge(x=fha,y=ua,by.x="Population",by.y="POP")
nrow(df2)
## [1] 498
tail(df2,3)
##     Population                                 City Miles.of.Road
## 496    8608208                      Chicago, IL--IN         25905
## 497   12150996 Los Angeles--Long Beach--Anaheim, CA         24877
## 498   18351295         New York--Newark, NY--NJ--CT         43893
##     Daily.Miles.Traveled  UACE                                 NAME
## 496               172708 16264                      Chicago, IL--IN
## 497               270807 51445 Los Angeles--Long Beach--Anaheim, CA
## 498               286101 63217         New York--Newark, NY--NJ--CT
##          HU   AREALAND AREALANDSQMI AREAWATER AREAWATERSQMI POPDEN LSADC
## 496 3459257 6326686332      2442.75 105649916         40.79 3524.0    75
## 497 4217448 4496266014      1736.02  61141327         23.61 6999.3    75
## 498 7263095 8935981360      3450.20 533176599        205.86 5318.9    75

merge()

Merging on names doesn’t work here

df3 = merge(x=fha,y=ua,by.x="City", by.y="NAME")
nrow(df3)
## [1] 492

We can force unmatched rows of either data frame to be included, with NA values as appropriate:

df4 = merge(x=fha,y=ua,by.x="City",by.y="NAME",all.x=TRUE)
nrow(df4)
## [1] 498

Where are the mis-matches?

df4$City[is.na(df4$POP)]
## [1] "Aguadilla--Isabela--San Sebastián, PR"   
## [2] "Danville, VA – NC"                       
## [3] "Florida--Imbéry--Barceloneta, PR"        
## [4] "Juana Díaz, PR"                          
## [5] "Mayagüez, PR"                            
## [6] "San Germán--Cabo Rojo--Sabana Grande, PR"

(On investigation, fha.csv and ua.txt use 2 different encodings for accent characters, and one writes things like VA -- NC and the other says VA--NC)

Using order() and manual tricks versus merge()

So, do bigger cities mean more driving?

# Convert 1,000s of miles to miles
df1$Mileage = 1000*df1$Mileage
# Plot daily miles per person vs. area
plot(Mileage/Population ~ Area, data=df1, log="x",
     xlab="Miles driven per person per day",
     ylab="City area in square miles")
# Impressively flat regression line
abline(lm(Mileage/Population ~ Area, data=df1), col="blue")

Summary