Some Demos of Bootstrapping

36-402, Spring 2019, Section A

12 February 2019

An example: Let’s predict the stock market!

# Load some data
  # Loads as a time-series structure which we'll undo in a moment
require(pdfetch)
sp <- pdfetch_YAHOO("SPY", fields="adjclose",
  from=as.Date("1993-02-09"), to=as.Date("2019-02-09"))
# Compute log returns for each day
  # as.numeric() gets rid of the time-series structure
sp <- as.numeric(diff(log(sp)))
# need to drop the initial NA which makes difficulties later
sp <- sp[-1]
# Make into a two-column data frame, one column for day $t$, other for day $t+1$
  # So columns should have all but the last entry in sp vs. all but the first
sp.df <- data.frame(Today=head(sp,-1), Tomorrow=tail(sp,-1))
# Fit a linear model, see the coefficients
sp.lm <- lm(Tomorrow ~ Today, data=sp.df)
coefficients(sp.lm)
##   (Intercept)         Today 
##  0.0003705107 -0.0598583227
# Plot tomorrow versus today
plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
# Add a nice horizontal line
abline(h=0,lty=3)
# And the estimated model
abline(sp.lm,lwd=2)

Let’s try putting some error bars on this, with bootstrapping

What’s going on in bootstrapping?

We want to get the distribution of \(\tau(X)\) when \(X \sim P\) by simulating \(\tilde{X} \sim \hat{P}\), and calculating \(\tau(\tilde{X})\) repeatedly

Three parts to every bootstrapping exercise

  1. A way to simulate \(\hat{P}\) to get \(\tilde{X}\)
  2. A way to calculate \(\tau\) on \(\tilde{X}\)
  3. A way to summarize the distribution of \(\tau(\tilde{X})\)

Some functions for bootstrapping

bootstrap
## function (tboots, summarizer, ...) 
## {
##     summaries <- apply(tboots, 1, summarizer, ...)
##     return(t(summaries))
## }
## <bytecode: 0x7f97ce6e65c8>
rboot
## function (statistic, simulator, B) 
## {
##     tboots <- replicate(B, statistic(simulator()))
##     if (is.null(dim(tboots))) {
##         tboots <- array(tboots, dim = c(1, B))
##     }
##     return(tboots)
## }
## <bytecode: 0x7f97ce4ffd90>
bootstrap.se
## function (statistic, simulator, B) 
## {
##     bootstrap(rboot(statistic, simulator, B), summarizer = sd)
## }

Start with rboot

rboot
## function (statistic, simulator, B) 
## {
##     tboots <- replicate(B, statistic(simulator()))
##     if (is.null(dim(tboots))) {
##         tboots <- array(tboots, dim = c(1, B))
##     }
##     return(tboots)
## }
## <bytecode: 0x7f97ce4ffd90>

Simulate the linear model of stock returns

resample
## function (x) 
## {
##     sample(x, size = length(x), replace = TRUE)
## }
## <bytecode: 0x7f97cf185b10>
# After resample.residuals.penn in the textbook
resample.residuals.sp <- function() {
    new.frame <- sp.df
    new.Tomorrows <- fitted(sp.lm) + resample(residuals(sp.lm))
    new.frame$Tomorrow <- new.Tomorrows
    return(new.frame)
}

Example with that simulator

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")

Example with that simulator

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
sim.1 <- resample.residuals.sp()
points(x=sim.1$Today, y=sim.1$Tomorrow, pch=19, col="orange", cex=0.5)

Example with that simulator

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
sim.1 <- resample.residuals.sp()
points(x=sim.1$Today, y=sim.1$Tomorrow, pch=19, col="orange", cex=0.5)
sim.2 <- resample.residuals.sp()
points(x=sim.2$Today, y=sim.2$Tomorrow, pch=19, col="darkorange", cex=0.5)

What do we want to calculate?

An example of a statistic

selected.today.returns <- data.frame(Today=seq(from=-0.1,
                                               to=0.1,
                                               length.out=5))

sp.selected.preds <- function(data) {
    mdl <- lm(Tomorrow ~ Today, data=data)
    preds <- predict(mdl, newdata=selected.today.returns)
}
signif(sp.selected.preds(sp.df), 3) # How'd you check this?
##         1         2         3         4         5 
##  0.006360  0.003360  0.000371 -0.002620 -0.005620
signif(sp.selected.preds(sim.1), 3)
##         1         2         3         4         5 
##  0.004780  0.002490  0.000191 -0.002110 -0.004400

Put these together

# You don't need to run this inside system.time()
system.time(my.first.bootstrap <- rboot(statistic=sp.selected.preds,
                                    simulator=resample.residuals.sp,
                                    B=800))
##    user  system elapsed 
##   3.340   0.053   3.399
dim(my.first.bootstrap)
## [1]   5 800
# Rows are different dimensions of the statistic
# Columns are different bootstrap runs

What does this look like?

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey",
     ylim=range(my.first.bootstrap))
abline(sp.lm, lwd=2)
for (repnum in 1:ncol(my.first.bootstrap)) {
    lines(x=selected.today.returns$Today,
           y=my.first.bootstrap[,repnum], lwd=0.1,
           col="blue")
}    

sp.selected.preds.ses <- bootstrap(tboots = my.first.bootstrap,
                                summarizer=sd)
signif(sp.selected.preds.ses[1,],3)
##        1        2        3        4        5 
## 0.001200 0.000610 0.000142 0.000612 0.001200

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
abline(sp.lm, lwd=2)
segments(x0=selected.today.returns$Today,
         y0=sp.selected.preds(sp.df)-2*sp.selected.preds.ses,
         x1=selected.today.returns$Today,
         y1=sp.selected.preds(sp.df)+2*sp.selected.preds.ses,
         lwd=2)

We can use a different simulator

resample.data.frame
## function (data) 
## {
##     sample.rows <- resample(1:nrow(data))
##     return(data[sample.rows, ])
## }
## <bytecode: 0x7fceeb13df40>
resample.sp <- function() { resample.data.frame(sp.df) }

Example of simulating by resampling

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")

Example of simulating by resampling

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
sim.3 <- resample.sp()
# Jitter just a little to see multiple points
points(x=jitter(sim.3$Today), y=jitter(sim.3$Tomorrow), pch=19, col="lightgreen", cex=0.5)

Example of simulating by resampling

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
sim.3 <- resample.sp()
# Jitter just a little to see multiple points
points(x=jitter(sim.3$Today), y=jitter(sim.3$Tomorrow), pch=19, col="lightgreen", cex=0.5)
sim.4 <- resample.sp()
points(x=jitter(sim.4$Today), y=jitter(sim.4$Tomorrow), pch=19, col="darkgreen",
       cex=0.3)

DISCUSSION

SOLUTION

system.time(my.second.bootstrap <- rboot(statistic=sp.selected.preds,
                                    simulator=resample.sp,
                                    B=800))
##    user  system elapsed 
##   3.911   0.054   3.967
sp.selected.preds.ses.resampled <- bootstrap(tboots = my.second.bootstrap,
                                summarizer=sd)

signif(sp.selected.preds.ses.resampled, 3)
##            1       2        3       4       5
## [1,] 0.00231 0.00118 0.000138 0.00112 0.00225

plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
abline(sp.lm)
segments(x0=selected.today.returns$Today,
         y0=sp.selected.preds(sp.df)-2*sp.selected.preds.ses.resampled,
         x1=selected.today.returns$Today,
         y1=sp.selected.preds(sp.df)+2*sp.selected.preds.ses.resampled,
         lwd=2)