36-402, Spring 2019, Section A
12 February 2019
# Load some data
# Loads as a time-series structure which we'll undo in a moment
require(pdfetch)
sp <- pdfetch_YAHOO("SPY", fields="adjclose",
from=as.Date("1993-02-09"), to=as.Date("2019-02-09"))
# Compute log returns for each day
# as.numeric() gets rid of the time-series structure
sp <- as.numeric(diff(log(sp)))
# need to drop the initial NA which makes difficulties later
sp <- sp[-1]
# Make into a two-column data frame, one column for day $t$, other for day $t+1$
# So columns should have all but the last entry in sp vs. all but the first
sp.df <- data.frame(Today=head(sp,-1), Tomorrow=tail(sp,-1))
# Fit a linear model, see the coefficients
sp.lm <- lm(Tomorrow ~ Today, data=sp.df)
coefficients(sp.lm)
## (Intercept) Today
## 0.0003705107 -0.0598583227
# Plot tomorrow versus today
plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
# Add a nice horizontal line
abline(h=0,lty=3)
# And the estimated model
abline(sp.lm,lwd=2)
Let’s try putting some error bars on this, with bootstrapping
We want to get the distribution of \(\tau(X)\) when \(X \sim P\) by simulating \(\tilde{X} \sim \hat{P}\), and calculating \(\tau(\tilde{X})\) repeatedly
## function (tboots, summarizer, ...)
## {
## summaries <- apply(tboots, 1, summarizer, ...)
## return(t(summaries))
## }
## <bytecode: 0x7f97ce6e65c8>
## function (statistic, simulator, B)
## {
## tboots <- replicate(B, statistic(simulator()))
## if (is.null(dim(tboots))) {
## tboots <- array(tboots, dim = c(1, B))
## }
## return(tboots)
## }
## <bytecode: 0x7f97ce4ffd90>
## function (statistic, simulator, B)
## {
## bootstrap(rboot(statistic, simulator, B), summarizer = sd)
## }
rboot
## function (statistic, simulator, B)
## {
## tboots <- replicate(B, statistic(simulator()))
## if (is.null(dim(tboots))) {
## tboots <- array(tboots, dim = c(1, B))
## }
## return(tboots)
## }
## <bytecode: 0x7f97ce4ffd90>
simulator
function to make pseudo-data (generates \(\tilde{X})\)statistic
function to calculate on each simulation (calculates \(\tau)\)## function (x)
## {
## sample(x, size = length(x), replace = TRUE)
## }
## <bytecode: 0x7f97cf185b10>
statistic
selected.today.returns <- data.frame(Today=seq(from=-0.1,
to=0.1,
length.out=5))
sp.selected.preds <- function(data) {
mdl <- lm(Tomorrow ~ Today, data=data)
preds <- predict(mdl, newdata=selected.today.returns)
}
## 1 2 3 4 5
## 0.006360 0.003360 0.000371 -0.002620 -0.005620
## 1 2 3 4 5
## 0.004780 0.002490 0.000191 -0.002110 -0.004400
# You don't need to run this inside system.time()
system.time(my.first.bootstrap <- rboot(statistic=sp.selected.preds,
simulator=resample.residuals.sp,
B=800))
## user system elapsed
## 3.340 0.053 3.399
## [1] 5 800
plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey",
ylim=range(my.first.bootstrap))
abline(sp.lm, lwd=2)
for (repnum in 1:ncol(my.first.bootstrap)) {
lines(x=selected.today.returns$Today,
y=my.first.bootstrap[,repnum], lwd=0.1,
col="blue")
}
sp.selected.preds.ses <- bootstrap(tboots = my.first.bootstrap,
summarizer=sd)
signif(sp.selected.preds.ses[1,],3)
## 1 2 3 4 5
## 0.001200 0.000610 0.000142 0.000612 0.001200
## function (data)
## {
## sample.rows <- resample(1:nrow(data))
## return(data[sample.rows, ])
## }
## <bytecode: 0x7fceeb13df40>
plot(Tomorrow ~ Today, data=sp.df, pch=19, col="darkgrey")
sim.3 <- resample.sp()
# Jitter just a little to see multiple points
points(x=jitter(sim.3$Today), y=jitter(sim.3$Tomorrow), pch=19, col="lightgreen", cex=0.5)
sim.4 <- resample.sp()
points(x=jitter(sim.4$Today), y=jitter(sim.4$Tomorrow), pch=19, col="darkgreen",
cex=0.3)
system.time(my.second.bootstrap <- rboot(statistic=sp.selected.preds,
simulator=resample.sp,
B=800))
## user system elapsed
## 3.911 0.054 3.967
sp.selected.preds.ses.resampled <- bootstrap(tboots = my.second.bootstrap,
summarizer=sd)
signif(sp.selected.preds.ses.resampled, 3)
## 1 2 3 4 5
## [1,] 0.00231 0.00118 0.000138 0.00112 0.00225