36-350
15 October 2014
# Not actual code
big.job <- function(lots.of.arguments) {
intermediate.result <- first.step(some.of.the.args)
final.result <- second.step(intermediate.result,rest.of.the.args)
return(final.result)
}
credit: [http://cheezburger.com/View/4517375744]
One mode of abstraction is refactoring
The metaphor: numbers can be factored in many different ways; pick ones which emphasize the common factors
\[ \begin{eqnarray*} 144 & = & 9\times 16 = 3\times 3 \times 4 \times 2 \times 2\\ 360 & = & 6 \times 60 = 3 \times 3 \times 4 \times 2 \times 5 \end{eqnarray*} \]
Then you can re-use the common part of the work
Once we have some code, and it (more or less) works, re-write it to emphasize commonalities:
Re-factoring tends to make code look more like the result of top-down design
This is no accident
(Why \( (n-1)^2 / n \)? Think about just getting the standard error of the mean)
mean.jackknife <- function(a_vector) {
n <- length(a_vector)
jackknife.ests <- vector(length=n)
for (omitted.point in 1:n) {
jackknife.ests[omitted.point] <- mean(a_vector[-omitted.point])
}
variance.of.ests <- var(jackknife.ests)
jackknife.var <- ((n-1)^2/n)*variance.of.ests
jackknife.stderr <- sqrt(jackknife.var)
return(jackknife.stderr)
}
some_normals <- rnorm(100,mean=7,sd=5)
mean(some_normals)
[1] 6.932
(formula_se_of_mean <- sd(some_normals)/sqrt(length(some_normals)))
[1] 0.4961
all.equal(formula_se_of_mean,mean.jackknife(some_normals))
[1] TRUE
Recall our friend the method of moments estimator:
gamma.est <- function(the_data) {
m <- mean(the_data)
v <- var(the_data)
a <- m^2/v
s <- v/m
return(c(a=a,s=s))
}
gamma.jackknife <- function(a_vector) {
n <- length(a_vector)
jackknife.ests <- matrix(NA,nrow=2,ncol=n)
rownames(jackknife.ests) = c("a","s")
for (omitted.point in 1:n) {
fit <- gamma.est(a_vector[-omitted.point])
jackknife.ests["a",omitted.point] <- fit["a"]
jackknife.ests["s",omitted.point] <- fit["s"]
}
variance.of.ests <- apply(jackknife.ests,1,var)
jackknife.vars <- ((n-1)^2/n)*variance.of.ests
jackknife.stderrs <- sqrt(jackknife.vars)
return(jackknife.stderrs)
}
data("cats",package="MASS")
gamma.est(cats$Hwt)
a s
19.0653 0.5576
gamma.jackknife(cats$Hwt)
a s
2.74062 0.07829
jackknife.lm <- function(df,formula,p) {
n <- nrow(df)
jackknife.ests <- matrix(0,nrow=p,ncol=n)
for (omit in 1:n) {
new.coefs <- lm(as.formula(formula),data=df[-omit,])$coefficients
jackknife.ests[,omit] <- new.coefs
}
variance.of.ests <- apply(jackknife.ests,1,var)
jackknife.var <- ((n-1)^2/n)*variance.of.ests
jackknife.stderr <- sqrt(jackknife.var)
return(jackknife.stderr)
}
cats.lm <- lm(Hwt~Bwt,data=cats)
coefficients(cats.lm)
(Intercept) Bwt
-0.3567 4.0341
# "Official" standard errors
sqrt(diag(vcov(cats.lm)))
(Intercept) Bwt
0.6923 0.2503
jackknife.lm(df=cats,formula="Hwt~Bwt",p=2)
[1] 0.8314 0.3167
Omitting one point or row is a common sub-task
The general pattern:
figure out the size of the data
for each case
omit that case
repeat some estimation and get a vector of numbers
take variances across cases
scale up variances
take the square roots
Refactor by extracting the common “omit one” operation
Refactor by defining a general “jackknife” operation
Problem: Omit one particular data point from a larger structure
Difficulty: Do we need a comma in the index or not?
Solution: Works for vectors, lists, 1D and 2D arrays, matrices, data frames:
omit.case <- function(the_data,omitted_point) {
data_dims <- dim(the_data)
if (is.null(data_dims) || (length(data_dims)==1)) {
return(the_data[-omitted_point])
} else {
return(the_data[-omitted_point,])
}
}
Exercise: Modify so it also handles higher-dimensional arrays
jackknife <- function(estimator,the_data) {
if (is.null(dim(the_data))) { n <- length(the_data) }
else { n <- nrow(the_data) }
omit_and_est <- function(omit) {
estimator(omit.case(the_data,omit))
}
jackknife.ests <- matrix(sapply(1:n, omit_and_est), ncol=n)
var.of.reestimates <- apply(jackknife.ests,1,var)
jackknife.var <- ((n-1)^2/n)* var.of.reestimates
jackknife.stderr <- sqrt(jackknife.var)
return(jackknife.stderr)
}
Could allow other arguments to estimator
, spin off
finding n
as its own function, etc.
jackknife(estimator=mean,the_data=some_normals)
[1] 0.4961
all.equal(jackknife(estimator=mean,the_data=some_normals),
mean.jackknife(some_normals))
[1] TRUE
all.equal(jackknife(estimator=gamma.est,the_data=cats$Hwt),
gamma.jackknife(cats$Hwt))
[1] "names for current but not for target"
all.equal(jackknife(estimator=gamma.est,the_data=cats$Hwt),
gamma.jackknife(cats$Hwt), check.names=FALSE)
[1] TRUE
Exercise: Have jackknife()
figure out component
names for its output, if estimator
has named components
est.coefs <- function(the_data) {
return(lm(Hwt~Bwt,data=the_data)$coefficients)
}
est.coefs(cats)
(Intercept) Bwt
-0.3567 4.0341
all.equal(est.coefs(cats), coefficients(cats.lm))
[1] TRUE
jackknife(estimator=est.coefs,the_data=cats)
[1] 0.8314 0.3167
all.equal(jackknife(estimator=est.coefs,the_data=cats),
jackknife.lm(df=cats,formula="Hwt~Bwt",p=2))
[1] TRUE
We have just tested the new code against the old to make sure we've not added errors
i.e., we have done regression testing
Top-down design is a recursive heuristic for coding
Leads to multiple short functions, each solving a limited problem
Disciplines you to think algorithmically
Once you have working code, re-factor it to make it look more like it came from a top-down design
The code for jackknife()
is still a bit clunky:
if-else
for finding n
data_size <- function(the_data) {
if (is.null(dim(the_data))) { n <- length(the_data) }
else { n <- nrow(the_data) }
}
scale_and_sqrt_vars <- function(jackknife.ests,n) {
var.of.reestimates <- apply(jackknife.ests,1,var)
jackknife.var <- ((n-1)^2/n)* var.of.reestimates
jackknife.stderr <- sqrt(jackknife.var)
return(jackknife.stderr)
}
Now invoke those functions
jackknife <- function(estimator,the_data) {
n <- data_size(the_data)
omit_and_est <- function(omit) {
estimator(omit.case(the_data,omit))
}
jackknife.ests <- matrix(sapply(1:n, omit_and_est), ncol=n)
return(scale_and_sqrt_vars(jackknife.ests,n))
}