Minimal Advice on Programming in R

36-402, Section A

29 January 2019 (Lecture 5)

library(knitr)
opts_chunk$set(size="small", cache=TRUE,
               autodep=TRUE, warnings=TRUE, error=TRUE)

Why do you need to do programming?

Agenda for today

  1. Style
  2. Functions
  3. Debugging
  4. Avoiding iteration
  5. Design advice

Style: Why?

Style: Commandments

Functions

Functions: Example with Pareto distribution

# alpha = 2.33, x_0 = 6e8, p = 0.5
6e8 * (1-0.5)^(-1/(2.33-1))
## [1] 1010391288
# alpha = 2.33, x_0 = 6e8, p = 0.4
6e8 * (1-0.4)^(-1/(2.33-1))
## [1] 880957225
# alpha = 2.5, x_0 = 1e6, p = 0.92
1e6 * (1-0.92)^(-1/(2.5-1))
## [1] 5386087

Functions: Pareto example

# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: the pth quantile
qpareto.1 <- function(p, exponent, threshold) {
  q <- threshold*((1-p)^(-1/(exponent-1)))
  return(q)
}
qpareto.1(p=0.5, exponent=2.33, threshold=6e8)
## [1] 1010391288
qpareto.1(p=0.4, exponent=2.33, threshold=6e8)
## [1] 880957225
qpareto.1(p=0.92, exponent=2.5, threshold=1e6)
## [1] 5386087

Parts of a function

Parts of a function: Pareto quantiles cont’d

# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
  # flag for whether to give lower or upper quantiles (lower.tail)
# Outputs: the pth quantile
qpareto.2 <- function(p, exponent, threshold, lower.tail=TRUE) {
  if(lower.tail==FALSE) {
    p <- 1-p
  }
  q <- threshold*((1-p)^(-1/(exponent-1)))
  return(q)
}
qpareto.2(p=0.92, exponent=2.5, threshold=1e6)
## [1] 5386087
qpareto.2(p=0.92, exponent=2.5, threshold=1e6, lower.tail=FALSE)
## [1] 1057162

When should we write functions?

Chaining functions together

Calling functions from other functions

# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
  # flag for whether to give lower or upper quantiles (lower.tail)
# Outputs: the pth quantile
qpareto.3 <- function(p, exponent, threshold, lower.tail=TRUE) {
  if(lower.tail==FALSE) {
    p <- 1-p
  }
  # Call qpareto.1() to do the actual calculation
  q <- qpareto.1(p, exponent, threshold)
  return(q)
}

Functions calling other functions (cont’d)

Debugging

Debugging: Test cases

Debugging: Some tools

Debugging: Pareto example continued

# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
  # flag for whether to give lower or upper quantiles (lower.tail)
# Outputs: the pth quantile
qpareto.4 <- function(p, exponent, threshold, lower.tail=TRUE) {
  stopifnot(p >= 0, p <= 1, exponent > 1, threshold > 0)
  q <- qpareto.3(p,exponent,threshold,lower.tail)
  return(q)
}
qpareto.4(p=0.92,exponent=2.5,threshold=-1,lower.tail=FALSE)
## Error: threshold > 0 is not TRUE

Debugging: Pareto example continued

# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto <- function(n,exponent,threshold) {
  x <- vector(length=n)
  for (i in 1:n) {
    x[i] <- qpareto.4(p=rnorm(1),exponent=exponent,threshold=threshold)
  }
  return(x)
}

Debugging: Pareto example continued

rpareto(10)
## Error in qpareto.4(p = rnorm(1), exponent = exponent, threshold = threshold): argument "exponent" is missing, with no default
rpareto(n=10,exponent=2.5,threshold=1)
## Error: p >= 0 is not TRUE

Debugging: The in-class exercise

EXERCISE: Where is the bug? Why doesn’t rpareto(n=10,exponent=2.5,threshold=1) work?

qpareto.1 <- function(p, exponent, threshold) {
  q <- threshold*((1-p)^(-1/(exponent-1)))
  return(q)
}

qpareto.3 <- function(p, exponent, threshold, lower.tail=TRUE) {
  if(lower.tail==FALSE) {
    p <- 1-p
  }
  q <- qpareto.1(p, exponent, threshold)
  return(q)
}

qpareto.4 <- function(p, exponent, threshold, lower.tail=TRUE) {
  stopifnot(p >= 0, p <= 1, exponent > 1, threshold > 0)
  q <- qpareto.3(p,exponent,threshold,lower.tail)
  return(q)
}

rpareto <- function(n,exponent,threshold) {
  x <- vector(length=n)
  for (i in 1:n) {
    x[i] <- qpareto.4(p=rnorm(1),exponent=exponent,threshold=threshold)
  }
  return(x)
}

Exercise: Solution

Avoid iteration

Avoid iteration: Pareto example cont’d

# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto.1 <- function(n,exponent,threshold) {
  x <- c()
  for (i in 1:n) {
    x <- c(x,qpareto.4(p=runif(1),exponent=exponent,threshold=threshold))
  }
  return(x)
}
system.time(rpareto.1(n=1e4,exponent=2.5,threshold=1))
##    user  system elapsed 
##   0.323   0.018   0.343

Avoid iteration: Pareto example cont’d

# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto.2 <- function(n,exponent,threshold) {
  x = replicate(n, qpareto.4(p=runif(1),exponent=exponent,threshold=threshold))
  return(x)
}


system.time(rpareto.2(n=1e4,exponent=2.5,threshold=1))
##    user  system elapsed 
##   0.122   0.001   0.123

Avoiding iteration: Vectorizing

qpareto.4(c(.1,.2,.3), 2.5, 1)
## [1] 1.072766 1.160397 1.268434
# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto.3 <- function(n,exponent,threshold) {
  x = qpareto.4(p=runif(n), exponent=exponent, threshold=threshold)
  return(x)
}
system.time(rpareto.3(10000,exponent=2.5,threshold=1))
##    user  system elapsed 
##   0.000   0.000   0.001

Avoiding iteration: ifelse

# Calculate Huber's loss function
# Input: vector of numbers x
# Return: x^2 for |x|<1, 2|x|-1 otherwise
huber <- function(x) {
  return(ifelse(abs(x) <= 1, x^2, 2*abs(x)-1))
}
x = seq(from=-3,to=3,length.out=100)
plot(x,huber(x), type="l")

Avoiding iteration: matrix and vector operations

# Get some long vectors
n = 1e6
a = rnorm(n)
b = rnorm(n)
# Find the inner product with a for() loop
system.time({
  total = 0
  for(i in 1:length(a)){total = total + a[i]*b[i]}
})
##    user  system elapsed 
##   0.058   0.000   0.058
# Find the inner product with matrix operations
system.time({
a %*% b
})
##    user  system elapsed 
##   0.005   0.000   0.005

Avoiding iteration: apply and friends

#Make a 100x100 matrix of uniform [0,1] variables
X = matrix(runif(10000),nrow=100,ncol=100)
#Find the maximum of each column
column.maxs = apply(X, MARGIN = 2, FUN = max)
#Plot a histogram.  Do you know the distribution?
hist(column.maxs,20)

Do not be afraid of iteration

sim.ar <- function(n,x1,phi=0,sd=1) {
  x <- vector(length=n)
  x[1] <- x1
  for (t in 2:n) {
    x[t] <- phi*x[t-1]+rnorm(1,mean=0,sd=sd)
  }
  return(x)
}

Returning vectors and lists

rpareto.3 <- function(n,exponent,threshold) {
  x = qpareto.4(p=runif(n), exponent=exponent, threshold=threshold)
  return(x) #If n>1, this is a vector!
}
  return(list(best.bw=best.bw,CV_MSEs=CV_MSEs,
              fold_MSEs=fold_MSEs))

Design advice

What you need to remember

  1. Use meaningful names and Comment your code
  2. Write functions to bundle related commands together
  3. Split big jobs into a bunch of related functions
  4. Practice debugging
  5. Avoid iteration, in favor of acting on whole objects

Aside: Style: Things not to do with your main data frame

# Not what you want to read:
data <- read.csv("http://www.stat.cmu.edu/~cshalizi/uADA/19/hw/02/uval.csv")
data.lm <- lm(growth~log(gdp)+underval, data=data)

Aside: No-argument functions

# Function to return the biggest correlation of 5 length-10 Gaussian vectors
# Takes no arguments, so it can be used in a simulation
random.correlation.max = function(){
  #Draw a 10x5 matrix of Gaussians
  X = matrix(rnorm(50),nrow=10,ncol=5)
  #Compute the correlation matrix
  cor.matrix = cor(X)
  #Find the biggest entry, making sure to avoid the diagonal (which is all 1)
  return(max(cor.matrix[upper.tri(cor.matrix)]))
}
hist(replicate(100,random.correlation.max()),20,col='grey', main="Max. Correlation",xlab='correlation')

Aside: Functions refer to the current definition of other functions

add.one = function(x){
  x+1
}
add.stuff = function(x){
  return(add.one(x))
}
add.stuff(3)
## [1] 4
add.one = function(x){
  x+2
}
add.stuff(3)
## [1] 5