Logistic Regression and Friends

28 February 2017

The logistic curve

library(faraway) # for logit and ilogit functions

curve(ilogit(x), from=-10, to=10, xlab="log odds ratio", ylab="probability")

The logistic regression model

\[ g(p) \equiv \log{\frac{p}{1-p}} = \beta_0 + \beta\cdot x \]

curve(ilogit(x), from=-10, to=10, xlab=expression(beta[0]+beta*x), ylab="probability")

curve(ilogit(x), from=-10, to=10, xlab=expression(beta[0]+beta*x), ylab="probability")
x = (-2):2; colors = c("red", "orange", "yellow", "green", "blue")
points(x, ilogit(x), col=colors, pch=15)
points(x+0.5, ilogit(x+.5), col=colors, pch=16)
points(2*x, ilogit(2*x), col=colors, pch=17)
points(2*x+0.5, ilogit(2*x+.5), col=colors, pch=18)
legend("topleft", legend=c(expression(paste(beta[0]==0, beta==1)),
                           expression(paste(beta[0]==.5, beta==1)),
                           expression(paste(beta[0]==0, beta==2)),
                           expression(paste(beta[0]==.5, beta==2))), pch=15:18)

In 2D

Let's begin with a simulation from the model:

# Simulate data from a logistic regression
# Inputs: matrix of x values (x)
  # intercept (beta.0)
  # vector of slopes (beta)
  # should the output be bound in an array, or just a vector of y's? (bind)
# output: either a binary vector or an array with columns of x and y values
sim.logistic <- function(x, beta.0,beta,bind=FALSE) {
  require(faraway) # For accessible logit and inverse-logit functions
  linear.parts <- beta.0+(x%*%beta)
  # Actually simulate in next line
  y <- rbinom(nrow(x),size=1,prob=ilogit(linear.parts))
  # Output either an array (x values, then y), or just a vector of y's
  if (bind) { return(cbind(x,y)) } else { return(y) }
}

And plotting of simulations:

# Simulate and plot a 2D logistic regression
  # The model's probabilities are plotted as a contour map, with
  # labeled points overlayed
# Inputs: matrix of x values (x)
  # intercept (beta.0)
  # vector of slopes (beta)
  # number of grid points foir evaluating probabilities per axis (n.grid)
  # font size for labels on contour map (labcex)
  # color for contour map (col)
  # other graphical parameters as desired (...)
# Output: Labels of the points (invisibly)
# Presumes: x has 2 columns, with values in [-1, 1]
# Presumes: beta has two entries
# Presumes: x, beta.0, beta are all numbers
plot.logistic.sim <- function(x, beta.0, beta, n.grid=50,
                              labcex=0.3, col="grey", ...) {
  grid.seq <- seq(from=-1,to=1,length.out=n.grid)
  plot.grid <- as.matrix(expand.grid(grid.seq,grid.seq))
  require(faraway)
  # Calculate the model's probability at each grid point
  p <- matrix(ilogit(beta.0 + (plot.grid %*% beta )),nrow=n.grid)
  # Lay down the contour map of probabilities
  contour(x=grid.seq,y=grid.seq,z=p, xlab=expression(x[1]),
          ylab=expression(x[2]),main="",labcex=labcex,col=col)
  # Now toss some coins at the points given by x
  y <- sim.logistic(x,beta.0,beta,bind=FALSE)
  # Add those points to the plot, with 1 being blue+, 0 being red-
  points(x[,1],x[,2],pch=ifelse(y==1,"+","-"),col=ifelse(y==1,"blue","red"))
  invisible(y)
}

Now let's make up some x values and see what happens:

x <- matrix(runif(n=50*2,min=-1,max=1),ncol=2)
par(mfrow=c(2,2))
plot.logistic.sim(x,beta.0=-0.1,beta=c(-0.2,0.2))
y.1 <- plot.logistic.sim(x,beta.0=-0.5,beta=c(-1,1))
plot.logistic.sim(x,beta.0=-2.5,beta=c(-5,5))
plot.logistic.sim(x,beta.0=-2.5e2,beta=c(-5e2,5e2))

We'll take one of those and run a logistic regression:

df <- data.frame(y=y.1, x1=x[,1], x2=x[,2])
logr <- glm(y ~ x1 + x2, data=df, family="binomial")

Deliberately, the output looks very similar to that of lm:

print(summary(logr),digits=2,signif.stars=FALSE)

## 
## Call:
## glm(formula = y ~ x1 + x2, family = "binomial", data = df)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -1.75   -1.01   -0.58    1.03    1.69  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -0.60       0.36    -1.6     0.10
## x1             -0.67       0.55    -1.2     0.22
## x2              1.55       0.66     2.4     0.02
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 68.994  on 49  degrees of freedom
## Residual deviance: 60.994  on 47  degrees of freedom
## AIC: 66.99
## 
## Number of Fisher Scoring iterations: 4

Classification

We classify each point as "should be \(y=1\)" or "should be \(y=0\)"

mean(ifelse(fitted(logr)<0.5,0,1) != df$y)

## [1] 0.36

Is this good or bad?

Generalized additive model

\[ g(p) = \alpha + \sum_{j=1}^{p}{f_j(x_j)} \]

\(f_j\) are still partial response functions, but for log-odds of \(Y=1\), not for \(Y\)

Syntax: additive models + glm and glm

library(mgcv)
(gam.1 <- gam(y~s(x1)+s(x2), data=df, family="binomial"))

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## y ~ s(x1) + s(x2)
## 
## Estimated degrees of freedom:
## 1.65 1.00  total = 3.65 
## 
## UBRE score: 0.3240966

plot(gam.1,residuals=TRUE,pages=1)

Real-data example: weather forecasting for Snoqualmie Falls, WA

Daily precipitation records, in 1/100 inch, for a tourist destination in Washington State, from 1948 through 1983

(Tedious re-shaping of the data hidden in the next code chunk)

Outcome: snoqualmie is a big vector of precipitation day by day, snoq records today's precipitation vs. tomorrow's

Marginal distribution of precipitation

hist(snoqualmie,n=50,probability=TRUE,xlab="Precipitation (1/100 inch)")
rug(snoqualmie,col="grey")

Lots of rain, lots of dry days

Tomorrow's weather vs. today's

plot(tomorrow~today,data=snoq,
     xlab="Precipitation today (1/100 inch)",
     ylab="Precipitation tomorrow (1/100 inch)",cex=0.1)
rug(snoq$today,side=1,col="grey")
rug(snoq$tomorrow,side=2,col="grey")

Fit a logistic regression!

snoq.logistic <- glm((tomorrow > 0) ~ today, data=snoq, family=binomial)

What's the outcome we're predicting? What are we predicting it from?

How does our model look?

plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)",
     ylab="Positive precipitation tomorrow?")
rug(snoq$today,side=1,col="grey")
data.plot <- data.frame(today=(0:500))
pred.bands <- function(mdl,data,col="black",mult=1.96) {
    preds <- predict(mdl,newdata=data,se.fit=TRUE)
    lines(data[,1],ilogit(preds$fit),col=col)
    lines(data[,1],ilogit(preds$fit+mult*preds$se.fit),col=col,lty="dashed")
    lines(data[,1],ilogit(preds$fit-mult*preds$se.fit),col=col,lty="dashed")
}
pred.bands(snoq.logistic,data.plot)

How does our model look?

How good is that?

plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)",
     ylab="Positive precipitation tomorrow?")
rug(snoq$today,side=1,col="grey")
data.plot <- data.frame(today=(0:500))
pred.bands(snoq.logistic,data.plot)
snoq.spline <- smooth.spline(x=snoq$today,y=(snoq$tomorrow>0))
lines(snoq.spline,col="red")

How about a GAM?

plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)",
     ylab="Positive precipitation tomorrow?")
rug(snoq$today,side=1,col="grey")
pred.bands(snoq.logistic,data.plot)
lines(snoq.spline,col="red")
snoq.gam <- gam((tomorrow>0)~s(today),data=snoq,family=binomial)
pred.bands(snoq.gam,data.plot,"blue")

Maybe dry days are special?

snoq2 <- data.frame(snoq,dry=ifelse(snoq$today==0,1,0))
snoq2.logistic <- glm((tomorrow > 0) ~ today + dry,data=snoq2,family=binomial)
snoq2.gam <- gam((tomorrow > 0) ~ s(today) + dry,data=snoq2,family=binomial)

plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)",
  ylab="Positive precipitation tomorrow?")
rug(snoq$today,side=1,col="grey")
data.plot=data.frame(data.plot,dry=ifelse(data.plot$today==0,1,0))
lines(snoq.spline,col="red")
pred.bands(snoq2.logistic,data.plot)
pred.bands(snoq2.gam,data.plot,"blue")

Checking calibration

frequency.vs.probability <- function(p.lower,p.upper=p.lower+0.01,
  model=snoq2.logistic,events=(snoq$tomorrow>0)) {
  fitted.probs <- fitted(model)
  indices <- (fitted.probs >= p.lower) & (fitted.probs < p.upper)
  matching.probs <- fitted.probs[indices]
  ave.prob <- mean(matching.probs)
  frequency <- mean(events[indices])
  # "Law of total variance": Var[Y]=E[Var[Y|X]] + Var[E[Y|X]]
  total.var <- mean(matching.probs*(1-matching.probs)) + var(matching.probs)
  se <- sqrt(total.var/sum(indices))
  return(c(frequency=frequency,ave.prob=ave.prob,se=se))
}

(Can you add comments?)

Now apply the function a bunch (why these numbers?)

f.vs.p <- sapply(c(0.28, (63:100)/100), frequency.vs.probability)

This is "turned on its side" from a data frame, so let's fix. (Can you find a more elegant way to do this?)

f.vs.p <- data.frame(frequency=f.vs.p["frequency",],
                      ave.prob=f.vs.p["ave.prob",],
                      se=f.vs.p["se",])

plot(frequency~ave.prob,data=f.vs.p,xlim=c(0,1),ylim=c(0,1),
  xlab="Predicted probabilities",ylab="Observed frequencies")
rug(fitted(snoq2.logistic),col="grey")
abline(0,1,col="grey")
segments(x0=f.vs.p$ave.prob,y0=f.vs.p$ave.prob-1.96*f.vs.p$se,
  y1=f.vs.p$ave.prob+1.96*f.vs.p$se)