library(faraway) # for logit and ilogit functions
curve(ilogit(x), from=-10, to=10, xlab="log odds ratio", ylab="probability")
28 February 2017
library(faraway) # for logit and ilogit functions
curve(ilogit(x), from=-10, to=10, xlab="log odds ratio", ylab="probability")
\[ g(p) \equiv \log{\frac{p}{1-p}} = \beta_0 + \beta\cdot x \]
curve(ilogit(x), from=-10, to=10, xlab=expression(beta[0]+beta*x), ylab="probability")
curve(ilogit(x), from=-10, to=10, xlab=expression(beta[0]+beta*x), ylab="probability") x = (-2):2; colors = c("red", "orange", "yellow", "green", "blue") points(x, ilogit(x), col=colors, pch=15) points(x+0.5, ilogit(x+.5), col=colors, pch=16) points(2*x, ilogit(2*x), col=colors, pch=17) points(2*x+0.5, ilogit(2*x+.5), col=colors, pch=18) legend("topleft", legend=c(expression(paste(beta[0]==0, beta==1)), expression(paste(beta[0]==.5, beta==1)), expression(paste(beta[0]==0, beta==2)), expression(paste(beta[0]==.5, beta==2))), pch=15:18)
Let's begin with a simulation from the model:
# Simulate data from a logistic regression # Inputs: matrix of x values (x) # intercept (beta.0) # vector of slopes (beta) # should the output be bound in an array, or just a vector of y's? (bind) # output: either a binary vector or an array with columns of x and y values sim.logistic <- function(x, beta.0,beta,bind=FALSE) { require(faraway) # For accessible logit and inverse-logit functions linear.parts <- beta.0+(x%*%beta) # Actually simulate in next line y <- rbinom(nrow(x),size=1,prob=ilogit(linear.parts)) # Output either an array (x values, then y), or just a vector of y's if (bind) { return(cbind(x,y)) } else { return(y) } }
And plotting of simulations:
# Simulate and plot a 2D logistic regression # The model's probabilities are plotted as a contour map, with # labeled points overlayed # Inputs: matrix of x values (x) # intercept (beta.0) # vector of slopes (beta) # number of grid points foir evaluating probabilities per axis (n.grid) # font size for labels on contour map (labcex) # color for contour map (col) # other graphical parameters as desired (...) # Output: Labels of the points (invisibly) # Presumes: x has 2 columns, with values in [-1, 1] # Presumes: beta has two entries # Presumes: x, beta.0, beta are all numbers plot.logistic.sim <- function(x, beta.0, beta, n.grid=50, labcex=0.3, col="grey", ...) { grid.seq <- seq(from=-1,to=1,length.out=n.grid) plot.grid <- as.matrix(expand.grid(grid.seq,grid.seq)) require(faraway) # Calculate the model's probability at each grid point p <- matrix(ilogit(beta.0 + (plot.grid %*% beta )),nrow=n.grid) # Lay down the contour map of probabilities contour(x=grid.seq,y=grid.seq,z=p, xlab=expression(x[1]), ylab=expression(x[2]),main="",labcex=labcex,col=col) # Now toss some coins at the points given by x y <- sim.logistic(x,beta.0,beta,bind=FALSE) # Add those points to the plot, with 1 being blue+, 0 being red- points(x[,1],x[,2],pch=ifelse(y==1,"+","-"),col=ifelse(y==1,"blue","red")) invisible(y) }
Now let's make up some x values and see what happens:
x <- matrix(runif(n=50*2,min=-1,max=1),ncol=2) par(mfrow=c(2,2)) plot.logistic.sim(x,beta.0=-0.1,beta=c(-0.2,0.2)) y.1 <- plot.logistic.sim(x,beta.0=-0.5,beta=c(-1,1)) plot.logistic.sim(x,beta.0=-2.5,beta=c(-5,5)) plot.logistic.sim(x,beta.0=-2.5e2,beta=c(-5e2,5e2))
We'll take one of those and run a logistic regression:
df <- data.frame(y=y.1, x1=x[,1], x2=x[,2]) logr <- glm(y ~ x1 + x2, data=df, family="binomial")
Deliberately, the output looks very similar to that of lm
:
print(summary(logr),digits=2,signif.stars=FALSE)
## ## Call: ## glm(formula = y ~ x1 + x2, family = "binomial", data = df) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.75 -1.01 -0.58 1.03 1.69 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -0.60 0.36 -1.6 0.10 ## x1 -0.67 0.55 -1.2 0.22 ## x2 1.55 0.66 2.4 0.02 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 68.994 on 49 degrees of freedom ## Residual deviance: 60.994 on 47 degrees of freedom ## AIC: 66.99 ## ## Number of Fisher Scoring iterations: 4
We classify each point as "should be \(y=1\)" or "should be \(y=0\)"
mean(ifelse(fitted(logr)<0.5,0,1) != df$y)
## [1] 0.36
Is this good or bad?
\[ g(p) = \alpha + \sum_{j=1}^{p}{f_j(x_j)} \]
\(f_j\) are still partial response functions, but for log-odds of \(Y=1\), not for \(Y\)
Syntax: additive models + glm
and glm
library(mgcv) (gam.1 <- gam(y~s(x1)+s(x2), data=df, family="binomial"))
## ## Family: binomial ## Link function: logit ## ## Formula: ## y ~ s(x1) + s(x2) ## ## Estimated degrees of freedom: ## 1.65 1.00 total = 3.65 ## ## UBRE score: 0.3240966
plot(gam.1,residuals=TRUE,pages=1)
Daily precipitation records, in 1/100 inch, for a tourist destination in Washington State, from 1948 through 1983
(Tedious re-shaping of the data hidden in the next code chunk)
Outcome: snoqualmie
is a big vector of precipitation day by day, snoq
records today's precipitation vs. tomorrow's
hist(snoqualmie,n=50,probability=TRUE,xlab="Precipitation (1/100 inch)") rug(snoqualmie,col="grey")
Lots of rain, lots of dry days
plot(tomorrow~today,data=snoq, xlab="Precipitation today (1/100 inch)", ylab="Precipitation tomorrow (1/100 inch)",cex=0.1) rug(snoq$today,side=1,col="grey") rug(snoq$tomorrow,side=2,col="grey")
snoq.logistic <- glm((tomorrow > 0) ~ today, data=snoq, family=binomial)
What's the outcome we're predicting? What are we predicting it from?
plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)", ylab="Positive precipitation tomorrow?") rug(snoq$today,side=1,col="grey") data.plot <- data.frame(today=(0:500)) pred.bands <- function(mdl,data,col="black",mult=1.96) { preds <- predict(mdl,newdata=data,se.fit=TRUE) lines(data[,1],ilogit(preds$fit),col=col) lines(data[,1],ilogit(preds$fit+mult*preds$se.fit),col=col,lty="dashed") lines(data[,1],ilogit(preds$fit-mult*preds$se.fit),col=col,lty="dashed") } pred.bands(snoq.logistic,data.plot)
plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)", ylab="Positive precipitation tomorrow?") rug(snoq$today,side=1,col="grey") data.plot <- data.frame(today=(0:500)) pred.bands(snoq.logistic,data.plot) snoq.spline <- smooth.spline(x=snoq$today,y=(snoq$tomorrow>0)) lines(snoq.spline,col="red")
plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)", ylab="Positive precipitation tomorrow?") rug(snoq$today,side=1,col="grey") pred.bands(snoq.logistic,data.plot) lines(snoq.spline,col="red") snoq.gam <- gam((tomorrow>0)~s(today),data=snoq,family=binomial) pred.bands(snoq.gam,data.plot,"blue")
snoq2 <- data.frame(snoq,dry=ifelse(snoq$today==0,1,0)) snoq2.logistic <- glm((tomorrow > 0) ~ today + dry,data=snoq2,family=binomial) snoq2.gam <- gam((tomorrow > 0) ~ s(today) + dry,data=snoq2,family=binomial)
plot((tomorrow>0)~today,data=snoq,xlab="Precipitation today (1/100 inch)", ylab="Positive precipitation tomorrow?") rug(snoq$today,side=1,col="grey") data.plot=data.frame(data.plot,dry=ifelse(data.plot$today==0,1,0)) lines(snoq.spline,col="red") pred.bands(snoq2.logistic,data.plot) pred.bands(snoq2.gam,data.plot,"blue")
frequency.vs.probability <- function(p.lower,p.upper=p.lower+0.01, model=snoq2.logistic,events=(snoq$tomorrow>0)) { fitted.probs <- fitted(model) indices <- (fitted.probs >= p.lower) & (fitted.probs < p.upper) matching.probs <- fitted.probs[indices] ave.prob <- mean(matching.probs) frequency <- mean(events[indices]) # "Law of total variance": Var[Y]=E[Var[Y|X]] + Var[E[Y|X]] total.var <- mean(matching.probs*(1-matching.probs)) + var(matching.probs) se <- sqrt(total.var/sum(indices)) return(c(frequency=frequency,ave.prob=ave.prob,se=se)) }
(Can you add comments?)
Now apply the function a bunch (why these numbers?)
f.vs.p <- sapply(c(0.28, (63:100)/100), frequency.vs.probability)
This is "turned on its side" from a data frame, so let's fix. (Can you find a more elegant way to do this?)
f.vs.p <- data.frame(frequency=f.vs.p["frequency",], ave.prob=f.vs.p["ave.prob",], se=f.vs.p["se",])
plot(frequency~ave.prob,data=f.vs.p,xlim=c(0,1),ylim=c(0,1), xlab="Predicted probabilities",ylab="Observed frequencies") rug(fitted(snoq2.logistic),col="grey") abline(0,1,col="grey") segments(x0=f.vs.p$ave.prob,y0=f.vs.p$ave.prob-1.96*f.vs.p$se, y1=f.vs.p$ave.prob+1.96*f.vs.p$se)