#' --- #' title: "Regression and Other Stories: Elections Economy" #' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" #' date: "`r format(Sys.Date())`" #' output: #' html_document: #' theme: readable #' toc: true #' toc_depth: 2 #' toc_float: true #' code_download: true #' --- #' Predicting presidential vote share from the economy. See Chapters #' 1, 7, 8, 9, and 22 in Regression and Other Stories. #' #' ------------- #' #+ setup, include=FALSE knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE #' #### Load packages library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstanarm") library("arm") library("ggplot2") library("bayesplot") theme_set(bayesplot::theme_default(base_family = "sans")) #' #### Load data hibbs <- read.table(root("ElectionsEconomy/data","hibbs.dat"), header=TRUE) head(hibbs) #' ## Graphing the bread and peace model #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbsdots.pdf"), height=4.5, width=7.5, colormodel="gray") #+ n <- nrow(hibbs) par(mar=c(0,0,1.2,0)) left <- -.3 right <- -.28 center <- -.07 f <- .17 plot(c(left-.31,center+.23), c(-3.3,n+3), type="n", bty="n", xaxt="n", yaxt="n", xlab="", ylab="", xaxs="i", yaxs="i") mtext("Forecasting elections from the economy", 3, 0, cex=1.2) with(hibbs, { for (i in 1:n){ ii <- order(growth)[i] text(left-.3, i, paste (inc_party_candidate[ii], " vs. ", other_candidate[ii], " (", year[ii], ")", sep=""), adj=0, cex=.8) points(center+f*(vote[ii]-50)/10, i, pch=20) if (i>1){ if (floor(growth[ii]) != floor(growth[order(growth)[i-1]])){ lines(c(left-.3,center+.22), rep(i-.5,2), lwd=.5, col="darkgray") } } } }) lines(center+f*c(-.65,1.3), rep(0,2), lwd=.5) for (tick in seq(-.5,1,.5)){ lines(center + f*rep(tick,2), c(0,-.2), lwd=.5) text(center + f*tick, -.5, paste(50+10*tick,"%",sep=""), cex=.8) } lines(rep(center,2), c(0,n+.5), lty=2, lwd=.5) text(center+.05, n+1.5, "Incumbent party's share of the popular vote", cex=.8) lines(c(center-.088,center+.19), rep(n+1,2), lwd=.5) text(right, n+1.5, "Income growth", adj=.5, cex=.8) lines(c(right-.05,right+.05), rep(n+1,2), lwd=.5) text(right, 16.15, "more than 4%", cex=.8) text(right, 14, "3% to 4%", cex=.8) text(right, 10.5, "2% to 3%", cex=.8) text(right, 7, "1% to 2%", cex=.8) text(right, 3.5, "0% to 1%", cex=.8) text(right, .85, "negative", cex=.8) text(left-.3, -2.3, "Above matchups are all listed as incumbent party's candidate vs.\ other party's candidate.\nIncome growth is a weighted measure over the four years preceding the election. Vote share excludes third parties.", adj=0, cex=.7) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbsscatter.pdf"), height=4.5, width=5, colormodel="gray") #+ par(mar=c(3,3,2,.1), mgp=c(1.7,.5,0), tck=-.01) plot(c(-.7, 4.5), c(43,63), type="n", xlab="Avg recent growth in personal income", ylab="Incumbent party's vote share", xaxt="n", yaxt="n", mgp=c(2,.5,0), main="Forecasting the election from the economy ", bty="l") axis(1, 0:4, paste(0:4,"%",sep=""), mgp=c(2,.5,0)) axis(2, seq(45,60,5), paste(seq(45,60,5),"%",sep=""), mgp=c(2,.5,0)) with(hibbs, text(growth, vote, year, cex=.8)) abline(50, 0, lwd=.5, col="gray") #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' ## Linear regression #' #' The option `refresh = 0` supresses the default Stan sampling #' progress output. This is useful for small data with fast #' computation. For more complex models and bigger data, it can be #' useful to see the progress. M1 <- stan_glm(vote ~ growth, data = hibbs, refresh = 0) #' Print default summary of the fitted model print(M1) #' Print summary of the priors used prior_summary(M1) #' Almost all models in Regression and Other Stories have very good #' sampling behavior. `summary()` function can be used to obtain the #' summary of the convergence diagnostics for MCMC sampling. summary(M1) #' #### Posterior interval round(posterior_interval(M1),1) #' #### Plot regression line #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbsline.pdf"), height=4.5, width=5, colormodel="gray") #+ par(mar=c(3,3,2,.1), mgp=c(1.7,.5,0), tck=-.01) plot(c(-.7, 4.5), c(43,63), type="n", xlab="Average recent growth in personal income", ylab="Incumbent party's vote share", xaxt="n", yaxt="n", mgp=c(2,.5,0), main="Data and linear fit", bty="l") axis(1, 0:4, paste(0:4,"%",sep=""), mgp=c(2,.5,0)) axis(2, seq(45,60,5), paste(seq(45,60,5),"%",sep=""), mgp=c(2,.5,0)) with(hibbs, points(growth, vote, pch=20)) abline(50, 0, lwd=.5, col="gray") abline(coef(M1), col="gray15") text(2.7, 53.5, paste("y =", fround(coef(M1)[1],1), "+", fround(coef(M1)[2],1), "x"), adj=0, col="gray15") #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Plot prediction given 2% growth #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbspredict.pdf"), height=3.5, width=6.5, colormodel="gray") #+ par(mar=c(3,3,3,1), mgp=c(1.7,.5,0), tck=-.01) mu <- 52.3 sigma <- 3.9 curve (dnorm(x,mu,sigma), ylim=c(0,.103), from=35, to=70, bty="n", xaxt="n", yaxt="n", yaxs="i", xlab="Clinton share of the two-party vote", ylab="", main="Probability forecast of Hillary Clinton vote share in 2016,\nbased on 2% rate of economic growth", cex.main=.9) x <- seq (50,65,.1) polygon(c(min(x),x,max(x)), c(0,dnorm(x,mu,sigma),0), col="darkgray", border="black") axis(1, seq(40,65,5), paste(seq(40,65,5),"%",sep="")) text(50.7, .025, "Predicted\n72% chance\nof Clinton victory", adj=0) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Plot data and linear fit #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbsline2a.pdf"), height=4.5, width=5, colormodel="gray") #+ par(mar=c(3,3,2,.1), mgp=c(1.7,.5,0), tck=-.01) plot(c(-.7, 4.5), c(43,63), type="n", xlab="x", ylab="y", xaxt="n", yaxt="n", mgp=c(2,.5,0), main="Data and linear fit", bty="l", cex.lab=1.3, cex.main=1.3) axis(1, 0:4, cex.axis=1.3) axis(2, seq(45, 60, 5), cex.axis=1.3) abline(coef(M1), col="gray15") with(hibbs, points(growth, vote, pch=20)) text(2.7, 53.5, paste("y =", fround(coef(M1)[1],1), "+", fround(coef(M1)[2],1), "x"), adj=0, col="gray15", cex=1.3) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Plot data and range of possible linear fits #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbsline2b.pdf"), height=4.5, width=5, colormodel="gray") #+ par(mar=c(3,3,2,.1), mgp=c(1.7,.5,0), tck=-.01) plot(c(-.7, 4.5), c(43,63), type="n", xlab="x", ylab="y", xaxt="n", yaxt="n", mgp=c(2,.5,0), main="Data and range of possible linear fits", bty="l", cex.lab=1.3, cex.main=1.3) axis(1, 0:4, cex.axis=1.3) axis(2, seq(45, 60, 5), cex.axis=1.3) sims <- as.matrix(M1) n_sims <- nrow(sims) for (s in sample(n_sims, 50)) abline(sims[s,1], sims[s,2], col="gray50", lwd=0.5) with(hibbs, points(growth, vote, pch=20)) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' ## Illustrate computations #' #### Extract the simulations sims <- as.matrix(M1) a <- sims[,1] b <- sims[,2] sigma <- sims[,3] n_sims <- nrow(sims) #' #### Median and mean absolute deviation (MAD_SD) Median <- apply(sims, 2, median) MAD_SD <- apply(sims, 2, mad) print(cbind(Median, MAD_SD)) #' #### Median and mean absolute deviation (MAD_SD) for a derived quantity a/b a <- sims[,1] b <- sims[,2] z <- a/b print(median(z)) print(mad(z)) #' #### Point prediction given 2% growth new <- data.frame(growth=2.0) y_point_pred <- predict(M1, newdata=new) #' #### Alternative way to compute the point prediction a_hat <- coef(M1)[1] b_hat <- coef(M1)[2] y_point_pred <- a_hat + b_hat*as.numeric(new) #' #### Uncertainty in prediction given 2% growth y_linpred <- posterior_linpred(M1, newdata=new) #' #### Do same computation "manually" a <- sims[,1] b <- sims[,2] y_linpred <- a + b*as.numeric(new) #' #### Predictive uncertainty y_pred <- posterior_predict(M1, newdata=new) #' #### Predictive uncertainty manually sigma <- sims[,3] n_sims <- nrow(sims) y_pred <- a + b*as.numeric(new) + rnorm(n_sims, 0, sigma) #' #### Summarize predictions Median <- median(y_pred) MAD_SD <- mad(y_pred) win_prob <- mean(y_pred > 50) cat("Predicted Clinton percentage of 2-party vote: ", round(Median,1), ", with s.e. ", round(MAD_SD, 1), "\nPr (Clinton win) = ", round(win_prob, 2), sep="") #' #### Summarize predictions graphically hist(y_pred) #' #### Predict for many new values new_grid <- data.frame(growth=seq(-2.0, 4.0, 0.5)) y_point_pred_grid <- predict(M1, newdata=new_grid) y_linpred_grid <- posterior_linpred(M1, newdata=new_grid) y_pred_grid <- posterior_predict(M1, newdata=new_grid) #' #### Plots #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbspredict_bayes_1.pdf"), height=4, width=10, colormodel="gray") #+ par(mfrow=c(1,2), mar=c(3,2,3,0), mgp=c(1.5,.5,0), tck=-.01) hist(a, ylim=c(0,0.25*n_sims), xlab="a", ylab="", main="Posterior simulations of the intercept, a,\nand posterior median +/- 1 and 2 std err", cex.axis=.9, cex.lab=.9, yaxt="n", col="gray90") abline(v=median(a), lwd=2) arrows(median(a) - 1.483*median(abs(a - median(a))), 550, median(a) + 1.483*median(abs(a - median(a))), 550, length=.1, code=3, lwd=2) arrows(median(a) - 2*1.483*median(abs(a - median(a))), 250, median(a) + 2*1.483*median(abs(a - median(a))), 250, length=.1, code=3, lwd=2) hist(b, ylim=c(0,0.27*n_sims), xlab="b", ylab="", main="Posterior simulations of the slope, b,\nand posterior median +/- 1 and 2 std err", cex.axis=.9, cex.lab=.9, yaxt="n", col="gray90") abline(v=median(b), lwd=2) arrows(median(b) - 1.483*median(abs(b - median(b))), 550, median(b) + 1.483*median(abs(b - median(b))), 550, length=.1, code=3, lwd=2) arrows(median(b) - 2*1.483*median(abs(b - median(b))), 250, median(b) + 2*1.483*median(abs(b - median(b))), 250, length=.1, code=3, lwd=2) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbspredict_bayes_2a.pdf"), height=4.5, width=5) #+ par(mar=c(3,3,2,.1), mgp=c(1.7,.5,0), tck=-.01) plot(a, b, xlab="a", ylab="b", main="Posterior draws of the regression coefficients a, b ", bty="l", pch=20, cex=.2) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### ggplot version ggplot(data.frame(a = sims[, 1], b = sims[, 2]), aes(a, b)) + geom_point(size = 1) + labs(title = "Posterior draws of the regression coefficients a, b") #' #### More plotting #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbspredict_bayes_2b.pdf"), height=4.5, width=5, colormodel="gray") #+ par(mar=c(3,3,2,.1), mgp=c(1.7,.5,0), tck=-.01) plot(c(-.7, 4.5), c(43,63), type="n", xlab="Average recent growth in personal income", ylab="Incumbent party's vote share", xaxt="n", yaxt="n", mgp=c(2,.5,0), main="Data and 100 posterior draws of the line, y = a + bx ", bty="l") axis(1, 0:4, paste(0:4,"%",sep=""), mgp=c(2,.5,0)) axis(2, seq(45,60,5), paste(seq(45,60,5),"%",sep=""), mgp=c(2,.5,0)) for (i in 1:100){ abline(a[i], b[i], lwd=.5) } abline(50, 0, lwd=.5, col="gray") with(hibbs, { points(growth, vote, pch=20, cex=1.7, col="white") points(growth, vote, pch=20) }) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### ggplot version ggplot(hibbs, aes(x = growth, y = vote)) + geom_abline( intercept = sims[1:100, 1], slope = sims[1:100, 2], size = 0.1 ) + geom_abline( intercept = mean(sims[, 1]), slope = mean(sims[, 2]) ) + geom_point(color = "white", size = 3) + geom_point(color = "black", size = 2) + labs( x = "Avg recent growth in personal income", y ="Incumbent party's vote share", title = "Data and 100 posterior draws of the line, y = a + bx" ) + scale_x_continuous( limits = c(-.7, 4.5), breaks = 0:4, labels = paste(0:4, "%", sep = "") ) + scale_y_continuous( limits = c(43, 63), breaks = seq(45, 60, 5), labels = paste(seq(45, 60, 5), "%", sep = "") ) #' #### Add more uncertainty x <- rnorm(n_sims, 2.0, 0.3) y_hat <- a + b*x y_pred <- rnorm(n_sims, y_hat, sigma) Median <- median(y_pred) MAD_SD <- 1.483*median(abs(y_pred - median(y_pred))) win_prob <- mean(y_pred > 50) cat("Predicted Clinton percentage of 2-party vote: ", round(Median, 1), ", with s.e. ", round(MAD_SD, 1), "\nPr (Clinton win) = ", round(win_prob, 2), sep="", "\n") #' #### More plotting #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("ElectionsEconomy/figs","hibbspredict_bayes_3.pdf"), height=3.5, width=6) #+ par(mar=c(3,3,3,1), mgp=c(1.7,.5,0), tck=-.01) hist(y_pred, breaks=seq(floor(min(y_pred)), ceiling(max(y_pred)),1), xlim=c(35,70), xaxt="n", yaxt="n", yaxs="i", bty="n", xlab="Clinton share of the two-party vote", ylab="", main="Bayesian simulations of Hillary Clinton vote share,\nbased on 2% rate of economic growth") axis(1, seq(40,65,5), paste(seq(40,65,5),"%",sep="")) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### ggplot version qplot(y_pred, binwidth = 1) + labs( x ="Clinton share of the two-party vote", title = "Simulations of Hillary Clinton vote share,\nbased on 2% rate of economic growth" ) + theme(axis.line.y = element_blank()) #' #### Bayesian inference and prior information #' #' Combining information from a forecast and a poll. #' Hypothetical forecast and data. theta_hat_prior <- 0.524 se_prior <- 0.041 n <- 400 y <- 190 theta_hat_data <- y/n se_data <- sqrt((y/n)*(1-y/n)/n) theta_hat_bayes <- (theta_hat_prior / se_prior^2 + theta_hat_data / se_data^2) / (1 / se_prior^2 + 1 / se_data^2) se_bayes <- sqrt(1/(1/se_prior^2 + 1/se_data^2)) #' #### Ramp up the data variance se_data <- .075 print((theta_hat_prior/se_prior^2 + theta_hat_data/se_data^2)/(1/se_prior^2 + 1/se_data^2)) #' ## Comparison to `lm()` M1a <- lm(vote ~ growth, data=hibbs) print(M1a) summary(M1a)