#' --- #' title: "Regression and Other Stories: Earnings" #' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" #' date: "`r format(Sys.Date())`" #' output: #' html_document: #' theme: readable #' toc: true #' toc_depth: 2 #' toc_float: true #' code_download: true #' --- #' Predict respondents' yearly earnings using survey data from #' 1990. See Chapters 6, 9 and 12 in Regression and Other Stories. #' #' ------------- #' #+ setup, include=FALSE knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE #' #### Load packages library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstanarm") library("ggplot2") library("bayesplot") theme_set(bayesplot::theme_default(base_family = "sans")) #+ eval=FALSE, include=FALSE # grayscale figures for the book if (savefigs) color_scheme_set(scheme = "gray") #' Set random seed for reproducibility SEED <- 7783 #' #### Load data earnings <- read.csv(root("Earnings/data","earnings.csv")) head(earnings) n <- nrow(earnings) height_jitter_add <- runif(n, -.2, .2) #' ## Normal linear regression #' #' #### Predict earnings in dollars fit_0 <- stan_glm(earn ~ height, data=earnings, seed = SEED, refresh = 0) print(fit_0) #' #### Plot linear model draws #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Earnings/figs","heights.simple0a.pdf"), height=3, width=4.2, colormodel="gray") #+ sims_0 <- as.matrix(fit_0) n_sims <- nrow(sims_0) keep <- earnings$earn <= 2e5 par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01) plot((earnings$height + height_jitter_add)[keep], earnings$earn[keep], xlab="height", ylab="earnings", pch=20, yaxt="n", col="gray10", bty="l", cex=.4) mtext("Fitted linear model", 3, 1) axis(2, seq(0, 2e5, 1e5), c("0", "100000", "200000")) for (i in sample(n_sims, 10)){ curve(sims_0[i,1] + sims_0[i,2]*x, lwd=0.5, col="gray30", add=TRUE) } curve(coef(fit_0)[1] + coef(fit_0)[2]*x, add=TRUE) #+ eval=FALSE, include=FALSE dev.off() #' #### Plot linear model draws with x-axis extended to 0 #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Earnings/figs","heights.intercept.pdf"), height=3, width=4.2, colormodel="gray") #+ keep <- earnings$earn <= 2e5 par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01) plot((earnings$height + height_jitter_add)[keep], earnings$earn[keep], xlab="height", ylab="earnings", pch=20, yaxt="n", col="gray10", bty="l", cex=.4, xlim=c(0, max(earnings$height)), ylim=c(-1e5, 2.2e5)) mtext("x-axis extended to 0", 3, 1) axis(2, seq(-1e5, 2e5, 1e5), c("-100000", "0", "100000", "200000")) for (i in sample(n_sims, 10)){ curve(sims_0[i,1] + sims_0[i,2]*x, lwd=0.5, col="gray30", add=TRUE) } curve(coef(fit_0)[1] + coef(fit_0)[2]*x, add=TRUE) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Predict earnings in thousands dollars #' #' By scaling the earnings, the model coefficients are scaled, but the #' results don't change otherwise. earnings$earnk <- earnings$earn/1000 # (earnk is actually already included in the data frame `earnings` for # convenience for running examples in different arts of the book) fit_1 <- stan_glm(earnk ~ height, data = earnings, seed = SEED, refresh = 0) print(fit_1) #' for plotting scale back to dollar scale coef1 <- coef(fit_1)*1000 #' #### Plot linear model, ggplot version gg_earnings <- ggplot(subset(earnings, subset=earn<2e5), aes(x = jitter(height, amount=0.2), y = earn)) + geom_point(alpha = 0.75) + geom_hline(yintercept = 0, color = "darkgray") + geom_abline(intercept = coef1[1], slope = coef1[2], size = 1) + labs(x = "height", y = "earnings", title = "Fitted linear model") gg_earnings #' #### Plot extrapolation, ggplot version #' modifying the gg_earnings object we already created gg_earnings + ylim(-70000, 200000) + xlim(0, 80) + labs(title = "Extrapolation") #' #### Include male/female fit_2 <- stan_glm(earnk ~ height + male, data = earnings, seed = SEED, refresh = 0) print(fit_2) #' for plotting scale back to dollar scale coef2 <- coef(fit_2)*1000 #' #### Include male/female, ggplot version ggplot(earnings, aes(height, earn)) + geom_blank() + geom_abline( intercept = c(coef2[1], coef2[1] + coef2[3]), slope = coef2[2], color = c("red", "blue") ) + coord_cartesian( ylim = range(predict(fit_2)*1000), xlim = range(earnings$height) ) + annotate( "text", x = c(68, 68), y = c(coef2[1] + coef2[2] * 65, coef2[1] + coef2[3] + coef2[2] * 65), label = c("women:\ny = -11 000 + 450x", "men:\ny = -2 000 + 450x"), color = c("red", "blue"), size = 5, hjust = 0 ) + labs( x = "height", y = "predicted earnings", title = "Fitted regression, displayed as\nseparate lines for men and women" ) #' #### Include interaction fit_3 <- stan_glm(earnk ~ height + male + height:male, data = earnings, seed = SEED, refresh = 0) print(fit_3) #' for plotting scale back to dollar scale coef3 <- coef(fit_3)*1000 #' #### Include interaction, ggplot version ggplot(subset(earnings, subset=earn>0), aes(height, earn)) + geom_blank() + geom_abline( intercept = c(coef3[1], coef3[1] + coef3[3]), slope = c(coef3[2], coef3[2] + coef3[4]), color = c("red", "blue") ) + coord_cartesian( ylim = range(predict(fit_3)*1000), xlim = range(earnings$height) ) + annotate( "text", x = c(62, 68), y = c(coef3[1] + coef3[2] * 80, coef3[1]+coef3[3] + (coef3[2]+coef3[4])*66), label = c("women:\ny = -7 000 + 180x", "men:\ny = -22 000 + 740x"), color = c("red", "blue"), size = 5, hjust = 0 ) + labs( x = "height", y = "predicted earnings", title = "Fitted regression with interactions,\nseparate lines for men and women" ) #' ## Linear regression on log scale #' #### Models on log scale logmodel_1 <- stan_glm(log(earn) ~ height, data = earnings, subset = earn>0, seed = SEED, refresh = 0) print(logmodel_1, digits=2) #' #### Model on log10 scale log10model_1 <- stan_glm(log10(earn) ~ height, data = earnings, subset = earn>0, seed = SEED, refresh = 0) print(log10model_1, digits=3) #' #### Model on log scale with two predictors logmodel_2 <- stan_glm(log(earn) ~ height + male, data = earnings, subset = earn>0, seed = SEED, refresh = 0) print(logmodel_2, digits=2) #' #### Model on log scale for the target and one predictor loglogmodel_2 <- stan_glm(log(earn) ~ log(height) + male, data = earnings, subset = earn>0, seed = SEED, refresh = 0) print(loglogmodel_2, digits=2) #' #### Model on log scale with two predictors and interaction logmodel_3 <- stan_glm(log(earn) ~ height + male + height:male, data = earnings, subset = earn>0, seed = SEED, refresh = 0) print(logmodel_3, digits=2) #' #### Model on log scale with standardized interaction earnings$z_height <- with(earnings, (height - mean(height))/sd(height)) logmodel_3a <- stan_glm(log(earn) ~ z_height + male + z_height:male, data = earnings, subset = earn>0, seed = SEED, refresh = 0) print(logmodel_3a, digits=2) #' #### PLot log models #' get posterior draws sims <- as.matrix(logmodel_1) n_sims <- nrow(sims) #' #### Plot log model on log scale #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Earnings/figs","heights.log1a.pdf"), height=3, width=4.2, colormodel="gray") #+ keep <- earnings$earn > 0 par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01) plot((earnings$height + height_jitter_add)[keep], log(earnings$earn)[keep], xlab="height", ylab="log (earnings)", pch=20, yaxt="n", col="gray10", bty="l", cex=.4) mtext("Log regression plotted on log scale", 3, 1) axis(2, seq(6,12,2)) for (i in sample(n_sims, 10)){ curve(sims[i,1] + sims[i,2]*x, lwd=0.5, col="gray30", add=TRUE) } curve(coef(logmodel_1)[1] + coef(logmodel_1)[2]*x, add=TRUE) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Plot posterior draws of linear model on log scale, ggplot version sims_display <- sample(n_sims, 10) ggplot(subset(earnings, subset=earn>0), aes(height, log(earn))) + geom_jitter(height = 0, width = 0.25) + geom_abline( intercept = sims[sims_display, 1], slope = sims[sims_display, 2], color = "darkgray" ) + geom_abline( intercept = coef(logmodel_1)[1], slope = coef(logmodel_1)[2] ) + labs( x = "height", y = "log(earnings)", title = "Log regression, plotted on log scale" ) #' #### Plot log model on linear scale #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Earnings/figs","heights.log1b.pdf"), height=3, width=4.2, colormodel="gray") #+ keep <- earnings$earn > 0 & earnings$earn <= 2e5 par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01) plot((earnings$height + height_jitter_add)[keep], earnings$earn[keep], xlab="height", ylab="earnings", pch=20, yaxt="n", col="gray10", bty="l", cex=.4) mtext("Log regression plotted on original scale", 3, 1) axis(2, seq(0, 2e5, 1e5), c("0", "100000", "200000")) for (i in sample(n_sims, 10)){ curve(exp(sims[i,1] + sims[i,2]*x), lwd=0.5, col="gray30", add=TRUE) } curve(exp(coef(logmodel_1)[1] + coef(logmodel_1)[2]*x), add=TRUE) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' ## Posterior predictive checking #' #### Posterior predictive checking for model in linear scale
#' for fair comparison refit the linear scale model only for non.zero earnings yrep_0 <- posterior_predict(fit_0) n_sims <- nrow(yrep_0) sims_display <- sample(n_sims, 100) ppc_0 <- ppc_dens_overlay(earnings$earn, yrep_0[sims_display,]) + theme(axis.line.y = element_blank()) #' #### Posterior predictive checking for model in log scale yrep_log_1 <- posterior_predict(logmodel_1) n_sims <- nrow(yrep_log_1) sims_display <- sample(n_sims, 100) ppc_log_1 <- ppc_dens_overlay(log(earnings$earn[earnings$earn>0]), yrep_log_1[sims_display,]) + theme(axis.line.y = element_blank()) bpg <- bayesplot_grid( ppc_0, ppc_log_1, grid_args = list(ncol = 2), titles = c("earn", "log(earn)")) bpg #+ eval=FALSE, include=FALSE ggsave(root("Earnings/figs","earnings_ppc.pdf"), bpg, height=3, width=9, colormodel="gray") #' #### Posterior predictive checking for model in linear scale fit_2b <- stan_glm(earn ~ height + male, data = earnings, subset=earn>0, seed = SEED, refresh = 0) yrep_2 <- posterior_predict(fit_2b) n_sims <- nrow(yrep_2) sims_display <- sample(n_sims, 100) ppc_dens_overlay(earnings$earn[earnings$earn>0], yrep_2[sims_display,]) #' #### Posterior predictive checking for model in log scale yrep_log_2 <- posterior_predict(logmodel_2) n_sims <- nrow(yrep_log_2) sims_display <- sample(n_sims, 100) ppc_dens_overlay(log(earnings$earn[earnings$earn>0]), yrep_log_2[sims_display,]) #' #### Posterior predictive checking for model in log-log scale yrep_loglog_2 <- posterior_predict(loglogmodel_2) n_sims <- nrow(yrep_loglog_2) sims_display <- sample(n_sims, 100) ppc_dens_overlay(log(earnings$earn[earnings$earn>0]), yrep_loglog_2[sims_display,])