#' ---
#' title: "Regression and Other Stories: Earnings"
#' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
#' date: "`r format(Sys.Date())`"
#' output:
#'   html_document:
#'     theme: readable
#'     toc: true
#'     toc_depth: 2
#'     toc_float: true
#'     code_download: true
#' ---

#' Predict respondents' yearly earnings using survey data from
#' 1990. See Chapters 6, 9 and 12 in Regression and Other Stories.
#' 
#' -------------
#' 

#+ setup, include=FALSE
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
# switch this to TRUE to save figures in separate files
savefigs <- FALSE

#' #### Load packages
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))
#+ eval=FALSE, include=FALSE
# grayscale figures for the book
if (savefigs) color_scheme_set(scheme = "gray")

#' Set random seed for reproducibility
SEED <- 7783

#' #### Load data
earnings <- read.csv(root("Earnings/data","earnings.csv"))
head(earnings)
n <- nrow(earnings)
height_jitter_add <- runif(n, -.2, .2)

#' ## Normal linear regression
#' 
#' #### Predict earnings in dollars
fit_0 <- stan_glm(earn ~ height, data=earnings,
                  seed = SEED, refresh = 0)
print(fit_0)

#' #### Plot linear model draws
#+ eval=FALSE, include=FALSE
if (savefigs) pdf(root("Earnings/figs","heights.simple0a.pdf"), height=3, width=4.2, colormodel="gray")
#+
sims_0 <- as.matrix(fit_0)
n_sims <- nrow(sims_0)
keep <- earnings$earn <= 2e5
par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01)
plot((earnings$height + height_jitter_add)[keep], earnings$earn[keep], xlab="height", ylab="earnings", pch=20, yaxt="n", col="gray10", bty="l", cex=.4)
mtext("Fitted linear model", 3, 1)
axis(2, seq(0, 2e5, 1e5), c("0", "100000", "200000"))
for (i in sample(n_sims, 10)){
 curve(sims_0[i,1] + sims_0[i,2]*x, lwd=0.5, col="gray30", add=TRUE)
}
curve(coef(fit_0)[1] + coef(fit_0)[2]*x, add=TRUE)
#+ eval=FALSE, include=FALSE
dev.off()

#' #### Plot linear model draws with x-axis extended to 0
#+ eval=FALSE, include=FALSE
if (savefigs) pdf(root("Earnings/figs","heights.intercept.pdf"), height=3, width=4.2, colormodel="gray")
#+
keep <- earnings$earn <= 2e5
par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01)
plot((earnings$height + height_jitter_add)[keep], earnings$earn[keep], xlab="height", ylab="earnings", pch=20, yaxt="n", col="gray10", bty="l", cex=.4, xlim=c(0, max(earnings$height)), ylim=c(-1e5, 2.2e5))
mtext("x-axis extended to 0", 3, 1)
axis(2, seq(-1e5, 2e5, 1e5), c("-100000", "0", "100000", "200000"))
for (i in sample(n_sims, 10)){
 curve(sims_0[i,1] + sims_0[i,2]*x, lwd=0.5, col="gray30", add=TRUE)
}
curve(coef(fit_0)[1] + coef(fit_0)[2]*x, add=TRUE)
#+ eval=FALSE, include=FALSE
if (savefigs) dev.off()

#' #### Predict earnings in thousands dollars
#'
#' By scaling the earnings, the model coefficients are scaled, but the
#' results don't change otherwise.
earnings$earnk <- earnings$earn/1000
# (earnk is actually already included in the data frame `earnings` for
# convenience for running examples in different arts of the book)
fit_1 <- stan_glm(earnk ~ height, data = earnings,
                  seed = SEED, refresh = 0)
print(fit_1)
#' for plotting scale back to dollar scale
coef1 <- coef(fit_1)*1000

#' #### Plot linear model, ggplot version
gg_earnings <- ggplot(subset(earnings, subset=earn<2e5), aes(x = jitter(height, amount=0.2), y = earn)) +
  geom_point(alpha = 0.75) +
  geom_hline(yintercept = 0, color = "darkgray") +
  geom_abline(intercept = coef1[1], slope = coef1[2], size = 1) +
  labs(x = "height", y = "earnings",
       title = "Fitted linear model")
gg_earnings

#' #### Plot extrapolation, ggplot version
#' modifying the gg_earnings object we already created
gg_earnings +
  ylim(-70000, 200000) +
  xlim(0, 80) +
  labs(title = "Extrapolation")

#' #### Include male/female
fit_2 <- stan_glm(earnk ~ height + male, data = earnings,
                  seed = SEED, refresh = 0)
print(fit_2)
#' for plotting scale back to dollar scale
coef2 <- coef(fit_2)*1000

#' #### Include male/female, ggplot version
ggplot(earnings, aes(height, earn)) +
  geom_blank() +
  geom_abline(
    intercept = c(coef2[1], coef2[1] + coef2[3]),
    slope = coef2[2],
    color = c("red", "blue")
  ) +
  coord_cartesian(
    ylim = range(predict(fit_2)*1000),
    xlim = range(earnings$height)
  ) +
  annotate(
    "text",
    x = c(68, 68),
    y = c(coef2[1] + coef2[2] * 65, coef2[1] + coef2[3] + coef2[2] * 65),
    label = c("women:\ny = -11 000 + 450x", "men:\ny = -2 000 + 450x"),
    color = c("red", "blue"),
    size = 5, hjust = 0
  ) +
  labs(
    x = "height",
    y = "predicted earnings",
    title = "Fitted regression, displayed as\nseparate lines for men and women"
  )


#' #### Include interaction
fit_3 <- stan_glm(earnk ~ height + male + height:male, data = earnings,
                  seed = SEED, refresh = 0)
print(fit_3)
#' for plotting scale back to dollar scale
coef3 <- coef(fit_3)*1000

#' #### Include interaction, ggplot version
ggplot(subset(earnings, subset=earn>0), aes(height, earn)) +
  geom_blank() +
  geom_abline(
    intercept = c(coef3[1], coef3[1] + coef3[3]),
    slope = c(coef3[2], coef3[2] + coef3[4]),
    color = c("red", "blue")
  ) +
  coord_cartesian(
    ylim = range(predict(fit_3)*1000),
    xlim = range(earnings$height)
  ) +
  annotate(
    "text",
    x = c(62, 68),
    y = c(coef3[1] + coef3[2] * 80, coef3[1]+coef3[3] + (coef3[2]+coef3[4])*66),
    label = c("women:\ny = -7 000 + 180x", "men:\ny = -22 000 + 740x"),
    color = c("red", "blue"),
    size = 5, hjust = 0
  ) +
  labs(
    x = "height",
    y = "predicted earnings",
    title = "Fitted regression with interactions,\nseparate lines for men and women"
  )

#' ## Linear regression on log scale

#' #### Models on log scale
logmodel_1 <- stan_glm(log(earn) ~ height, data = earnings,
                       subset = earn>0,
                       seed = SEED, refresh = 0)
print(logmodel_1, digits=2)

#' #### Model on log10 scale
log10model_1 <- stan_glm(log10(earn) ~ height, data = earnings,
                         subset = earn>0,
                         seed = SEED, refresh = 0)
print(log10model_1, digits=3)

#' #### Model on log scale with two predictors
logmodel_2 <- stan_glm(log(earn) ~ height + male, data = earnings,
                       subset = earn>0,
                       seed = SEED, refresh = 0)
print(logmodel_2, digits=2)

#' #### Model on log scale for the target and one predictor
loglogmodel_2 <- stan_glm(log(earn) ~ log(height) + male, data = earnings,
                          subset = earn>0,
                          seed = SEED, refresh = 0)
print(loglogmodel_2, digits=2)

#' #### Model on log scale with two predictors and interaction
logmodel_3 <- stan_glm(log(earn) ~ height + male + height:male, data = earnings,
                       subset = earn>0,
                       seed = SEED, refresh = 0)
print(logmodel_3, digits=2)

#' #### Model on log scale with standardized interaction
earnings$z_height <- with(earnings, (height - mean(height))/sd(height))
logmodel_3a <- stan_glm(log(earn) ~ z_height + male + z_height:male,
                        data = earnings, subset = earn>0,
                        seed = SEED, refresh = 0)
print(logmodel_3a, digits=2)

#' #### PLot log models

#' get posterior draws
sims <- as.matrix(logmodel_1)
n_sims <- nrow(sims)

#' #### Plot log model on log scale
#+ eval=FALSE, include=FALSE
if (savefigs) pdf(root("Earnings/figs","heights.log1a.pdf"), height=3, width=4.2, colormodel="gray")
#+
keep <- earnings$earn > 0
par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01)
plot((earnings$height + height_jitter_add)[keep], log(earnings$earn)[keep], xlab="height", ylab="log (earnings)", pch=20, yaxt="n", col="gray10", bty="l", cex=.4)
mtext("Log regression plotted on log scale", 3, 1)
axis(2, seq(6,12,2))
for (i in sample(n_sims, 10)){
 curve(sims[i,1] + sims[i,2]*x, lwd=0.5, col="gray30", add=TRUE)
}
curve(coef(logmodel_1)[1] + coef(logmodel_1)[2]*x, add=TRUE)
#+ eval=FALSE, include=FALSE
if (savefigs) dev.off()

#' #### Plot posterior draws of linear model on log scale, ggplot version
sims_display <- sample(n_sims, 10)
ggplot(subset(earnings, subset=earn>0), aes(height, log(earn))) +
  geom_jitter(height = 0, width = 0.25) +
  geom_abline(
    intercept = sims[sims_display, 1],
    slope = sims[sims_display, 2],
    color = "darkgray"
  ) +
  geom_abline(
    intercept = coef(logmodel_1)[1],
    slope = coef(logmodel_1)[2]
  ) +
  labs(
    x = "height",
    y = "log(earnings)",
    title = "Log regression, plotted on log scale"
  )

#' #### Plot log model on linear scale
#+ eval=FALSE, include=FALSE
if (savefigs) pdf(root("Earnings/figs","heights.log1b.pdf"), height=3, width=4.2, colormodel="gray")
#+
keep <- earnings$earn > 0 & earnings$earn <= 2e5
par(mar=c(3,3,2,0), mgp=c(1.7,.5,0), tck=-.01)
plot((earnings$height + height_jitter_add)[keep], earnings$earn[keep], xlab="height", ylab="earnings", pch=20, yaxt="n", col="gray10", bty="l", cex=.4)
mtext("Log regression plotted on original scale", 3, 1)
axis(2, seq(0, 2e5, 1e5), c("0", "100000", "200000"))
for (i in sample(n_sims, 10)){
 curve(exp(sims[i,1] + sims[i,2]*x), lwd=0.5, col="gray30", add=TRUE)
}
curve(exp(coef(logmodel_1)[1] + coef(logmodel_1)[2]*x), add=TRUE)
#+ eval=FALSE, include=FALSE
if (savefigs) dev.off()

#' ## Posterior predictive checking

#' #### Posterior predictive checking for model in linear scale<br>
#' for fair comparison refit the linear scale model only for non.zero earnings
yrep_0 <- posterior_predict(fit_0)
n_sims <- nrow(yrep_0)
sims_display <- sample(n_sims, 100)
ppc_0 <- ppc_dens_overlay(earnings$earn, yrep_0[sims_display,]) +
  theme(axis.line.y = element_blank())
#' #### Posterior predictive checking for model in log scale
yrep_log_1 <- posterior_predict(logmodel_1)
n_sims <- nrow(yrep_log_1)
sims_display <- sample(n_sims, 100)
ppc_log_1 <- ppc_dens_overlay(log(earnings$earn[earnings$earn>0]), yrep_log_1[sims_display,]) +
    theme(axis.line.y = element_blank())
bpg <- bayesplot_grid(
  ppc_0, ppc_log_1,
  grid_args = list(ncol = 2),
  titles = c("earn", "log(earn)"))
bpg
#+ eval=FALSE, include=FALSE
ggsave(root("Earnings/figs","earnings_ppc.pdf"), bpg, height=3, width=9, colormodel="gray")

#' #### Posterior predictive checking for model in linear scale
fit_2b <- stan_glm(earn ~ height + male, data = earnings, subset=earn>0,
                   seed = SEED, refresh = 0)
yrep_2 <- posterior_predict(fit_2b)
n_sims <- nrow(yrep_2)
sims_display <- sample(n_sims, 100)
ppc_dens_overlay(earnings$earn[earnings$earn>0], yrep_2[sims_display,])

#' #### Posterior predictive checking for model in log scale
yrep_log_2 <- posterior_predict(logmodel_2)
n_sims <- nrow(yrep_log_2)
sims_display <- sample(n_sims, 100)
ppc_dens_overlay(log(earnings$earn[earnings$earn>0]), yrep_log_2[sims_display,])

#' #### Posterior predictive checking for model in log-log scale
yrep_loglog_2 <- posterior_predict(loglogmodel_2)
n_sims <- nrow(yrep_loglog_2)
sims_display <- sample(n_sims, 100)
ppc_dens_overlay(log(earnings$earn[earnings$earn>0]), yrep_loglog_2[sims_display,])