---
title: "Regression and Other Stories: Mesquite"
author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
date: "`r format(Sys.Date())`"
output:
html_document:
theme: readable
toc: true
toc_depth: 2
toc_float: true
code_download: true
---
Predicting the yields of mesquite bushes. See Chapter 12 in
Regression and Other Stories.
-------------
```{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
# switch this to TRUE to save figures in separate files
savefigs <- FALSE
```
#### Load packages
```{r }
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("foreign")
library("rstanarm")
library("loo")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))
```
```{r eval=FALSE, include=FALSE}
# grayscale figures for the book
if (savefigs) color_scheme_set(scheme = "gray")
```
Set random seed for reproducability
```{r }
SEED <- 4587
```
#### Load data
```{r }
mesquite <- read.table(root("Mesquite/data","mesquite.dat"), header=TRUE)
head(mesquite)
```
## Predict weight given all the predictors
```{r }
fit_1 <- stan_glm(weight ~ diam1 + diam2 + canopy_height + total_height +
density + group, data=mesquite, seed=SEED, refresh=0)
print(fit_1)
(loo_1 <- loo(fit_1))
```
We get warnings about high Pareto k values, which indicates that
the importance sampling approximation used in loo is in this case
unreliable. We thus use more robust K-fold-CV.
```{r }
kfold_1 <- kfold(fit_1, K=10)
kfold_1
```
## Predict log(weight) given log transformed predictors
```{r }
fit_2 <- stan_glm(log(weight) ~ log(diam1) + log(diam2) + log(canopy_height) +
log(total_height) + log(density) + group,
data=mesquite, seed=SEED, refresh=0)
(loo_2 <- loo(fit_2))
```
#### Jacobian correction to make the models comparable
Jacobian correction is needed as model 1 is modeling y and model 2
is modeling log(y).
```{r }
loo_2_with_jacobian <- loo_2
loo_2_with_jacobian$pointwise[,1] <- loo_2_with_jacobian$pointwise[,1]-
log(mesquite$weight)
(elpd_loo_2_with_jacobian <- sum(loo_2_with_jacobian$pointwise[,1]))
```
there will be a warning that the target data is not the same same,
this is ok because we have the jacobian correction
```{r }
loo_compare(kfold_1, loo_2_with_jacobian)
```
#### Posterior predictive checking for model in original scale
```{r }
yrep_1 <- posterior_predict(fit_1)
n_sims <- nrow(yrep_1)
sims_display <- sample(n_sims, 100)
ppc_1 <- ppc_dens_overlay(mesquite$weight, yrep_1[sims_display,]) +
theme(axis.line.y = element_blank())
```
#### Posterior predictive checking for model in log scale
```{r }
yrep_2 <- posterior_predict(fit_2)
ppc_2 <- ppc_dens_overlay(log(mesquite$weight), yrep_2[sims_display,]) +
theme(axis.line.y = element_blank())
bpg <- bayesplot_grid(
ppc_1, ppc_2,
grid_args = list(ncol = 2),
titles = c("Model for weight", "Model for log(weight)")
)
```
#### Display posterior predictive checking plots
```{r }
bpg
```
```{r eval=FALSE, include=FALSE}
if (savefigs)
ggsave(root("Mesquite/figs","mesquite_ppc.pdf"), bpg, height=3, width=9, colormodel="gray")
```
#### Plot marginal posteriors
```{r fig.height=3, fig.width=6}
mcmc_areas(as.matrix(fit_2), regex_pars = "^log|^gro")
```
```{r eval=FALSE, include=FALSE}
if (savefigs)
ggsave(root("Mesquite/figs","mesquite_areas.pdf"), height=3.5, width=5, colormodel="gray")
```
**Plot joint marginal posterior for log(canopy_height) and log(total_height)
```{r }
mcmc_scatter(as.matrix(fit_2), pars = c("log(canopy_height)","log(total_height)"), size = 1) +
geom_vline(xintercept=0) +
geom_hline(yintercept=0) +
labs(x="coef of log(canopy_height)", y="coef of log(total_height)")
```
```{r eval=FALSE, include=FALSE}
if (savefigs)
ggsave(root("Mesquite/figs","mesquite_scatter.pdf"), height=3.5, width=5, colormodel="gray")
```
## Additional transformed variables
```{r }
mesquite$canopy_volume <- mesquite$diam1 * mesquite$diam2 * mesquite$canopy_height
mesquite$canopy_area <- mesquite$diam1 * mesquite$diam2
mesquite$canopy_shape <- mesquite$diam1 / mesquite$diam2
```
## A model with a canopy volume variable
```{r }
fit_3 <- stan_glm(log(weight) ~ log(canopy_volume), data=mesquite,
seed=SEED, refresh=0)
print(fit_3)
loo_3 <- loo(fit_3)
```
Both models are modeling log(y) and can be compared directly.
```{r }
loo_compare(loo_2, loo_3)
```
#### Compare also LOO-R^2
```{r }
round(median(loo_R2(fit_2)),2)
round(median(loo_R2(fit_3)),2)
```
#### Compare Bayesian R^2
```{r }
round(median(bayes_R2(fit_2)),2)
round(median(bayes_R2(fit_3)),2)
```
## Add canopy area and canopy shape
```{r }
fit_4 <- stan_glm(log(weight) ~ log(canopy_volume) +
log(canopy_area) + log(canopy_shape) +
log(total_height) + log(density) + group,
data=mesquite, seed=SEED, refresh=0)
print(fit_4)
(loo_4 <- loo(fit_4))
loo_compare(loo_2, loo_4)
round(median(loo_R2(fit_4)),2)
round(median(bayes_R2(fit_4)),2)
```
#### Plot Bayesian R^2
```{r results='hide', fig.height=3, fig.width=6}
mcmc_hist(data.frame(bayes_R2(fit_4)), binwidth=0.005)+
xlab('Bayesian R^2') + scale_y_continuous(breaks=NULL)
```
#### Plot marginals
```{r fig.height=3, fig.width=6}
mcmc_areas(as.matrix(fit_4))
```
#### Plot pairwise joint marginals
Strong collinearity between canopy volume and canopy area is obvious
```{r fig.width=8, fig.height=8}
mcmc_pairs(as.matrix(fit_4), pars=c("log(canopy_volume)","log(canopy_area)",
"log(canopy_shape)","log(total_height)",
"log(density)"))
```
## A model with just canopy volume and canopy shape
```{r }
fit_5 <- stan_glm(log(weight) ~ log(canopy_volume) + log(canopy_shape) +
group, data=mesquite, seed=SEED, refresh=0)
(loo_5 <- loo(fit_5))
loo_compare(loo_4, loo_5)
round(median(loo_R2(fit_5)),2)
round(median(bayes_R2(fit_5)),2)
```