---
title: "Regression and Other Stories: Arsenic"
author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
date: "`r format(Sys.Date())`"
output:
  html_document:
    theme: readable
    toc: true
    toc_depth: 2
    toc_float: true
    code_download: true
---
Building a logistic regression model: wells in Bangladesh. See
Chapter 13 in Regression and Other Stories.

This version uses algorithm='optimizing', which finds the posterior
mode, computes Hessian and uses a normal distribution approximation
of the posterior. This can work well for non-hierarchical
generalized linear models when the number of observations is much
higher than the number of covariates.

-------------


```{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
# switch this to TRUE to save figures in separate files
savefigs <- FALSE
```

#### Load packages

```{r }
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("loo")
invlogit <- plogis
```

#### Load data

```{r }
wells <- read.csv(root("Arsenic/data","wells.csv"))
wells$y <- wells$switch
n <- nrow(wells)
```

## Null model

#### Log-score for coin flipping

```{r }
prob <- 0.5
round(log(prob)*sum(wells$y) + log(1-prob)*sum(1-wells$y),1)
```

#### Log-score for intercept model

```{r }
round(prob <- mean(wells$y),2)
round(log(prob)*sum(wells$y) + log(1-prob)*sum(1-wells$y),1)
```

## A single predictor

#### Fit a model using distance to the nearest safe well

```{r results='hide'}
fit_1 <- stan_glm(y ~ dist, family = binomial(link = "logit"), data=wells, algorithm='optimizing')
```


```{r }
print(fit_1, digits=3)
summary(fit_1, digits=3)
```

#### LOO log score

```{r }
(loo1 <- loo(fit_1))
```

#### Histogram of distances

```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.distances.bnew.ps"),
                         height=3, width=4, horizontal=TRUE)
```
```{r }
hist(wells$dist, breaks=seq(0,10+max(wells$dist),10), freq=TRUE,
     xlab="Distance (in meters) to nearest safe well", ylab="", main="", mgp=c(2,.5,0))
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

#### Scale distance in meters to distance in 100 meters

```{r }
wells$dist100 <- wells$dist/100
```

#### Fit a model using scaled distance to the nearest safe well

```{r results='hide'}
fit_2 <- stan_glm(y ~ dist100, family = binomial(link = "logit"), data=wells, algorithm='optimizing')
```


```{r }
print(fit_2, digits=2)
summary(fit_2, digits=2)
```

#### LOO log score

```{r }
(loo2 <- loo(fit_2, save_psis = TRUE))
```

#### Plot model fit

```{r }
jitter_binary <- function(a, jitt=.05){
  a + (1-2*a)*runif(length(a),0,jitt)
}
```
```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.logitfit.1new.a.ps"),
                         height=3.5, width=4, horizontal=TRUE)
```
```{r }
plot(c(0,max(wells$dist, na.rm=TRUE)*1.02), c(0,1),
     xlab="Distance (in meters) to nearest safe well", ylab="Pr (switching)",
     type="n", xaxs="i", yaxs="i", mgp=c(2,.5,0))
curve(invlogit(coef(fit_1)[1]+coef(fit_1)[2]*x), lwd=1, add=TRUE)
points(wells$dist, jitter_binary(wells$y), pch=20, cex=.1)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

#### Plot uncertainty in the estimated coefficients

```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.logitfit.scatterplot.ps"),
                         height=3.5, width=3.5, horizontal=TRUE)
```
```{r }
sims <- as.matrix(fit_2)
par(pty="s")
plot(sims[1:500,1], sims[1:500,2], xlim=c(.4,.8), ylim=c(-1,0),
     xlab=expression(beta[0]), ylab=expression(beta[1]), mgp=c(1.5,.5,0),
     pch=20, cex=.5, xaxt="n", yaxt="n")
axis(1, seq(.4,.8,.2), mgp=c(1.5,.5,0))
axis(2, seq(-1,0,.5), mgp=c(1.5,.5,0))
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

#### Plot uncertainty in the estimated predictions

```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.logitfit.1new.b.ps"),
                         height=3.5, width=4, horizontal=TRUE)
```
```{r }
plot(c(0,max(wells$dist, na.rm=T)*1.02), c(0,1),
     xlab="Distance (in meters) to nearest safe well", ylab="Pr (switching)",
     type="n", xaxs="i", yaxs="i", mgp=c(2,.5,0))
for (j in 1:20) {
    curve (invlogit(sims[j,1]+sims[j,2]*x/100), lwd=.5,
           col="darkgray", add=TRUE)
}
curve(invlogit(coef(fit_2)[1]+coef(fit_2)[2]*x/100), lwd=1, add=T)
points(wells$dist, jitter_binary(wells$y), pch=20, cex=.1)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

## Two predictors

#### Histogram of arsenic levels

```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.levels.a.ps"),
                         height=3, width=4, horizontal=TRUE)
```
```{r }
hist(wells$arsenic, breaks=seq(0,.25+max(wells$arsenic),.25), freq=TRUE,
     xlab="Arsenic concentration in well water", ylab="", main="", mgp=c(2,.5,0))
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

#### Fit a model using scaled distance and arsenic level

```{r results='hide'}
fit_3 <- stan_glm(y ~ dist100 + arsenic, family = binomial(link = "logit"),
                  data=wells, algorithm='optimizing')
```


```{r }
print(fit_3, digits=2)
summary(fit_3, digits=2)
```

#### LOO log score

```{r }
(loo3 <- loo(fit_3, save_psis = TRUE))
```

#### Compare models

```{r }
loo_compare(loo2, loo3)
```

#### Average improvement in LOO predictive probabilities<br>
from dist100 to dist100 + arsenic

```{r }
pred2 <- loo_predict(fit_2, psis_object = loo2$psis_object)$value
pred3 <- loo_predict(fit_3, psis_object = loo3$psis_object)$value
round(mean(c(pred3[wells$y==1]-pred2[wells$y==1],pred2[wells$y==0]-pred3[wells$y==0])),3)
```

#### Plot model fits

```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.2variables.a.ps"),
                         height=3.5, width=4, horizontal=TRUE)
```
```{r }
plot(c(0,max(wells$dist,na.rm=T)*1.02), c(0,1),
     xlab="Distance (in meters) to nearest safe well", ylab="Pr (switching)",
     type="n", xaxs="i", yaxs="i", mgp=c(2,.5,0))
points(wells$dist, jitter_binary(wells$y), pch=20, cex=.1)
curve(invlogit(coef(fit_3)[1]+coef(fit_3)[2]*x/100+coef(fit_3)[3]*.50), lwd=.5, add=T)
curve(invlogit(coef(fit_3)[1]+coef(fit_3)[2]*x/100+coef(fit_3)[3]*1.00), lwd=.5, add=T)
text(50, .27, "if As = 0.5", adj=0, cex=.8)
text(75, .50, "if As = 1.0", adj=0, cex=.8)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```
```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.2variables.b.ps"),
                         height=3.5, width=4, horizontal=TRUE)
```
```{r }
plot(c(0,max(wells$arsenic,na.rm=T)*1.02), c(0,1),
     xlab="Arsenic concentration in well water", ylab="Pr (switching)",
     type="n", xaxs="i", yaxs="i", mgp=c(2,.5,0))
points(wells$arsenic, jitter_binary(wells$y), pch=20, cex=.1)
curve(invlogit(coef(fit_3)[1]+coef(fit_3)[2]*0+coef(fit_3)[3]*x), from=0.5, lwd=.5, add=T)
curve(invlogit(coef(fit_3)[1]+coef(fit_3)[2]*0.5+coef(fit_3)[3]*x), from=0.5, lwd=.5, add=T)
text(.5, .78, "if dist = 0", adj=0, cex=.8)
text(2, .6, "if dist = 50", adj=0, cex=.8)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

## Interaction

#### Fit a model using scaled distance, arsenic level, and an interaction

```{r results='hide'}
fit_4 <- stan_glm(y ~ dist100 + arsenic + dist100:arsenic,
                  family = binomial(link="logit"), data = wells, algorithm='optimizing')
```


```{r }
print(fit_4, digits=2)
summary(fit_4, digits=2)
```

#### LOO log score

```{r }
(loo4 <- loo(fit_4))
```

#### Compare models

```{r }
loo_compare(loo3, loo4)
```

#### Centering the input variables

```{r }
wells$c_dist100 <- wells$dist100 - mean(wells$dist100)
wells$c_arsenic <- wells$arsenic - mean(wells$arsenic)
fit_5 <- stan_glm(y ~ c_dist100 + c_arsenic + c_dist100:c_arsenic,
                  family = binomial(link="logit"), data = wells, algorithm='optimizing')
```


```{r }
print(fit_5, digits=2)
summary(fit_5, digits=2)
```

#### Plot model fits

```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.interact.a.ps"),
                         height=3.5, width=4, horizontal=TRUE)
```
```{r }
plot(c(0,max(wells$dist,na.rm=T)*1.02), c(0,1),
     xlab="Distance (in meters) to nearest safe well", ylab="Pr (switching)",
     type="n", xaxs="i", yaxs="i", mgp=c(2,.5,0))
points(wells$dist, jitter_binary(wells$y), pch=20, cex=.1)
curve(invlogit(coef(fit_4)[1]+coef(fit_4)[2]*x/100+coef(fit_4)[3]*.50+coef(fit_4)[4]*x/100*.50), lwd=.5, add=T)
curve(invlogit(coef(fit_4)[1]+coef(fit_4)[2]*x/100+coef(fit_4)[3]*1.00+coef(fit_4)[4]*x/100*1.00), lwd=.5, add=T)
text (50, .29, "if As = 0.5", adj=0, cex=.8)
text (75, .50, "if As = 1.0", adj=0, cex=.8)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```
```{r eval=FALSE, include=FALSE}
if (savefigs) postscript(root("Arsenic/figs","arsenic.interact.b.ps"),
                         height=3.5, width=4, horizontal=TRUE)
```
```{r }
plot(c(0,max(wells$arsenic,na.rm=T)*1.02), c(0,1),
     xlab="Arsenic concentration in well water", ylab="Pr (switching)",
     type="n", xaxs="i", yaxs="i", mgp=c(2,.5,0))
points(wells$arsenic, jitter_binary(wells$y), pch=20, cex=.1)
curve(invlogit(coef(fit_4)[1]+coef(fit_4)[2]*0+coef(fit_4)[3]*x+coef(fit_4)[4]*0*x), from=0.5, lwd=.5, add=T)
curve(invlogit(coef(fit_4)[1]+coef(fit_4)[2]*0.5+coef(fit_4)[3]*x+coef(fit_4)[4]*0.5*x), from=0.5, lwd=.5, add=T)
text (.5, .78, "if dist = 0", adj=0, cex=.8)
text (2, .6, "if dist = 50", adj=0, cex=.8)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

## More predictors

#### Adding social predictors

```{r results='hide'}
fit_6 <- stan_glm(y ~ dist100 + arsenic + educ4 + assoc,
                  family = binomial(link="logit"), data = wells, algorithm='optimizing')
```


```{r }
print(fit_6, digits=2)
summary(fit_6, digits=2)
```

#### LOO log score

```{r }
(loo6 <- loo(fit_6))
```

#### Compare models

```{r }
loo_compare(loo4, loo6)
```

#### Remove assoc

```{r results='hide'}
fit_7 <- stan_glm(y ~ dist100 + arsenic + educ4,
                  family = binomial(link="logit"), data = wells, algorithm='optimizing')
```


```{r }
print(fit_7, digits=2)
summary(fit_7, digits=2)
```

#### LOO log score

```{r }
(loo7 <- loo(fit_7))
```

#### Compare models

```{r }
loo_compare(loo4, loo7)
loo_compare(loo6, loo7)
```

#### Add interactions with education

```{r results='hide'}
wells$c_educ4 <- wells$educ4 - mean(wells$educ4)
fit_8 <- stan_glm(y ~ c_dist100 + c_arsenic + c_educ4 +
                      c_dist100:c_educ4 + c_arsenic:c_educ4,
                  family = binomial(link="logit"), data = wells, algorithm='optimizing')
```


```{r }
print(fit_8, digits=2)
summary(fit_8, digits=2)
```

#### LOO log score

```{r }
(loo8 <- loo(fit_8, save_psis=TRUE))
```

#### Compare models

```{r }
loo_compare(loo3, loo8)
loo_compare(loo7, loo8)
```

#### Average improvement in LOO predictive probabilities</br>
from dist100 + arsenic to dist100 + arsenic + educ4 + dist100:educ4 + arsenic:educ4

```{r }
pred8 <- loo_predict(fit_8, psis_object = loo8$psis_object)$value
round(mean(c(pred8[wells$y==1]-pred3[wells$y==1],pred3[wells$y==0]-pred8[wells$y==0])),3)
```

## Transformation of variable

#### Fit a model using scaled distance and log arsenic level

```{r }
wells$log_arsenic <- log(wells$arsenic)
```
```{r results='hide'}
fit_3a <- stan_glm(y ~ dist100 + log_arsenic, family = binomial(link = "logit"),
                   data = wells, algorithm='optimizing')
```


```{r }
print(fit_3a, digits=2)
summary(fit_3a, digits=2)
```

#### LOO log score

```{r }
(loo3a <- loo(fit_3a))
```

#### Compare models

```{r }
loo_compare(loo3, loo3a)
```

#### Fit a model using scaled distance, log arsenic level, and an interaction<br>

```{r results='hide'}
fit_4a <- stan_glm(y ~ dist100 + log_arsenic + dist100:log_arsenic,
                  family = binomial(link = "logit"), data = wells, algorithm='optimizing')
```


```{r }
print(fit_4a, digits=2)
summary(fit_4a, digits=2)
```

#### LOO log score

```{r }
(loo4a <- loo(fit_4a))
```

#### Compare models

```{r }
loo_compare(loo3a, loo4a)
```

#### Add interactions with education

```{r }
wells$c_log_arsenic <- wells$log_arsenic - mean(wells$log_arsenic)
```
```{r results='hide'}
fit_8a <- stan_glm(y ~ c_dist100 + c_log_arsenic + c_educ4 +
                      c_dist100:c_educ4 + c_log_arsenic:c_educ4,
                  family = binomial(link="logit"), data = wells, algorithm='optimizing')
```


```{r }
print(fit_8a, digits=2)
summary(fit_8a, digits=2)
```

#### LOO log score

```{r }
(loo8a <- loo(fit_8a, save_psis=TRUE))
```

#### Compare models

```{r }
loo_compare(loo8, loo8a)
```