---
title: "Regression and Other Stories: Simple regression"
author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
date: "`r format(Sys.Date())`"
output:
  html_document:
    theme: readable
    toc: true
    toc_depth: 2
    toc_float: true
    code_download: true
---
Linear regression with a single predictor. See Chapters 6 and 7 in
Regression and Other Stories.

-------------


```{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
# switch this to TRUE to save figures in separate files
savefigs <- FALSE
```

#### Load packages

```{r }
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
```

## Fitting a regression using a data frame in R

#### Simulate fake data

```{r }
x <- 1:20
n <- length(x)
a <- 0.2
b <- 0.3
sigma <- 0.5
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141) 
y <- a + b*x + sigma*rnorm(n)
fake <- data.frame(x, y)
```

#### Linear regression model

The option `refresh = 0` supresses the default Stan sampling
progress output. This is useful for small data with fast
computation. For more complex models and bigger data, it can be
useful to see the progress.

```{r }
fit_1 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0)
print(fit_1, digits=2)
```

#### Plot for the book

```{r eval=FALSE, include=FALSE}
if (savefigs) pdf(root("Simplest/figs","simple.pdf"), height=4, width=5.5)
```
```{r }
par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
plot(fake$x, fake$y, main="Data and fitted regression line", bty="l", pch=20,
     xlab = "x", ylab = "y")
a_hat <- coef(fit_1)[1]
b_hat <- coef(fit_1)[2]
abline(a_hat, b_hat)
x_bar <- mean(fake$x)
text(x_bar, a_hat + b_hat*x_bar, paste("   y =", round(a_hat, 2), "+", round(b_hat, 2), "* x"), adj=0)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

## Formulating comparisons as regression models

#### Simulate fake data

```{r }
n_0 <- 200
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141)
y_0 <- rnorm(n_0, 2.0, 5.0)
fake_0 <- data.frame(y_0)
round(y_0, 1)
round(mean(y_0), 2)
round(sd(y_0)/sqrt(n_0), 2)
```

#### Estimating the mean is the same as regressing on a constant term

```{r results='hide'}
fit_2 <- stan_glm(y_0 ~ 1, data = fake_0, seed=2141, refresh = 0,
                  prior_intercept = NULL, prior = NULL, prior_aux = NULL)
```
```{r }
print(fit_2)
```

#### Simulate fake data

```{r }
n_1 <- 300
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141)
y_1 <- rnorm(n_1, 8, 5)
diff <- mean(y_1) - mean(y_0)
se_0 <- sd(y_0)/sqrt(n_0)
se_1 <- sd(y_1)/sqrt(n_1)
se <- sqrt(se_0^2 + se_1^2)
print(diff)

print(se)
```

#### Estimating a difference is the same as regressing on an indicator variable

```{r }
n <- n_0 + n_1
y <- c(y_0, y_1)
x <- c(rep(0, n_0), rep(1, n_1))
fake <- data.frame(y, x)
fit_3 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0, 
                  prior_intercept = NULL, prior = NULL, prior_aux = NULL)
print(fit_3)
```

#### Plot for the book

```{r eval=FALSE, include=FALSE}
if (savefigs) pdf(root("Simplest/figs","simplest_1.pdf"), height=4, width=5)
```
```{r }
par(mar=c(3,3,3,2), mgp=c(1.7,.5,0), tck=-.01)
plot(x, y, xlab="Indicator, x", ylab="y", bty="l", xaxt="n", main="Regression on an indicator is the same\nas computing a difference in means",  pch=19, cex=.5)
axis(1, c(0, 1))
abline(h=mean(y[x==0]), lty=2, col="gray50")
abline(h=mean(y[x==1]), lty=2, col="gray50")
abline(coef(fit_3)[1], coef(fit_3)[2])
text(.5, -1 + coef(fit_3)[1] + .5*coef(fit_3)[2], paste("y =", round(coef(fit_3)[1], 2), "+", round(coef(fit_3)[2], 2), "x"), cex=.9, adj=0)
text(.05, -1 + mean(y[x==0]), bquote(paste(bar(y)[0], " = ", .(round(mean(y[x==0]), 2)))), col="gray30", cex=.9, adj=0)
text(.95, 1 + mean(y[x==1]), bquote(paste(bar(y)[1], " = ", .(round(mean(y[x==1]), 2)))), col="gray30", cex=.9, adj=1)
```
```{r eval=FALSE, include=FALSE}
if (savefigs) dev.off()
```

#### Repeat with flat priors

```{r }
fit_3b <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0,
                  prior=NULL, prior_intercept=NULL, prior_aux=NULL)
print(fit_3b)
```