--- title: "Regression and Other Stories: Simple regression" author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" date: "`r format(Sys.Date())`" output: html_document: theme: readable toc: true toc_depth: 2 toc_float: true code_download: true --- Linear regression with a single predictor. See Chapters 6 and 7 in Regression and Other Stories. ------------- ```{r setup, include=FALSE} knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE ``` #### Load packages ```{r } library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstanarm") ``` ## Fitting a regression using a data frame in R #### Simulate fake data ```{r } x <- 1:20 n <- length(x) a <- 0.2 b <- 0.3 sigma <- 0.5 # set the random seed to get reproducible results # change the seed to experiment with variation due to random noise set.seed(2141) y <- a + b*x + sigma*rnorm(n) fake <- data.frame(x, y) ``` #### Linear regression model The option `refresh = 0` supresses the default Stan sampling progress output. This is useful for small data with fast computation. For more complex models and bigger data, it can be useful to see the progress. ```{r } fit_1 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0) print(fit_1, digits=2) ``` #### Plot for the book ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Simplest/figs","simple.pdf"), height=4, width=5.5) ``` ```{r } par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01) plot(fake$x, fake$y, main="Data and fitted regression line", bty="l", pch=20, xlab = "x", ylab = "y") a_hat <- coef(fit_1)[1] b_hat <- coef(fit_1)[2] abline(a_hat, b_hat) x_bar <- mean(fake$x) text(x_bar, a_hat + b_hat*x_bar, paste(" y =", round(a_hat, 2), "+", round(b_hat, 2), "* x"), adj=0) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ## Formulating comparisons as regression models #### Simulate fake data ```{r } n_0 <- 200 # set the random seed to get reproducible results # change the seed to experiment with variation due to random noise set.seed(2141) y_0 <- rnorm(n_0, 2.0, 5.0) fake_0 <- data.frame(y_0) round(y_0, 1) round(mean(y_0), 2) round(sd(y_0)/sqrt(n_0), 2) ``` #### Estimating the mean is the same as regressing on a constant term ```{r results='hide'} fit_2 <- stan_glm(y_0 ~ 1, data = fake_0, seed=2141, refresh = 0, prior_intercept = NULL, prior = NULL, prior_aux = NULL) ``` ```{r } print(fit_2) ``` #### Simulate fake data ```{r } n_1 <- 300 # set the random seed to get reproducible results # change the seed to experiment with variation due to random noise set.seed(2141) y_1 <- rnorm(n_1, 8, 5) diff <- mean(y_1) - mean(y_0) se_0 <- sd(y_0)/sqrt(n_0) se_1 <- sd(y_1)/sqrt(n_1) se <- sqrt(se_0^2 + se_1^2) print(diff) print(se) ``` #### Estimating a difference is the same as regressing on an indicator variable ```{r } n <- n_0 + n_1 y <- c(y_0, y_1) x <- c(rep(0, n_0), rep(1, n_1)) fake <- data.frame(y, x) fit_3 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0, prior_intercept = NULL, prior = NULL, prior_aux = NULL) print(fit_3) ``` #### Plot for the book ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Simplest/figs","simplest_1.pdf"), height=4, width=5) ``` ```{r } par(mar=c(3,3,3,2), mgp=c(1.7,.5,0), tck=-.01) plot(x, y, xlab="Indicator, x", ylab="y", bty="l", xaxt="n", main="Regression on an indicator is the same\nas computing a difference in means", pch=19, cex=.5) axis(1, c(0, 1)) abline(h=mean(y[x==0]), lty=2, col="gray50") abline(h=mean(y[x==1]), lty=2, col="gray50") abline(coef(fit_3)[1], coef(fit_3)[2]) text(.5, -1 + coef(fit_3)[1] + .5*coef(fit_3)[2], paste("y =", round(coef(fit_3)[1], 2), "+", round(coef(fit_3)[2], 2), "x"), cex=.9, adj=0) text(.05, -1 + mean(y[x==0]), bquote(paste(bar(y)[0], " = ", .(round(mean(y[x==0]), 2)))), col="gray30", cex=.9, adj=0) text(.95, 1 + mean(y[x==1]), bquote(paste(bar(y)[1], " = ", .(round(mean(y[x==1]), 2)))), col="gray30", cex=.9, adj=1) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` #### Repeat with flat priors ```{r } fit_3b <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0, prior=NULL, prior_intercept=NULL, prior_aux=NULL) print(fit_3b) ```