#' --- #' title: "Regression and Other Stories: Simple regression" #' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" #' date: "`r format(Sys.Date())`" #' output: #' html_document: #' theme: readable #' toc: true #' toc_depth: 2 #' toc_float: true #' code_download: true #' --- #' Linear regression with a single predictor. See Chapters 6 and 7 in #' Regression and Other Stories. #' #' ------------- #' #+ setup, include=FALSE knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE #' #### Load packages library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstanarm") #' ## Fitting a regression using a data frame in R #' #' #### Simulate fake data x <- 1:20 n <- length(x) a <- 0.2 b <- 0.3 sigma <- 0.5 # set the random seed to get reproducible results # change the seed to experiment with variation due to random noise set.seed(2141) y <- a + b*x + sigma*rnorm(n) fake <- data.frame(x, y) #' #### Linear regression model #' #' The option `refresh = 0` supresses the default Stan sampling #' progress output. This is useful for small data with fast #' computation. For more complex models and bigger data, it can be #' useful to see the progress. fit_1 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0) print(fit_1, digits=2) #' #### Plot for the book #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Simplest/figs","simple.pdf"), height=4, width=5.5) #+ par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01) plot(fake$x, fake$y, main="Data and fitted regression line", bty="l", pch=20, xlab = "x", ylab = "y") a_hat <- coef(fit_1)[1] b_hat <- coef(fit_1)[2] abline(a_hat, b_hat) x_bar <- mean(fake$x) text(x_bar, a_hat + b_hat*x_bar, paste(" y =", round(a_hat, 2), "+", round(b_hat, 2), "* x"), adj=0) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' ## Formulating comparisons as regression models #' #' #### Simulate fake data n_0 <- 200 # set the random seed to get reproducible results # change the seed to experiment with variation due to random noise set.seed(2141) y_0 <- rnorm(n_0, 2.0, 5.0) fake_0 <- data.frame(y_0) round(y_0, 1) round(mean(y_0), 2) round(sd(y_0)/sqrt(n_0), 2) #' #### Estimating the mean is the same as regressing on a constant term #+ results='hide' fit_2 <- stan_glm(y_0 ~ 1, data = fake_0, seed=2141, refresh = 0, prior_intercept = NULL, prior = NULL, prior_aux = NULL) #+ print(fit_2) #' #### Simulate fake data n_1 <- 300 # set the random seed to get reproducible results # change the seed to experiment with variation due to random noise set.seed(2141) y_1 <- rnorm(n_1, 8, 5) diff <- mean(y_1) - mean(y_0) se_0 <- sd(y_0)/sqrt(n_0) se_1 <- sd(y_1)/sqrt(n_1) se <- sqrt(se_0^2 + se_1^2) print(diff) print(se) #' #### Estimating a difference is the same as regressing on an indicator variable n <- n_0 + n_1 y <- c(y_0, y_1) x <- c(rep(0, n_0), rep(1, n_1)) fake <- data.frame(y, x) fit_3 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0, prior_intercept = NULL, prior = NULL, prior_aux = NULL) print(fit_3) #' #### Plot for the book #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Simplest/figs","simplest_1.pdf"), height=4, width=5) #+ par(mar=c(3,3,3,2), mgp=c(1.7,.5,0), tck=-.01) plot(x, y, xlab="Indicator, x", ylab="y", bty="l", xaxt="n", main="Regression on an indicator is the same\nas computing a difference in means", pch=19, cex=.5) axis(1, c(0, 1)) abline(h=mean(y[x==0]), lty=2, col="gray50") abline(h=mean(y[x==1]), lty=2, col="gray50") abline(coef(fit_3)[1], coef(fit_3)[2]) text(.5, -1 + coef(fit_3)[1] + .5*coef(fit_3)[2], paste("y =", round(coef(fit_3)[1], 2), "+", round(coef(fit_3)[2], 2), "x"), cex=.9, adj=0) text(.05, -1 + mean(y[x==0]), bquote(paste(bar(y)[0], " = ", .(round(mean(y[x==0]), 2)))), col="gray30", cex=.9, adj=0) text(.95, 1 + mean(y[x==1]), bquote(paste(bar(y)[1], " = ", .(round(mean(y[x==1]), 2)))), col="gray30", cex=.9, adj=1) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Repeat with flat priors fit_3b <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0, prior=NULL, prior_intercept=NULL, prior_aux=NULL) print(fit_3b)