#' ---
#' title: "Regression and Other Stories: Newcomb"
#' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
#' date: "`r format(Sys.Date())`"
#' output:
#'   html_document:
#'     theme: readable
#'     toc: true
#'     toc_depth: 2
#'     toc_float: true
#'     code_download: true
#' ---

#' Posterior predictive checking of Normal model for Newcomb's speed
#' of light data. See Chapter 11 in Regression and Other Stories.
#' 
#' -------------
#' 

#+ setup, include=FALSE
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
# switch this to TRUE to save figures in separate files
savefigs <- FALSE

#' #### Load packages
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))
#+ eval=FALSE, include=FALSE
# grayscale figures for the book
color_scheme_set(scheme = "gray")

#' #### Load Data
#' 
#' Simon Newcomb's measurements of the speed of light, from Stigler
#' (1977).  The data are recorded as deviations from $24,\!800$
#' nanoseconds.
newcomb <- read.table(root("Newcomb/data","newcomb.txt"), header = TRUE)
head(newcomb)

#' #### Histogram of the data
#+ eval=FALSE, include=FALSE
if (savefigs) pdf(root("Newcomb/figs","newcomb_hist.pdf"), height=4, width=5)
#+
hist(newcomb$y, main=NULL, ylab="", xlab="", yaxt="n", breaks=30)
#+ eval=FALSE, include=FALSE
if (savefigs) dev.off()

#' #### Histogram of the data with bayesplot
mcmc_hist(newcomb, pars="y") + xlab("")

#' #### Fit a regression model with just the intercept term
#' 
#' The option `refresh = 0` supresses the default Stan sampling
#' progress output. This is useful for small data with fast
#' computation. For more complex models and bigger data, it can be
#' useful to see the progress.
fit <- stan_glm(y ~ 1, data=newcomb, refresh=0)

#' #### Simulate from the predictive distribution
sims <- as.matrix(fit)
n_sims <- nrow(sims)
n <- length(newcomb$y)
y_rep <- array(NA, c(n_sims, n))
for (s in 1:n_sims)
    y_rep[s,] <- rnorm(n, sims[s,1], sims[s,2])

#' #### Plot histogram of 20 replicates
#+ eval=FALSE, include=FALSE
if (savefigs) pdf(root("Newcomb/figs","newcomb_rep_hist.pdf"), height=4, width=8)
#+
par(mfrow=c(4,5), mar=rep(2,4))
for (s in sample(n_sims, 20)) {
  hist(y_rep[s,], main=NULL, ylab="", xlab="", yaxt="n") }
#+ eval=FALSE, include=FALSE
if (savefigs) dev.off()

#' #### Simulate using built-in function
y_rep <- posterior_predict(fit)
#' #### Plot data and 19 replications using built-in function
ppc_hist(newcomb$y, y_rep[1:19, ], binwidth = 8)
#+ eval=FALSE, include=FALSE
if (savefigs) ggsave(root("Newcomb/figs","newcomb_ppc_hist.pdf"), width = 9, height = 4)

#' #### Plot kernel density estimate of data and 100 replications using built-in function
ppc_dens_overlay(newcomb$y, y_rep[1:100, ]) + scale_y_continuous(breaks=NULL)
#+ eval=FALSE, include=FALSE
if (savefigs) ggsave(root("Newcomb/figs","newcomb_ppc_dens_overlay.pdf"), width = 6, height = 2.5)

#' #### Plot test statistic for data and replicates
test <- function (y) {
  min(y) }
test_rep <- apply(y_rep, 1, test)
#+ eval=FALSE, include=FALSE
if (savefigs) pdf(root("Newcomb/figs","newcomb_test_hist.pdf"), height=4, width=5)
#+
hist(test_rep, xlim=c(-50,20), breaks=20, yaxt="n",
     xlab="", ylab="", main=NULL)
lines(rep(test(newcomb$y),2), c(0,n_sims), lwd=3)
#+ eval=FALSE, include=FALSE
if (savefigs) dev.off()

#' #### Plot test statistic for data and replicates using built-in function
ppc_stat(newcomb$y, y_rep, stat = "min", binwidth = 2)
#+ eval=FALSE, include=FALSE
if (savefigs) ggsave(root("Newcomb/figs","newcomb_ppc_stat.pdf"), width = 6, height = 4)