---
title: "Regression and Other Stories: Poststratification"
author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
date: "`r format(Sys.Date())`"
output:
  html_document:
    theme: readable
    toc: true
    toc_depth: 2
    toc_float: true
    code_download: true
---
Poststratification after estimation. See Chapter 17 in Regression
and Other Stories.

The CBS News poll conducted from 12--16 October 2016 reported that,
among likely voters who preferred one of the two major-party
candidates, 45\% intended to vote for Donald Trump and 55\% for
Hillary Clinton.  Of these respondents, Party ID 33\% Republican,
40\% Republican, 27\% independent.

source:  http://www.cbsnews.com/news/cbs-poll-clintons-lead-over-trump-widens-with-three-weeks-to-go/
and https://www.scribd.com/document/327938789/CBS-News-Poll-10-17-toplines

Effective sample size of likely voters
```
254 Republican, 282 Democrat, 242 Independent
```

Compare to:
```
exit polls 2012  32 38 29
exit polls 2016  33 36 31
```

```
Republicans:  77% Trump,  8% Clinton (must normalize to 100%)
Democrats:     5% Trump, 89% Clinton (must normalize to 100%)
Independents: 36% Trump, 38% Clinton (must normalize to 100%)
```
-------------


````{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
````

#### Load packages

````{r }
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
````

## Simulate fake data

````{r }
n_pid <- c(254, 282, 242)
n <- sum(n_pid)
pid_names <- c("Republican", "Democrat", "Independent")
pid <- rep(pid_names, n_pid)
n_vote <- as.list(rep(NA, 3))
n_vote[[1]] <- round(c(0.77, 0.08)*n_pid[1])
n_vote[[2]] <- round(c(0.05, 0.89)*n_pid[2])
n_vote[[3]] <- round(c(0.36, 0.38)*n_pid[3])
vote <- NULL
y_bar_cells <- rep(NA, 3)
for (j in 1:3){
  n_vote[[j]]<- c(n_vote[[j]], n_pid[j] - sum(n_vote[[j]]))
  vote <- c(vote, rep(c(1, 0, NA), n_vote[[j]]))
  y_bar_cells[j] <- mean(vote[pid==pid_names[j]], na.rm=TRUE)
  round(y_bar_cells[j], 3)
}
poll <- data.frame(vote, pid)
# write.csv(poll, root("Poststrat/data","poll.csv"), row.names=FALSE)
# poll <- read.csv(root("Poststrat/data","poll.csv"))
head(poll)
summary(poll)
````

## Simple poststrat

````{r }
poststrat_data <- data.frame(pid=c("Republican", "Democrat", "Independent"),
                             N=c(0.33, 0.36, 0.31))
round(sum(poststrat_data$N * y_bar_cells), 3)
````

## Linear model
#### Raw estimate

````{r }
round(mean(poll$vote, na.rm=TRUE), 3)
````

#### stan_glm

````{r }
fit_1 <- stan_glm(vote ~ factor(pid), data = poll, refresh = 0)
print(fit_1, digits=2)
````

#### Poststrat using posterior_linpred()

````{r }
epred_1 <- posterior_epred(fit_1, newdata=poststrat_data)
poststrat_est_1 <- epred_1 %*% poststrat_data$N/sum(poststrat_data$N)
print(c(mean(poststrat_est_1), mad(poststrat_est_1)), digits=2)
````

#### Add extra uncertainty

````{r }
n_sim <- nrow(epred_1)
poststrat_est_2 <- poststrat_est_1 + rnorm(n_sim, 0, 0.02)
print(c(mean(poststrat_est_2), mad(poststrat_est_2)), digits=2)
````

## Logistic model
#### Fit the regression

````{r }
fit <- stan_glm(vote ~ factor(pid), family=binomial(link="logit"), data = poll, refresh = 0)
print(fit, digits=2)
````

#### Raw estimate

````{r }
round(mean(poll$vote, na.rm=TRUE), 3)
````

#### Poststrat using the predict function

````{r }
X_population <- data.frame(pid=c("Republican", "Democrat", "Independent"))
N_population <- c(0.33, 0.36, 0.31)
predict_poststrat <- colMeans(posterior_epred(fit, newdata=X_population))
poststrat_est_2 <- sum(N_population*predict_poststrat)/sum(N_population)
round(poststrat_est_2, digits=3)
````

#### Just to compare, poststrat using the original data

````{r }
predict_a <- predict(fit, type="response")
round(mean(predict_a), 3)
````

This doesn't work--it just spits back the raw estimate--because it's not using the external population info which is what makes poststrat work.