---
title: "Regression and Other Stories: Poststratification 2"
author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
date: "`r format(Sys.Date())`"
output:
  html_document:
    theme: readable
    toc: true
    toc_depth: 2
    toc_float: true
    code_download: true
---
Demonstrate poststratification with simulated census and poll data
See Chapter 17 in Regression and Other Stories.


-------------

#### Load packages

```{r setup, message=FALSE, error=FALSE, warning=FALSE}
library("rstanarm")
invlogit <- plogis
```

## Simulate fake data

Create an empty poststratification table, with rows for each of the
$2 \times 4 \times 4$ sorts of people.

```{r }
J <- c(2, 4, 4)
poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1)))
colnames(poststrat) <- c("sex", "age", "eth", "N")
count <- 0
for (i1 in 1:J[1]){
  for (i2 in 1:J[2]){
    for (i3 in 1:J[3]){
      count <- count + 1
      poststrat[count, 1:3] <- c(i1, i2, i3)
    }
  }
}
```

make up numbers for the populations of the cells

```{r }
p_sex <- c(0.52, 0.48)
p_age <- c(0.2, 0.25, 0.3, 0.25)
p_eth <- c(0.7, 0.1, 0.1, 0.1)
for (j in 1:prod(J)){
  poststrat$N[j] <- 250e6 * p_sex[poststrat[j,1]] * p_age[poststrat[j,2]] *
  p_eth[poststrat[j,3]]
}
```

Hypothesize a nonresponse pattern in which women, older people, and
whites are more likely to respond than men, younger people, and
minorities

```{r }
p_response_baseline <- 0.1
p_response_sex <- c(1, 0.8)
p_response_age <- c(1, 1.2, 1.6, 2.5)
p_response_eth <- c(1, 0.8, 0.7, 0.6)
p_response <- rep(NA, prod(J))
for (j in 1:prod(J)){
  p_response[j] <- p_response_baseline * p_response_sex[poststrat[j,1]] *
    p_response_age[poststrat[j,2]] * p_response_eth[poststrat[j,3]]
}
```

Sample from the assumed population with the assumed nonresponse probabilities

```{r }
n <- 1000
people <- sample(prod(J), n, replace=TRUE, prob=poststrat$N*p_response)
# For respondent i, people[i] is that person's poststrat cell,
# some number between 1 and 32
n_cell <- rep(NA, prod(J))
for (j in 1:prod(J)){
  n_cell[j] <- sum(people==j)
}
print(cbind(poststrat, n_cell/n, poststrat$N/sum(poststrat$N)))
```

Assume the survey responses come from a logistic regression with
these coefficients

```{r }
coef_intercept <- 0.6
coef_sex <- c(0, -0.2)
coef_age <- c(0, -0.2, -0.3, -0.4)
coef_eth <- c(0, 0.6, 0.3, 0.3)
```

The probabilities are:

```{r }
prob_yes <- rep(NA, prod(J))
for (j in 1:prod(J)){
  prob_yes[j] <- invlogit(coef_intercept + coef_sex[poststrat[j,1]] +
  coef_age[poststrat[j,2]] + coef_eth[poststrat[j,3]])
}
```

Simulate the fake data:

```{r }
y <- rbinom(n, 1, prob_yes[people])
```

## Linear model

```{r }
sex <- poststrat[people,1]
age <- poststrat[people,2]
eth <- poststrat[people,3]
fake <- data.frame(y, sex, age, eth)
fit <- stan_glm(y ~ factor(sex) + factor(age) + factor(eth),
                family=binomial(link="logit"), data=fake, refresh=0)
print(fit)
```

#### Prediction

```{r }
pred_sim <- posterior_epred(fit, newdata=as.data.frame(poststrat))
pred_est <- colMeans(pred_sim)
round(cbind(poststrat, prob_yes, pred_est), 2)
```

#### Poststratification

```{r }
poststrat_est <- sum(poststrat$N*pred_est)/sum(poststrat$N)
round(poststrat_est, 2)
```

plus uncertainty

```{r }
poststrat_sim <- pred_sim %*% poststrat$N / sum(poststrat$N)
round(c(mean(poststrat_sim), sd(poststrat_sim)), 3)
```