--- title: "Regression and Other Stories: Poststratification 2" author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" date: "`r format(Sys.Date())`" output: html_document: theme: readable toc: true toc_depth: 2 toc_float: true code_download: true --- Demonstrate poststratification with simulated census and poll data See Chapter 17 in Regression and Other Stories. ------------- #### Load packages ```{r setup, message=FALSE, error=FALSE, warning=FALSE} library("rstanarm") invlogit <- plogis ``` ## Simulate fake data Create an empty poststratification table, with rows for each of the $2 \times 4 \times 4$ sorts of people. ```{r } J <- c(2, 4, 4) poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1))) colnames(poststrat) <- c("sex", "age", "eth", "N") count <- 0 for (i1 in 1:J[1]){ for (i2 in 1:J[2]){ for (i3 in 1:J[3]){ count <- count + 1 poststrat[count, 1:3] <- c(i1, i2, i3) } } } ``` make up numbers for the populations of the cells ```{r } p_sex <- c(0.52, 0.48) p_age <- c(0.2, 0.25, 0.3, 0.25) p_eth <- c(0.7, 0.1, 0.1, 0.1) for (j in 1:prod(J)){ poststrat$N[j] <- 250e6 * p_sex[poststrat[j,1]] * p_age[poststrat[j,2]] * p_eth[poststrat[j,3]] } ``` Hypothesize a nonresponse pattern in which women, older people, and whites are more likely to respond than men, younger people, and minorities ```{r } p_response_baseline <- 0.1 p_response_sex <- c(1, 0.8) p_response_age <- c(1, 1.2, 1.6, 2.5) p_response_eth <- c(1, 0.8, 0.7, 0.6) p_response <- rep(NA, prod(J)) for (j in 1:prod(J)){ p_response[j] <- p_response_baseline * p_response_sex[poststrat[j,1]] * p_response_age[poststrat[j,2]] * p_response_eth[poststrat[j,3]] } ``` Sample from the assumed population with the assumed nonresponse probabilities ```{r } n <- 1000 people <- sample(prod(J), n, replace=TRUE, prob=poststrat$N*p_response) # For respondent i, people[i] is that person's poststrat cell, # some number between 1 and 32 n_cell <- rep(NA, prod(J)) for (j in 1:prod(J)){ n_cell[j] <- sum(people==j) } print(cbind(poststrat, n_cell/n, poststrat$N/sum(poststrat$N))) ``` Assume the survey responses come from a logistic regression with these coefficients ```{r } coef_intercept <- 0.6 coef_sex <- c(0, -0.2) coef_age <- c(0, -0.2, -0.3, -0.4) coef_eth <- c(0, 0.6, 0.3, 0.3) ``` The probabilities are: ```{r } prob_yes <- rep(NA, prod(J)) for (j in 1:prod(J)){ prob_yes[j] <- invlogit(coef_intercept + coef_sex[poststrat[j,1]] + coef_age[poststrat[j,2]] + coef_eth[poststrat[j,3]]) } ``` Simulate the fake data: ```{r } y <- rbinom(n, 1, prob_yes[people]) ``` ## Linear model ```{r } sex <- poststrat[people,1] age <- poststrat[people,2] eth <- poststrat[people,3] fake <- data.frame(y, sex, age, eth) fit <- stan_glm(y ~ factor(sex) + factor(age) + factor(eth), family=binomial(link="logit"), data=fake, refresh=0) print(fit) ``` #### Prediction ```{r } pred_sim <- posterior_epred(fit, newdata=as.data.frame(poststrat)) pred_est <- colMeans(pred_sim) round(cbind(poststrat, prob_yes, pred_est), 2) ``` #### Poststratification ```{r } poststrat_est <- sum(poststrat$N*pred_est)/sum(poststrat$N) round(poststrat_est, 2) ``` plus uncertainty ```{r } poststrat_sim <- pred_sim %*% poststrat$N / sum(poststrat$N) round(c(mean(poststrat_sim), sd(poststrat_sim)), 3) ```