#' --- #' title: "Regression and Other Stories: Poststratification 2" #' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" #' date: "`r format(Sys.Date())`" #' output: #' html_document: #' theme: readable #' toc: true #' toc_depth: 2 #' toc_float: true #' code_download: true #' --- #' Demonstrate poststratification with simulated census and poll data #' See Chapter 17 in Regression and Other Stories. #' #' #' ------------- #' #' #### Load packages #+ setup, message=FALSE, error=FALSE, warning=FALSE library("rstanarm") invlogit <- plogis #' ## Simulate fake data #' #' Create an empty poststratification table, with rows for each of the #' $2 \times 4 \times 4$ sorts of people. J <- c(2, 4, 4) poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1))) colnames(poststrat) <- c("sex", "age", "eth", "N") count <- 0 for (i1 in 1:J[1]){ for (i2 in 1:J[2]){ for (i3 in 1:J[3]){ count <- count + 1 poststrat[count, 1:3] <- c(i1, i2, i3) } } } #' make up numbers for the populations of the cells p_sex <- c(0.52, 0.48) p_age <- c(0.2, 0.25, 0.3, 0.25) p_eth <- c(0.7, 0.1, 0.1, 0.1) for (j in 1:prod(J)){ poststrat$N[j] <- 250e6 * p_sex[poststrat[j,1]] * p_age[poststrat[j,2]] * p_eth[poststrat[j,3]] } #' Hypothesize a nonresponse pattern in which women, older people, and #' whites are more likely to respond than men, younger people, and #' minorities p_response_baseline <- 0.1 p_response_sex <- c(1, 0.8) p_response_age <- c(1, 1.2, 1.6, 2.5) p_response_eth <- c(1, 0.8, 0.7, 0.6) p_response <- rep(NA, prod(J)) for (j in 1:prod(J)){ p_response[j] <- p_response_baseline * p_response_sex[poststrat[j,1]] * p_response_age[poststrat[j,2]] * p_response_eth[poststrat[j,3]] } #' Sample from the assumed population with the assumed nonresponse probabilities n <- 1000 people <- sample(prod(J), n, replace=TRUE, prob=poststrat$N*p_response) # For respondent i, people[i] is that person's poststrat cell, # some number between 1 and 32 n_cell <- rep(NA, prod(J)) for (j in 1:prod(J)){ n_cell[j] <- sum(people==j) } print(cbind(poststrat, n_cell/n, poststrat$N/sum(poststrat$N))) #' Assume the survey responses come from a logistic regression with #' these coefficients coef_intercept <- 0.6 coef_sex <- c(0, -0.2) coef_age <- c(0, -0.2, -0.3, -0.4) coef_eth <- c(0, 0.6, 0.3, 0.3) #' The probabilities are: prob_yes <- rep(NA, prod(J)) for (j in 1:prod(J)){ prob_yes[j] <- invlogit(coef_intercept + coef_sex[poststrat[j,1]] + coef_age[poststrat[j,2]] + coef_eth[poststrat[j,3]]) } #' Simulate the fake data: y <- rbinom(n, 1, prob_yes[people]) #' ## Linear model sex <- poststrat[people,1] age <- poststrat[people,2] eth <- poststrat[people,3] fake <- data.frame(y, sex, age, eth) fit <- stan_glm(y ~ factor(sex) + factor(age) + factor(eth), family=binomial(link="logit"), data=fake, refresh=0) print(fit) #' #### Prediction pred_sim <- posterior_epred(fit, newdata=as.data.frame(poststrat)) pred_est <- colMeans(pred_sim) round(cbind(poststrat, prob_yes, pred_est), 2) #' #### Poststratification poststrat_est <- sum(poststrat$N*pred_est)/sum(poststrat$N) round(poststrat_est, 2) #' plus uncertainty poststrat_sim <- pred_sim %*% poststrat$N / sum(poststrat$N) round(c(mean(poststrat_sim), sd(poststrat_sim)), 3)