#' --- #' title: "Regression and Other Stories: Poststratification" #' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" #' date: "`r format(Sys.Date())`" #' output: #' html_document: #' theme: readable #' toc: true #' toc_depth: 2 #' toc_float: true #' code_download: true #' --- #' Poststratification after estimation. See Chapter 17 in Regression #' and Other Stories. #' #' The CBS News poll conducted from 12--16 October 2016 reported that, #' among likely voters who preferred one of the two major-party #' candidates, 45\% intended to vote for Donald Trump and 55\% for #' Hillary Clinton. Of these respondents, Party ID 33\% Republican, #' 40\% Republican, 27\% independent. #' #' source: http://www.cbsnews.com/news/cbs-poll-clintons-lead-over-trump-widens-with-three-weeks-to-go/ #' and https://www.scribd.com/document/327938789/CBS-News-Poll-10-17-toplines #' #' Effective sample size of likely voters #' ``` #' 254 Republican, 282 Democrat, 242 Independent #' ``` #' #' Compare to: #' ``` #' exit polls 2012 32 38 29 #' exit polls 2016 33 36 31 #' ``` #' #' ``` #' Republicans: 77% Trump, 8% Clinton (must normalize to 100%) #' Democrats: 5% Trump, 89% Clinton (must normalize to 100%) #' Independents: 36% Trump, 38% Clinton (must normalize to 100%) #' ``` #' ------------- #' #+ setup, include=FALSE knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) #' #### Load packages library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstanarm") #' ## Simulate fake data n_pid <- c(254, 282, 242) n <- sum(n_pid) pid_names <- c("Republican", "Democrat", "Independent") pid <- rep(pid_names, n_pid) n_vote <- as.list(rep(NA, 3)) n_vote[[1]] <- round(c(0.77, 0.08)*n_pid[1]) n_vote[[2]] <- round(c(0.05, 0.89)*n_pid[2]) n_vote[[3]] <- round(c(0.36, 0.38)*n_pid[3]) vote <- NULL y_bar_cells <- rep(NA, 3) for (j in 1:3){ n_vote[[j]]<- c(n_vote[[j]], n_pid[j] - sum(n_vote[[j]])) vote <- c(vote, rep(c(1, 0, NA), n_vote[[j]])) y_bar_cells[j] <- mean(vote[pid==pid_names[j]], na.rm=TRUE) round(y_bar_cells[j], 3) } poll <- data.frame(vote, pid) # write.csv(poll, root("Poststrat/data","poll.csv"), row.names=FALSE) # poll <- read.csv(root("Poststrat/data","poll.csv")) head(poll) summary(poll) #' ## Simple poststrat poststrat_data <- data.frame(pid=c("Republican", "Democrat", "Independent"), N=c(0.33, 0.36, 0.31)) round(sum(poststrat_data$N * y_bar_cells), 3) #' ## Linear model #' #### Raw estimate round(mean(poll$vote, na.rm=TRUE), 3) #' #### stan_glm fit_1 <- stan_glm(vote ~ factor(pid), data = poll, refresh = 0) print(fit_1, digits=2) #' #### Poststrat using posterior_linpred() epred_1 <- posterior_epred(fit_1, newdata=poststrat_data) poststrat_est_1 <- epred_1 %*% poststrat_data$N/sum(poststrat_data$N) print(c(mean(poststrat_est_1), mad(poststrat_est_1)), digits=2) #' #### Add extra uncertainty n_sim <- nrow(epred_1) poststrat_est_2 <- poststrat_est_1 + rnorm(n_sim, 0, 0.02) print(c(mean(poststrat_est_2), mad(poststrat_est_2)), digits=2) #' ## Logistic model #' #### Fit the regression fit <- stan_glm(vote ~ factor(pid), family=binomial(link="logit"), data = poll, refresh = 0) print(fit, digits=2) #' #### Raw estimate round(mean(poll$vote, na.rm=TRUE), 3) #' #### Poststrat using the predict function X_population <- data.frame(pid=c("Republican", "Democrat", "Independent")) N_population <- c(0.33, 0.36, 0.31) predict_poststrat <- colMeans(posterior_epred(fit, newdata=X_population)) poststrat_est_2 <- sum(N_population*predict_poststrat)/sum(N_population) round(poststrat_est_2, digits=3) #' #### Just to compare, poststrat using the original data predict_a <- predict(fit, type="response") round(mean(predict_a), 3) #' This doesn't work--it just spits back the raw estimate--because it's not using the external population info which is what makes poststrat work. #'