--- title: "Regression and Other Stories: Imputation" author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" date: "`r format(Sys.Date())`" output: html_document: theme: readable toc: true toc_depth: 2 toc_float: true code_download: true --- Regression-based imputation for the Social Indicators Survey. See Chapter 17 in Regression and Other Stories.
dplyr + ggplot version. ------------- ```{r setup, include=FALSE} knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE ``` **Load packages** ```{r } library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstanarm") library("dplyr") library("ggplot2") theme_set(bayesplot::theme_default(base_family = "sans")) ``` **Load data** ```{r } SIS <- read.csv(root("Imputation/data","SIS.csv")) head(SIS) summary(SIS) ``` **Imputation helper functions**
Create a completed data vector using imputations ```{r } impute <- function(a, a_impute) { ifelse(is.na(a), a_impute, a) } ``` Top code function ```{r } topcode <- function(a, top) { ifelse(a>top, top, a) } ``` se missing codes to NA ```{r } na_fix <- function(a) { ifelse(a<0 | a==999999, NA, a) } is_any <- function(a) { any_a <- ifelse(a>0, 1, 0) any_a[is.na(a)] <- 0 any_a } ``` ### 1. Deterministic imputation **Impute 0 earnings using the logical rule (if worked 0 months and 0 hrs/wk)** ```{r } SIS <- mutate(SIS, zero_earnings = (workhrs_top==0 & workmos==0), earnings_top = if_else(zero_earnings, 0, topcode(earnings, 100))) ``` **Create a little dataset with all our redefined variables** ```{r } SIS <- select(SIS, earnings, earnings_top, interest, male, over65, white, immig, educ_r, workmos, workhrs_top, any_ssi, any_welfare, any_charity) ``` **Impute subset of earnings that are nonzero: linear scale** ```{r } fit_imp_1 <- stan_glm( earnings ~ male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data = SIS, subset = earnings > 0, refresh = 0 ) print(fit_imp_1) SIS_predictors <- select(SIS, -starts_with("earnings")) pred_1 <- colMeans(posterior_linpred(fit_imp_1, newdata = SIS_predictors)) SIS <- mutate(SIS, earnings_imp_1 = impute(earnings, pred_1)) ``` **Impute subset of earnings that are nonzero: square root scale and topcoding** ```{r } fit_imp_2 <- stan_glm( sqrt(earnings_top) ~ male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data = SIS, subset = earnings > 0, refresh = 0 ) print(fit_imp_2) ``` point predictions ```{r } pred_2_sqrt <- colMeans(posterior_linpred(fit_imp_2, newdata = SIS_predictors)) pred_2 <- topcode(pred_2_sqrt^2, 100) SIS <- mutate(SIS, pred_2 = pred_2, earnings_imp_2 = impute(earnings_top, pred_2)) ``` ### 2. Random imputation **Linear scale (use fitted model fit_imp_1)** ```{r } pred_3 <- posterior_predict(fit_imp_1, newdata = SIS_predictors, draws = 1) SIS <- mutate(SIS, earnings_imp_3 = impute(earnings, pred_3)) ``` **Square root scale and topcoding (use fitted model fit_imp_2)** ```{r } pred_4_sqrt <- posterior_predict(fit_imp_2, newdata = SIS_predictors, draws = 1) pred_4 <- topcode(pred_4_sqrt^2, 100) SIS <- mutate(SIS, earnings_imp_4 = impute(earnings_top, pred_4)) ``` ### 3. Histograms and scatterplots of data and imputations Observed earnings (excluding 0's) ```{r } qplot(earnings_top, data=filter(SIS, !is.na(earnings)), breaks=seq(0,100,10), fill=I("white"), col=I("black")) + xlab("earnings") + ggtitle("Observed earnings (excluding 0's)") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) ggsave(root("Imputation/figs","impute_hist2_gg.pdf"), width = 5, height = 4) qplot(earnings_imp_2, data = filter(SIS, !is.na(earnings)), breaks=seq(0,100,10), fill=I("white"), col=I("black")) + xlab("earnings") + ggtitle("Deterministic imputation of earnings") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) ggsave(root("Imputation/figs","impute_hist3_gg.pdf"), width = 5, height = 4) ``` Random imputation of earnings ```{r } qplot(earnings_imp_4, data = filter(SIS, !is.na(earnings)), breaks=seq(0,100,10), fill=I("white"), col=I("black")) + xlab("earnings") + ggtitle("Random imputation of earnings") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) ggsave(root("Imputation/figs","impute_hist4_gg.pdf"), width = 5, height = 4) ``` Deterministic imputation scatter plot ```{r } ggplot() + geom_point(aes(x=earnings_imp_2, y=earnings_imp_2), shape=19, data = filter(SIS, !is.na(earnings))) + geom_point(aes(x=pred_2, y=earnings_top), shape=20, color="darkgrey", data = filter(SIS, !is.na(earnings))) + xlab("Regression prediction") + ylab("Earnings") + ggtitle("Deterministic imputation") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) ggsave(root("Imputation/figs","impute_scat1_gg.pdf"), width = 5, height = 4) ``` Random imputation scatter plot ```{r } ggplot() + geom_point(aes(x=earnings_imp_2, y=earnings_imp_4), shape=19, data = filter(SIS, !is.na(earnings))) + geom_point(aes(x=pred_2, y=earnings_top), shape=20, color="darkgrey", data = filter(SIS, !is.na(earnings))) + xlab("Regression prediction") + ylab("Earnings") + ggtitle("Random imputation") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) ggsave(root("Imputation/figs","impute_scat2_gg.pdf"), width = 5, height = 4) ``` ### 4. Two-stage imputation model **Fit the 2 models** ```{r } fit_positive <- stan_glm((earnings>0) ~ male + over65 + white + immig + educ_r + any_ssi + any_welfare + any_charity, data=SIS, family=binomial(link=logit), refresh=0) print(fit_positive, digits=2) # 2nd model was alraedy fitted before fit_positive_sqrt <- fit_imp_2 print(fit_positive_sqrt) ``` **Predict the sign and then the earnings (if positive)** ```{r } pred_pos <- posterior_predict(fit_positive, newdata = SIS_predictors, draws = 1) pred_ifpos_sqrt <- posterior_predict(fit_positive_sqrt, newdata = SIS_predictors, draws = 1) pred_ifpos <- topcode(pred_ifpos_sqrt^2, 100) SIS <- mutate(SIS, earnings_imp = impute(earnings, pred_pos*pred_ifpos)) ``` ### 5. Iterative regression imputation **Starting values** ```{r } random_imp <- function (a){ missing <- is.na(a) n_missing <- sum(missing) a_obs <- a[!missing] imputed <- a imputed[missing] <- sample(a_obs, n_missing) imputed } SIS <- mutate(SIS, interest_imp = random_imp(interest), earnings_imp = random_imp(earnings)) ``` **Simplest regression imputation** ```{r } n_sims <- 10 for (s in 1:n_sims) { # Predict earnings output <- capture.output( fit <- stan_glm(earnings ~ interest_imp + male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data = SIS, cores = 1, open_progress = FALSE, refresh=0)) pred1 <- posterior_predict(fit, draws = 1) # Impute earnings SIS <- mutate(SIS, earnings_imp = impute(earnings, pred1)) # Predict interest output <- capture.output( fit <- stan_glm(interest ~ earnings_imp + male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data = SIS, cores = 1, open_progress = FALSE, refresh=0)) pred2 <- posterior_predict(fit, draws = 1) # Impute interest SIS <- mutate(SIS, interest_imp = impute(interest, pred2)) } ```