--- title: "Regression and Other Stories: Imputation" author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" date: "`r format(Sys.Date())`" output: html_document: theme: readable toc: true toc_depth: 2 toc_float: true code_download: true --- Regression-based imputation for the Social Indicators Survey. See Chapter 17 in Regression and Other Stories. ------------- ```{r setup, include=FALSE} knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE ``` #### Load packages ```{r } library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstanarm") ``` #### Load data ```{r } SIS <- read.csv(root("Imputation/data","SIS.csv")) head(SIS) summary(SIS) ``` #### Imputation helper functions
Create a completed data vector using imputations ```{r } impute <- function(a, a_impute) { ifelse(is.na(a), a_impute, a) } ``` Top code function ```{r } topcode <- function(a, top) { ifelse(a > top, top, a) } ``` ## Deterministic imputation #### Impute 0 earnings using the logical rule (if worked 0 months and 0 hrs/wk) ```{r } SIS$earnings_top <- topcode(SIS$earnings, 100) SIS$earnings_top[SIS$workhrs_top==0 & SIS$workmos==0] <- 0 ``` #### Create a dataset with all predictor variables ```{r } n <- nrow(SIS) SIS_predictors <- SIS[,c("male","over65","white","immig","educ_r","workmos", "workhrs_top","any_ssi","any_welfare","any_charity")] ``` #### Impute subset of earnings that are nonzero: linear scale ```{r } fit_imp_1 <- stan_glm( earnings ~ male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data = SIS, subset = earnings > 0, refresh = 0 ) print(fit_imp_1) ``` point predictions ```{r } pred_1 <- colMeans(posterior_linpred(fit_imp_1, newdata = SIS_predictors)) SIS$earnings_imp_1 <- impute(SIS$earnings, pred_1) ``` #### Impute subset of earnings that are nonzero: square root scale and topcoding ```{r } fit_imp_2 <- stan_glm( sqrt(earnings_top) ~ male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data = SIS, subset = earnings > 0, refresh = 0 ) print(fit_imp_2) ``` point predictions ```{r } pred_2_sqrt <- colMeans(posterior_linpred(fit_imp_2, newdata = SIS_predictors)) pred_2 <- topcode(pred_2_sqrt^2, 100) SIS$earnings_imp_2 <- impute(SIS$earnings_top, pred_2) ``` ## One random imputation #### Linear scale (use fitted model fit_imp_1) ```{r } pred_3 <- posterior_predict(fit_imp_1, newdata = SIS_predictors, draws = 1) SIS$earnings_imp_3 <- impute(SIS$earnings, pred_3) ``` #### Square root scale and topcoding (use fitted model fit_imp_2) ```{r } pred_4_sqrt <- posterior_predict(fit_imp_2, newdata = SIS_predictors, draws = 1) pred_4 <- topcode(pred_4_sqrt^2, 100) SIS$earnings_imp_4 <- impute(SIS$earnings_top, pred_4) ``` ### 3. Histograms and scatterplots of data and imputations ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Imputation/figs","impute_hist2.pdf"), height=4, width=5.5) ``` ```{r } par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01) hist(SIS$earnings_top[SIS$earnings>0], breaks=seq(0,100,10), xlab="earnings", ylab="", main="Observed earnings (excluding 0's)") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Imputation/figs","impute_hist3.pdf"), height=4, width=5.5) ``` ```{r } par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01) hist(SIS$earnings_imp_2[is.na(SIS$earnings)], breaks=seq(0,100,10), xlab="earnings", ylab="", ylim=c(0,48), main="Deterministic imputation of earnings") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Imputation/figs","impute_hist4.pdf"), height=4, width=5.5) ``` ```{r } par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01) hist(SIS$earnings_imp_4[is.na(SIS$earnings)], breaks=seq(0,100,10), xlab="earnings", ylab="", ylim=c(0,48), main="Random imputation of earnings") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Imputation/figs","impute_scat_1.pdf"), height=4, width=5) ``` ```{r } par(mar=c(3,3,2,1), mgp=c(1.7,.5,0), tck=-.01) plot(range(SIS$earnings_imp_2[is.na(SIS$earnings)]), c(0,100), xlab="Regression prediction", ylab="Earnings", main="Deterministic imputation", type="n", bty="l") points(SIS$earnings_imp_2[is.na(SIS$earnings)], SIS$earnings_imp_2[is.na(SIS$earnings)], pch=19, cex=.5) points(pred_2, SIS$earnings_top, pch=20, col="darkgray", cex=.5) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Imputation/figs","impute_scat_2.pdf"), height=4, width=5) ``` ```{r } par(mar=c(3,3,2,1), mgp=c(1.7,.5,0), tck=-.01) plot(range(SIS$earnings_imp_2[is.na(SIS$earnings)]), c(0,100), xlab="Regression prediction", ylab="Earnings", main="Random imputation", type="n", bty="l") points(SIS$earnings_imp_2[is.na(SIS$earnings)], SIS$earnings_imp_4[is.na(SIS$earnings)], pch=19, cex=.5) points(pred_2, SIS$earnings_top, pch=20, col="darkgray", cex=.5) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ## Two-stage imputation model #### Fit the 2 models ```{r } fit_positive <- stan_glm((earnings>0) ~ male + over65 + white + immig + educ_r + any_ssi + any_welfare + any_charity, data=SIS, family=binomial(link=logit), refresh = 0) print(fit_positive) fit_positive_sqrt <- stan_glm(sqrt(earnings_top) ~ male + over65 + white + immig + educ_r + any_ssi + any_welfare + any_charity, data=SIS, subset=earnings>0, refresh = 0) # (same as fit_imp_2 from above) print(fit_positive_sqrt) ``` #### Predict the sign and then the earnings (if positive) ```{r } # one random imp pred_sign <- posterior_predict(fit_positive, newdata = SIS_predictors, draws = 1) # one random imp pred_pos_sqrt <- posterior_predict(fit_positive_sqrt, newdata = SIS_predictors, draws = 1) pred_pos <- topcode(pred_pos_sqrt^2, 100) SIS$earnings_imp <- impute(SIS$earnings, pred_sign*pred_pos) ``` ## Iterative regression imputation #### Starting values ```{r } random_imp <- function (a){ missing <- is.na(a) n_missing <- sum(missing) a_obs <- a[!missing] imputed <- a imputed[missing] <- sample(a_obs, n_missing) imputed } SIS$interest_imp <- random_imp(SIS$interest) SIS$earnings_imp <- random_imp(SIS$earnings) ``` #### Simplest regression imputation ```{r } n_loop <- 10 for (s in 1:n_loop){ fit <- stan_glm(earnings ~ interest_imp + male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data=SIS, refresh = 0) SIS_predictors <- SIS[,c("male","over65","white","immig","educ_r","workmos", "workhrs_top","any_ssi","any_welfare","any_charity", "interest_imp", "earnings_imp")] pred1 <- posterior_predict(fit, newdata = SIS_predictors, draws = 1) SIS$earnings_imp <- impute(SIS$earnings, pred1) fit <- stan_glm(interest ~ earnings_imp + male + over65 + white + immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare + any_charity, data=SIS, refresh = 0) SIS_predictors <- SIS[,c("male","over65","white","immig","educ_r","workmos", "workhrs_top","any_ssi","any_welfare","any_charity", "interest_imp", "earnings_imp")] pred2 <- posterior_predict(fit, newdata = SIS_predictors, draws = 1) SIS$interest_imp <- impute(SIS$interest, pred2) } ```