--- title: "Regression and Other Stories: ChildCare" author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" date: "`r format(Sys.Date())`" output: html_document: theme: readable toc: true toc_depth: 2 toc_float: true code_download: true --- Code and figures for Infant Health and Development Program (IHDP) example. See Chapter 20 in Regression and Other Stories. The intervention for low-birth-weight children is described by - Brooks-Gunn, J., Liaw, F. R., and Klebanov, P. K. (1992). Effects of early intervention on cognitive function of low birth weight preterm infants. Journal of Pediatrics 120, 350–359. - Hill, J. L., Brooks-Gunn, J., and Waldfogel, J. (2003). Sustained effects of high participation in an early intervention for low-birth-weight premature infants. Developmental Psychology 39, 730–744. ------------- ```{r setup, include=FALSE} knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE ``` #### Load packages ```{r } library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstan") library("arm") library("rstanarm") library("survey") source(root("Childcare/library","matching.R")) source(root("Childcare/library","balance.R")) source(root("Childcare/library","estimation.R")) ``` #### Load data ```{r } cc2 <- read.csv(root("Childcare/data","cc2.csv")) head(cc2) ``` #### Figure: displays a scatter plot of birthweight versus test scores at age 3 with observations displayed in different colors. ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Childcare/figs","ppvt.bw.pdf"), height=3.6,width=4.2) ``` ```{r } par(mfrow=c(1,1)) tmp <- lm(ppvtr.36~bw+treat,data=cc2)$coef plot(cc2$bw, cc2$ppvtr.36, xlab="birth weight", ylab="test score at age 3", mgp=c(2,.5,0), main="",type="n",xlim=c(1500,5000),cex.axis=.75,cex.lab=.8,lab=c(3,5,7),xaxt="n") axis(side=1,at=c(2000,3000,4000,5000),cex.axis=.75) points(cc2$bw[cc2$treat==0]+runif(sum(cc2$treat==0),-.5,5), cc2$ppvtr.36[cc2$treat==0], col="darkgrey", pch=20, cex=.3) points(cc2$bw[cc2$treat==1]+runif(sum(cc2$treat==1),-.5,5), cc2$ppvtr.36[cc2$treat==1], pch=20, cex=.3) curve(tmp[1]+tmp[2]*x,add=T,lty=2) curve(tmp[1]+tmp[3]+tmp[2]*x,add=T) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ## Plots #### Figure: Overlap plot of mother's education & age of child (months). ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Childcare/figs","age.educ.freq.AZC.f20.11.pdf"), height=4, width=6) ``` ```{r } par(mfrow=c(1,2)) hist(cc2$educ[cc2$treat==0],xlim=c(0,5),main="",border="darkgrey",breaks=c(.5,1.5,2.5,3.5,4.5),mgp=c(2,.5,0),xlab="mother's education",freq=TRUE) hist(cc2$educ[cc2$treat==1],xlim=c(0,5),xlab="education",breaks=c(.5,1.5,2.5,3.5,4.5),freq=TRUE,add=T) hist(cc2$age[cc2$treat==0],xlim=c(0,110),main="",xlab="age of child (months)",border="darkgrey",breaks=seq(0,110,10),mgp=c(2,.5,0), freq=TRUE) hist(cc2$age[cc2$treat==1],xlim=c(0,110),xlab="",breaks=seq(0,110,10),freq=TRUE, add=T) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` ## Subclassification / Stratification #### Figure: Table of stratified treatment effect estimate table. ```{r } edu <- list(cc2$lths, cc2$hs, cc2$ltcoll, cc2$college) # mean difference y.trt <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), mean)[4] }) y.ctrl <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), mean)[3] }) te <- y.trt - y.ctrl # sample sizes n.trt <- sapply(edu, FUN=function(x) { tapply(rep(1, nrow(cc2)), list(cc2$treat, x), sum)[4] }) n.ctrl <- sapply(edu, FUN=function(x) { sum(tapply(rep(1, nrow(cc2)), list(cc2$treat, x), sum)[3]) }) # std errors var.trt <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), var)[4] }) var.ctrl <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), var)[3] }) se <- sqrt(var.trt/n.trt + var.ctrl/n.ctrl) tes <- matrix(c(te, n.trt, n.ctrl, se), nrow=4) rownames(tes) <- c('lths', 'hs', 'ltcoll', 'college') colnames(tes) <- c('te', 'n.trt', 'n.ctrl', 'se') round(tes, 1) ``` ## Estimands and subclassification #### ATE ```{r } (9.3* 1358 + 4.1 * 1820 + 7.9* 837 + 4.6* 366) / (1358+1820+837+366) ``` standard error ```{r } (1.5^2* 1358^2 + 1.9^2 * 1738^2 + 2.4^2* 789^2 + 2.3^2 * 366^2) / ((1358+1820+837+366)^2) ``` #### ATT ```{r } round((9.3* 126 + 4.1 * 82 + 7.9* 48 + 4.6* 34) / (126+82+48+34), 1) ``` standard error ```{r } round((1.5^2* 126^2 + 1.9^2 * 82^2 + 2.4^2* 48^2 + 2.3^2 * 34^2) / ((126+82+48+34)^2),2) ``` ## Propensity Score Matching #### Step 2: Estimating the propensity score these are the no redundancy covariates with and without state covariates ```{r } covs.nr <- c('bwg', 'hispanic', 'black', 'b.marr', 'lths', 'hs', 'ltcoll', 'work.dur', 'prenatal', 'sex', 'first', 'bw', 'preterm', 'momage', 'dayskidh') covs.nr.st <- c(covs.nr, 'st5', 'st9', 'st12', 'st25', 'st36', 'st42', 'st53') ``` pscore estimation formula with original covariates ```{r } ps_spec <- formula(treat ~ bw + bwg + hispanic + black + b.marr + lths + hs + ltcoll + work.dur + prenatal + sex + first + preterm + momage + dayskidh + income) ``` pscore estimation formula with states added ```{r } ps_spec.st <- update(ps_spec, . ~ . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) ``` #### Estimation of pscores using stan_glm ```{r } set.seed(1234) ps_fit_1 <- stan_glm(ps_spec, family=binomial(link='logit'), data=cc2, algorithm='optimizing', refresh=0) ps_fit_1.st <- stan_glm(ps_spec.st, family=binomial(link='logit'), data=cc2, algorithm='optimizing', refresh=0) ``` extracting (logit) pscores from the fit ```{r } pscores <- apply(posterior_linpred(ps_fit_1), 2, mean) pscores.st <- apply(posterior_linpred(ps_fit_1.st), 2, mean) ``` #### Step 3: Matching matching without replacement, original formula ```{r } matches <- matching(z=cc2$treat, score=pscores, replace=FALSE) matched <- cc2[matches$match.ind,] ``` matching with replacement, original formula ```{r } matches.wr <- matching(z=cc2$treat, score=pscores, replace=TRUE) wts.wr <- matches.wr$cnts ``` matching without replacement, pscores that include state indicators ```{r } matches.st <- matching(z=cc2$treat, score=pscores.st, replace=FALSE) matched.st <- cc2[matches.st$match.ind,] ``` matching with replacement, pscores that include state indicators ```{r } matches.st.wr <- matching(z=cc2$treat, score=pscores.st, replace=TRUE) wts.st.wr <- matches.st.wr$cnts ``` #### Step 4: Balance and overlap Balance plots for all covariates specified in STEP 1 (in the book) ```{r } covs <- c('bw', 'preterm', 'dayskidh', 'sex', 'first', 'age', 'black', 'hispanic', 'white', 'b.marr', 'lths', 'hs', 'ltcoll', 'college', 'work.dur', 'prenatal', 'momage') cov_names <- c('birth weight', 'weeks preterm', 'days in hospital', 'male', 'first born', 'age', 'black', 'hispanic', 'white', 'unmarried at birth', 'less than high school', 'high school graduate', 'some college', 'college graduate', 'worked during pregnancy', 'had no prenatal care', 'age at birth') bal_nr <- balance(rawdata=cc2[,covs], treat=cc2$treat, matched=matches$cnts, estimand='ATT') bal_nr.wr <- balance(rawdata=cc2[,covs], treat=cc2$treat, matched=matches.wr$cnts, estimand='ATT') bal_nr.wr.st <- balance(rawdata=cc2[, union(covs, covs.nr.st)], treat=cc2$treat, matched=matches.st.wr$cnts, estimand='ATT') ``` ## Balance plot Labeled cov names, ps_fit_1 MwoR. No state variables included. Balance for all covariates, not just those used in propensity score model ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Childcare/figs","balance.both.azc.pdf"), width=11, height=8.5) ``` ```{r } pts <- bal_nr$diff.means.raw[,4] pts2 <- bal_nr$diff.means.matched[,4] K <- length(pts) idx <- 1:K main <- 'Absolute Standardized Difference in Means' mar <- c(8, 6, 6, 7) par(mar=mar) maxchar <- max(sapply(cov_names, nchar)) min.mar <- par('mar') mar[2] <- max(min.mar[2], trunc(mar[2] + maxchar/10)) + mar[2] + 0.5 par(mar=mar) pts <- rev(pts) pts2 <- rev(pts2) longcovnames <- rev(cov_names) plot(c(pts,pts2), c(idx,idx), bty='n', xlab='', ylab='', xaxt='n', yaxt='n', type='n', main=main, cex.main=1.2) abline(v=0, lty=2) points(pts, idx, cex=1) points(pts2, idx, pch=19, cex=1) axis(3) axis(2, at=1:K, labels=longcovnames[1:K], las=2, hadj=1, lty=0, cex.axis=1.2) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` #### Step 4: Diagnostics for balance and overlap Separate balance plots for continuous and binary variables. Labelled cov names, ps_fit_1 MWR. Manual plot code based on the code in balance.R. #### Figure ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Childcare/figs","balance.cont.binary.AZC.pdf"), width=10, height=6) ``` ```{r } par(mfrow=c(1,2)) mar1 <- c(5, 4, 6, 2) mar2 <- c(5, 3, 6, 4) pts <- bal_nr.wr$diff.means.raw[bal_nr.wr$binary==TRUE,4] pts2 <- bal_nr.wr$diff.means.matched[bal_nr.wr$binary==TRUE,4] K <- length(pts) idx <- 1:K main <- 'Absolute Difference in Means' longcovnames <- cov_names[bal_nr.wr$binary==TRUE] mar <- mar1 par(mar=mar) maxchar <- max(sapply(longcovnames, nchar)) min.mar <- par('mar') mar[2] <- max(min.mar[2], trunc(mar[2] + maxchar/10)) + mar[2] + 0.5 par(mar=mar) pts <- rev(pts) pts2 <- rev(pts2) longcovnames <- rev(longcovnames) plot(c(pts,pts2), c(idx,idx), bty='n', xlab='', ylab='', xaxt='n', yaxt='n', type='n', main=main, cex.main=1.2, xlim=c(0,.55)) abline(v=0, lty=2) points(pts, idx, cex=1) points(pts2, idx, pch=19, cex=1) axis(3, at=seq(0,.5,.1), xpd=TRUE) axis(2, at=1:K, labels=longcovnames[1:K], las=2, hadj=1, lty=0, cex.axis=1) pts <- bal_nr.wr$diff.means.raw[bal_nr.wr$binary==FALSE,4] pts2 <- bal_nr.wr$diff.means.matched[bal_nr.wr$binary==FALSE,4] # AZC: hack to fix spacing of binary covariates against x axis # the spacing of how spaced apart the ticks are changes as the number of covariates change. It's frustratingly hard, maybe impossible, to get the spacing to match between the continuous and binary plots with different number of covariates in each, so, I'll add fake data that won't show up pts <- c(pts, rep(NA, 7)) pts2 <- c(pts2, rep(NA, 7)) K <- length(pts) idx <- 1:K main <- 'Absolute Standardized Difference in Means' longcovnames <- cov_names[bal_nr.wr$binary==FALSE] # add extra names to match above longcovnames <- c(longcovnames, rep('', 7)) mar <- mar2 par(mar=mar) maxchar <- max(sapply(longcovnames, nchar)) min.mar <- par('mar') mar[2] <- max(min.mar[2], trunc(mar[2] + maxchar/10)) + mar[2] + 0.5 par(mar=mar) pts <- rev(pts) pts2 <- rev(pts2) longcovnames <- rev(longcovnames) plot(c(pts,pts2), c(idx,idx), bty='n', xlab='', ylab='', xaxt='n', yaxt='n', type='n', main=main, cex.main=1.2) segments(x0=0, y0=13, x1=0, y1=7.5, lty=2) points(pts, idx, cex=1) points(pts2, idx, pch=19, cex=1) axis(3) axis(2, at=8:12, labels=longcovnames[8:12], las=2, hadj=1, lty=0, cex.axis=1) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` #### Figure: Overlap of propensity scores before/after matching with replacement. Pscores from original model. ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Childcare/figs","ps.overlap.dens.AZC.pdf"), width=11, height=8.5) ``` ```{r } par(mfrow=c(1,2), cex.main=1.3, cex.lab=1.3) # Plot the overlapping histograms for pscores before matching, density par(mar=c(16,8,2,2)) hist(pscores[cc2$treat==0], xlim=c(-20,5), ylim=c(0,.28), main="before matching", border="darkgrey", mgp=c(2,.5,0), xlab="logit propensity scores", freq=FALSE) hist(pscores[cc2$treat==1], freq=FALSE, add=TRUE) # Plot the overlapping histograms for pscores after matching, frequency par(mar=c(16,3,2,8)) hist(pscores[cc2[matches.wr$match.ind, 'treat']==0], xlim=c(-20,6), ylim=c(0,.28), main="after matching", border="darkgrey", mgp=c(2,.5,0), xlab="logit propensity scores", freq=FALSE) hist(pscores[cc2[matches.wr$match.ind, 'treat']==1], freq=FALSE, add=TRUE) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` how many pscores[cc2$treat==0] left out of plot? ```{r } sum(pscores[cc2$treat==0] < -20) ``` pscore matching check ```{r } sum(pscores[cc2$treat==1] > max(pscores[cc2$treat==0])) ``` #### Figure: Example: good overlap, bad pscore. ```{r } set.seed(20) ps3.mod <- glm(treat ~ unemp.rt, data=cc2,family=binomial) pscores3 <- predict(ps3.mod, type="link") ``` ```{r eval=FALSE, include=FALSE} if (savefigs) pdf(root("Childcare/figs","bad.pscore.overlap.AZC.pdf"), width=11, height=8.5) ``` ```{r } par(mar=c(8,3,4,3), cex=1.4, cex.lab=1.2) # Plot the overlapping histograms for pscore3, density hist(pscores3[cc2$treat==0], xlim=range(pscores3), ylim=c(0,8), main="", border="darkgrey", mgp=c(2,.5,0), xlab="logit propensity scores",freq=FALSE) hist(pscores3[cc2$treat==1], freq=FALSE, add=TRUE) ``` ```{r eval=FALSE, include=FALSE} if (savefigs) dev.off() ``` #### Step 5: Estimating a treatment effect using the restructured data. ```{r } te_spec_nr <- update(ps_spec, ppvtr.36 ~ treat + .) ``` treatment effect without replacement ```{r } set.seed(20) reg_ps <- stan_glm(te_spec_nr, data=cc2[matches$match.ind,], algorithm='optimizing') ``` treatment effect with replacement ```{r } set.seed(20) reg_ps.wr <- stan_glm(te_spec_nr, data=cc2, weight=matches.wr$cnts, algorithm='optimizing') ps_fit_1_design <- svydesign(ids=~1, weights=matches.wr$cnts, data=cc2) reg_ps.wr_svy <- svyglm(te_spec_nr, design=ps_fit_1_design, data=cc2) summary(reg_ps)['treat', 1:2] summary(reg_ps.wr)['treat', 1:2] summary(reg_ps.wr_svy)$coef['treat', 1:2] ``` Geographic information, covs_nr.st ```{r } te_spec_nr.st <- update(ps_spec.st, ppvtr.36 ~ treat + . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) ``` treatment effect without replacement ```{r } set.seed(20) reg_ps.st <- stan_glm(te_spec_nr.st, data=cc2[matches.st$match.ind,], algorithm='optimizing') ``` treatment effect with replacement ```{r } set.seed(20) reg_ps.st.wr <- stan_glm(te_spec_nr.st, data=cc2, weight=matches.st.wr$cnts, algorithm='optimizing') ps_fit_1.st_design <- svydesign(ids=~1, weights=matches.st.wr$cnts, data=cc2) reg_ps.st.wr_svy <- svyglm(te_spec_nr.st, design=ps_fit_1.st_design, data=cc2) summary(reg_ps.st)['treat', 1:2] summary(reg_ps.st.wr)['treat', 1:2] summary(reg_ps.st.wr_svy)$coef['treat', 1:2] ``` standard regression estimate of treatment effect ```{r } set.seed(20) reg_te <- stan_glm(te_spec_nr, data=cc2, algorithm='optimizing') ``` standard regression estimate of treatment effect with state ```{r } set.seed(20) reg_te.st <- stan_glm(te_spec_nr.st, data=cc2, algorithm='optimizing') summary(reg_te)['treat', 1:2] summary(reg_te.st)['treat', 1:2] ``` ## Improved pscore Model Improved pscore model using an interaction or squared term. transformed variables ```{r } cc2$bwT = (cc2$bw-1500)^2 cc2$dayskidT = log(cc2$dayskidh+1) cc2$pretermT = (cc2$preterm+8)^2 cc2$momageT = (cc2$momage^2) ``` New ps-spec from psFitR(21) ```{r } ps_spec2 <- formula(treat ~ bwg*as.factor(educ) + as.factor(ethnic)*b.marr + work.dur + prenatal + preterm + momage + sex + first + bw + dayskidT + pretermT + momageT + black*(bw + preterm + dayskidT) + b.marr*(bw + preterm + dayskidT) + bw*income) ps_spec_i21 <- formula(treat~bw+preterm+dayskidh+sex+hispanic+b.marr+lths+hs+ltcoll+work.dur+prenatal+momage+income+bwT+pretermT+income+black:dayskidT+b.marr:bw+b.marr:preterm+b.marr:dayskidT+bw:income) ps_spec2 <- ps_spec_i21 set.seed(8) ps_fit_2 <- stan_glm(ps_spec2, family=binomial(link="logit"), data=cc2, algorithm='optimizing') pscores_2 <- apply(posterior_linpred(ps_fit_2, type='link'), 2, mean) matches2 <- matching(z=cc2$treat, score=pscores_2, replace=FALSE) matched2 <- cc2[matches2$match.ind,] matches2_wr <- matching(z=cc2$treat, score=pscores_2, replace=TRUE) matched2_wr <- cc2[matches2_wr$match.ind,] bal_2 <- balance(rawdata=cc2[,covs], cc2$treat, matched=matches2$cnts, estimand='ATT') bal_2.wr <- balance(rawdata=cc2[,covs], cc2$treat, matched=matches2_wr$cnts, estimand='ATT') ``` look at some balance plots ```{r } par(mfrow=c(1,1)) plot.balance(bal_nr, which.covs='cont', main='ps_fit_1') plot.balance(bal_2.wr, which.covs='cont', main='ps_fit_2') plot.balance(bal_nr, which.covs='binary', main='ps_fit_1') plot.balance(bal_2.wr, which.covs='binary', main='ps_fit_2') ``` #### Figure: side by side binary/continuous ```{r } plot.balance(bal_2.wr, longcovnames=cov_names) ``` #### Treatment effect ```{r } te_spec2 <- formula(ppvtr.36 ~ treat + hispanic + black + b.marr + lths + hs + ltcoll + work.dur + prenatal + momage + sex + first + preterm + dayskidh + bw + income) te_spec2 <- te_spec_nr set.seed(8) # MwoR reg_ps2 <- stan_glm(te_spec2, data=matched2, algorithm='optimizing') # MwR reg_ps2.design <- svydesign(ids=~1, weights=matches2_wr$cnts, data=cc2) reg_ps2.wr <- svyglm(te_spec2, design=reg_ps2.design, data=cc2) summary(reg_ps2)['treat', 1:2] summary(reg_ps2.wr)$coef['treat', 1:2] ``` #### Geographic information using ps_spec2 ```{r } ps_spec2.st <- update(ps_spec2, . ~ . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) set.seed(8) ps_fit_2.st <- stan_glm(ps_spec2.st, family=binomial(link='logit'), data=cc2, algorithm='optimizing') pscores_2.st <- apply(posterior_linpred(ps_fit_2.st, type='link'), 2, mean) matches2.st <- matching(z=cc2$treat, score=pscores_2.st, replace=FALSE) matched2.st <- cc2[matches2.st$match.ind,] matches2.st_wr <- matching(z=cc2$treat, score=pscores_2.st, replace=TRUE) ``` #### Treatment effect estimate ```{r } te_spec2.st <- update(te_spec2, . ~ . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) set.seed(8) # MwoR reg_ps2.st <- stan_glm(te_spec2.st, data=matched2.st, algorithm='optimizing') reg_ps2.st_design <- svydesign(ids=~1, weights=matches2.st_wr$cnts, data=cc2) reg_ps2.st.wr <- svyglm(te_spec2.st, design=reg_ps2.st_design, data=cc2) summary(reg_ps2.st)['treat', 1:2] summary(reg_ps2.st.wr)$coef['treat', 1:2] ``` #### IPTW (including state indicators) ```{r } wt.iptw <- inv.logit(pscores) / (1 - inv.logit(pscores)) wt.iptw[cc2$treat==0] <- wt.iptw[cc2$treat==0] * (sum(wt.iptw[cc2$treat==0]) / sum(cc2$treat==0)) wt.iptw[cc2$treat==1] <- 1 set.seed(20) ps_fit_iptw_design <- svydesign(ids=~1, weights=wt.iptw, data=cc2) reg_ps.iptw <- svyglm(te_spec_nr, design=ps_fit_iptw_design, data=cc2) summary(reg_ps.iptw)$coef['treat', 1:2] ``` ## Beyond balance in means Table of ratio of standard deviations across treatment & control groups for unmatched, MWOR, MWR. ```{r } cont_vars <- c('bw', 'preterm', 'dayskidh', 'momage', 'income', 'age') sds.um <- sapply(cont_vars, function(x){ tapply(cc2[,x], cc2$treat, sd) }) sds.mwor <- sapply(cont_vars, function(x){ tapply(matched[,x], matched$treat, sd) }) mwr.ind <- rep(cc2$row.names, times=matches.wr$cnts) matched.wr <- cc2[mwr.ind, ] sds.mwr <- sapply(cont_vars, function(x){ tapply(matched.wr[,x], matched.wr$treat, sd) }) ```