#' --- #' title: "Regression and Other Stories: ChildCare" #' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari" #' date: "`r format(Sys.Date())`" #' output: #' html_document: #' theme: readable #' toc: true #' toc_depth: 2 #' toc_float: true #' code_download: true #' --- #' Code and figures for Infant Health and Development Program (IHDP) #' example. See Chapter 20 in Regression and Other Stories. #' #' The intervention for low-birth-weight children is described by #' #' - Brooks-Gunn, J., Liaw, F. R., and Klebanov, P. K. (1992). Effects #' of early intervention on cognitive function of low birth weight #' preterm infants. Journal of Pediatrics 120, 350–359. #' - Hill, J. L., Brooks-Gunn, J., and Waldfogel, J. (2003). Sustained #' effects of high participation in an early intervention for #' low-birth-weight premature infants. Developmental Psychology 39, #' 730–744. #' #' ------------- #' #+ setup, include=FALSE knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA) # switch this to TRUE to save figures in separate files savefigs <- FALSE #' #### Load packages library("rprojroot") root<-has_file(".ROS-Examples-root")$make_fix_file() library("rstan") library("arm") library("rstanarm") library("survey") source(root("Childcare/library","matching.R")) source(root("Childcare/library","balance.R")) source(root("Childcare/library","estimation.R")) #' #### Load data cc2 <- read.csv(root("Childcare/data","cc2.csv")) head(cc2) #' #### Figure: displays a scatter plot of birthweight versus test scores #' at age 3 with observations displayed in different colors. #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Childcare/figs","ppvt.bw.pdf"), height=3.6,width=4.2) #+ par(mfrow=c(1,1)) tmp <- lm(ppvtr.36~bw+treat,data=cc2)$coef plot(cc2$bw, cc2$ppvtr.36, xlab="birth weight", ylab="test score at age 3", mgp=c(2,.5,0), main="",type="n",xlim=c(1500,5000),cex.axis=.75,cex.lab=.8,lab=c(3,5,7),xaxt="n") axis(side=1,at=c(2000,3000,4000,5000),cex.axis=.75) points(cc2$bw[cc2$treat==0]+runif(sum(cc2$treat==0),-.5,5), cc2$ppvtr.36[cc2$treat==0], col="darkgrey", pch=20, cex=.3) points(cc2$bw[cc2$treat==1]+runif(sum(cc2$treat==1),-.5,5), cc2$ppvtr.36[cc2$treat==1], pch=20, cex=.3) curve(tmp[1]+tmp[2]*x,add=T,lty=2) curve(tmp[1]+tmp[3]+tmp[2]*x,add=T) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' ## Plots #' #' #### Figure: Overlap plot of mother's education & age of child (months). #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Childcare/figs","age.educ.freq.AZC.f20.11.pdf"), height=4, width=6) #+ par(mfrow=c(1,2)) hist(cc2$educ[cc2$treat==0],xlim=c(0,5),main="",border="darkgrey",breaks=c(.5,1.5,2.5,3.5,4.5),mgp=c(2,.5,0),xlab="mother's education",freq=TRUE) hist(cc2$educ[cc2$treat==1],xlim=c(0,5),xlab="education",breaks=c(.5,1.5,2.5,3.5,4.5),freq=TRUE,add=T) hist(cc2$age[cc2$treat==0],xlim=c(0,110),main="",xlab="age of child (months)",border="darkgrey",breaks=seq(0,110,10),mgp=c(2,.5,0), freq=TRUE) hist(cc2$age[cc2$treat==1],xlim=c(0,110),xlab="",breaks=seq(0,110,10),freq=TRUE, add=T) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' ## Subclassification / Stratification #' #### Figure: Table of stratified treatment effect estimate table. edu <- list(cc2$lths, cc2$hs, cc2$ltcoll, cc2$college) # mean difference y.trt <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), mean)[4] }) y.ctrl <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), mean)[3] }) te <- y.trt - y.ctrl # sample sizes n.trt <- sapply(edu, FUN=function(x) { tapply(rep(1, nrow(cc2)), list(cc2$treat, x), sum)[4] }) n.ctrl <- sapply(edu, FUN=function(x) { sum(tapply(rep(1, nrow(cc2)), list(cc2$treat, x), sum)[3]) }) # std errors var.trt <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), var)[4] }) var.ctrl <- sapply(edu, FUN=function(x) { tapply(cc2$ppvtr.36, list(cc2$treat, x), var)[3] }) se <- sqrt(var.trt/n.trt + var.ctrl/n.ctrl) tes <- matrix(c(te, n.trt, n.ctrl, se), nrow=4) rownames(tes) <- c('lths', 'hs', 'ltcoll', 'college') colnames(tes) <- c('te', 'n.trt', 'n.ctrl', 'se') round(tes, 1) #' ## Estimands and subclassification #' #' #### ATE (9.3* 1358 + 4.1 * 1820 + 7.9* 837 + 4.6* 366) / (1358+1820+837+366) #' standard error (1.5^2* 1358^2 + 1.9^2 * 1738^2 + 2.4^2* 789^2 + 2.3^2 * 366^2) / ((1358+1820+837+366)^2) #' #### ATT round((9.3* 126 + 4.1 * 82 + 7.9* 48 + 4.6* 34) / (126+82+48+34), 1) #' standard error round((1.5^2* 126^2 + 1.9^2 * 82^2 + 2.4^2* 48^2 + 2.3^2 * 34^2) / ((126+82+48+34)^2),2) #' ## Propensity Score Matching #' #' #### Step 2: Estimating the propensity score #' #' these are the no redundancy covariates with and without state covariates covs.nr <- c('bwg', 'hispanic', 'black', 'b.marr', 'lths', 'hs', 'ltcoll', 'work.dur', 'prenatal', 'sex', 'first', 'bw', 'preterm', 'momage', 'dayskidh') covs.nr.st <- c(covs.nr, 'st5', 'st9', 'st12', 'st25', 'st36', 'st42', 'st53') #' pscore estimation formula with original covariates ps_spec <- formula(treat ~ bw + bwg + hispanic + black + b.marr + lths + hs + ltcoll + work.dur + prenatal + sex + first + preterm + momage + dayskidh + income) #' pscore estimation formula with states added ps_spec.st <- update(ps_spec, . ~ . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) #' #### Estimation of pscores using stan_glm set.seed(1234) ps_fit_1 <- stan_glm(ps_spec, family=binomial(link='logit'), data=cc2, algorithm='optimizing', refresh=0) ps_fit_1.st <- stan_glm(ps_spec.st, family=binomial(link='logit'), data=cc2, algorithm='optimizing', refresh=0) #' extracting (logit) pscores from the fit pscores <- apply(posterior_linpred(ps_fit_1), 2, mean) pscores.st <- apply(posterior_linpred(ps_fit_1.st), 2, mean) #' #### Step 3: Matching #' #' matching without replacement, original formula matches <- matching(z=cc2$treat, score=pscores, replace=FALSE) matched <- cc2[matches$match.ind,] #' matching with replacement, original formula matches.wr <- matching(z=cc2$treat, score=pscores, replace=TRUE) wts.wr <- matches.wr$cnts #' matching without replacement, pscores that include state indicators matches.st <- matching(z=cc2$treat, score=pscores.st, replace=FALSE) matched.st <- cc2[matches.st$match.ind,] #' matching with replacement, pscores that include state indicators matches.st.wr <- matching(z=cc2$treat, score=pscores.st, replace=TRUE) wts.st.wr <- matches.st.wr$cnts #' #### Step 4: Balance and overlap #' #' Balance plots for all covariates specified in STEP 1 (in the book) covs <- c('bw', 'preterm', 'dayskidh', 'sex', 'first', 'age', 'black', 'hispanic', 'white', 'b.marr', 'lths', 'hs', 'ltcoll', 'college', 'work.dur', 'prenatal', 'momage') cov_names <- c('birth weight', 'weeks preterm', 'days in hospital', 'male', 'first born', 'age', 'black', 'hispanic', 'white', 'unmarried at birth', 'less than high school', 'high school graduate', 'some college', 'college graduate', 'worked during pregnancy', 'had no prenatal care', 'age at birth') bal_nr <- balance(rawdata=cc2[,covs], treat=cc2$treat, matched=matches$cnts, estimand='ATT') bal_nr.wr <- balance(rawdata=cc2[,covs], treat=cc2$treat, matched=matches.wr$cnts, estimand='ATT') bal_nr.wr.st <- balance(rawdata=cc2[, union(covs, covs.nr.st)], treat=cc2$treat, matched=matches.st.wr$cnts, estimand='ATT') #' ## Balance plot #' #' Labeled cov names, ps_fit_1 MwoR. #' No state variables included. #' #' Balance for all covariates, not just those used in propensity score model #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Childcare/figs","balance.both.azc.pdf"), width=11, height=8.5) #+ pts <- bal_nr$diff.means.raw[,4] pts2 <- bal_nr$diff.means.matched[,4] K <- length(pts) idx <- 1:K main <- 'Absolute Standardized Difference in Means' mar <- c(8, 6, 6, 7) par(mar=mar) maxchar <- max(sapply(cov_names, nchar)) min.mar <- par('mar') mar[2] <- max(min.mar[2], trunc(mar[2] + maxchar/10)) + mar[2] + 0.5 par(mar=mar) pts <- rev(pts) pts2 <- rev(pts2) longcovnames <- rev(cov_names) plot(c(pts,pts2), c(idx,idx), bty='n', xlab='', ylab='', xaxt='n', yaxt='n', type='n', main=main, cex.main=1.2) abline(v=0, lty=2) points(pts, idx, cex=1) points(pts2, idx, pch=19, cex=1) axis(3) axis(2, at=1:K, labels=longcovnames[1:K], las=2, hadj=1, lty=0, cex.axis=1.2) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Step 4: Diagnostics for balance and overlap #' #' Separate balance plots for continuous and binary variables. #' Labelled cov names, ps_fit_1 MWR. #' Manual plot code based on the code in balance.R. #' #' #### Figure #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Childcare/figs","balance.cont.binary.AZC.pdf"), width=10, height=6) #+ par(mfrow=c(1,2)) mar1 <- c(5, 4, 6, 2) mar2 <- c(5, 3, 6, 4) pts <- bal_nr.wr$diff.means.raw[bal_nr.wr$binary==TRUE,4] pts2 <- bal_nr.wr$diff.means.matched[bal_nr.wr$binary==TRUE,4] K <- length(pts) idx <- 1:K main <- 'Absolute Difference in Means' longcovnames <- cov_names[bal_nr.wr$binary==TRUE] mar <- mar1 par(mar=mar) maxchar <- max(sapply(longcovnames, nchar)) min.mar <- par('mar') mar[2] <- max(min.mar[2], trunc(mar[2] + maxchar/10)) + mar[2] + 0.5 par(mar=mar) pts <- rev(pts) pts2 <- rev(pts2) longcovnames <- rev(longcovnames) plot(c(pts,pts2), c(idx,idx), bty='n', xlab='', ylab='', xaxt='n', yaxt='n', type='n', main=main, cex.main=1.2, xlim=c(0,.55)) abline(v=0, lty=2) points(pts, idx, cex=1) points(pts2, idx, pch=19, cex=1) axis(3, at=seq(0,.5,.1), xpd=TRUE) axis(2, at=1:K, labels=longcovnames[1:K], las=2, hadj=1, lty=0, cex.axis=1) pts <- bal_nr.wr$diff.means.raw[bal_nr.wr$binary==FALSE,4] pts2 <- bal_nr.wr$diff.means.matched[bal_nr.wr$binary==FALSE,4] # AZC: hack to fix spacing of binary covariates against x axis # the spacing of how spaced apart the ticks are changes as the number of covariates change. It's frustratingly hard, maybe impossible, to get the spacing to match between the continuous and binary plots with different number of covariates in each, so, I'll add fake data that won't show up pts <- c(pts, rep(NA, 7)) pts2 <- c(pts2, rep(NA, 7)) K <- length(pts) idx <- 1:K main <- 'Absolute Standardized Difference in Means' longcovnames <- cov_names[bal_nr.wr$binary==FALSE] # add extra names to match above longcovnames <- c(longcovnames, rep('', 7)) mar <- mar2 par(mar=mar) maxchar <- max(sapply(longcovnames, nchar)) min.mar <- par('mar') mar[2] <- max(min.mar[2], trunc(mar[2] + maxchar/10)) + mar[2] + 0.5 par(mar=mar) pts <- rev(pts) pts2 <- rev(pts2) longcovnames <- rev(longcovnames) plot(c(pts,pts2), c(idx,idx), bty='n', xlab='', ylab='', xaxt='n', yaxt='n', type='n', main=main, cex.main=1.2) segments(x0=0, y0=13, x1=0, y1=7.5, lty=2) points(pts, idx, cex=1) points(pts2, idx, pch=19, cex=1) axis(3) axis(2, at=8:12, labels=longcovnames[8:12], las=2, hadj=1, lty=0, cex.axis=1) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Figure: Overlap of propensity scores before/after matching with replacement. #' #' Pscores from original model. #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Childcare/figs","ps.overlap.dens.AZC.pdf"), width=11, height=8.5) #+ par(mfrow=c(1,2), cex.main=1.3, cex.lab=1.3) # Plot the overlapping histograms for pscores before matching, density par(mar=c(16,8,2,2)) hist(pscores[cc2$treat==0], xlim=c(-20,5), ylim=c(0,.28), main="before matching", border="darkgrey", mgp=c(2,.5,0), xlab="logit propensity scores", freq=FALSE) hist(pscores[cc2$treat==1], freq=FALSE, add=TRUE) # Plot the overlapping histograms for pscores after matching, frequency par(mar=c(16,3,2,8)) hist(pscores[cc2[matches.wr$match.ind, 'treat']==0], xlim=c(-20,6), ylim=c(0,.28), main="after matching", border="darkgrey", mgp=c(2,.5,0), xlab="logit propensity scores", freq=FALSE) hist(pscores[cc2[matches.wr$match.ind, 'treat']==1], freq=FALSE, add=TRUE) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' how many pscores[cc2$treat==0] left out of plot? sum(pscores[cc2$treat==0] < -20) #' pscore matching check sum(pscores[cc2$treat==1] > max(pscores[cc2$treat==0])) #' #### Figure: Example: good overlap, bad pscore. set.seed(20) ps3.mod <- glm(treat ~ unemp.rt, data=cc2,family=binomial) pscores3 <- predict(ps3.mod, type="link") #+ eval=FALSE, include=FALSE if (savefigs) pdf(root("Childcare/figs","bad.pscore.overlap.AZC.pdf"), width=11, height=8.5) #+ par(mar=c(8,3,4,3), cex=1.4, cex.lab=1.2) # Plot the overlapping histograms for pscore3, density hist(pscores3[cc2$treat==0], xlim=range(pscores3), ylim=c(0,8), main="", border="darkgrey", mgp=c(2,.5,0), xlab="logit propensity scores",freq=FALSE) hist(pscores3[cc2$treat==1], freq=FALSE, add=TRUE) #+ eval=FALSE, include=FALSE if (savefigs) dev.off() #' #### Step 5: Estimating a treatment effect using the restructured data. te_spec_nr <- update(ps_spec, ppvtr.36 ~ treat + .) #' treatment effect without replacement set.seed(20) reg_ps <- stan_glm(te_spec_nr, data=cc2[matches$match.ind,], algorithm='optimizing') #' treatment effect with replacement set.seed(20) reg_ps.wr <- stan_glm(te_spec_nr, data=cc2, weight=matches.wr$cnts, algorithm='optimizing') ps_fit_1_design <- svydesign(ids=~1, weights=matches.wr$cnts, data=cc2) reg_ps.wr_svy <- svyglm(te_spec_nr, design=ps_fit_1_design, data=cc2) summary(reg_ps)['treat', 1:2] summary(reg_ps.wr)['treat', 1:2] summary(reg_ps.wr_svy)$coef['treat', 1:2] #' Geographic information, covs_nr.st te_spec_nr.st <- update(ps_spec.st, ppvtr.36 ~ treat + . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) #' treatment effect without replacement set.seed(20) reg_ps.st <- stan_glm(te_spec_nr.st, data=cc2[matches.st$match.ind,], algorithm='optimizing') #' treatment effect with replacement set.seed(20) reg_ps.st.wr <- stan_glm(te_spec_nr.st, data=cc2, weight=matches.st.wr$cnts, algorithm='optimizing') ps_fit_1.st_design <- svydesign(ids=~1, weights=matches.st.wr$cnts, data=cc2) reg_ps.st.wr_svy <- svyglm(te_spec_nr.st, design=ps_fit_1.st_design, data=cc2) summary(reg_ps.st)['treat', 1:2] summary(reg_ps.st.wr)['treat', 1:2] summary(reg_ps.st.wr_svy)$coef['treat', 1:2] #' standard regression estimate of treatment effect set.seed(20) reg_te <- stan_glm(te_spec_nr, data=cc2, algorithm='optimizing') #' standard regression estimate of treatment effect with state set.seed(20) reg_te.st <- stan_glm(te_spec_nr.st, data=cc2, algorithm='optimizing') summary(reg_te)['treat', 1:2] summary(reg_te.st)['treat', 1:2] #' ## Improved pscore Model #' #' Improved pscore model using an interaction or squared term. #' #' transformed variables cc2$bwT = (cc2$bw-1500)^2 cc2$dayskidT = log(cc2$dayskidh+1) cc2$pretermT = (cc2$preterm+8)^2 cc2$momageT = (cc2$momage^2) #' New ps-spec from psFitR(21) ps_spec2 <- formula(treat ~ bwg*as.factor(educ) + as.factor(ethnic)*b.marr + work.dur + prenatal + preterm + momage + sex + first + bw + dayskidT + pretermT + momageT + black*(bw + preterm + dayskidT) + b.marr*(bw + preterm + dayskidT) + bw*income) ps_spec_i21 <- formula(treat~bw+preterm+dayskidh+sex+hispanic+b.marr+lths+hs+ltcoll+work.dur+prenatal+momage+income+bwT+pretermT+income+black:dayskidT+b.marr:bw+b.marr:preterm+b.marr:dayskidT+bw:income) ps_spec2 <- ps_spec_i21 set.seed(8) ps_fit_2 <- stan_glm(ps_spec2, family=binomial(link="logit"), data=cc2, algorithm='optimizing') pscores_2 <- apply(posterior_linpred(ps_fit_2, type='link'), 2, mean) matches2 <- matching(z=cc2$treat, score=pscores_2, replace=FALSE) matched2 <- cc2[matches2$match.ind,] matches2_wr <- matching(z=cc2$treat, score=pscores_2, replace=TRUE) matched2_wr <- cc2[matches2_wr$match.ind,] bal_2 <- balance(rawdata=cc2[,covs], cc2$treat, matched=matches2$cnts, estimand='ATT') bal_2.wr <- balance(rawdata=cc2[,covs], cc2$treat, matched=matches2_wr$cnts, estimand='ATT') #' look at some balance plots par(mfrow=c(1,1)) plot.balance(bal_nr, which.covs='cont', main='ps_fit_1') plot.balance(bal_2.wr, which.covs='cont', main='ps_fit_2') plot.balance(bal_nr, which.covs='binary', main='ps_fit_1') plot.balance(bal_2.wr, which.covs='binary', main='ps_fit_2') #' #### Figure: side by side binary/continuous #' plot.balance(bal_2.wr, longcovnames=cov_names) #' #### Treatment effect te_spec2 <- formula(ppvtr.36 ~ treat + hispanic + black + b.marr + lths + hs + ltcoll + work.dur + prenatal + momage + sex + first + preterm + dayskidh + bw + income) te_spec2 <- te_spec_nr set.seed(8) # MwoR reg_ps2 <- stan_glm(te_spec2, data=matched2, algorithm='optimizing') # MwR reg_ps2.design <- svydesign(ids=~1, weights=matches2_wr$cnts, data=cc2) reg_ps2.wr <- svyglm(te_spec2, design=reg_ps2.design, data=cc2) summary(reg_ps2)['treat', 1:2] summary(reg_ps2.wr)$coef['treat', 1:2] #' #### Geographic information using ps_spec2 ps_spec2.st <- update(ps_spec2, . ~ . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) set.seed(8) ps_fit_2.st <- stan_glm(ps_spec2.st, family=binomial(link='logit'), data=cc2, algorithm='optimizing') pscores_2.st <- apply(posterior_linpred(ps_fit_2.st, type='link'), 2, mean) matches2.st <- matching(z=cc2$treat, score=pscores_2.st, replace=FALSE) matched2.st <- cc2[matches2.st$match.ind,] matches2.st_wr <- matching(z=cc2$treat, score=pscores_2.st, replace=TRUE) #' #### Treatment effect estimate te_spec2.st <- update(te_spec2, . ~ . + st5 + st9 + st12 + st25 + st36 + st42 + st48 + st53) set.seed(8) # MwoR reg_ps2.st <- stan_glm(te_spec2.st, data=matched2.st, algorithm='optimizing') reg_ps2.st_design <- svydesign(ids=~1, weights=matches2.st_wr$cnts, data=cc2) reg_ps2.st.wr <- svyglm(te_spec2.st, design=reg_ps2.st_design, data=cc2) summary(reg_ps2.st)['treat', 1:2] summary(reg_ps2.st.wr)$coef['treat', 1:2] #' #### IPTW (including state indicators) wt.iptw <- inv.logit(pscores) / (1 - inv.logit(pscores)) wt.iptw[cc2$treat==0] <- wt.iptw[cc2$treat==0] * (sum(wt.iptw[cc2$treat==0]) / sum(cc2$treat==0)) wt.iptw[cc2$treat==1] <- 1 set.seed(20) ps_fit_iptw_design <- svydesign(ids=~1, weights=wt.iptw, data=cc2) reg_ps.iptw <- svyglm(te_spec_nr, design=ps_fit_iptw_design, data=cc2) summary(reg_ps.iptw)$coef['treat', 1:2] #' ## Beyond balance in means #' #' Table of ratio of standard deviations across treatment & control #' groups for unmatched, MWOR, MWR. cont_vars <- c('bw', 'preterm', 'dayskidh', 'momage', 'income', 'age') sds.um <- sapply(cont_vars, function(x){ tapply(cc2[,x], cc2$treat, sd) }) sds.mwor <- sapply(cont_vars, function(x){ tapply(matched[,x], matched$treat, sd) }) mwr.ind <- rep(cc2$row.names, times=matches.wr$cnts) matched.wr <- cc2[mwr.ind, ] sds.mwr <- sapply(cont_vars, function(x){ tapply(matched.wr[,x], matched.wr$treat, sd) })