#' ---
#' title: "Regression and Other Stories: Beauty and Teaching Quality"
#' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
#' date: "`r format(Sys.Date())`"
#' output:
#'   html_document:
#'     theme: readable
#'     toc: true
#'     toc_depth: 2
#'     toc_float: true
#'     code_download: true
#' ---

#' Hamermesh and Parker (2005) data on student evaluations of
#' instructors’ beauty and teaching quality for several courses at the
#' University of Texas. See Chapter 10 in Regression and Other
#' Stories.
#'
#' Hamermesh, D. S., and Parker, A. M. (2005).  Beauty in the
#' classroom: Instructors' pulchritude and putative pedagogical
#' productivity.  Economics of Education Review, 24:369-376.
#' 
#' -------------
#' 

#+ setup, include=FALSE
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)

#' #### Load packages
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))

#' #### Load data
beauty <- read.csv(root("Beauty/data","beauty.csv"))
head(beauty)

#' ## Do more beautiful profs get higher evaluations?
#'

#' #### Make a scatterplot of data
par(mar=c(3,3,1,1), mgp=c(1.7, .5, 0), tck=-.01)
plot(beauty$beauty, beauty$eval)

#' #### Fit a linear regression
fit_1 <- stan_glm(eval ~ beauty, data=beauty, refresh=0)
print(fit_1, digits=2)

#' #### Make a scatterplot with regression lines
# Labeling the axes
plot(beauty$beauty, beauty$eval, xlab="Beauty", ylab="Average teaching evaluation")
# Display the regression line, added onto the scatterplot (add=TRUE)
coefs <- coef(fit_1)
curve(coefs[1] + coefs[2]*x, add=TRUE)
# Add dotted lines to show +/- 1 standard deviation
sigma <- sigma(fit_1)
curve(coefs[1] + coefs[2]*x + sigma, lty=2, add=TRUE)
curve(coefs[1] + coefs[2]*x - sigma, lty=2, add=TRUE)

#' #### ggplot version
ggplot(data=beauty, aes(beauty, eval)) +
  geom_point(size = 2, alpha = 0.75) +
  geom_abline(
    slope = rep(coefs[2], 3),
    intercept = c(coefs[1], coefs[1] - sigma, coefs[1] + sigma),
    linetype = c(1, 2, 2),
    color = "darkgray",
    size = 1
  ) +
  labs(
    x = "Beauty",
    y = "Average teaching evaluation"
  )

#' ## Do things differ for male and female profs?  

#' #### Parallel regression lines
fit_2 <- stan_glm(eval ~ beauty + female, data=beauty, refresh=0)
print(fit_2, digits=2)
coefs2 <- coef(fit_2)

#' #### Make several subplots
# Set up a 2x2 grid of plots
par(mfrow=c(2,2))
# Make separate plot for men, ...
plot(beauty$beauty[beauty$female==0], beauty$eval[beauty$female==0], xlim=range(beauty$beauty), ylim=range(beauty$eval),
     xlab="Beauty", ylab="Average teaching evaluation", main="Men")
curve(coefs2[1] + coefs2[2]*x + coefs2[3]*0, add=TRUE)
# ... women, ...
plot(beauty$beauty[beauty$female==1], beauty$eval[beauty$female==1], xlim=range(beauty$beauty), ylim=range(beauty$eval),
      xlab="Beauty", ylab="Average teaching evaluation", main="Women")
curve(coefs2[1] + coefs2[2]*x + coefs2[3]*1, add=TRUE)
# ... and both sexes on the same plot
# First make the plot with type="n" (which displays axes but does not plot
#   the points), then plot the points and lines separately for each sex
plot(beauty$beauty, beauty$eval, xlab="Beauty", ylab="Average teaching evaluation",
      main="Both sexes", type="n")
points(beauty$beauty[beauty$female==0], beauty$eval[beauty$female==0], col="blue")
curve(coefs2[1] + coefs2[2]*x + coefs2[3]*0, add=TRUE, col="blue")
points(beauty$beauty[beauty$female==1], beauty$eval[beauty$female==1], col="red")
curve(coefs2[1] + coefs2[2]*x + coefs2[3]*1, add=TRUE, col="red")

#' #### ggplot versions
# Men 
gg_male <-
  ggplot(subset(beauty, female == 0), aes(beauty, eval)) +
  geom_point() +
  geom_abline(slope = coefs2[2], intercept = coefs2[1], color = "darkgray")
# Women
gg_female <-
  ggplot(subset(beauty, female == 1), aes(beauty, eval)) +
  geom_point() +
  geom_abline(slope = coefs2[2], intercept = coefs2[1] + coefs2[3], color = "darkgray")
# Both
gg_both <-
  ggplot(data=beauty, aes(beauty, eval)) +
  geom_point(aes(color = factor(female)), show.legend = FALSE) +
  scale_color_manual(values = c("red", "blue")) +
  geom_abline(
    slope = coefs2[2],
    intercept = c(coefs2[1], coefs2[1] + coefs2[3]),
    color = c("blue3", "red3"),
    size = 1
  )
# Put them in a grid
bayesplot_grid(
  gg_male, gg_female, gg_both,
  grid_args = list(ncol = 2),
  xlim = range(beauty$beauty),
  ylim = range(beauty$eval),
  titles = c("Men", "Women", "Both sexes")
)

#' ## Do things differ for male and female profs?  

#' #### Non-parallel regression lines
fit_3 <- stan_glm(eval ~ beauty + female + beauty*female, data=beauty, refresh=0)
print(fit_3, digits=2)
coefs3 <- coef(fit_3)

#' #### Make two subplots
# Set up a new 1x2 grid of plots
par(mfrow=c(1,2))
# Display the parallel regression lines in gray and the non-parallel lines
# in heavy black
# Make separate plot for men ...
plot(beauty$beauty[beauty$female==0], beauty$eval[beauty$female==0], xlim=range(beauty$beauty), ylim=range(beauty$eval),
      xlab="Beauty", ylab="Average teaching evaluation", main="Men")
curve(coefs2[1] + coefs2[2]*x + coefs2[3]*0,
       lwd=.5, col="gray", add=TRUE)
curve(coefs3[1] + coefs3[2]*x + coefs3[3]*0 + coefs3[4]*x*0,
       lwd=2, col="black", add=TRUE)
# ... and women
plot (beauty$beauty[beauty$female==1], beauty$eval[beauty$female==1], xlim=range(beauty$beauty), ylim=range(beauty$eval),
      xlab="Beauty", ylab="Average teaching evaluation", main="Women")
curve(coefs2[1] +coefs2[2]*x +coefs2[3]*1,
       lwd=.5, col="gray", add=TRUE)
curve(coefs3[1] + coefs3[2]*x + coefs3[3]*1 +coefs3[4]*x*1,
       lwd=2, col="black", add=TRUE)

#' #### ggplot version
# we can add to the gg_male and gg_female plots we already made above
gg_male2 <- gg_male + geom_abline(intercept = coefs3[1], slope = coefs3[2], size = 1)
gg_female2 <- gg_female + geom_abline(intercept = coefs3[1] + coefs3[3], slope = coefs3[2] + coefs3[4], size = 1)
# Put them in a grid
bayesplot_grid(
  gg_male2, gg_female2,
  grid_args = list(ncol = 2),
  xlim = range(beauty$beauty),
  ylim = range(beauty$eval),
  titles = c("Men", "Women")
)

#' ## More models

#' #### Add age
fit_4 <- stan_glm(eval ~ beauty + female + age, data=beauty, refresh=0)
print(fit_4, digits=2)

#' #### Add minority
fit_5 <- stan_glm(eval ~ beauty + female + minority, data=beauty, refresh=0)
print(fit_5, digits=2)

#' #### Add nonenglish
fit_6 <- stan_glm(eval ~ beauty + female + nonenglish, data=beauty, refresh=0)
print(fit_6, digits=2)

#' #### Add nonenglish and lower
fit_7 <- stan_glm(eval ~ beauty + female + nonenglish + lower,
                  data=beauty, refresh=0)
print(fit_7, digits=2)

#' ## Simple model with course indicators

#' #### Include course indicators in a regression
fit_8 <- stan_glm(eval ~ beauty + factor(course_id), data=beauty, refresh=0)
print(fit_8, digits=2)