\documentclass{amsart}
\usepackage{hyperref,verbatim}
\usepackage{Sweave}
\begin{document}
\SweaveOpts{concordance=TRUE}
\input{col-concordance}

\title{Variable Selection}

\maketitle

This exercise concerns the data set SMSA:
<<>>=
smsa = na.omit(read.table("http://pages.uoregon.edu/dlevin/DATA/smsa.txt",
                          sep="\t",header=T,row.names=1))
@

Set aside $1/3$ of the data for testing, use the remaining for training.
We want to predict \verb|Mortality| from the other variables.

Omit the variable \verb|NOx| in fitting the largest model, as otherwise the
model matrix may be singular.  (What does this mean?)

Search for low BIC models using:
\begin{itemize}
\item Forward selection
\item Backward stepwise selection
\item All subset regression
\end{itemize}

Then also find the ridge regression model with choice of $\lambda$ minimizing
cross-validation error.  Choose the number of folds equal to $3$ for cross-validation
due to small sample size.

Finally, find the model with lowest $C_p$.

Then test these models on the testing data.  Find the root-mean-squared-error of 
prediction for each choice of model.


<<echo=FALSE,results=hide>>=
set.seed(1)
n = dim(smsa)[1]
train=sample(1:n,2*n/3)
g = lm(Mortality~.-NOx, data=smsa[train,])
@

See \url{http://pages.uoregon.edu/dlevin/DATA/SMSA.html} for description of variables.


Some notes:
\begin{itemize}
  \item To do forward section using \verb|step| us
  \begin{verbatim}
    step(f0,formula(f1),direction="forward",k=log(39))
  \end{verbatim}
  where \verb|f0| is the model with only an intercept,
  and \verb|f1| is the largest model.  Setting \verb|k=\log(39)|
  uses BIC instead of AIC, where the sample size is 39.
  
  \item Use \verb|regsubsets| from the package \verb|leaps| to
  get all subsets.  Use
  \begin{verbatim}
     all.mods = regsubsets(Mortality~.-NOx, data=smsatrain, nvmax=17)
  \end{verbatim}
  where \verb|smsatrain| is the training data.
\end{itemize}
<<results=hide,echo=FALSE>>=
library(glmnet)
xsmsatr = model.matrix(g)[,-1]
g2 = glmnet(xsmsatr,smsa$Mortality[train],alpha=0,lambda=10^seq(4,-2,length.out=100))
plot(g2,xvar="lambda")
g3 = cv.glmnet(xsmsatr,smsa$Mortality[train],alpha=0,lambda=10^seq(4,-2,length.out=100),nfolds=3)
plot(g3)
xtest = as.matrix(smsa[-train,-c(5,16)])
pg3 = predict(g3,s=g3$lambda.min,newx=xtest)
print(sqrt(mean((pg3-smsa$Mortality[-train])^2)))
pg =  predict(g,newdata=smsa[-train,])
print(sqrt(mean((pg-smsa$Mortality[-train])^2)))
g4 = lm(Mortality~JanTemp+JulyTemp+Rain+Education+PopDensity+X.NonWhite+X.WC+pop.house+income,data=smsa[train,])
pg4 = predict(g4,newdata=smsa[-train,])
print(sqrt(mean((pg4-smsa$Mortality[-train])^2)))
g5 = lm(Mortality~JanTemp+Rain+PopDensity+X.NonWhite+X.WC,data=smsa[train,])
pg5 = predict(g5,newdata=smsa[-train,])
print(sqrt(mean((pg5-smsa$Mortality[-train])^2)))
#step(lm(Mortality~1, data=smsa[train,]),formula(g),direction="forward",k=log(39))
@
\end{document}