The Data. Math 158, Spring 2016 Jo Hardin Shrinkage Methods R code Ridge Regression & LASSO

Math 158, Spring 2016 Jo Hardin Shrinkage Methods R code Ridge Regression & LASSO The Data The following dataset is from Hastie, Tibshirani and Friedman (2009), from a studyby Stamey et al. (1989) of prostate cancer, measuring the correlation between the level of a prostate-specific antigen and some covariates. The covariates are lcavol : log-cancer volume lweight : log-prostate weight age : age of patient lbhp : log-amount of benign hyperplasia svi : seminal vesicle invasion lcp : log-capsular penetration gleason : Gleason Score, check http://en.wikipedia.org/wiki/gleason_grading_system pgg45 : percent of Gleason scores 4 or 5 lpsa is the response variable, log-psa. require(readr); require(car); require(tidyr) require(glmnet) # lars is where the LASSO functions live pcancer <- read_delim(file="http://www-stat.stanford.edu/~tibs/elemstatlearn/datasets/prostate.data", delim="\t") head(pcancer) ## Source: local data frame [6 x 11] ## ## NA lcavol lweight age lbph svi lcp gleason pgg45 ## (int) (dbl) (dbl) (int) (dbl) (int) (dbl) (int) (int) ## 1 1-0.5798185 2.769459 50-1.386294 0-1.386294 6 0 ## 2 2-0.9942523 3.319626 58-1.386294 0-1.386294 6 0 ## 3 3-0.5108256 2.691243 74-1.386294 0-1.386294 7 20 ## 4 4-1.2039728 3.282789 58-1.386294 0-1.386294 6 0 ## 5 5 0.7514161 3.432373 62-1.386294 0-1.386294 6 0 ## 6 6-1.0498221 3.228826 50-1.386294 0-1.386294 6 0 ## Variables not shown: lpsa (dbl), train (lgl) # They provide the breakdown of training / test observations pr.train <- pcancer[which(pcancer$train),2:10] pr.test <- pcancer[-which(pcancer$train),2:10]

Remembering that shrinkage methods are best when covariates are correlated, we investigate whether we expect collinearity to be an issue with our data. As is often the case with biological data, there is some degree of correlation between the variables, here the collinearity does not seem to be exteme. pairs(pr.train[,1:9], pch=19, cex=.25) 2.5 1 2 1 2 0 80 lcavol 1 3 2.5 lweight age 40 70 1 2 lbph svi 0.0 1.0 1 2 lcp gleason 6.0 9.0 0 80 pgg45 lpsa 0 4 1 3 40 70 0.0 1.0 6.0 9.0 0 4 round(cor(pr.train[,1:9]),3) ## lcavol lweight age lbph svi lcp gleason pgg45 lpsa ## lcavol 1.000 0.300 0.286 0.063 0.593 0.692 0.426 0.483 0.733 ## lweight 0.300 1.000 0.317 0.437 0.181 0.157 0.024 0.074 0.485 ## age 0.286 0.317 1.000 0.287 0.129 0.173 0.366 0.276 0.228 ## lbph 0.063 0.437 0.287 1.000-0.139-0.089 0.033-0.030 0.263 ## svi 0.593 0.181 0.129-0.139 1.000 0.671 0.307 0.481 0.557 ## lcp 0.692 0.157 0.173-0.089 0.671 1.000 0.476 0.663 0.489 ## gleason 0.426 0.024 0.366 0.033 0.307 0.476 1.000 0.757 0.342 ## pgg45 0.483 0.074 0.276-0.030 0.481 0.663 0.757 1.000 0.448 ## lpsa 0.733 0.485 0.228 0.263 0.557 0.489 0.342 0.448 1.000

The full linear model: pr.ls <- lm(lpsa ~., data=pr.train) summary(pr.ls)$coef ## Estimate Std. Error t value Pr(> t ) ## (Intercept) 0.429170133 1.55358810 0.2762445 7.833423e-01 ## lcavol 0.576543185 0.10743794 5.3662905 1.469415e-06 ## lweight 0.614020004 0.22321593 2.7507894 7.917895e-03 ## age -0.019001022 0.01361193-1.3959090 1.680626e-01 ## lbph 0.144848082 0.07045669 2.0558456 4.430784e-02 ## svi 0.737208645 0.29855507 2.4692552 1.650539e-02 ## lcp -0.206324227 0.11051627-1.8669126 6.697085e-02 ## gleason -0.029502884 0.20113609-0.1466812 8.838923e-01 ## pgg45 0.009465162 0.00544651 1.7378397 8.754628e-02 What if n is small? No good! We can t estimate any of our coefficients! (Here, I randomly selected 6 points, remember p = 9.) set.seed(47) fewpts = sample(c(true, FALSE), 67, replace=true, prob=c(.1,.9)) pr.ls.sub <- lm(lpsa~., data=pr.train, subset=fewpts) summary(pr.ls.sub) ## ## Call: ## lm(formula = lpsa ~., data = pr.train, subset = fewpts) ## ## Residuals: ## ALL 6 residuals are 0: no residual degrees of freedom! ## ## Coefficients: (3 not defined because of singularities) ## Estimate Std. Error t value Pr(> t ) ## (Intercept) -2.82644 NA NA NA ## lcavol 1.96675 NA NA NA ## lweight 0.54273 NA NA NA ## age -0.03449 NA NA NA ## lbph 0.21285 NA NA NA ## svi NA NA NA NA ## lcp -2.92338 NA NA NA ## gleason NA NA NA NA ## pgg45 NA NA NA NA ## ## Residual standard error: NaN on 0 degrees of freedom ## Multiple R-squared: 1,Adjusted R-squared: NaN ## F-statistic: NaN on 5 and 0 DF, p-value: NA

The model breaks down because (X t X) 1 is not invertible: x.fewpts <- as.matrix(pr.train[fewpts,-9]) dim(x.fewpts) ## [1] 6 8 solve(t(x.fewpts) %*% x.fewpts) ## Error in solve.default(t(x.fewpts) %*% x.fewpts): Lapack routine dgesv: system is exactly singular: U[5,5] = 0 x.full <- as.matrix(pr.train[,-9]) solve(t(x.full) %*% x.full) # note that it is invertible w/all the data ## lcavol lweight age lbph ## lcavol 2.184705e-02-0.0032645189-1.743289e-04-0.0018930899 ## lweight -3.264519e-03 0.0715909474-1.966555e-03-0.0071984168 ## age -1.743289e-04-0.0019665550 3.441990e-04-0.0001557976 ## lbph -1.893090e-03-0.0071984168-1.557976e-04 0.0086891817 ## svi -1.500244e-02-0.0183358230-4.018146e-05 0.0083083917 ## lcp -1.028210e-02 0.0002114462 2.298114e-04 0.0010251226 ## gleason -7.230883e-04-0.0203649756-2.260220e-03 0.0058365117 ## pgg45 1.453606e-05 0.0005665740 1.482164e-05-0.0001179802 ## svi lcp gleason pgg45 ## lcavol -1.500244e-02-0.0102821032-0.0007230883 1.453606e-05 ## lweight -1.833582e-02 0.0002114462-0.0203649756 5.665740e-04 ## age -4.018146e-05 0.0002298114-0.0022602200 1.482164e-05 ## lbph 8.308392e-03 0.0010251226 0.0058365117-1.179802e-04 ## svi 1.752383e-01-0.0216018727 0.0078926473-3.289651e-04 ## lcp -2.160187e-02 0.0240671351 0.0029937536-4.717413e-04 ## gleason 7.892647e-03 0.0029937536 0.0353064467-6.328419e-04 ## pgg45-3.289651e-04-0.0004717413-0.0006328419 4.501992e-05

AIC model building Remembering how we used AIC to model build going forward and backward, we can build two different model. pr.fwd <- step(lm(lpsa ~ 1, data=pr.train), ~lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=pr.train, direction="forward") pr.bck <- step(pr.ls, direction="backward") The AIC forward and backward models: coef(pr.fwd) ## (Intercept) lcavol lweight svi lbph ## -0.3259212 0.5055209 0.5388292 0.6718487 0.1400111 coef(pr.bck) ## (Intercept) lcavol lweight age lbph ## 0.259061747 0.573930391 0.619208833-0.019479879 0.144426474 ## svi lcp pgg45 ## 0.741781258-0.205416986 0.008944996

Ridge Regression model building The vignette for glmnet is at http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html and is very useful. Note: all the logs, throughout, are natural logs. The basic ridge regession model code is as follows: lambda.grid =10^seq(5,-2, length =100) # note the form of the regression function is glmnet(x,y,...) # alpha = 0 gives RR, alpha=1 gives Lasso pr.ridge <- glmnet(as.matrix(pr.train[,1:8]), pr.train$lpsa, alpha=0, lambda = lambda.grid, standardize=true) Choosing λ What happens to the coefficients and predictions when λ is large (8.4975344 10 4 ) verus λ being small (0.0509414)? dim(coef(pr.ridge)) # 9 coef, 100 lambda values ## [1] 9 100 lambda.grid[2] ## [1] 84975.34 coef(pr.ridge)[,2] # note glmnet unstandardizes the variables ## (Intercept) lcavol lweight age lbph ## 2.452169e+00 1.005294e-05 1.734629e-05 5.169843e-07 3.060858e-06 ## svi lcp gleason pgg45 ## 2.259182e-05 5.950457e-06 8.230396e-06 2.605237e-07 lambda.grid[90] ## [1] 0.05094138 coef(pr.ridge)[,90] # note glmnet unstandardizes the variables ## (Intercept) lcavol lweight age lbph ## 0.200684117 0.522928641 0.607850026-0.016427788 0.140824424 ## svi lcp gleason pgg45 ## 0.700735303-0.147679802 0.002898093 0.007850570

Each boxplot represents the MSE values for each left out sample according to the cross validation information. That is, given a particular λ value, the data is partitioned. Each hold out sample is predicted (and MSE calculated) given the model for the rest of the data. The boxplot describes those MSE values with the average MSE given by the red dot. In the following figure, the lower dashed (green) vertical line gives the λ with the smallest cross validated MSE. The other line gives the most regularized (that is: smaller coefficients!) λ that is within one SE (a box whisker) of the minimum λ value. pr.ridge.cv <- cv.glmnet(as.matrix(pr.train[,1:8]), pr.train$lpsa, alpha=0, lambda = lambda.grid, standardize=true) plot(pr.ridge.cv) abline(v=log(pr.ridge.cv$lambda.min), col="green") 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 Mean Squared Error 0.6 1.0 1.4 5 0 5 10 log(lambda) colors <- rainbow(8) plot(pr.ridge, xvar="lambda", xlim=c(-6,10),col=colors) pr.ridge.cv$lambda.min ## [1] 0.1149757 abline(v=log(pr.ridge.cv$lambda.min)) abline(h=0, lty=2) text(rep(-6.5, 9), coef(pr.ridge)[-1,length(lambda.grid)], colnames(pr.train)[-9], pos=4, col=colors)

8 8 8 8 Coefficients 0.2 0.0 0.2 0.4 0.6 svi lweight lcavol lbph age gleason pgg45 lcp 5 0 5 10 Log Lambda plot(pr.ridge, xvar="dev",col=colors, label=true) abline(h=0, lty=2) 8 8 8 8 8 8 8 8 5 Coefficients 0.2 0.0 0.2 0.4 0.6 2 1 4 8 37 6 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 Fraction Deviance Explained

plot(pr.ridge, xvar="norm",col=colors, label=true) abline(h=0, lty=2) 8 8 8 8 8 5 Coefficients 0.2 0.0 0.2 0.4 0.6 2 1 4 8 37 6 0.0 0.5 1.0 1.5 2.0 L1 Norm The Ridge Model Using the λ value for the minimum MSE in the cross validation, we output the coefficients / model associated with the best λ for the ridge regression model. Ridge regression does not do variable selection. coef(pr.ridge.cv, s = "lambda.min") ## 9 x 1 sparse Matrix of class "dgcmatrix" ## 1 ## (Intercept) 0.037185265 ## lcavol 0.473676896 ## lweight 0.595911319 ## age -0.013758114 ## lbph 0.135886907 ## svi 0.665331969 ## lcp -0.098068934 ## gleason 0.025077397 ## pgg45 0.006641498

Lasso model building # note the form of the regression function is glmnet(x,y,...) # alpha = 0 gives RR, alpha=1 gives Lasso pr.lasso <- glmnet(as.matrix(pr.train[,1:8]), pr.train$lpsa, alpha=1, lambda = lambda.grid, standardize=true) Choosing λ Again, the goal is to find the λ value which will minimize MSE. Doing this via cross validation avoids overfitting the model. What happens to the coefficients and predictions when λ is large (8.4975344 10 4 ) verus λ being small (0.0509414)? dim(coef(pr.lasso)) # 9 coef, 100 lambda values ## [1] 9 100 lambda.grid[2] ## [1] 84975.34 coef(pr.lasso)[,2] # note glmnet unstandardizes the variables ## (Intercept) lcavol lweight age lbph svi ## 2.452345 0.000000 0.000000 0.000000 0.000000 0.000000 ## lcp gleason pgg45 ## 0.000000 0.000000 0.000000 lambda.grid[90] ## [1] 0.05094138 coef(pr.lasso)[,90] # note glmnet unstandardizes the variables ## (Intercept) lcavol lweight age lbph ## -0.122804781 0.469844436 0.530595527-0.002673565 0.106750458 ## svi lcp gleason pgg45 ## 0.488806935 0.000000000 0.000000000 0.003428396

In the following figure, the lower dashed (green) vertical line gives the λ with the smallest cross validated MSE. The other line gives the most regularized (that is: smaller coefficients!) λ that is within one SE (a box whisker) of the minimum λ value. pr.lasso.cv <- cv.glmnet(as.matrix(pr.train[,1:8]), pr.train$lpsa, alpha=1, lambda = lambda.grid, standardize=true) plot(pr.lasso.cv) abline(v=log(pr.lasso.cv$lambda.min), col="green") 7 7 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 Mean Squared Error 0.4 0.8 1.2 1.6 5 0 5 10 log(lambda) plot(pr.lasso, xvar="lambda", xlim=c(-6,10), col=colors) pr.lasso.cv$lambda.min ## [1] 0.01176812 abline(v=log(pr.lasso.cv$lambda.min)) abline(h=0, lty=2) text(rep(-6.5, 9), coef(pr.lasso)[-1,length(lambda.grid)], colnames(pr.train)[-9], pos=4, col=colors)

7 0 0 0 Coefficients 0.0 0.2 0.4 0.6 svi lweight lcavol lbph age gleason pgg45 lcp 5 0 5 10 Log Lambda plot(pr.lasso, xvar="dev",col=colors, label=true) abline(h=0, lty=2) 0 1 1 1 2 3 5 7 5 Coefficients 0.0 0.2 0.4 0.6 2 1 4 8 3 6 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 Fraction Deviance Explained

plot(pr.lasso, xvar="norm",col=colors, label=true) abline(h=0, lty=2) 0 2 3 5 7 5 Coefficients 0.0 0.2 0.4 0.6 2 1 4 8 3 6 0.0 0.5 1.0 1.5 2.0 L1 Norm The Lasso Model Using the λ value for the minimum MSE in the cross validation, we output the coefficients / model associated with the best λ for the lasso model. Note that here (possibly because there wasn t much collinearity!), only one coefficient was set to zero. Lasso does do variable selection! coef(pr.lasso.cv, s = "lambda.min") ## 9 x 1 sparse Matrix of class "dgcmatrix" ## 1 ## (Intercept) 0.175589982 ## lcavol 0.547524290 ## lweight 0.598528511 ## age -0.015529964 ## lbph 0.135996816 ## svi 0.677973839 ## lcp -0.152032851 ## gleason. ## pgg45 0.007564082

Comparing Models forw.pred <- predict(pr.fwd, newdata = pr.test) back.pred <- predict(pr.bck, newdata = pr.test) ridge.pred <- predict(pr.ridge.cv, newx = as.matrix(pr.test[,1:8]), s = "lambda.min") lasso.pred <- predict(pr.lasso.cv, newx = as.matrix(pr.test[,1:8]), s = "lambda.min") sum((forw.pred - pr.test$lpsa)^2) ## [1] 13.68996 sum((back.pred - pr.test$lpsa)^2) ## [1] 15.4954 sum((ridge.pred - pr.test$lpsa)^2) ## [1] 14.73585 sum((lasso.pred - pr.test$lpsa)^2) ## [1] 14.8724 Forward Won!