36-40 Modern Regression HW #8 Solutions Problem [25 points] (a) DUE: /0/207 at 3PM This is still a linear regression model the simplest possible one. That being the case, the solution we derived before β = ( T ) T Y still holds. The design matrix is so =., β = ( T ) }{{}} T {{ Y } /n = Y i = Y. (b) So H = ( T ) T = (. (n) ) n n n. =... n..... n n n h ii = n for all i =,..., n.
(c) Ŷ = HY n n i= Y i =. = n n i= Y i Y. Y (d) t i = Y i Ŷi( i) s i Y i n j n j i Y i = Var(Y i ( Ŷi( i)) n ) Y i n Y j Y i j= = Var(Y i ) + Var(Ŷi( i)) = Y ( ) i ny n Yi σ 2 + σ2 n = Y ( ) i ny n Yi = nσ 2 n n n (Y i Y ) nσ 2 n (e) lim t i = Y i = lim Y i = n n (Y i Y ) nσ 2 n n n (lim Y i Y i Y ) nσ 2 n 2
Problem 2 [20 points] H = HH h h 2 h n h h 2 h n h h 2 h n h 2 h 22 h 2n..... = h 2 h 22 h 2n h 2 h 22 h 2n.......... h n h nn h n h nn h n h nn So for each diagonal element of H we have, h ii = h 2 ii + h 2 ij. j i We have expressed h ii as a sum of squares, so h ii 0. Furthermore, so h ii h 2 ii, h ii. 3
Problem 3 [20 points] Comments omitted. One of the points in data set four produces NaN s because it has leverage. See equation (2) of Lecture Notes 20. attach(anscombe) names(anscombe) ## [] "x" "x2" "x3" "x4" "y" "y2" "y3" "y4" Y 4 5 6 7 8 9 0 Residuals 2 0 4 6 8 0 2 4 4 6 8 0 2 4 2 0 4 6 8 0 2 4 4 6 8 0 2 4 2 4 6 8 0 Figure : Data Set 4
Y 3 4 5 6 7 8 9 Residuals 2.0.5.0 0.5 0.0 0.5.0 4 6 8 0 2 4 4 6 8 0 2 4 2.0.5.0 0.5 0.0 0.5.0 4 6 8 0 2 4 4 6 8 0 2 4 2 4 6 8 0 Figure 2: Data Set 2 5
Y 6 8 0 2 Residuals 0 2 3 4 6 8 0 2 4 4 6 8 0 2 4 0 200 400 600 800 000 200 4 6 8 0 2 4 4 6 8 0 2 4 2 4 6 8 0 Figure 3: Data Set 3 6
Y 6 8 0 2 Residuals.5 0.5 0.0 0.5.0.5 8 0 2 4 6 8 8 0 2 4 6 8.5.0 0.5 0.0 0.5.0.5 8 0 2 4 6 8 8 0 2 4 6 8 2 4 6 8 0 Figure 4: Data Set 4 7
Problem 4 [25 points] Discussions omitted (i) 20 40 60 80 00 40 80 220 y 0.0 0.2 0.4 0.6 0.8.0 20 40 60 80 00 age tri 50 250 350 450 40 80 220 chol 0.0 0.2 0.4 0.6 0.8.0 50 250 350 450 (ii) model <- lm(y ~ age + tri + chol, data = data) 8
(iii) 0 2 3 4 5 20 40 60 80 00 20 40 60 80 00 Age 5 0 5 20 25 Age 0 2 3 4 5 50 200 250 300 350 400 450 50 200 250 300 350 400 450 Triglycerides 5 0 5 20 25 Triglycerides 0 2 3 4 5 40 60 80 200 220 240 40 60 80 200 220 240 Cholesterol 5 0 5 20 25 Cholesterol 9
(iv) model <- lm(y ~ age + tri + chol + I(age^2) + I(tri^2) + I(chol^2), data = data) 0 0 20 30 40 50 20 40 60 80 00 20 40 60 80 00 Age 5 0 5 20 25 Age 0 0 20 30 40 50 50 200 250 300 350 400 450 50 200 250 300 350 400 450 Triglycerides 5 0 5 20 25 Triglycerides 0 0 20 30 40 50 40 60 80 200 220 240 40 60 80 200 220 240 Cholesterol 5 0 5 20 25 Cholesterol 0
which(rstudent(model) > 30) ## 25 ## 25 (v) data2 <- data[-25,] model <- lm(y ~ age + tri + chol + I(age^2) + I(tri^2) + I(chol^2), data = data2) 3 2 0 2 20 30 40 50 60 70 80 20 30 40 50 60 70 80 Age 5 0 5 20 Age 3 2 0 2 50 200 250 300 350 400 450 50 200 250 300 350 400 450 Triglycerides 5 0 5 20 Triglycerides
3 2 0 2 40 60 80 200 220 240 40 60 80 200 220 240 Cholesterol 5 0 5 20 Cholesterol library(knitr) kable(summary(model)$coefficients) Estimate Std. Error t value Pr(> t ) (Intercept) -0.095302 0.00005-862.43588 0.0000000 age 0.000047 0.000005 28.0644703 0.0000000 tri 0.000036 0.0000003 385.7563999 0.0000000 chol 0.000242 0.000002 03.582206 0.0000000 I(ageˆ2) 0.000032 0.0000000 6932.5827948 0.0000000 I(triˆ2) 0.0000000 0.0000000 -.4373 0.288524 I(cholˆ2) 0.0000000 0.0000000-0.272432 0.78857 kable(confint(model, parm = 2:6, level = - 0.05 / 6)) 0.47 % 99.583 % age 0.0000373 0.0000462 tri 0.000028 0.000044 chol 0.000206 0.000278 I(ageˆ2) 0.000032 0.000033 I(triˆ2) 0.0000000 0.0000000 2
Problem 5 [0 points] d = read.table("http://stat.cmu.edu/~larry/secretdata.txt") y = d[,] x = d[,2] x2 = d[,3] x3 = d[,4] x4 = d[,5] (a) 0.5 0.0 0.5 0.5 0.0 0.5 2.0 0.0 0.5 3 y 0.0 0.5.0 x 0.5 0.0 0.5 0.5 x2 0.5.5 x3 3 2 0 2 0.5 0.0 0.5 Figure 5: Secret Data (b) model5 <- lm(y ~ x + x2 + x3 + x4) Summarize the fitted model. 3.5 0.5.5 0.5 x4 0.5.5
4
5
Figure 6: The above residual plot is a spoof of this scene from The Simpsons Appendix quartz(height = 2, width = 6) par(mar = c(4,4.5,2,2) + 0.) par(oma = c(.5,,2,) + 0.) par(mfrow=c(2,2)) par(bg = "azure") out <- lm(y ~ x+x2+x3+x4) plot(x, residuals(out), col = NA, pch = 9, axes = FALSE, ylab = "Residuals", font.lab = 3, xlab = "", xlim = range(x4), cex.lab=2) axis(side =, at = seq(-2.5,2.5,0.25), labels = as.character(seq(-2.5,2.5,0.25)), font = 5, cex.axis =.25) axis(side = 2, at = seq(-3,2.5,0.5), labels = as.character(seq(-3,2.5,0.5)), font = 5, cex.axis =.25) abline(h = seq(-3,2.5,0.5), col = "gray75", lty = 2) abline(v = seq(-2.5,2.5,0.25), col = "gray80", lty = 2) 6
abline(0,0, lty = 2, col = "gray45") points(x, residuals(out), col = addtrans("orange",20), pch = 9) points(x, residuals(out), col = "orange") panel.smooth(x, residuals(out), col = "orange",cex =, lwd =.2, col.smooth = "seagreen", span = 2/3, iter = 3) plot(x2, residuals(out), col = NA, pch = 9, axes = FALSE, ylab = "Residuals", font.lab = 3, xlab = "", xlim = range(x4), cex.lab=2) axis(side =, at = seq(-2.5,2.5,0.25), labels = as.character(seq(-2.5,2.5,0.25)), font = 5, cex.axis =.25) axis(side = 2, at = seq(-3,2.5,0.5), labels = as.character(seq(-3,2.5,0.5)), font = 5, cex.axis =.25) abline(h = seq(-3,2.5,0.5), col = "gray75", lty = 2) abline(v = seq(-2.5,2.5,0.25), col = "gray80", lty = 2) abline(0,0, lty = 2, col = "gray45") points(x2, residuals(out), col = addtrans("orange",20), pch = 9) points(x2, residuals(out), col = "orange") panel.smooth(x2, residuals(out), col = "orange",cex =, lwd =.2, col.smooth = "seagreen", span = 2/3, iter = 3) plot(x3, residuals(out), col = NA, pch = 9, axes = FALSE, ylab = "Residuals", font.lab = 3, xlab = "", xlim = range(x4), cex.lab=2) axis(side =, at = seq(-2.5,2.5,0.25), labels = as.character(seq(-2.5,2.5,0.25)), font = 5, cex.axis =.25) axis(side = 2, at = seq(-3,2.5,0.5), labels = as.character(seq(-3,2.5,0.5)), font = 5, cex.axis =.25) abline(h = seq(-3,2.5,0.5), col = "gray75", lty = 2) abline(v = seq(-2.5,2.5,0.25), col = "gray80", lty = 2) abline(0,0, lty = 2, col = "gray45") points(x3, residuals(out), col = addtrans("orange",20), pch = 9) points(x3, residuals(out), col = "orange") panel.smooth(x3, residuals(out), col = "orange",cex =, lwd =.2, col.smooth = "seagreen", span = 2/3, iter = 3) plot(x4, residuals(out), col = NA, pch = 9, axes = FALSE, ylab = "Residuals", font.lab = 3, xlab = "", xlim = range(x4), cex.lab=2) axis(side =, at = seq(-2.5,2.5,0.25), labels = as.character(seq(-2.5,2.5,0.25)), font = 5, cex.axis =.25) axis(side = 2, at = seq(-3,2.5,0.5), labels = as.character(seq(-3,2.5,0.5)), font = 5, cex.axis =.25) abline(h = seq(-3,2.5,0.5), col = "gray75", lty = 2) abline(v = seq(-2.5,2.5,0.25), col = "gray80", lty = 2) abline(0,0, lty = 2, col = "gray45") points(x4, residuals(out), col = addtrans("orange",20), pch = 9) points(x4, residuals(out), col = "orange") panel.smooth(x4, residuals(out), col = "orange",cex =, lwd =.2, col.smooth = "seagreen", span = 2/3, iter = 3) mtext("linear Regression Residuals vs. Predictors", side = 3, line = -.2, outer = TRUE, font = 3, cex = 2) quartz.save(file = "residuals.png", type = "png") graphics.off() 7
quartz(height = 2, width = 6) par(mar = c(7,4.5,2,2) + 0.) par(bg = "azure") par(mfrow=c(,)) plot(out, which =, col = NA, pch = 9, axes = FALSE,add.smooth = FALSE, caption = "", font.lab = 3, sub.caption = "", cex.lab = 2, labels.id = NA) axis(side =, at = seq(-3,2.5,0.25), labels = as.character(seq(-3,2.5,0.25)), font = 5, cex.axis =.5) axis(side = 2, at = seq(-3.5,2.5,0.5), labels = as.character(seq(-3.5,2.5,0.5)), font = 5, cex.axis =.5) abline(h = seq(-3,2.5,0.5), col = "gray75", lty = 2) abline(v = seq(-2.5,2.5,0.25), col = "gray80", lty = 2) abline(0,0, lty = 2, col = "gray45") points(fitted(out), residuals(out), col = addtrans("orange",20), pch = 9, cex = 0.8) points(fitted(out), residuals(out), col = "orange", cex = 0.8) quartz.save(file = "homer.png", type = "png") 8