wants <- c("car", "leaps")
has <- wants %in% rownames(installed.packages())
if(any(!has)) install.packages(wants[!has])
set.seed(123)
N <- 100
X1 <- rnorm(N, 175, 7)
X2 <- rnorm(N, 30, 8)
X3 <- abs(rnorm(N, 60, 30))
Y <- 0.5*X1 - 0.3*X2 - 0.4*X3 + 10 + rnorm(N, 0, 7)
dfRegr <- data.frame(X1, X2, X3, Y)
Call:
lm(formula = Y ~ X1 + X2, data = dfRegr)
Coefficients:
(Intercept) X1 X2
-47.5012 0.6805 -0.2965
Call:
lm(formula = scale(Y) ~ scale(X1) + scale(X2), data = dfRegr)
Coefficients:
(Intercept) scale(X1) scale(X2)
5.053e-16 2.972e-01 -1.568e-01
Error in scatter3d(Y ~ X1 + X2, fill = FALSE, data = dfRegr): rgl package missing
(Intercept) X1 X2
-47.5011799 0.6804857 -0.2964717
1 2 3 4 5 6
-28.853937 -18.923898 -3.540015 -12.154867 3.585806 7.677999
1 2 3 4 5 6
61.70483 60.98398 70.69952 63.84983 65.56255 70.96601
Call:
lm(formula = Y ~ X1 + X2 + X3, data = dfRegr)
Coefficients:
(Intercept) X1 X2 X3
19.1588 0.4445 -0.2596 -0.4134
Call:
lm(formula = Y ~ X2 + X3, data = dfRegr)
Coefficients:
(Intercept) X2 X3
98.5347 -0.2763 -0.4261
Call:
lm(formula = Y ~ X1, data = dfRegr)
Coefficients:
(Intercept) X1
-59.2628 0.6983
[1] 0.7545767
[1] 0.7469072
[1] 7.360588
[1] 815.6499
[1] 2.0000 529.8622
[1] 2.0000 535.0725
cv.glm()
function from package boot
, see crossvalidation
Call:
lm(formula = Y ~ X1 + X2, data = dfRegr)
Residuals:
Min 1Q Median 3Q Max
-31.857 -6.765 1.967 9.270 37.943
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -47.5012 39.0472 -1.217 0.22674
X1 0.6805 0.2187 3.112 0.00244 **
X2 -0.2965 0.1806 -1.641 0.10395
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.89 on 97 degrees of freedom
Multiple R-squared: 0.1175, Adjusted R-squared: 0.09931
F-statistic: 6.458 on 2 and 97 DF, p-value: 0.002328
2.5 % 97.5 %
(Intercept) -124.9990783 29.99671860
X1 0.2464808 1.11449071
X2 -0.6549521 0.06200877
(Intercept) X1 X2
(Intercept) 1524.684429 -8.455382007 -1.294237738
X1 -8.455382 0.047817791 0.001956354
X2 -1.294238 0.001956354 0.032623539
Single term additions
Model:
Y ~ X1
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 19222 529.86
X2 1 519.5 18702 529.12 2.6942 0.104
X3 1 13622.6 5599 408.52 236.0033 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Analysis of Variance Table
Model 1: Y ~ X1
Model 2: Y ~ X1 + X2 + X3
Res.Df RSS Df Sum of Sq F Pr(>F)
1 98 19221.6
2 96 5201.1 2 14020 129.39 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As an alternative, see package lmSubsets
, also described here.
GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
1950 89.5 284.599 335.1 165.0 110.929 1950 61.187
1951 96.2 328.975 209.9 309.9 112.075 1951 63.221
1952 98.1 346.999 193.2 359.4 113.270 1952 63.639
library(leaps)
subs <- regsubsets(GNP.deflator ~ ., data=longley)
summary(subs, matrix.logical=TRUE)
Subset selection object
Call: regsubsets.formula(GNP.deflator ~ ., data = longley)
6 Variables (and intercept)
Forced in Forced out
GNP FALSE FALSE
Unemployed FALSE FALSE
Armed.Forces FALSE FALSE
Population FALSE FALSE
Year FALSE FALSE
Employed FALSE FALSE
1 subsets of each size up to 6
Selection Algorithm: exhaustive
GNP Unemployed Armed.Forces Population Year Employed
1 ( 1 ) TRUE FALSE FALSE FALSE FALSE FALSE
2 ( 1 ) FALSE FALSE TRUE FALSE TRUE FALSE
3 ( 1 ) TRUE TRUE FALSE TRUE FALSE FALSE
4 ( 1 ) TRUE TRUE TRUE TRUE FALSE FALSE
5 ( 1 ) TRUE TRUE TRUE TRUE TRUE FALSE
6 ( 1 ) TRUE TRUE TRUE TRUE TRUE TRUE
Xmat <- data.matrix(subset(longley, select=c("GNP", "Unemployed", "Population", "Year")))
(leapFits <- leaps(Xmat, longley$GNP.deflator, method="Cp"))
$which
1 2 3 4
1 TRUE FALSE FALSE FALSE
1 FALSE FALSE FALSE TRUE
1 FALSE FALSE TRUE FALSE
1 FALSE TRUE FALSE FALSE
2 FALSE TRUE FALSE TRUE
2 FALSE FALSE TRUE TRUE
2 TRUE FALSE FALSE TRUE
2 TRUE FALSE TRUE FALSE
2 TRUE TRUE FALSE FALSE
2 FALSE TRUE TRUE FALSE
3 TRUE TRUE TRUE FALSE
3 TRUE FALSE TRUE TRUE
3 FALSE TRUE TRUE TRUE
3 TRUE TRUE FALSE TRUE
4 TRUE TRUE TRUE TRUE
$label
[1] "(Intercept)" "1" "2" "3" "4"
$size
[1] 2 2 2 2 3 3 3 3 3 3 4 4 4 4 5
$Cp
[1] 9.934664 11.077014 42.000853 793.075625 8.960981 9.177695
[7] 9.430473 10.982967 10.985247 37.402418 3.000512 5.963360
[13] 8.781825 10.821894 5.000000
plot(leapFits$size, leapFits$Cp, xlab="model size", pch=20, col="blue",
ylab="Mallows' Cp", main="Mallows' Cp against model size")
abline(a=0, b=1)
X1new <- c(177, 150, 192, 189, 181)
dfNew <- data.frame(X1=X1new)
predict(fit1, dfNew, interval="prediction", level=0.95)
fit lwr upr
1 64.33003 36.39261 92.26744
2 45.47689 15.38202 75.57176
3 74.80400 45.97114 103.63686
4 72.70920 44.17348 101.24492
5 67.12309 39.09370 95.15248
hOrd <- order(X1)
par(lend=2)
plot(X1, Y, pch=20, xlab="Predictor", ylab="Dependent variable and prediction",
xaxs="i", main="Data and regression prediction")
polygon(c(X1[hOrd], X1[rev(hOrd)]),
c(predOrg[hOrd, "lwr"], predOrg[rev(hOrd), "upr"]),
border=NA, col=rgb(0.7, 0.7, 0.7, 0.6))
abline(fit1, col="black")
legend(x="bottomright", legend=c("Data", "prediction", "confidence region"),
pch=c(20, NA, NA), lty=c(NA, 1, 1), lwd=c(NA, 1, 8),
col=c("black", "blue", "gray"))
R markdown - markdown - R code - all posts