# Regression diagnostics

## TODO

• link to regression, regressionLogistic

## Install required packages

wants <- c("car", "lmtest", "mvoutlier", "perturb", "robustbase", "tseries")
has   <- wants %in% rownames(installed.packages())
if(any(!has)) install.packages(wants[!has])

## Extreme values and outliers

### Univariate assessment of outliers

set.seed(123)
N  <- 100
X1 <- rnorm(N, 175, 7)
X2 <- rnorm(N,  30, 8)
X3 <- 0.3*X1 - 0.2*X2 + rnorm(N, 0, 5)
Y  <- 0.5*X1 - 0.3*X2 - 0.4*X3 + 10 + rnorm(N, 0, 5)
dfRegr <- data.frame(X1, X2, X3, Y)
library(robustbase)
xyMat <- data.matrix(dfRegr)
robXY <- covMcd(xyMat)
XYz   <- scale(xyMat, center=robXY$center, scale=sqrt(diag(robXY$cov)))
summary(XYz)
       X1                 X2                 X3                Y
Min.   :-2.53818   Min.   :-1.96554   Min.   :-2.4625   Min.   :-2.45112
1st Qu.:-0.63344   1st Qu.:-0.68918   1st Qu.:-0.5474   1st Qu.:-0.61660
Median :-0.05045   Median :-0.10278   Median : 0.1471   Median :-0.04010
Mean   :-0.02039   Mean   : 0.01779   Mean   : 0.1410   Mean   :-0.04869
3rd Qu.: 0.61065   3rd Qu.: 0.60431   3rd Qu.: 0.7874   3rd Qu.: 0.60444
Max.   : 2.17984   Max.   : 3.43113   Max.   : 2.9812   Max.   : 2.45654  

### Multivariate assessment of outliers

Mahalanobis distance with robust estimate for the covariance matrix

mahaSq <- mahalanobis(xyMat, center=robXY$center, cov=robXY$cov)
summary(sqrt(mahaSq))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
0.6179  1.3452  1.7461  1.8444  2.2515  3.9331 

Nonparametric multivariate outlier detection with package mvoutlier

library(mvoutlier)
aqRes <- aq.plot(xyMat)
Projection to the first and second robust principal components.
Proportion of total variation (explained variance): 0.63336

Where any outliers found?

## Multicollinearity

### Pairwise correlations between predictor variables

X   <- data.matrix(subset(dfRegr, select=c("X1", "X2", "X3")))
(Rx <- cor(X))
            X1          X2         X3
X1  1.00000000 -0.04953215  0.2700393
X2 -0.04953215  1.00000000 -0.2929049
X3  0.27003928 -0.29290486  1.0000000

### Variance inflation factor

library(car)
vif(fit)
      X1       X2       X3
1.079770 1.094974 1.178203 

### Condition indexes

$$\kappa$$

fitScl <- lm(scale(Y) ~ scale(X1) + scale(X2) + scale(X3), data=dfRegr)
kappa(fitScl, exact=TRUE)
[1] 1.508749
library(perturb)
colldiag(fit, scale=TRUE, center=TRUE)
Condition
Index   Variance Decomposition Proportions
X1    X2    X3
1  1.000 0.162 0.181 0.280
2  1.224 0.528 0.440 0.000
3  1.509 0.311 0.379 0.720

### Using package perturb

pRes <- perturb(fit, pvars=c("X1", "X2", "X3"), prange=c(1, 1, 1))
summary(pRes)
$formula [1] "Y ~ X1 + X2 + X3"$pvars
[1] "X1" "X2" "X3"

$prange [1] 1 1 1$ptrans2
character(0)

$formula2 [1] "Y ~ X1.1 + X2.1 + X3.1"$distribution
[1] "normal"

$summ mean s.d. min max (Intercept) 17.4182516 2.74597430 10.5755610 23.9483451 X1 0.4616296 0.01774141 0.4149206 0.5012884 X2 -0.2734070 0.01414048 -0.3074216 -0.2358462 X3 -0.4342440 0.02780265 -0.5184953 -0.3792446$dec.places
[1] 3

$full [1] FALSE$dots
[1] ""

attr(,"class")
[1] "summary.perturb"

## Detach (automatically) loaded packages (if possible)

try(detach(package:tseries))
try(detach(package:lmtest))
try(detach(package:zoo))
try(detach(package:perturb))
try(detach(package:mvoutlier))
try(detach(package:robustbase))
try(detach(package:sgeostat))
try(detach(package:car))
try(detach(package:carData))