is.na()
In R, missing values are coded as NA
(not available)
c(2, 5, 7)
vec <-5] vec[
[1] NA
c(10, 20, NA, 40, 50, NA)) (vec1 <-
[1] 10 20 NA 40 50 NA
length(vec1)
[1] 6
Identify missing values with is.na()
is.na(vec1)
[1] FALSE FALSE TRUE FALSE FALSE TRUE
any(is.na(vec1))
[1] TRUE
which(is.na(vec1))
[1] 3 6
sum(is.na(vec1))
[1] 2
c(NA, 7, 9, 10, 1, 8)
vec2 <- rbind(vec1, vec2)) (matNA <-
[,1] [,2] [,3] [,4] [,5] [,6]
vec1 10 20 NA 40 50 NA
vec2 NA 7 9 10 1 8
is.na(matNA)
[,1] [,2] [,3] [,4] [,5] [,6]
vec1 FALSE FALSE TRUE FALSE FALSE TRUE
vec2 TRUE FALSE FALSE FALSE FALSE FALSE
NA
in different situationsc(1, NA, 3)] LETTERS[
[1] "A" NA "C"
factor(LETTERS[c(1, NA, 3)])
[1] A <NA> C
Levels: A C
factor(LETTERS[c(1, NA, 3)], exclude=NULL)
[1] A <NA> C
Levels: A C <NA>
NA & TRUE
[1] NA
TRUE | NA
[1] TRUE
c(-3, 2, 0, NA, -7, 5)
vecNA <- vecNA > 0) (logIdx <-
[1] FALSE TRUE FALSE NA FALSE TRUE
vecNA[logIdx]
[1] 2 NA 5
which(logIdx)] vecNA[
[1] 2 5
NA
When data is entered in other applications (spreadsheets, SPSS, etc.), missing values are often coded as a reserved numeric value, e.g., 99 or 9999. These values need to be replaced with NA
.
c(30, 25, 23, 21, -999, 999)
vec <-is.na(vec) <- vec %in% c(-999, 999)
vec
[1] 30 25 23 21 NA NA
matrix(c(30, 25, 23, 21, -999, 999), nrow=2, ncol=3)) (mat <-
[,1] [,2] [,3]
[1,] 30 23 -999
[2,] 25 21 999
is.na(mat) <- mat %in% c(-999, 999)
mat
[,1] [,2] [,3]
[1,] 30 23 NA
[2,] 25 21 NA
c(-3, 2, 0, NA, -7, 5)
vecNA <-mean(vecNA)
[1] NA
!is.na(vecNA)
goodIdx <-mean(vecNA[goodIdx])
[1] -0.6
sd(na.omit(vecNA))
[1] 4.615192
sum(vecNA, na.rm=TRUE)
[1] -3
c(18, NA, 27, 22)
ageNA <- c(NA, 1, 5, -3)
DV1 <- c(9, 4, 2, 7)
DV2 <- cbind(ageNA, DV1, DV2)) (matNA <-
ageNA DV1 DV2
[1,] 18 NA 9
[2,] NA 1 4
[3,] 27 5 2
[4,] 22 -3 7
apply(matNA, 1, FUN=mean)
[1] NA NA 11.333333 8.666667
apply(matNA, 1, FUN=mean, na.rm=TRUE)
[1] 13.500000 2.500000 11.333333 8.666667
apply(is.na(matNA), 1, any)) (rowNAidx <-
[1] TRUE TRUE FALSE FALSE
!rowNAidx, ] matNA[
ageNA DV1 DV2
[1,] 27 5 2
[2,] 22 -3 7
na.omit(matNA)
ageNA DV1 DV2
[1,] 27 5 2
[2,] 22 -3 7
attr(,"na.action")
[1] 2 1
attr(,"class")
[1] "omit"
colMeans(na.omit(matNA))
ageNA DV1 DV2
24.5 1.0 4.5
cov(matNA, use="complete.obs")
ageNA DV1 DV2
ageNA 12.5 20 -12.5
DV1 20.0 32 -20.0
DV2 -12.5 -20 12.5
all(cov(matNA, use="complete.obs") == cov(na.omit(matNA)))
[1] TRUE
Set casewise deletion as a permanent option for statistical functions (another choice is "na.fail"
)
options(na.action="na.omit")
rowMeans(matNA)
[1] NA NA 11.333333 8.666667
rowMeans(mat, na.rm=TRUE)
[1] 26.5 23.0
cov(matNA, use="pairwise.complete.obs")
ageNA DV1 DV2
ageNA 20.33333 20 -16.000000
DV1 20.00000 16 -10.000000
DV2 -16.00000 -10 9.666667
Multiple imputation is supported by functions in packages Hmisc
, Amelia II
, and mice
.
R markdown - markdown - R code - all posts