Transform data frames

TODO

  • link to strings for grep(), dfSplitMerge, dfReshape

Add variables to a data frame

set.seed(123)
N      <- 12
sex    <- sample(c("f", "m"), N, replace=TRUE)
group  <- sample(rep(c("CG", "WL", "T"), 4), N, replace=FALSE)
age    <- sample(18:35, N, replace=TRUE)
IQ     <- round(rnorm(N, mean=100, sd=15))
rating <- round(runif(N, min=0, max=6))
(myDf1 <- data.frame(id=1:N, sex, group, age, IQ, rating))
   id sex group age  IQ rating
1   1   f     T  29 111      4
2   2   m    CG  30  93      1
3   3   f    WL  27  84      2
4   4   m     T  28  97      2
5   5   m    CG  23  85      5
6   6   f    CG  20  89      3
7   7   m    WL  35  91      5
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
10 10   f     T  32 102      3
11 11   m     T  18  83      5
12 12   f    WL  26 119      4
isSingle <- sample(c(TRUE, FALSE), nrow(myDf1), replace=TRUE)
myDf2    <- myDf1
myDf2$isSingle1    <- isSingle
myDf2["isSingle2"] <- isSingle
myDf3 <- cbind(myDf1, isSingle)
head(myDf3)
  id sex group age  IQ rating isSingle
1  1   f     T  29 111      4    FALSE
2  2   m    CG  30  93      1     TRUE
3  3   f    WL  27  84      2     TRUE
4  4   m     T  28  97      2     TRUE
5  5   m    CG  23  85      5     TRUE
6  6   f    CG  20  89      3    FALSE
myDf4 <- transform(myDf3, rSq=rating^2)
head(myDf4)
  id sex group age  IQ rating isSingle rSq
1  1   f     T  29 111      4    FALSE  16
2  2   m    CG  30  93      1     TRUE   1
3  3   f    WL  27  84      2     TRUE   4
4  4   m     T  28  97      2     TRUE   4
5  5   m    CG  23  85      5     TRUE  25
6  6   f    CG  20  89      3    FALSE   9

Remove variables from a data frame

Remove one variable.

dfTemp <- myDf1
dfTemp$group <- NULL
head(dfTemp)
  id sex age  IQ rating
1  1   f  29 111      4
2  2   m  30  93      1
3  3   f  27  84      2
4  4   m  28  97      2
5  5   m  23  85      5
6  6   f  20  89      3

Remove multiple variables.

delVars <- c("sex", "IQ")
dfTemp[delVars] <- list(NULL)
head(dfTemp)
  id age rating
1  1  29      4
2  2  30      1
3  3  27      2
4  4  28      2
5  5  23      5
6  6  20      3

Sort data frames

(idx1 <- order(myDf1$rating))
 [1]  2  3  4  6 10  1 12  5  7  8  9 11
myDf1[idx1, ]
   id sex group age  IQ rating
2   2   m    CG  30  93      1
3   3   f    WL  27  84      2
4   4   m     T  28  97      2
6   6   f    CG  20  89      3
10 10   f     T  32 102      3
1   1   f     T  29 111      4
12 12   f    WL  26 119      4
5   5   m    CG  23  85      5
7   7   m    WL  35  91      5
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
11 11   m     T  18  83      5
(idx2 <- order(myDf1$group, myDf1$IQ))
 [1]  5  6  2  9 11  4 10  1  8  3  7 12
myDf1[idx2, ]
   id sex group age  IQ rating
5   5   m    CG  23  85      5
6   6   f    CG  20  89      3
2   2   m    CG  30  93      1
9   9   m    CG  30 113      5
11 11   m     T  18  83      5
4   4   m     T  28  97      2
10 10   f     T  32 102      3
1   1   f     T  29 111      4
8   8   m    WL  34  75      5
3   3   f    WL  27  84      2
7   7   m    WL  35  91      5
12 12   f    WL  26 119      4
(idx3 <- order(myDf1$group, -myDf1$rating))
 [1]  5  9  6  2 11  1 10  4  7  8 12  3
myDf1[idx3, ]
   id sex group age  IQ rating
5   5   m    CG  23  85      5
9   9   m    CG  30 113      5
6   6   f    CG  20  89      3
2   2   m    CG  30  93      1
11 11   m     T  18  83      5
1   1   f     T  29 111      4
10 10   f     T  32 102      3
4   4   m     T  28  97      2
7   7   m    WL  35  91      5
8   8   m    WL  34  75      5
12 12   f    WL  26 119      4
3   3   f    WL  27  84      2

Select subsets of data

Select cases and variables using index vectors

(idxLog <- myDf1$sex == "f")
 [1]  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
[12]  TRUE
(idxNum <- which(idxLog))
[1]  1  3  6 10 12
myDf1[idxNum, ]
   id sex group age  IQ rating
1   1   f     T  29 111      4
3   3   f    WL  27  84      2
6   6   f    CG  20  89      3
10 10   f     T  32 102      3
12 12   f    WL  26 119      4
(idx2 <- (myDf1$sex == "m") & (myDf1$rating > 2))
 [1] FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE
[12] FALSE
myDf1[which(idx2), ]
   id sex group age  IQ rating
5   5   m    CG  23  85      5
7   7   m    WL  35  91      5
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
11 11   m     T  18  83      5
(idx3 <- (myDf1$IQ < 90) | (myDf1$IQ > 110))
 [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
[12]  TRUE
myDf1[which(idx3), ]
   id sex group age  IQ rating
1   1   f     T  29 111      4
3   3   f    WL  27  84      2
5   5   m    CG  23  85      5
6   6   f    CG  20  89      3
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
11 11   m     T  18  83      5
12 12   f    WL  26 119      4
myDf1[1:3, c("group", "IQ")]
  group  IQ
1     T 111
2    CG  93
3    WL  84
myDf1[1:3, 2:4]
  sex group age
1   f     T  29
2   m    CG  30
3   f    WL  27
dfTemp <- myDf1
(names(dfTemp) <- paste(rep(c("A", "B"), each=3), 100:102, sep=""))
[1] "A100" "A101" "A102" "B100" "B101" "B102"
(colIdx <- grep("^B.*$", names(dfTemp)))
[1] 4 5 6
dfTemp[1:3, colIdx]
  B100 B101 B102
1   29  111    4
2   30   93    1
3   27   84    2

See ?Extract for help on this topic.

Select cases and variables using subset()

subset(myDf1, sex == "f")
   id sex group age  IQ rating
1   1   f     T  29 111      4
3   3   f    WL  27  84      2
6   6   f    CG  20  89      3
10 10   f     T  32 102      3
12 12   f    WL  26 119      4
subset(myDf1, sex == "f", select=-2)
   id group age  IQ rating
1   1     T  29 111      4
3   3    WL  27  84      2
6   6    CG  20  89      3
10 10     T  32 102      3
12 12    WL  26 119      4
subset(myDf1, (sex == "m") & (rating > 2))
   id sex group age  IQ rating
5   5   m    CG  23  85      5
7   7   m    WL  35  91      5
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
11 11   m     T  18  83      5
subset(myDf1, (IQ < 90) | (IQ > 110))
   id sex group age  IQ rating
1   1   f     T  29 111      4
3   3   f    WL  27  84      2
5   5   m    CG  23  85      5
6   6   f    CG  20  89      3
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
11 11   m     T  18  83      5
12 12   f    WL  26 119      4
subset(myDf1, group %in% c("CG", "WL"))
   id sex group age  IQ rating
2   2   m    CG  30  93      1
3   3   f    WL  27  84      2
5   5   m    CG  23  85      5
6   6   f    CG  20  89      3
7   7   m    WL  35  91      5
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
12 12   f    WL  26 119      4

Select variables based on conditions with Filter()

Only numerical variables.

myDfNum <- Filter(is.numeric, myDf1)
head(myDfNum)
  id age  IQ rating
1  1  29 111      4
2  2  30  93      1
3  3  27  84      2
4  4  28  97      2
5  5  23  85      5
6  6  20  89      3

Remove duplicated cases

myDfDouble <- rbind(myDf1, myDf1[sample(1:nrow(myDf1), 4), ])
duplicated(myDfDouble)
 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE  TRUE  TRUE  TRUE  TRUE
myDfUnique <- unique(myDfDouble)

Treat missing values

myDfNA           <- myDf1
myDfNA$IQ[4]     <- NA
myDfNA$rating[5] <- NA
is.na(myDfNA)[1:5, c("age", "IQ", "rating")]
       age    IQ rating
[1,] FALSE FALSE  FALSE
[2,] FALSE FALSE  FALSE
[3,] FALSE FALSE  FALSE
[4,] FALSE  TRUE  FALSE
[5,] FALSE FALSE   TRUE
apply(is.na(myDfNA), 2, any)
    id    sex  group    age     IQ rating 
 FALSE  FALSE  FALSE  FALSE   TRUE   TRUE 
complete.cases(myDfNA)
 [1]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[12]  TRUE
subset(myDfNA, !complete.cases(myDfNA))
  id sex group age IQ rating
4  4   m     T  28 NA      2
5  5   m    CG  23 85     NA
head(na.omit(myDfNA))
  id sex group age  IQ rating
1  1   f     T  29 111      4
2  2   m    CG  30  93      1
3  3   f    WL  27  84      2
6  6   f    CG  20  89      3
7  7   m    WL  35  91      5
8  8   m    WL  34  75      5

Useful packages

Package dplyr provides very handy functions for the split-apply-combine approach to aggregating data frames. In order to work with data frames like with a database, use sqldf. You can then use standard SQL commands to select data. data.table provides an alternative (compatible) class of data frames with better performance for large amounts of data.

Get the article source from GitHub

R markdown - markdown - R code - all posts