grep(), dfSplitMerge, dfReshapeset.seed(123)
N <- 12
sex <- sample(c("f", "m"), N, replace=TRUE)
group <- sample(rep(c("CG", "WL", "T"), 4), N, replace=FALSE)
age <- sample(18:35, N, replace=TRUE)
IQ <- round(rnorm(N, mean=100, sd=15))
rating <- round(runif(N, min=0, max=6))
(myDf1 <- data.frame(id=1:N, sex, group, age, IQ, rating)) id sex group age IQ rating
1 1 f T 29 111 4
2 2 m CG 30 93 1
3 3 f WL 27 84 2
4 4 m T 28 97 2
5 5 m CG 23 85 5
6 6 f CG 20 89 3
7 7 m WL 35 91 5
8 8 m WL 34 75 5
9 9 m CG 30 113 5
10 10 f T 32 102 3
11 11 m T 18 83 5
12 12 f WL 26 119 4isSingle <- sample(c(TRUE, FALSE), nrow(myDf1), replace=TRUE)
myDf2 <- myDf1
myDf2$isSingle1 <- isSingle
myDf2["isSingle2"] <- isSingle
myDf3 <- cbind(myDf1, isSingle)
head(myDf3) id sex group age IQ rating isSingle
1 1 f T 29 111 4 FALSE
2 2 m CG 30 93 1 TRUE
3 3 f WL 27 84 2 TRUE
4 4 m T 28 97 2 TRUE
5 5 m CG 23 85 5 TRUE
6 6 f CG 20 89 3 FALSEmyDf4 <- transform(myDf3, rSq=rating^2)
head(myDf4) id sex group age IQ rating isSingle rSq
1 1 f T 29 111 4 FALSE 16
2 2 m CG 30 93 1 TRUE 1
3 3 f WL 27 84 2 TRUE 4
4 4 m T 28 97 2 TRUE 4
5 5 m CG 23 85 5 TRUE 25
6 6 f CG 20 89 3 FALSE 9Remove one variable.
dfTemp <- myDf1
dfTemp$group <- NULL
head(dfTemp) id sex age IQ rating
1 1 f 29 111 4
2 2 m 30 93 1
3 3 f 27 84 2
4 4 m 28 97 2
5 5 m 23 85 5
6 6 f 20 89 3Remove multiple variables.
delVars <- c("sex", "IQ")
dfTemp[delVars] <- list(NULL)
head(dfTemp) id age rating
1 1 29 4
2 2 30 1
3 3 27 2
4 4 28 2
5 5 23 5
6 6 20 3(idx1 <- order(myDf1$rating)) [1] 2 3 4 6 10 1 12 5 7 8 9 11myDf1[idx1, ] id sex group age IQ rating
2 2 m CG 30 93 1
3 3 f WL 27 84 2
4 4 m T 28 97 2
6 6 f CG 20 89 3
10 10 f T 32 102 3
1 1 f T 29 111 4
12 12 f WL 26 119 4
5 5 m CG 23 85 5
7 7 m WL 35 91 5
8 8 m WL 34 75 5
9 9 m CG 30 113 5
11 11 m T 18 83 5(idx2 <- order(myDf1$group, myDf1$IQ)) [1] 5 6 2 9 11 4 10 1 8 3 7 12myDf1[idx2, ] id sex group age IQ rating
5 5 m CG 23 85 5
6 6 f CG 20 89 3
2 2 m CG 30 93 1
9 9 m CG 30 113 5
11 11 m T 18 83 5
4 4 m T 28 97 2
10 10 f T 32 102 3
1 1 f T 29 111 4
8 8 m WL 34 75 5
3 3 f WL 27 84 2
7 7 m WL 35 91 5
12 12 f WL 26 119 4(idx3 <- order(myDf1$group, -myDf1$rating)) [1] 5 9 6 2 11 1 10 4 7 8 12 3myDf1[idx3, ] id sex group age IQ rating
5 5 m CG 23 85 5
9 9 m CG 30 113 5
6 6 f CG 20 89 3
2 2 m CG 30 93 1
11 11 m T 18 83 5
1 1 f T 29 111 4
10 10 f T 32 102 3
4 4 m T 28 97 2
7 7 m WL 35 91 5
8 8 m WL 34 75 5
12 12 f WL 26 119 4
3 3 f WL 27 84 2(idxLog <- myDf1$sex == "f") [1] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
[12] TRUE(idxNum <- which(idxLog))[1] 1 3 6 10 12myDf1[idxNum, ] id sex group age IQ rating
1 1 f T 29 111 4
3 3 f WL 27 84 2
6 6 f CG 20 89 3
10 10 f T 32 102 3
12 12 f WL 26 119 4(idx2 <- (myDf1$sex == "m") & (myDf1$rating > 2)) [1] FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
[12] FALSEmyDf1[which(idx2), ] id sex group age IQ rating
5 5 m CG 23 85 5
7 7 m WL 35 91 5
8 8 m WL 34 75 5
9 9 m CG 30 113 5
11 11 m T 18 83 5(idx3 <- (myDf1$IQ < 90) | (myDf1$IQ > 110)) [1] TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE
[12] TRUEmyDf1[which(idx3), ] id sex group age IQ rating
1 1 f T 29 111 4
3 3 f WL 27 84 2
5 5 m CG 23 85 5
6 6 f CG 20 89 3
8 8 m WL 34 75 5
9 9 m CG 30 113 5
11 11 m T 18 83 5
12 12 f WL 26 119 4myDf1[1:3, c("group", "IQ")] group IQ
1 T 111
2 CG 93
3 WL 84myDf1[1:3, 2:4] sex group age
1 f T 29
2 m CG 30
3 f WL 27dfTemp <- myDf1
(names(dfTemp) <- paste(rep(c("A", "B"), each=3), 100:102, sep=""))[1] "A100" "A101" "A102" "B100" "B101" "B102"(colIdx <- grep("^B.*$", names(dfTemp)))[1] 4 5 6dfTemp[1:3, colIdx] B100 B101 B102
1 29 111 4
2 30 93 1
3 27 84 2See ?Extract for help on this topic.
subset()subset(myDf1, sex == "f") id sex group age IQ rating
1 1 f T 29 111 4
3 3 f WL 27 84 2
6 6 f CG 20 89 3
10 10 f T 32 102 3
12 12 f WL 26 119 4subset(myDf1, sex == "f", select=-2) id group age IQ rating
1 1 T 29 111 4
3 3 WL 27 84 2
6 6 CG 20 89 3
10 10 T 32 102 3
12 12 WL 26 119 4subset(myDf1, (sex == "m") & (rating > 2)) id sex group age IQ rating
5 5 m CG 23 85 5
7 7 m WL 35 91 5
8 8 m WL 34 75 5
9 9 m CG 30 113 5
11 11 m T 18 83 5subset(myDf1, (IQ < 90) | (IQ > 110)) id sex group age IQ rating
1 1 f T 29 111 4
3 3 f WL 27 84 2
5 5 m CG 23 85 5
6 6 f CG 20 89 3
8 8 m WL 34 75 5
9 9 m CG 30 113 5
11 11 m T 18 83 5
12 12 f WL 26 119 4subset(myDf1, group %in% c("CG", "WL")) id sex group age IQ rating
2 2 m CG 30 93 1
3 3 f WL 27 84 2
5 5 m CG 23 85 5
6 6 f CG 20 89 3
7 7 m WL 35 91 5
8 8 m WL 34 75 5
9 9 m CG 30 113 5
12 12 f WL 26 119 4Filter()Only numerical variables.
myDfNum <- Filter(is.numeric, myDf1)
head(myDfNum) id age IQ rating
1 1 29 111 4
2 2 30 93 1
3 3 27 84 2
4 4 28 97 2
5 5 23 85 5
6 6 20 89 3myDfDouble <- rbind(myDf1, myDf1[sample(1:nrow(myDf1), 4), ])
duplicated(myDfDouble) [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE TRUE TRUE TRUE TRUEmyDfUnique <- unique(myDfDouble)myDfNA <- myDf1
myDfNA$IQ[4] <- NA
myDfNA$rating[5] <- NAis.na(myDfNA)[1:5, c("age", "IQ", "rating")] age IQ rating
[1,] FALSE FALSE FALSE
[2,] FALSE FALSE FALSE
[3,] FALSE FALSE FALSE
[4,] FALSE TRUE FALSE
[5,] FALSE FALSE TRUEapply(is.na(myDfNA), 2, any) id sex group age IQ rating
FALSE FALSE FALSE FALSE TRUE TRUE complete.cases(myDfNA) [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
[12] TRUEsubset(myDfNA, !complete.cases(myDfNA)) id sex group age IQ rating
4 4 m T 28 NA 2
5 5 m CG 23 85 NAhead(na.omit(myDfNA)) id sex group age IQ rating
1 1 f T 29 111 4
2 2 m CG 30 93 1
3 3 f WL 27 84 2
6 6 f CG 20 89 3
7 7 m WL 35 91 5
8 8 m WL 34 75 5Package dplyr provides very handy functions for the split-apply-combine approach to aggregating data frames. In order to work with data frames like with a database, use sqldf. You can then use standard SQL commands to select data. data.table provides an alternative (compatible) class of data frames with better performance for large amounts of data.
R markdown - markdown - R code - all posts