grep()
, dfSplitMerge, dfReshapeset.seed(123)
N <- 12
sex <- factor(sample(c("f", "m"), N, replace=TRUE), levels=c("f", "m"))
group <- factor(sample(rep(c("CG", "WL", "T"), 4), N, replace=FALSE), levels=c("CG", "WL", "T"))
age <- sample(18:35, N, replace=TRUE)
IQ <- round(rnorm(N, mean=100, sd=15))
rating <- round(runif(N, min=0, max=6))
(myDf1 <- data.frame(id=1:N, sex, group, age, IQ, rating))
id sex group age IQ rating
1 1 f T 25 95 5
2 2 f T 24 84 5
3 3 f CG 27 99 3
4 4 m WL 26 116 5
5 5 f T 21 98 4
6 6 m WL 31 83 4
7 7 m CG 34 88 0
8 8 m CG 28 110 3
9 9 f T 24 95 1
10 10 f WL 29 80 2
11 11 m CG 32 91 4
12 12 m WL 27 98 2
[1] "id" "sex" "group" "age" "IQ" "rating"
[1] "id" "sex" "fac" "age" "IQ" "rating"
[1] "id" "sex" "group" "age" "IQ" "rating"
group IQ
1 T 95
2 T 84
3 CG 99
4 WL 116
5 T 98
6 WL 83
7 CG 88
8 CG 110
9 T 95
10 WL 80
11 CG 91
12 WL 98
[1] 1 5
id IQ
1 1 95
2 2 84
3 3 99
4 4 116
5 5 98
6 6 83
7 7 88
8 8 110
9 9 95
10 10 80
11 11 91
12 12 98
Only numeric variables
id sex group age IQ rating
TRUE FALSE FALSE TRUE TRUE TRUE
id age IQ rating
1 1 25 95 5
2 2 24 84 5
3 3 27 99 3
4 4 26 116 5
5 5 21 98 4
6 6 31 83 4
7 7 34 88 0
8 8 28 110 3
9 9 24 95 1
10 10 29 80 2
11 11 32 91 4
12 12 27 98 2
More elegant
id age IQ rating
1 1 25 95 5
2 2 24 84 5
3 3 27 99 3
4 4 26 116 5
5 5 21 98 4
6 6 31 83 4
id group age rating
1 1 T 25 5
2 2 T 24 5
3 3 CG 27 3
4 4 WL 26 5
5 5 T 21 4
6 6 WL 31 4
7 7 CG 34 0
8 8 CG 28 3
9 9 T 24 1
10 10 WL 29 2
11 11 CG 32 4
12 12 WL 27 2
Remove variables - 1 variable
id sex age IQ rating
1 1 f 25 95 5
2 2 f 24 84 5
3 3 f 27 99 3
Remove variables - multiple variables
id age rating
1 1 25 5
2 2 24 5
3 3 27 3
Simple conditions
id sex group age IQ rating
1 1 f T 25 95 5
2 2 f T 24 84 5
3 3 f CG 27 99 3
5 5 f T 21 98 4
9 9 f T 24 95 1
10 10 f WL 29 80 2
id sex group age IQ rating
3 3 f CG 27 99 3
Combined conditions - logical AND
id sex group age IQ rating
4 4 m WL 26 116 5
6 6 m WL 31 83 4
8 8 m CG 28 110 3
11 11 m CG 32 91 4
Combined conditions - logical OR
id sex group age IQ rating
2 2 f T 24 84 5
4 4 m WL 26 116 5
6 6 m WL 31 83 4
7 7 m CG 34 88 0
10 10 f WL 29 80 2
id sex group age IQ rating
3 3 f CG 27 99 3
4 4 m WL 26 116 5
6 6 m WL 31 83 4
7 7 m CG 34 88 0
8 8 m CG 28 110 3
10 10 f WL 29 80 2
11 11 m CG 32 91 4
12 12 m WL 27 98 2
myDfDouble <- rbind(myDf1, myDf1[sample(seq_len(nrow(myDf1)), 4), ])
duplicated(myDfDouble) | duplicated(myDfDouble, fromLast=TRUE)
[1] FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
[13] TRUE TRUE TRUE TRUE
[1] FALSE
Count missings
id sex group age IQ rating
[1,] FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE TRUE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE TRUE
id sex group age IQ rating
FALSE FALSE FALSE FALSE TRUE TRUE
Count missings / present observations by variable
id sex group age IQ rating
0 0 0 0 1 1
id sex group age IQ rating
12 12 12 12 11 11
Drop cases with missings
[1] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
id sex group age IQ rating
1 1 f T 25 95 5
4 4 m WL 26 116 5
5 5 f T 21 98 4
6 6 m WL 31 83 4
FALSE TRUE
2 10
Show only cases with missings
id sex group age IQ rating
2 2 f T 24 NA 5
3 3 f CG 27 99 NA
married <- sample(c(TRUE, FALSE), nrow(myDf1), replace=TRUE)
myDf2 <- myDf1
myDf2$married1 <- married
myDf2["married2"] <- married
head(myDf2)
id sex group age IQ rating married1 married2
1 1 f T 25 95 5 FALSE FALSE
2 2 f T 24 84 5 TRUE TRUE
3 3 f CG 27 99 3 TRUE TRUE
4 4 m WL 26 116 5 FALSE FALSE
5 5 f T 21 98 4 TRUE TRUE
6 6 m WL 31 83 4 FALSE FALSE
id sex group age IQ rating married
1 1 f T 25 95 5 FALSE
2 2 f T 24 84 5 TRUE
3 3 f CG 27 99 3 TRUE
id sex group age IQ rating married rSq IQgrp
1 1 f T 25 95 5 FALSE 25 (0,100]
2 2 f T 24 84 5 TRUE 25 (0,100]
3 3 f CG 27 99 3 TRUE 9 (0,100]
[1] 7 9 10 12 3 8 5 6 11 1 2 4
id sex group age IQ rating
7 7 m CG 34 88 0
9 9 f T 24 95 1
10 10 f WL 29 80 2
12 12 m WL 27 98 2
3 3 f CG 27 99 3
8 8 m CG 28 110 3
5 5 f T 21 98 4
6 6 m WL 31 83 4
11 11 m CG 32 91 4
1 1 f T 25 95 5
2 2 f T 24 84 5
4 4 m WL 26 116 5
[1] 7 11 3 8 10 6 12 4 2 1 9 5
id sex group age IQ rating
7 7 m CG 34 88 0
11 11 m CG 32 91 4
3 3 f CG 27 99 3
8 8 m CG 28 110 3
10 10 f WL 29 80 2
6 6 m WL 31 83 4
12 12 m WL 27 98 2
4 4 m WL 26 116 5
2 2 f T 24 84 5
1 1 f T 25 95 5
9 9 f T 24 95 1
5 5 f T 21 98 4
In order to work with data frames like with a database, use sqldf
. You can then use standard SQL commands to select data. data.table
provides an alternative (compatible) class of data frames with better performance for large amounts of data.
R markdown - markdown - R code - all posts