# Aggregate data frames - base R

## Simulate data

set.seed(123)
N      <- 12
sex    <- factor(sample(c("f", "m"), N, replace=TRUE), levels=c("f", "m"))
group  <- factor(sample(rep(c("CG", "WL", "T"), 4), N, replace=FALSE), levels=c("CG", "WL", "T"))
age    <- sample(18:35, N, replace=TRUE)
IQ     <- round(rnorm(N, mean=100, sd=15))
rating <- round(runif(N, min=0, max=6))
(myDf1 <- data.frame(id=1:N, sex, group, age, IQ, rating))
   id sex group age  IQ rating
1   1   f     T  25  95      5
2   2   f     T  24  84      5
3   3   f    CG  27  99      3
4   4   m    WL  26 116      5
5   5   f     T  21  98      4
6   6   m    WL  31  83      4
7   7   m    CG  34  88      0
8   8   m    CG  28 110      3
9   9   f     T  24  95      1
10 10   f    WL  29  80      2
11 11   m    CG  32  91      4
12 12   m    WL  27  98      2

## Apply the same function to different variables in a data frame

lapply(myDf1[ , c("age", "IQ", "rating")], mean)
$age [1] 27.33333$IQ
[1] 94.75

$rating [1] 3.166667 sapply(myDf1[ , c("age", "IQ", "rating")], range)  age IQ rating [1,] 21 80 0 [2,] 34 116 5 (numIdx <- sapply(myDf1, is.numeric))  id sex group age IQ rating TRUE FALSE FALSE TRUE TRUE TRUE  dataNum <- myDf1[ , numIdx] head(dataNum)  id age IQ rating 1 1 25 95 5 2 2 24 84 5 3 3 27 99 3 4 4 26 116 5 5 5 21 98 4 6 6 31 83 4 ## Apply the same function to pairs of variables from two data frames N <- 100 x1 <- rnorm(N, 10, 10) y1 <- rnorm(N, 10, 10) x2 <- x1 + rnorm(N, 5, 4) y2 <- y1 + rnorm(N, 10, 4) tDf1 <- data.frame(x1, y1) tDf2 <- data.frame(x2, y2) mapply(t.test, tDf1, tDf2, MoreArgs=list(alternative="less", var.equal=TRUE))  x1 statistic -3.659217 parameter 198 p.value 0.0001621933 conf.int Numeric,2 estimate Numeric,2 null.value 0 stderr 1.336687 alternative "less" method " Two Sample t-test" data.name "dots[[1L]][[1L]] and dots[[2L]][[1L]]" y1 statistic -5.918708 parameter 198 p.value 7.044864e-09 conf.int Numeric,2 estimate Numeric,2 null.value 0 stderr 1.598287 alternative "less" method " Two Sample t-test" data.name "dots[[1L]][[2L]] and dots[[2L]][[2L]]" ## Separate descriptive statistics for each group for many variables ### split() -> sapply() (splitRes <- split(myDf1, myDf1$group))
$CG id sex group age IQ rating 3 3 f CG 27 99 3 7 7 m CG 34 88 0 8 8 m CG 28 110 3 11 11 m CG 32 91 4$WL
id sex group age  IQ rating
4   4   m    WL  26 116      5
6   6   m    WL  31  83      4
10 10   f    WL  29  80      2
12 12   m    WL  27  98      2

$T id sex group age IQ rating 1 1 f T 25 95 5 2 2 f T 24 84 5 5 5 f T 21 98 4 9 9 f T 24 95 1 sapply(splitRes, function(x) mean(x$IQ))
   CG    WL     T
97.00 94.25 93.00 

Here equivalent to

tapply(myDf1$IQ, myDf1$group, FUN=mean)
   CG    WL     T
97.00 94.25 93.00 

### aggregate()

numDf <- subset(myDf1, select=c("age", "IQ", "rating"))
aggregate(numDf, list(myDf1$sex, myDf1$group), FUN=mean)
  Group.1 Group.2      age       IQ   rating
1       f      CG 27.00000 99.00000 3.000000
2       m      CG 31.33333 96.33333 2.333333
3       f      WL 29.00000 80.00000 2.000000
4       m      WL 28.00000 99.00000 3.666667
5       f       T 23.50000 93.00000 3.750000
aggregate(cbind(age, IQ, rating) ~ sex + group, FUN=mean, data=myDf1)
  sex group      age       IQ   rating
1   f    CG 27.00000 99.00000 3.000000
2   m    CG 31.33333 96.33333 2.333333
3   f    WL 29.00000 80.00000 2.000000
4   m    WL 28.00000 99.00000 3.666667
5   f     T 23.50000 93.00000 3.750000
aggregate(cbind(age, IQ, rating) ~ 1, FUN=mean, data=myDf1)
       age    IQ   rating
1 27.33333 94.75 3.166667

Calculate different measures by group

(groupN   <- as.data.frame(xtabs(~ sex + group, data=myDf1)))
  sex group Freq
1   f    CG    1
2   m    CG    3
3   f    WL    1
4   m    WL    3
5   f     T    4
6   m     T    0
(groupM   <- aggregate(age ~ sex + group, data=myDf1, FUN=mean))
  sex group      age
1   f    CG 27.00000
2   m    CG 31.33333
3   f    WL 29.00000
4   m    WL 28.00000
5   f     T 23.50000
(groupSD  <- aggregate(age ~ sex + group, data=myDf1, FUN=sd))
  sex group      age
1   f    CG       NA
2   m    CG 3.055050
3   f    WL       NA
4   m    WL 2.645751
5   f     T 1.732051

Merge all results into one data frame

(groupMSD <- merge(groupM, groupSD, by=c("sex", "group"),
suffixes=c(".M", ".SD")))
  sex group    age.M   age.SD
1   f    CG 27.00000       NA
2   f     T 23.50000 1.732051
3   f    WL 29.00000       NA
4   m    CG 31.33333 3.055050
5   m    WL 28.00000 2.645751
(groupMSDN <- merge(groupMSD, groupN, by=c("sex", "group")))
  sex group    age.M   age.SD Freq
1   f    CG 27.00000       NA    1
2   f     T 23.50000 1.732051    4
3   f    WL 29.00000       NA    1
4   m    CG 31.33333 3.055050    3
5   m    WL 28.00000 2.645751    3

### by()

by(numDf, list(myDf1$sex, myDf1$group), FUN=function(x) sapply(x, mean))
: f
: CG
age     IQ rating
27     99      3
------------------------------------------------------------
: m
: CG
age        IQ    rating
31.333333 96.333333  2.333333
------------------------------------------------------------
: f
: WL
age     IQ rating
29     80      2
------------------------------------------------------------
: m
: WL
age        IQ    rating
28.000000 99.000000  3.666667
------------------------------------------------------------
: f
: T
age     IQ rating
23.50  93.00   3.75
------------------------------------------------------------
: m
: T
NULL