Aggregate data

Separate descriptive statistics for each group

Njk   <- 2
P     <- 2
Q     <- 3
IQ    <- round(rnorm(Njk*P*Q, mean=100, sd=15))
sex   <- factor(rep(c("f", "m"),       times=Q*Njk))
group <- factor(rep(c("T", "WL", "CG"), each=P*Njk))
table(sex, group)
   group
sex CG T WL
  f  2 2  2
  m  2 2  2

ave()

ave(IQ, sex, FUN=mean)
 [1] 107.8 102.5 107.8 102.5 107.8 102.5 107.8 102.5 107.8 102.5 107.8
[12] 102.5

tapply()

tapply(IQ, group, FUN=mean)
   CG     T    WL 
 95.5 108.8 111.2 
tapply(IQ, list(sex, group), FUN=mean)
    CG     T    WL
f 91.5 111.5 120.5
m 99.5 106.0 102.0

Aggregate data frames

Simulate data

set.seed(123)
N      <- 12
sex    <- sample(c("f", "m"), N, replace=TRUE)
group  <- sample(rep(c("CG", "WL", "T"), 4), N, replace=FALSE)
age    <- sample(18:35, N, replace=TRUE)
IQ     <- round(rnorm(N, mean=100, sd=15))
rating <- round(runif(N, min=0, max=6))
(myDf1 <- data.frame(id=1:N, sex, group, age, IQ, rating))
   id sex group age  IQ rating
1   1   f     T  29 111      4
2   2   m    CG  30  93      1
3   3   f    WL  27  84      2
4   4   m     T  28  97      2
5   5   m    CG  23  85      5
6   6   f    CG  20  89      3
7   7   m    WL  35  91      5
8   8   m    WL  34  75      5
9   9   m    CG  30 113      5
10 10   f     T  32 102      3
11 11   m     T  18  83      5
12 12   f    WL  26 119      4

Apply the same function to different variables in a data frame

lapply(myDf1[ , c("age", "IQ", "rating")], mean)
$age
[1] 27.67

$IQ
[1] 95.17

$rating
[1] 3.667
sapply(myDf1[ , c("age", "IQ", "rating")], range)
     age  IQ rating
[1,]  18  75      1
[2,]  35 119      5
(numIdx <- sapply(myDf1, is.numeric))
    id    sex  group    age     IQ rating 
  TRUE  FALSE  FALSE   TRUE   TRUE   TRUE 
dataNum <- myDf1[ , numIdx]
head(dataNum)
  id age  IQ rating
1  1  29 111      4
2  2  30  93      1
3  3  27  84      2
4  4  28  97      2
5  5  23  85      5
6  6  20  89      3

Apply the same function to pairs of variables from two data frames

N    <- 100
x1   <- rnorm(N, 10, 10)
y1   <- rnorm(N, 10, 10)
x2   <- x1 + rnorm(N, 5, 4)
y2   <- y1 + rnorm(N, 10, 4)
tDf1 <- data.frame(x1, y1)
tDf2 <- data.frame(x2, y2)
mapply(t.test, tDf1, tDf2, MoreArgs=list(alternative="less", var.equal=TRUE))
            x1                                     
statistic   -4.246                                 
parameter   198                                    
p.value     1.673e-05                              
conf.int    Numeric,2                              
estimate    Numeric,2                              
null.value  0                                      
alternative "less"                                 
method      " Two Sample t-test"                   
data.name   "dots[[1L]][[1L]] and dots[[2L]][[1L]]"
            y1                                     
statistic   -6.568                                 
parameter   198                                    
p.value     2.192e-10                              
conf.int    Numeric,2                              
estimate    Numeric,2                              
null.value  0                                      
alternative "less"                                 
method      " Two Sample t-test"                   
data.name   "dots[[1L]][[2L]] and dots[[2L]][[2L]]"

Separate descriptive statistics for each group for many variables

tapply(myDf1$IQ, myDf1$group, FUN=mean)
   CG     T    WL 
95.00 98.25 92.25 
aggregate(myDf1[ , c("age", "IQ", "rating")],
          list(myDf1$sex, myDf1$group), FUN=mean)
  Group.1 Group.2   age    IQ rating
1       f      CG 20.00  89.0  3.000
2       m      CG 27.67  97.0  3.667
3       f       T 30.50 106.5  3.500
4       m       T 23.00  90.0  3.500
5       f      WL 26.50 101.5  3.000
6       m      WL 34.50  83.0  5.000
aggregate(cbind(age, IQ, rating) ~ sex + group, FUN=mean, data=myDf1)
  sex group   age    IQ rating
1   f    CG 20.00  89.0  3.000
2   m    CG 27.67  97.0  3.667
3   f     T 30.50 106.5  3.500
4   m     T 23.00  90.0  3.500
5   f    WL 26.50 101.5  3.000
6   m    WL 34.50  83.0  5.000
by(myDf1[ , c("age", "IQ", "rating")], list(myDf1$sex, myDf1$group), FUN=mean)
: f
: CG
   age     IQ rating 
    20     89      3 
-------------------------------------------------------- 
: m
: CG
   age     IQ rating 
27.667 97.000  3.667 
-------------------------------------------------------- 
: f
: T
   age     IQ rating 
  30.5  106.5    3.5 
-------------------------------------------------------- 
: m
: T
   age     IQ rating 
  23.0   90.0    3.5 
-------------------------------------------------------- 
: f
: WL
   age     IQ rating 
  26.5  101.5    3.0 
-------------------------------------------------------- 
: m
: WL
   age     IQ rating 
  34.5   83.0    5.0 

Useful packages

Package plyr provides more functions for efficiently and consistently handling character strings.

Get the article source from GitHub

R markdown - markdown - R code - all posts