# Frequency tables

## Install required packages

wants <- c("DescTools", "dplyr")
has   <- wants %in% rownames(installed.packages())
if(any(!has)) install.packages(wants[!has])

## Category frequencies for one variable

### Absolute frequencies

set.seed(123)
(myLetters <- sample(LETTERS[1:5], 12, replace=TRUE))
 [1] "C" "C" "B" "B" "C" "E" "D" "A" "B" "C" "E" "C"
(tab <- table(myLetters))
myLetters
A B C D E
1 3 5 1 2 
names(tab)
[1] "A" "B" "C" "D" "E"
tab["B"]
B
3 
barplot(tab, main="Counts")

### (Cumulative) relative frequencies

(relFreq <- prop.table(tab))
myLetters
A          B          C          D          E
0.08333333 0.25000000 0.41666667 0.08333333 0.16666667 
cumsum(relFreq)
         A          B          C          D          E
0.08333333 0.33333333 0.75000000 0.83333333 1.00000000 

### Counting non-existent categories

letFac <- factor(myLetters, levels=c(LETTERS[1:5], "Q"))
letFac
 [1] C C B B C E D A B C E C
Levels: A B C D E Q
table(letFac)
letFac
A B C D E Q
1 3 5 1 2 0 

## Counting runs

(vec <- rep(rep(c("f", "m"), 3), c(1, 3, 2, 4, 1, 2)))
 [1] "f" "m" "m" "m" "f" "f" "m" "m" "m" "m" "f" "m" "m"
(res <- rle(vec))
Run Length Encoding
lengths: int [1:6] 1 3 2 4 1 2
values : chr [1:6] "f" "m" "f" "m" "f" "m"
length(res\$lengths)
[1] 6
inverse.rle(res)
 [1] "f" "m" "m" "m" "f" "f" "m" "m" "m" "m" "f" "m" "m"

## Contingency tables for two or more variables

### Absolute frequencies using table()

N    <- 10
(sex <- factor(sample(c("f", "m"), N, replace=TRUE)))
 [1] f f m f f f f m m f
Levels: f m
(work <- factor(sample(c("home", "office"), N, replace=TRUE)))
 [1] office home   office home   office office home   home   home   home
Levels: home office
(cTab <- table(sex, work))
   work
sex home office
f    4      3
m    2      1
summary(cTab)
Number of cases in table: 10
Number of factors: 2
Test for independence of all factors:
Chisq = 0.07937, df = 1, p-value = 0.7782
Chi-squared approximation may be incorrect
barplot(cTab, beside=TRUE, legend.text=rownames(cTab), ylab="absolute frequency")

### Using xtabs()

counts   <- sample(0:5, N, replace=TRUE)
(persons <- data.frame(sex, work, counts))
   sex   work counts
1    f office      3
2    f   home      1
3    m office      4
4    f   home      0
5    f office      0
6    f office      1
7    f   home      2
8    m   home      3
9    m   home      4
10   f   home      4
xtabs(~ sex + work, data=persons)
   work
sex home office
f    4      3
m    2      1
xtabs(counts ~ sex + work, data=persons)
   work
sex home office
f    7      4
m    7      4

### Marginal sums and means

rowSums(cTab)
f m
7 3 
colMeans(cTab)
  home office
3      2 
addmargins(cTab, c(1, 2), FUN=mean)
Margins computed over dimensions
in the following order:
1: sex
2: work
      work
sex    home office mean
f     4.0    3.0  3.5
m     2.0    1.0  1.5
mean  3.0    2.0  2.5

### Relative frequencies

(relFreq <- prop.table(cTab))
   work
sex home office
f  0.4    0.3
m  0.2    0.1

### Conditional relative frequencies

prop.table(cTab, margin=1)
   work
sex      home    office
f 0.5714286 0.4285714
m 0.6666667 0.3333333
prop.table(cTab, margin=2)
   work
sex      home    office
f 0.6666667 0.7500000
m 0.3333333 0.2500000

### Flat contingency tables for more than two variables

(group <- factor(sample(c("A", "B"), 10, replace=TRUE)))
 [1] A B B A B A A B B A
Levels: A B
ftable(work, sex, group, row.vars="work", col.vars=c("sex", "group"))
       sex   f   m
group A B A B
work
home         3 1 0 2
office       2 1 0 1

## Recovering the original data from contingency tables

Individual-level data frame

library(DescTools)
Untable(cTab)
   sex   work
1    f   home
2    f   home
3    f   home
4    f   home
5    m   home
6    m   home
7    f office
8    f office
9    f office
10   m office

Group-level data frame

as.data.frame(cTab, stringsAsFactors=TRUE)
  sex   work Freq
1   f   home    4
2   m   home    2
3   f office    3
4   m office    1

## Percentile rank

(vec <- round(rnorm(10), 2))
 [1]  0.90  0.88  0.82  0.69  0.55 -0.06 -0.31 -0.38 -0.69 -0.21
library(DescTools)
100*PercentRank(vec)
 [1] 100  90  80  70  60  50  30  20  10  40

Using base R

Fn <- ecdf(vec)
Fn(vec)
 [1] 1.0 0.9 0.8 0.7 0.6 0.5 0.3 0.2 0.1 0.4
100 * Fn(0.1)
[1] 50
Fn(sort(vec))
 [1] 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
knots(Fn)
 [1] -0.69 -0.38 -0.31 -0.21 -0.06  0.55  0.69  0.82  0.88  0.90
plot(Fn, main="cumulative frequencies")

## Using package dplyr

Data set

N      <- 12
sex    <- factor(sample(c("f", "m"), N, replace=TRUE), levels=c("f", "m"))
group  <- factor(sample(rep(c("CG", "WL", "T"), 4), N, replace=FALSE), levels=c("CG", "WL", "T"))
age    <- sample(18:35, N, replace=TRUE)
IQ     <- round(rnorm(N, mean=100, sd=15))
rating <- round(runif(N, min=0, max=6))
myDf1  <- data.frame(id=1:N, sex, group, age, IQ, rating)

### Absolute frequencies

count() gives frequencies using new variable n (by default).

library(dplyr)
myDf1 %>%
count(sex, group, name="n")
  sex group n
1   f    CG 1
2   f    WL 1
3   f     T 3
4   m    CG 3
5   m    WL 3
6   m     T 1

By default, count() drops groups / combinations of groups with no entries. Use option .drop=FALSE to include entries with frequency 0.

myDf1 %>%
count(sex, group, .drop=FALSE)
  sex group n
1   f    CG 1
2   f    WL 1
3   f     T 3
4   m    CG 3
5   m    WL 3
6   m     T 1

Add corresponding count to existing data frame.

myDf1 %>%
add_count(sex, group)
   id sex group age  IQ rating n
1   1   m    WL  29 106      1 3
2   2   m     T  31 112      3 1
3   3   f     T  20  98      2 3
4   4   f     T  31  97      4 3
5   5   m    CG  24  86      2 3
6   6   f    WL  20  89      2 1
7   7   m    CG  32  79      3 3
8   8   m    CG  22 109      4 3
9   9   f    CG  25 100      1 1
10 10   m    WL  27  90      2 3
11 11   m    WL  35  96      2 3
12 12   f     T  27  88      4 3

### Relative frequencies

myDf1 %>%
count(sex, group) %>%
mutate(freq_rel=n / sum(n))
  sex group n   freq_rel
1   f    CG 1 0.08333333
2   f    WL 1 0.08333333
3   f     T 3 0.25000000
4   m    CG 3 0.25000000
5   m    WL 3 0.25000000
6   m     T 1 0.08333333
myDf1 %>%
add_count(sex, group) %>%
mutate(freq_rel=n / n())
   id sex group age  IQ rating n   freq_rel
1   1   m    WL  29 106      1 3 0.25000000
2   2   m     T  31 112      3 1 0.08333333
3   3   f     T  20  98      2 3 0.25000000
4   4   f     T  31  97      4 3 0.25000000
5   5   m    CG  24  86      2 3 0.25000000
6   6   f    WL  20  89      2 1 0.08333333
7   7   m    CG  32  79      3 3 0.25000000
8   8   m    CG  22 109      4 3 0.25000000
9   9   f    CG  25 100      1 1 0.08333333
10 10   m    WL  27  90      2 3 0.25000000
11 11   m    WL  35  96      2 3 0.25000000
12 12   f     T  27  88      4 3 0.25000000

### Conditional relative frequencies

myDf1 %>%
count(sex, group, name="n_sex_group") %>%
group_by(sex) %>%
mutate(n_sex=sum(n_sex_group),
freq_cond_rel=n_sex_group / n_sex) %>%
ungroup()
# A tibble: 6 x 5
sex   group n_sex_group n_sex freq_cond_rel
<fct> <fct>       <int> <int>         <dbl>
1 f     CG              1     5         0.2
2 f     WL              1     5         0.2
3 f     T               3     5         0.6
4 m     CG              3     7         0.429
5 m     WL              3     7         0.429
6 m     T               1     7         0.143
myDf1 %>%
add_count(sex, name="n_sex") %>%
add_count(sex, group, name="n_sex_group") %>%
mutate(freq_cond_rel=n_sex_group / n_sex) %>%
select(id, sex, group, n_sex, n_sex_group, freq_cond_rel)
   id sex group n_sex n_sex_group freq_cond_rel
1   1   m    WL     7           3     0.4285714
2   2   m     T     7           1     0.1428571
3   3   f     T     5           3     0.6000000
4   4   f     T     5           3     0.6000000
5   5   m    CG     7           3     0.4285714
6   6   f    WL     5           1     0.2000000
7   7   m    CG     7           3     0.4285714
8   8   m    CG     7           3     0.4285714
9   9   f    CG     5           1     0.2000000
10 10   m    WL     7           3     0.4285714
11 11   m    WL     7           3     0.4285714
12 12   f     T     5           3     0.6000000

### Percent rank

myDf1 %>%
mutate(rating_pr=100*percent_rank(rating))
   id sex group age  IQ rating rating_pr
1   1   m    WL  29 106      1   0.00000
2   2   m     T  31 112      3  63.63636
3   3   f     T  20  98      2  18.18182
4   4   f     T  31  97      4  81.81818
5   5   m    CG  24  86      2  18.18182
6   6   f    WL  20  89      2  18.18182
7   7   m    CG  32  79      3  63.63636
8   8   m    CG  22 109      4  81.81818
9   9   f    CG  25 100      1   0.00000
10 10   m    WL  27  90      2  18.18182
11 11   m    WL  35  96      2  18.18182
12 12   f     T  27  88      4  81.81818

## Detach (automatically) loaded packages (if possible)

try(detach(package:DescTools))
try(detach(package:dplyr))