####*--------------------------------------------------------------------------*
####*--------------------------------------------------------------------------*
## Befehle zum Buch "Grundlagen der Datenanalyse mit R" (6. Aufl., Springer)
##
## Daniel Wollschläger <contact [at] dwoll.de>
##
## http://www.dwoll.de/r/
##
## Stand: 12.05.2026
##
## Vgl. Abschnitt 4.1, um die folgenden Befehle auszuführen.
##
## Befehle, die eine Benutzerinteraktion notwendig machen, oder
## nicht erwünschte Nebenwirkungen haben, sind auskommentiert.
##
## Um alle Teile des Skripts ausführen zu können, müssen die Pakete
##
## car, coin, colorspace, DescTools, dplyr, duckdb, effectsize, emmeans,
## forcats, ggbeeswarm, ggplot2, glmnet, haven, Hmisc, lmtest, MASS,
## multcomp, mvtnorm, pROC, pscl, readxl, rgl, tidyr, VGAM
##
## samt Abhängigkeiten installiert sein, vgl. Abschnitt 1.3.1.
##
pWants <- c("car", "coin", "colorspace",
            "DescTools", "dplyr", "duckdb",
            "effectsize", "emmeans",
            "forcats",
            "ggbeeswarm", "ggplot2", "glmnet",
            "haven", "Hmisc",
            "lmtest",
            "MASS", "multcomp", "mvtnorm",
            "patchwork", "pROC", "pscl",
            "readxl", "rgl",
            "tidyr",
            "VGAM")

pHas <- pWants %in% rownames(installed.packages())
if(any(!pHas)) {
    install.packages(pWants[!pHas], repos="https://cloud.r-project.org/")
}
##
####*--------------------------------------------------------------------------*
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
### 1 Erste Schritte ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 1.1 Vorstellung ----
##### 1.1.1 Pro und Contra R ----
##### 1.1.2 Typografische Konventionen ----
1 + 1

##### 1.1.3 R installieren ----
##### 1.1.4 Grafische Benutzeroberflächen ----
##### 1.1.5 Weiterführende Informationsquellen und Literatur ----

#####*-------------------------------------------------------------------------*
#### 1.2 Grundlegende Elemente ----
##### 1.2.1 R starten, beenden und die Konsole verwenden ----
1 + 1

data(Duncan, package="carData")
head(Duncan)

boxplot(income ~ type, data=Duncan, main="Anteil hoher Einkommen pro Berufsgruppe")
stripchart(income ~ type, data=Duncan, pch=20, vert=TRUE, add=TRUE)

aggregate(cbind(income, education, prestige) ~ type, FUN=mean, data=Duncan)
with(Duncan, cor(cbind(income, education, prestige)))
anova(lm(prestige ~ type, data=Duncan))
BCandWC <- with(Duncan, (type == "bc") | (type == "wc"))
t.test(education ~ type, alternative="less", data=Duncan, subset=BCandWC)

2 * (4
     -5)
## q()

##### 1.2.2 Einstellungen ----
getOption("width")
## op <- options(width=70)
## options(op)

##### 1.2.3 Umgang mit dem Workspace ----
history()

ls()
ls(".GlobalEnv", pattern="C")

##### 1.2.4 Einfache Arithmetik ----
12^2 + 1.5*10
sin(pi/2) + sqrt(abs(-4))
exp(1)^((0+1i)*pi)
exp(1)^(-pi/2) - (0+1i)^(0+1i)
sqrt(-1)
sqrt(-1+0i)

is.infinite(1/0)
is.nan(0/0)

##### 1.2.5 Funktionen mit Argumenten aufrufen ----
## q()
"/"(1, 10)
round(1.271)
round(pi, digits=2)

##### 1.2.6 Hilfe-Funktionen ----
## help.start()
## ?"/"
## help()

##### 1.2.7 Empfehlungen und typische Fehlerquellen ----

####*--------------------------------------------------------------------------*
#### 1.3 Zusatzpakete verwenden ----
##### 1.3.1 Zusatzpakete installieren: Grundlagen ----
## install.packages("colorspace")
.libPaths()
find.package("colorspace")

##### 1.3.2 Zusatzpakete installieren: Erweiterte Optionen ----
##### 1.3.3 Zusatzpakete laden----
library(colorspace)
installed.packages()
library()
search()

help(package="colorspace")
vignette(package="colorspace")
vignette("hcl-colors", package="colorspace")
vignette()

loadedNamespaces()
detach(package:colorspace)

data(package="colorspace")
data()
library(colorspace)
help(max_chroma_table)
sessionInfo()

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:colorspace))

##### 1.3.4 Hinweise zum Arbeiten mit Zusatzpaketen ----

####*--------------------------------------------------------------------------*
#### 1.4 Datenstrukturen: Klassen, Objekte, Datentypen ----
class(mtcars)
inherits(euro.cross, "matrix")
as.vector(euro.cross)
length(euro)
mode(euro.cross)
attributes(euro)
attr(euro, which="names")
str(euro.cross)

##### 1.4.1 Objekte benennen ----
conflicts(detail=TRUE)
exists("mean")

##### 1.4.2 Zuweisungen an Objekte ----
x1 <-  4.5
x1  =  4.5
4.5 -> x1
x2 <- x3 <- 10
x2
x3
x1 * 2
x1^x1 - x2

##### 1.4.3 Objekte ausgeben ----
print(x1)
get("x1")
x1
(x1 <- 4.5)

varName <- "x1"
get(varName)
.Last.value

##### 1.4.4 Objekte anzeigen lassen, umbenennen und entfernen ----
ls()
ls.str()
varNew1    <- x1
newNameVar <- "varNew2"
assign(newNameVar, x1)
varNew2
## rm(list=ls(all.names=TRUE))
age <- 22
rm(age)
## age

##### 1.4.5 Datentypen ----
NULL
TRUE
FALSE
T
F
3.14
3.14 + 1i
"Hello"
charVar <- "asdf"
mode(charVar)
5L
.Machine
.Machine$integer.max
.Machine$double.eps
is.character(1.23)
as.character(1.23)
as.logical(2)
as.logical("abc")
as.logical("TRUE")
as.logical("FALSE")

##### 1.4.6 Logische Werte, Operatoren und Verknüpfungen ----
TRUE == TRUE
TRUE == FALSE
!TRUE
!FALSE
TRUE != TRUE
TRUE != FALSE
 TRUE & TRUE
 TRUE & FALSE
FALSE & FALSE
FALSE & TRUE
 TRUE | TRUE
 TRUE | FALSE
FALSE | FALSE
FALSE | TRUE
xor(TRUE, FALSE)
xor(TRUE, TRUE)
4 <= 8
7 <  3

"A" < "B"

identical(4L, 4)
4L == 4

NaN  == TRUE
NULL == TRUE
isTRUE(NaN)
isTRUE(NULL)

0.1 + 0.2 == 0.3
1 %/% 0.1
sin(pi)
1 - ((1/49) * 49)
1 - ((1/48) * 48)

isTRUE(all.equal(0.123450001, 0.123450000))
0.123400001 == 0.123400000
all.equal(0.12345001, 0.12345000)
isTRUE(all.equal(0.12345001,  0.12345000))

####*--------------------------------------------------------------------------*
### 2 Elementare Dateneingabe und -verarbeitung ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 2.1 Vektoren ----
##### 2.1.1 Vektoren erzeugen ----
numeric(4)
character(2)
(age    <- c(18, 20, 30, 24, 23, 21))
addAge  <- c(27, 21, 19)
(ageNew <- c(age, addAge))
length(age)
(chars <- c("lorem", "ipsum", "dolor", ""))
length(chars)
LETTERS[c(1, 2, 3)]
letters[c(4, 5, 6)]

##### 2.1.2 Elemente auswählen und verändern ----
age[4]
(ageLast <- age[length(age)])
age[length(age) + 1]
c(11, 12, 13, 14)[2]

idx <- c(1, 2, 4)
age[idx]
age[c(3, 5, 6)]
age[c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6)]
age[c(4, NA, 1)]
age[-3]
age[c(-1, -2, -4)]
age[-c(1, 2, 4)]
age[-idx]

age[4]   <- 25
age
age[idx] <- c(17, 30, 25)
age
age[] <- c(1, 2)
age
charVec1 <- c("Z", "Y", "X")
charVec1[c(4, 5, 6)] <- c("W", "V", "U")
charVec1
(charVec2 <- c(charVec1, "T", "S", "R"))
(charVec3 <- append(charVec2, c("Q", "P", "O")))
append(charVec1, c("A", "B", "C"), after=0)

##### 2.1.3 Datentypen in Vektoren ----
charVec4 <- "word"
numVec   <- c(10, 20, 30)
(combVec <- c(charVec4, numVec))
mode(combVec)

##### 2.1.4 Elemente benennen ----
(namedVec1 <- c(elem1="first", elem2="second"))
namedVec1["elem1"]
names(namedVec1)
(namedVec2 <- c(val1=10, val2=-12, val3=33))
(names2    <- names(namedVec2))
(namedVec3 <- setNames(namedVec2, toupper(names2)))
unname(namedVec3)

##### 2.1.5 Elemente löschen ----
vec <- c(10, 20, 30, 40, 50)
vec <- vec[-c(4, 5)]
vec
vec <- c(1, 2, 3, 4, 5)
length(vec) <- 3
vec

####*--------------------------------------------------------------------------*
#### 2.2 Logische Operatoren ----
##### 2.2.1 Vektoren mit logischen Indexvektoren vergleichen ----
age <- c(17, 30, 30, 24, 23, 21)
age < 24
x <- c(2, 4, 8)
y <- c(3, 4, 5)
x == y
x < y
(age <= 20) | (age >= 30)
(age > 20)  & (age < 30)
TRUE   && TRUE
FALSE  || FALSE
c(1, 2) == c(1L, 2L)
identical(c(1, 2), c(1L, 2L))

x <- c(4, 5, 6)
y <- c(4, 5, 6)
z <- c(1, 2, 3)
all.equal(x, y)
all.equal(y, z)
isTRUE(all.equal(y, z))

all(numeric(0))
any(numeric(0))

res <- age > 30
any(res)
any(age < 18)
all(x == y)
res <- age < 24
sum(res)
which(age < 24)
length(which(age < 24))

##### 2.2.2 Logische Indexvektoren ----
age[c(TRUE, FALSE, TRUE, TRUE, FALSE, TRUE)]
(idx <- (age <= 20) | (age >= 30))
age[idx]
age[(age <= 20) | (age >= 30)]

age[c(TRUE, FALSE)]
age[c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE)]

vecNA <- c(-3, 2, 0, NA, -7, 5)
(ok   <- vecNA > 0)
vecNA[ok]
vecNA[ok & !is.na(ok)]

(numIdx <- which(ok))

####*--------------------------------------------------------------------------*
#### 2.3 Mengen ----
##### 2.3.1 Doppelt auftretende Werte finden ----
x <- c(1, 2, 3, 1, 4, 5)
duplicated(x)
duplicated(x, fromLast=TRUE)
duplicated(x) | duplicated(x, fromLast=TRUE)
unique(c(1, 1, 1, 3, 3, 4, 4))
length(unique(c("A", "B", "C", "C", "B", "B", "A", "C", "C", "A")))

##### 2.3.2 Mengenoperationen ----
x <- c(2, 1, 3, 2, 1)
y <- c(5, 3, 1, 3, 4, 4)
union(x, y)
union(y, x)
intersect(x, y)
intersect(y, x)
setequal(c(1, 1, 2, 2), c(2, 1))
setdiff(x, y)
setdiff(y, x)

is.element(c(29, 23, 30, 17, 30, 10), c(30, 23))
c("A", "Z", "B") %in% c("A", "B", "C", "D", "E")
A <- c(4, 5, 6)
B <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
(AinB <- all(A %in% B))
(BinA <- all(B %in% A))
AinB && !BinA

##### 2.3.3 Kombinatorik ----
myN <- 5
myK <- 4
choose(myN, myK)
factorial(myN) / (factorial(myK)*factorial(myN-myK))

combn(c("a", "b", "c", "d", "e"), myK)
combn(c(1, 2, 3, 4), 3)
combn(c(1, 2, 3, 4), 3, FUN=sum)
combn(c(1, 2, 3, 4), 3, weighted.mean, w=c(0.5, 0.2, 0.3))

library(DescTools)
Permn(1:3)

var1 <- c("control", "treatment")
var2 <- c("f", "m")
var3 <- c(1, 2)
expand.grid(IV1=var1, IV2=var2, IV3=var3)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

####*--------------------------------------------------------------------------*
#### 2.4 Systematische und zufällige Wertefolgen erzeugen ----
##### 2.4.1 Numerische Sequenzen erstellen ----
20:26
5.4:2.1
-4:2
-(4:2)
seq_len(4)
target_length <- 0
seq_len(target_length)
1:target_length

seq(from=2, to=12, by=2)
seq(from=2, to=11, by=2)
seq(from=0, to=-1, length.out=5)

age <- c(18, 20, 30, 24, 23, 21)
seq_along(age)
vec <- numeric(0)
length(vec)
1:length(vec)
seq_along(vec)

##### 2.4.2 Wertefolgen wiederholen ----
rep(1:3, times=5)
rep(c("A", "B", "C"), times=c(2, 3, 4))
rep(age, each=2)
rep(list(data.frame(A=1:2, B=c("B", "Q"))), 3)

##### 2.4.3 Zufällig aus einer Urne ziehen ----
sample(1:6, size=20, replace=TRUE)
sample(c("rot", "grün", "blau"), size=8, replace=TRUE)
x <- c(2, 4, 6, 8)
sample(x[(x %% 4) == 0])
sample(x[(x %% 8) == 0])

##### 2.4.4 Zufallszahlen aus bestimmten Verteilungen erzeugen ----
runif(5, min=1, max=6)
rbinom(20, size=5, prob=0.3)
rnorm(3, mean=c(0, 50, 100), sd=c(1, 5, 10))

####*--------------------------------------------------------------------------*
#### 2.5 Daten transformieren ----
##### 2.5.1 Werte sortieren ----
vec <- c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
rev(vec)
vec <- c(10, 12, 1, 12, 7, 16, 6, 19, 10, 19)
sort(vec)
(idxDec <- order(vec, decreasing=TRUE))
vec[idxDec]
sort(c("D", "E", "10", "A", "F", "E", "D", "4", "E", "A"))

##### 2.5.2 Werte in zufällige Reihenfolge bringen ----
myColors  <- c("red", "green", "blue", "yellow", "black")
(randCols <- sample(myColors, size=length(myColors), replace=FALSE))

P   <- 3
Nj  <- c(4, 3, 5)
(IV <- rep(seq_len(P), times=Nj))
(IVrand <- sample(IV, size=length(IV), replace=FALSE))

P <- 3
N <- 20
(sample(seq_len(N), size=N, replace=FALSE) %% P) + 1

##### 2.5.3 Teilmengen von Daten auswählen ----
vec <- rep(c("rot", "grün", "blau"), each=10)
sample(vec, size=5, replace=FALSE)

selIdx1 <- seq(1, length(vec), by=10)
vec[selIdx1]

selIdx2 <- rbinom(length(vec), size=1, prob=0.1) == 1
vec[selIdx2]

##### 2.5.4 Daten umrechnen ----
age <- c(18, 20, 30, 24, 23, 21)
age/10
(age/2) + 5
vec1 <- c( 3, 4,  5, 6)
vec2 <- c(-2, 2, -1, 3)
vec1*vec2
vec3 <- c(10, 100, 1000, 10000)
(vec1 + vec2) / vec3

##### 2.5.4.1 Zyklische Verlängerung von Vektoren (Recycling) ----
age  <- c(18, 20, 30, 24, 23, 21)
vec1 <- c(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24)
vec2 <- c(2, 4, 6, 8, 10)
c(length(age), length(vec1), length(vec2))
vec1*age
vec2*age

##### 2.5.4.2 z-Transformation ----
(zAge <- (age - mean(age)) / sd(age))
(zAge <- scale(age))
as.vector(zAge)
newSd   <- 15
newMean <- 100
(newAge <- (as.vector(zAge)*newSd) + newMean)
mean(newAge)
sd(newAge)

##### 2.5.4.3 Rangtransformation ----
rank(c(3, 1, 2, 3))

##### 2.5.5 Neue aus bestehenden Variablen bilden ----
height <- c(1.78, 1.91, 1.89, 1.83, 1.64)
weight <- c(65, 89, 91, 75, 73)
(bmi   <- weight / (height^2))

quest1  <- c(FALSE, FALSE, FALSE, TRUE,  FALSE, TRUE, FALSE, TRUE)
quest2  <- c(TRUE,  FALSE, FALSE, FALSE, TRUE,  TRUE, TRUE,  FALSE)
quest3  <- c(TRUE,  TRUE,  TRUE,  TRUE,  FALSE, TRUE, FALSE, FALSE)
(sumVar <- quest1 + quest2 + quest3)
(avgVar <- (quest1 + quest2 + quest3) / 3)

##### 2.5.6 Werte ersetzen oder recodieren ----
myColors <- c("red", "purple", "blue", "blue", "green", "red", "green")

farben <- replace(myColors, myColors == "red",    "rot")
farben <- replace(farben,   farben   == "purple", "lila")
farben <- replace(farben,   farben   == "blue",   "blau")
farben <- replace(farben,   farben   == "green",  "gruen")
farben

repl <- c(red="rot", purple="lila", blue="blau", green="gruen")
repl[myColors]
repl[c(myColors, "violet")]

library(dplyr)

## Lösung mit case_match()
## ab dplyr Version 1.2 nicht mehr bevorzugt
case_match(myColors,
           "red"    ~ "rot",
           "blue"   ~ "blau",
           "purple" ~ "lila",
           "green"  ~ "gruen")

case_match(myColors,
           c("red", "blue") ~ "basic",
           .default="complex")

orgVec <- c(5, 9, 11, 8, 9, 3, 1, 13, 9, 12, 5, 12)
case_match(orgVec,
            0:4  ~ "V00-04",
            5:10 ~ "V05-10",
           11:20 ~ "V11-20")

## ab dplyr Version 1.2 bevorzugte Lösung mit recode_values()
recode_values(myColors,
              "red"    ~ "rot",
              "blue"   ~ "blau",
              "purple" ~ "lila",
              "green"  ~ "gruen")

recode_values(myColors,
              c("red", "blue") ~ "basic",
              default="complex")

recode_values(orgVec,
               0:4  ~ "V00-04",
               5:10 ~ "V05-10",
              11:20 ~ "V11-20")

cutoff <- 10
(reVec <- ifelse(orgVec < cutoff, 0, 1))

targetSet <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K")
response  <- c("Z", "E", "O", "W", "H", "C", "I", "G")
(respRec  <- ifelse(response %in% targetSet, "yes", "no"))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:dplyr))

##### 2.5.7 Kontinuierliche Variablen in Kategorien einteilen ----
IQ    <- c(112, 103, 87, 86, 90, 101, 90, 89, 122, 103)
IQcls <- numeric(length(IQ))
IQcls[IQ <= 100]                <- 1
IQcls[(IQ > 100) & (IQ <= 115)] <- 2
IQcls[IQ > 115]                 <- 3
IQcls

library(dplyr)
case_when((IQ <= 100)              ~ 1,
          (IQ > 100) & (IQ <= 115) ~ 2,
          (IQ > 115)               ~ 3)

ifelse(IQ >= 100, "hi", "lo")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:dplyr))

####*--------------------------------------------------------------------------*
#### 2.6 Gruppierungsfaktoren ----
##### 2.6.1 Ungeordnete Faktoren ----
sex     <- c("m", "f", "f", "m", "m", "m", "f", "f")
(sexFac <- factor(sex))
factor(c(1, 1, 3, 3, 4, 4), levels=1:5)
(sexNum <- rbinom(10, size=1, prob=0.5))
factor(sexNum, labels=c("man", "woman"))
factor(c(1, 2, 3, 4), levels=1:4, labels=c("A", "A", "B", "B"))

nlevels(sexFac)
summary(sexFac)
levels(sexFac)
str(sexFac)
unclass(sexFac)
unclass(factor(10:15))
as.character(sexFac)

##### 2.6.2 Faktoren kombinieren ----
(fac1 <- factor(sample(LETTERS[1:5], 4), levels=LETTERS[1:5]))
(fac2 <- factor(sample(letters[1:5], 3), levels=letters[1:5]))
c(fac1, fac2)

rep(fac1, times=2)

(IV1 <- factor(rep(c("lo", "hi"), each=6)))
(IV2 <- factor(rep(1:3,          times=4)))
interaction(IV1, IV2)

##### 2.6.3 Faktorstufen nachträglich ändern ----
library(forcats)

levels(sexFac) <- c("female", "male")
sexFac
fct_recode(sexFac, "F"="female", "M"="male")

(status   <- factor(c("hi", "lo", "hi")))
status[4] <- "mid"
status
levels(status) <- c(levels(status), "mid")
status[4] <- "mid"
status
fct_expand(status, "new_level")

hiNotHi <- status
levels(hiNotHi) <- list(hi="hi", notHi=c("mid", "lo"))
hiNotHi
fct_collapse(status, notHi=c("mid", "lo"))

status[1:2]
(newStatus <- droplevels(status[1:2]))
fct_drop(status[1:2], "mid")

(fac   <- factor(c("A", "B", NA, "A", NA)))
(facNA <- factor(c("A", "B", NA, "A", NA), exclude=NULL))
table(fac)
table(addNA(fac))

(facNA_lvl <- fct_na_value_to_level(fac, "(N/A)"))
fct_na_level_to_value(facNA_lvl, "(N/A)")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
detach(package:forcats)

##### 2.6.4 Geordnete Faktoren ----
(ordStat <- ordered(status, levels=c("lo", "mid", "hi")))
ordStat[1] > ordStat[2]
sample(ordStat, size=length(ordStat), replace=FALSE)

##### 2.6.5 Reihenfolge von Faktorstufen kontrollieren ----
vec <- c(3, 4, 3, 2, 1, 4, 1, 1)
factor(vec)
factor(vec, levels=c(4, 3, 2, 1))

(abcde <- factor(sample(LETTERS[1:5], 4), levels=LETTERS[1:5]))
relevel(abcde, ref="E")

fac1 <- factor(rep(LETTERS[1:3], each=5))
vec  <- rnorm(15, rep(c(10, 5, 15), each=5), 3)
reorder(fac1, vec, FUN=mean)
tapply(vec, fac1, FUN=mean)

fct_relevel(fac1, "A", after=1)
fct_relevel(fac1, "A", after=Inf)

(fac2 <- factor(sample(1:2, 10, replace=TRUE), labels=c("B", "A")))
sort(fac2)
sort(as.character(fac2))

##### 2.6.6 Faktoren nach Muster erstellen ----
(fac1 <- factor(rep(c("A", "B"), times=c(5, 5))))
(fac2 <- gl(2, 5, labels=c("less", "more"), ordered=TRUE))

expand.grid(IV1=gl(2, 2, labels=c("a", "b")), IV2=gl(3, 1))

##### 2.6.7 Quantitative in kategoriale Variablen umwandeln ----
IQ    <- rnorm(100, mean=100, sd=15)
IQfac <- cut(IQ, breaks=c(0, 85, 115, Inf), labels=c("lo", "mid", "hi"))
IQfac[1:5]

medSplit <- cut(IQ, breaks=c(-Inf, median(IQ), Inf))
summary(medSplit)

IQdiscr <- cut(IQ, quantile(IQ), include.lowest=TRUE)
summary(IQdiscr)

####*--------------------------------------------------------------------------*
#### 2.7 Deskriptive Kennwerte numerischer Daten ----
age <- c(17, 30, 30, 25, 23, 21)
summary(age)

##### 2.7.1 Summen, Differenzen und Produkte ----
sum(age)
cumsum(age)

diff(age)
diff(age, lag=2)

prod(age)
cumprod(age)
factorial(5)

##### 2.7.2 Extremwerte ----
max(age)
range(c(17, 30, 30, 25, 23, 21))

which.min(age)
vec <- c(-5, -8, -2, 10, 9)
which.max(vec > 0)
which.min(vec %in% c(-5, 10))

val <- 0
which.min(abs(vec-val))

diff(range(c(17, 30, 30, 25, 23, 21)))
vec1 <- c(5, 2, 0, 7)
vec2 <- c(3, 3, 9, 2)
pmax(vec1, vec2)
pmin(vec1, vec2)

##### 2.7.3 Mittelwert, Median und Modalwert ----
mean(1, 3, 7)
mean(c(1, 3, 7))

age <- c(17, 30, 30, 25, 23, 21)
mean(age)
weights <- c(0.6, 0.6, 0.3, 0.2, 0.4, 0.6)
weighted.mean(age, w=weights)

sort(age)
median(age)

ageNew <- c(age, 22)
sort(ageNew)
median(ageNew)

vec <- c(11, 22, 22, 33, 33, 33, 33)
library(DescTools)
Mode(vec)

(tab    <- table(vec))
(modIdx <- which.max(tab))
sort(unique(vec))[modIdx]

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 2.7.4 Robuste Maße der zentralen Tendenz ----
age <- c(17, 30, 30, 25, 23, 21)
mean(age)
mean(age, trim=0.2)

library(DescTools)
border   <- quantile(age, probs=c(0.2, 0.8))
(ageWins <- Winsorize(age, val=border))
mean(ageWins)

HodgesLehmann(age)
pairM <- outer(age, age, FUN="+") / 2
median(pairM[lower.tri(pairM, diag=TRUE)])

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 2.7.5 Prozentrang, Quartile und Quantile ----
library(DescTools)
age <- c(17, 30, 30, 25, 23, 21)
PercentRank(age) * 100
sum(age <= 25)
100 * (sum(age <= 25) / length(age))
100 * (sum(age <= 30) / length(age))

(quant <- quantile(age))
quant[c("25%", "50%")]

vec <- sample(seq(0, 1, by=0.01), 1000, replace=TRUE)
quantile(vec, probs=c(0, 0.2, 0.4, 0.6, 0.8, 1))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 2.7.6 Varianz, Streuung, Schiefe und Wölbung ----
age <- c(17, 30, 30, 25, 23, 21)
N   <- length(age)
M   <- mean(age)
var(age)
sum((age-M)^2) / (N-1)

((N-1) / N) * var(age)
sum((age-M)^2) / N

sqrt(var(age))
sd(age)
sqrt((N-1) / N) * sd(age)
sqrt(sum((age-M)^2) / N)

##### 2.7.7 Diversität kategorialer Daten ----
fac <- factor(c("C", "D", "A", "D", "E", "D", "C", "E", "E", "B", "E"),
              levels=c(LETTERS[1:5], "Q"))
P   <- nlevels(fac)
(Fj <- proportions(table(fac)))

library(DescTools)
shannonIdx <- Entropy(Fj, base=exp(1))
(H <- (1/log(P)) * shannonIdx)

keep <- Fj > 0
-(1/log(P)) * sum(Fj[keep] * log(Fj[keep]))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 2.7.8 Kovarianz und Korrelation ----
x <- c(17, 30, 30, 25, 23, 21)
y <- c(1, 12, 8, 10, 5, 3)
cov(x, y)

N  <- length(x)
Mx <- mean(x)
My <- mean(y)
sum((x-Mx) * (y-My)) / (N-1)
((N-1) / N) * cov(x, y)
sum((x-Mx) * (y-My)) / N

cor(x, y)
cov(x, y) / (sd(x) * sd(y))

NN <- 100
zz <- runif(NN)
xx <- zz + rnorm(NN, 0, 0.5)
yy <- zz + rnorm(NN, 0, 0.5)
## für Daten des Buches diese Datei laden:
# load("data_regrParCor.Rdata")
(cor(xx, yy)-(cor(xx, zz) * cor(yy, zz))) / sqrt((1-cor(xx, zz)^2) * (1-cor(yy, zz)^2))
(cor(xx, yy)-(cor(xx, zz) * cor(yy, zz))) / sqrt( 1-cor(xx, zz)^2)

##### 2.7.9 Robuste Streuungsmaße und Kovarianzschätzer ----
age <- c(17, 30, 30, 25, 23, 21)
sd(age)
quantile(age)
IQR(age)
mean(abs(age-mean(age)))
mean(abs(age-median(age)))
mad(age)

library(DescTools)
border  <- quantile(age, probs=c(0.2, 0.8))
ageWins <- Winsorize(age, val=border)
var(ageWins)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 2.7.10 Kennwerte getrennt nach Gruppen berechnen ----
Njk   <- 2
P     <- 2
Q     <- 3
sex   <- factor(rep(c("f", "m"),       times=Q*Njk))
group <- factor(rep(c("T", "WL", "CG"), each=P*Njk))
xtabs(~ sex + group)

IQ <- round(rnorm(Njk*P*Q, mean=100, sd=15))
ave(IQ, sex, FUN=mean)

(tapRes <- tapply(IQ, group, FUN=mean))
tapply(IQ, list(sex, group), FUN=mean)

tapply(IQ, sex, FUN=identity)
split(IQ, sex)
IQ[sex == "f"]
IQ[sex == "m"]

##### 2.7.11 Funktionen auf geordnete Paare von Werten anwenden ----
outer(1:5, 1:5, FUN="*")

####*--------------------------------------------------------------------------*
#### 2.8 Matrizen ----
age <- c(17, 30, 30, 25, 23, 21)
matrix(age, nrow=3, ncol=2, byrow=FALSE)
(ageMat <- matrix(age, nrow=2, ncol=3, byrow=TRUE))

##### 2.8.1 Datentypen in Matrizen ----
##### 2.8.2 Dimensionierung, Zeilen und Spalten ----
age    <- c(17, 30, 30, 25, 23, 21)
ageMat <- matrix(age, nrow=2, ncol=3, byrow=FALSE)
dim(ageMat)
nrow(ageMat)
ncol(ageMat)
length(ageMat)

t(ageMat)

as.matrix(1:3)
c(ageMat)

P       <- 2
Q       <- 3
(pqMat  <- matrix(seq_len(P*Q), nrow=P, ncol=Q))
(rowMat <- row(pqMat))
(colMat <- col(pqMat))

cbind(rowIdx=c(rowMat), colIdx=c(colMat), val=c(pqMat))

mat <- matrix(sample(1:10, 16, replace=TRUE), 4, 4)
col(mat) >= row(mat)

##### 2.8.3 Elemente auswählen und verändern ----
ageMat
ageMat[2, 2]
ageMat[2, 2] <- 24
ageMat[2, 2]

ageMatCopy   <- ageMat
ageMatCopy[] <- c(1, 2, 3)
ageMatCopy

ageMat[2,  ]
ageMat[ , 1]
ageMat[ ,  ]
ageMat[ , -1]
ageMat[ , 1, drop=FALSE]
ageMat[FALSE, FALSE]

rowMat <- matrix(1:3, nrow=1)
rowMat
rowMat[]
rowMat[ , ]

ageMat[ , 2:3]
ageMat[ , c(1, 3)]

ageMatNew   <- ageMat
(replaceMat <- matrix(c(11, 21, 12, 22), nrow=2, ncol=2))
ageMatNew[ , c(1, 3)] <- replaceMat
ageMatNew

##### 2.8.4 Weitere Wege, Elemente auszuwählen und zu verändern ----
ageMat
idxVec <- c(1, 3, 4)
ageMat[idxVec]

(idxMatLog <- ageMat >= 25)
ageMat[idxMatLog]

(idxMatNum <- which(idxMatLog, arr.ind=TRUE))
ageMat[idxMatNum]
(idxMat <- arrayInd(idxVec, dim(ageMat)))

diag(ageMat)

##### 2.8.5 Matrizen verbinden ----
vec1  <- c(19, 19, 31, 19, 24)
vec2  <- c(95, 76, 94, 76, 76)
vec3  <- c(197, 178, 189, 184, 173)
rbind(vec1, vec2, vec3)
(mat <- cbind(age=vec1, weight=vec2, height=vec3))

##### 2.8.6 Matrizen sortieren ----
(rowOrder1 <- order(mat[ , "age"]))
mat[rowOrder1, ]

rowOrder2 <- order(mat[ , "age"], partial=mat[ , "weight"])
mat[rowOrder2, ]

rowOrder3 <- order(mat[ , "weight"], -mat[ , "height"])
mat[rowOrder3, ]

##### 2.8.7 Randkennwerte berechnen ----
sum(mat)
rowSums(mat)
mean(mat)
colMeans(mat)

##### 2.8.8 Beliebige Funktionen auf Matrizen anwenden ----
apply(mat, 2, sum)
apply(mat, 1, max)
apply(mat, 1, range)
apply(mat, 2, mean, trim=0.1)

##### 2.8.9 Matrix zeilen- oder spaltenweise mit Kennwerten verrechnen ----
Mj <- rowMeans(mat)
Mk <- colMeans(mat)
dimnames(mat) <- list(obs=NULL, vars=c("age", "weight", "height"))
sweep(mat, "vars", Mk, "-")
scale(mat, center=TRUE, scale=FALSE)

sweep(mat, 1, Mj, "-")
t(scale(t(mat), center=TRUE, scale=FALSE))

##### 2.8.10 Kovarianz- und Korrelationsmatrizen ----
cov(mat)
cor(mat)
cov.wt(mat, method="ML")
diag(cov(mat))

vec <- rnorm(nrow(mat))
cor(mat, vec)

####*--------------------------------------------------------------------------*
#### 2.9 Arrays ----
(myArr1 <- array(1:12,
                 dim=c(2, 3, 2),
                 dimnames=list(row=c("f", "m"), column=c("CG", "WL", "T"),
                               layer=c("high", "low"))))
myArr1[1, 3, 2]
myArr2 <- myArr1*2
myArr2[ , , "high"]
aperm(myArr1, perm=c("column", "row", "layer"))

####*--------------------------------------------------------------------------*
#### 2.10 Listen ----
myList1 <- list(c(1, 3), c(12, 8, 29, 5))
length(myList1)
lengths(myList1)
vector("list", 2)

##### 2.10.1 Komponenten auswählen und verändern ----
myList1[[2]]
idx <- 2
myList1[[idx]]
myList1[[2]][3]

(myList2 <- list(1:4, matrix(1:4, 2, 2), c("Lorem", "ipsum")))
myList2[[2]][1, 2]
myList2[[2]]
mode(myList2[[2]])
myList2[2]
mode(myList2[2])

(myList3 <- list(numvec=1:5, word="dolor"))
myList3[["word"]]
hasName(myList3, "numbers")
(myList4 <- setNames(myList3, c("numbers", "chars")))

myList3$numvec
mat     <- cbind(1:10, sample(-10:10, 10, replace=FALSE))
retList <- cov.wt(mat, method="ML")
names(retList)
retList$cov
retList[["center"]]
component <- "n.obs"
retList[[component]]
retList$n
retList$c

list2env(myList3, .GlobalEnv)
word

##### 2.10.2 Komponenten hinzufügen und entfernen ----
myList1[[3]]          <- LETTERS[1:5]
myList1[["neuKomp2"]] <- letters[1:5]
myList1$neuKomp3      <- 100:105
myList1
myListJoin <- c(myList1, myList2)

myList1$neuKomp3 <- NULL
myListJoin[c("neuKomp2", "neuKomp3")] <- NULL

##### 2.10.3 Liste mit mehreren Ebenen ----
myListAA <- list(AAA=c(1, 2), AAB=c("AAB1", "AAB2", "AAB3"))
myMatAB  <- matrix(1:8, nrow=2)
myListBA <- list(BAA=matrix(rnorm(10), ncol=2), BAB=c("BAB1", "BAB2"))
myVecBB  <- sample(1:10, 5)
myListA  <- list(AA=myListAA, AB=myMatAB)
myListB  <- list(BA=myListBA, BB=myVecBB)
myList4  <- list(A=myListA, B=myListB)
str(myList4)
str(myList4$B)
str(myList4$B$BA)
myList4$B$BA$BAA[4, 2]

myList5 <- list(c(1, 2, 3), c("A", "B"), matrix(5:12, 2))
unlist(myList5)

myList6 <- list(L1=list(L1a=1:3,      L1b=LETTERS[1:3]),
                L2=list(L2a=rnorm(2), L2b=letters[7:8]))

str(myList6)
unlist(myList6, recursive=FALSE)

####*--------------------------------------------------------------------------*
#### 2.11 Datensätze ----
N      <- 12
sex    <- sample(c("f", "m"), N, replace=TRUE)
group  <- sample(rep(c("CG", "WL", "T"), 4), N, replace=FALSE)
age    <- sample(18:35, N, replace=TRUE)
IQ     <- round(rnorm(N, mean=100, sd=15))
rating <- round(runif(N, min=0, max=6))
(myDf1 <- data.frame(id=1:N, sex, group, age, IQ, rating))

## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
dim(myDf1)
nrow(myDf1)
ncol(myDf1)
summary(myDf1)

##### 2.11.1 Datentypen in Datensätzen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
str(myDf1)

group_fac <- factor(group)
myDf2     <- data.frame(group_fac, IQ)
str(myDf2)

##### 2.11.2 Elemente auswählen und verändern ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
myDf1[[3]][2]
myDf1$rating
myDf1$age[4]
myDf1$IQ[10:12] <- c(99, 110, 89)
myDf1[3, 4]
myDf1[4, "group"]
myDf1[2, ]
myDf1[ , "age"]
myDf1[1:5, 4, drop=FALSE]

##### 2.11.3 Namen von Variablen und Beobachtungen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
dimnames(myDf1)
names(myDf1)

(rows <- paste("Z", 1:12, sep=""))
rownames(myDf1) <- rows
head(myDf1)

rownames(myDf1) <- NULL

##### 2.11.4 Datensätze in den Suchpfad einfügen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
with(myDf1, tapply(IQ, group, FUN=mean))
xtabs(~ sex + group, data=myDf1)
## IQ[3]
attach(myDf1)
IQ[3]
search()[1:4]

IQ[3] <- 130
IQ[3]
myDf1$IQ[3]
detach(myDf1)
## IQ

####*--------------------------------------------------------------------------*
#### 2.12 Häufigkeiten bestimmen ----
##### 2.12.1 Einfache Tabellen absoluter und relativer Häufigkeiten ----
(myLetters <- sample(LETTERS[1:5], size=12, replace=TRUE))
## für Daten des Buches diese Datei laden:
# load("data_myLetters.Rdata")
table(myLetters)
(tab <- xtabs(~ myLetters))
names(tab)
tab["B"]

(relFreq <- proportions(tab))
cumsum(relFreq)

letFac <- factor(myLetters, levels=c(LETTERS[1:5], "Q"))
letFac
xtabs(~ letFac)

##### 2.12.2 Iterationen zählen ----
(vec <- rep(rep(c("f", "m"), times=3), times=c(1, 3, 2, 4, 1, 2)))
(res <- rle(vec))
length(res$lengths)
inverse.rle(res)

##### 2.12.3 Absolute und (bedingte) relative Häufigkeiten in Kreuztabellen ----
N <- 10
(persons <- data.frame(sex  =factor(sample(c("home", "office"), size=N, replace=TRUE)),
                       work =factor(sample(c("f", "m"),         size=N, replace=TRUE)),
                       group=factor(sample(c("A", "B"),         size=N, replace=TRUE)),
                       counts=      sample(0:5,                 size=N, replace=TRUE)))

## für Daten des Buches diese Datei laden:
# load("data_xtabs.Rdata")
(absFreq <- xtabs(~ sex + work, data=persons))
table(persons$sex, persons$work)
xtabs(counts ~ sex + work, data=persons)

(relFreq <- proportions(absFreq))
proportions(absFreq, 1)
proportions(absFreq, "work")

rSums <- rowSums(relFreq)
cSums <- colSums(relFreq)
sweep(relFreq, 1, rSums, "/")
sweep(relFreq, 2, cSums, "/")

## für Daten des Buches diese Datei laden:
# load("data_xtabs.Rdata")
(cTab3D <- xtabs(~ work + sex + group, data=persons))

with(persons,
     ftable(work, sex, group, row.vars="work", col.vars=c("sex", "group")))

ftable(cTab3D, row.vars="work", col.vars="sex")

summary(xtabs(~ sex + work, data=persons))

##### 2.12.4 Randkennwerte von Kreuztabellen ----
## für Daten des Buches diese Datei laden:
# load("data_xtabs.Rdata")
colMeans(  xtabs(~ sex + work, data=persons))
addmargins(xtabs(~ sex + work, data=persons), c(1, 2), FUN=mean)

margin.table(cTab3D, margin=c("work", "sex"))

##### 2.12.5 Datensätze aus Häufigkeitstabellen erstellen ----
## für Daten des Buches diese Datei laden:
# load("data_xtabs.Rdata")
cTab <- xtabs(~ sex + work, data=persons)

library(DescTools)
Untable(cTab)

as.data.frame(cTab)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 2.12.6 Kumulierte relative Häufigkeiten und Prozentrang ----
(vec <- round(rnorm(10), 2))
Fn   <- ecdf(vec)
Fn(vec)
100 * Fn(0.1)
100 * (sum(vec <= 0.1) / length(vec))
Fn(sort(vec))
knots(Fn)

####*--------------------------------------------------------------------------*
#### 2.13 Fehlende Werte behandeln  ----
##### 2.13.1 Fehlende Werte codieren und identifizieren ----
c(1, 2, 3)[NA]

(vec1 <- c(10, 20, NA, 40, 50, NA))
length(vec1)

factor(c("A", "NA", "C"))[c(NA, 2, 3)]
LETTERS[c(1, NA, 3)]
factor(LETTERS[c(1, NA, 3)])

is.na(NaN)
is.na(vec1)

vec2   <- c(NA, 7, 9, 10, 1, 8)
(datNA <- data.frame(vec1, vec2))
is.na(datNA)

anyNA(vec1)
sum(is.na(vec1))
which(is.na(vec1))
which(is.na(datNA), arr.ind=TRUE)

##### 2.13.2 Fehlende Werte ersetzen und umcodieren ----
vec <- c(30, 25, 23, 21, -999, 999)
is.na(vec) <- vec %in% c(-999, 999)
vec

##### 2.13.3 Behandlung fehlender Werte bei der Berechnung einfacher Kennwerte ----
TRUE | NA

sd(na.omit(vec))
keep <- !is.na(vec)
mean(vec[keep])

sum(vec)
sum(vec, na.rm=TRUE)

##### 2.13.4 Behandlung fehlender Werte in Matrizen ----
##### 2.13.4.1 Zeilenweiser (fallweiser) Ausschluss ----
ageNA  <- c(18, NA, 27, 22)
DV1    <- c(NA, 1, 5, -3)
DV2    <- c(9, 4, 2, 7)
(matNA <- cbind(ageNA, DV1, DV2))

na.omit(matNA)
colMeans(na.omit(matNA))

cov(matNA, use="complete.obs")
all.equal(cov(matNA, use="complete.obs"), cov(na.omit(matNA)))

##### 2.13.4.2 Paarweiser Ausschluss ----
rowMeans(matNA)
rowMeans(matNA, na.rm=TRUE)
cov(matNA, use="pairwise.complete.obs")

##### 2.13.5 Behandlung fehlender Werte beim Sortieren von Daten ----
##### 2.13.6 Behandlung fehlender Werte in inferenzstatistischen Tests ----
##### 2.13.7 Multiple Imputation ----

####*--------------------------------------------------------------------------*
#### 2.14 Zeichenketten verarbeiten ----
##### 2.14.1 Objekte in Zeichenketten umwandeln ----
randVals <- round(rnorm(5), 2)
toString(randVals)
formatC(3, digits=5, format="d")
formatC(c(1, 2.345), width=5, format="f")

##### 2.14.2 Zeichenketten erstellen und ausgeben ----
length("ABCDEF")
nchar("ABCDEF")
nchar(c("A", "BC", "DEF"))

##### 2.14.2.1 Zeichenketten nach Muster erstellen ----
paste("group", LETTERS[1:5], sep="_")
paste(1:5, palette()[1:5], sep=": ")
paste(1:5, letters[1:5], sep=".", collapse=" ")
paste(1, NA, 2, NULL, 3, character(0), sep="_")

N   <- 20
grp <- "A"
M   <- 14.2
sprintf("For %d particpants in group %s, the mean was %f", N, grp, M)
sprintf("%.3f", 1.23456)
sprintf("%.4d", c(1, 52, 712))

cVar <- "A string"
print(cVar, quote=FALSE)
noquote(cVar)

##### 2.14.2.2 Zeichenketten verbinden ----
cat(cVar, "with\n", 4, "\nwords\n", sep="+")

##### 2.14.3 Zeichenketten manipulieren ----
tolower(c("A", "BC", "DEF"))
toupper(c("ghi", "jk", "i"))
strtrim("AfairlyLongString", width=6)
abbreviate("AfairlyLongString", minlength=6)
trimws(c("  Quattuor ", "   quinque   "), which="both")
strrep(c("A", "B", "C"), 1:3)

library(DescTools)
StrRev(c("Lorem", "ipsum", "dolor", "sit"))

strsplit(c("abc_def_ghi", "jkl_mno"), split="_")
strsplit("Xylophon", split=NULL)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 2.14.4 Zeichenfolgen finden ----
 match(c("abc", "de", "f", "h"), c("abcde", "abc", "de", "fg", "ih"))
pmatch(c("abc", "de", "f", "h"), c("abcde", "abc", "de", "fg", "ih"))
startsWith(c("Train", "Station"), "T")
  endsWith(c("Train", "Station"), "n")
grep( "A[BC][[:blank:]]", c("AB ", "AB", "AC ", "A "))
grep( "A[BC][[:blank:]]", c("AB ", "AB", "AC ", "A "),
     value=TRUE)

grepl("A[BC][[:blank:]]", c("AB ", "AB", "AC ", "A "))

##### 2.14.5 Zeichenfolgen extrahieren ----
substring(c("ABCDEF", "GHIJK", "LMNO", "PQR"), first=4, last=5)

pat    <- "[[:upper:]]+"
txt    <- c("abcDEFG", "ABCdefg", "abcdefg")
(start <- regexpr(pat, txt))
regmatches(txt, start)

txt2   <- c("abcDEFGhijKL", "ABCdefgHIJ", "abcdefg")
start2 <- gregexpr(pat, txt2)
regmatches(txt2, start2)

glob2rx("asdf*.txt")

##### 2.14.6 Zeichenfolgen ersetzen ----
charVec <- c("ABCDEF", "GHIJK", "LMNO", "PQR")
substring(charVec, 4, 5) <- c("..", "xx", "++", "**")
charVec

 sub("em", "XX", "Lorem ipsum dolor sit Lorem ipsum")
gsub("em", "XX", "Lorem ipsum dolor sit Lorem ipsum")

gsub("^[[:alpha:]]+-([[:digit:]]+)-[[:alpha:]]+$", "\\1", "abc-412-def")

##### 2.14.7 Zeichenketten als Befehl ausfühen ----
obj1 <- parse(text="3 + 4")
obj2 <- parse(text=c("vec <- c(1, 2, 3)", "vec^2"))
eval(obj1)
eval(obj2)

####*--------------------------------------------------------------------------*
#### 2.15 Datum und Uhrzeit ----
##### 2.15.1 Datumsangaben erstellen und formatieren  ----
Sys.Date()

(myDate <- as.Date("01.11.1974", format="%d.%m.%Y"))
format(myDate, format="%d.%m.%Y")

(negDate <- as.Date(-374, origin="1910-12-16"))
as.numeric(negDate)

##### 2.15.2 Uhrzeit ----
Sys.time()
date()

(myTime   <- as.POSIXct("2009-02-07 09:23:02"))
charDates <- c("05.08.1972, 03:37", "31.03.1981, 12:44")
(lDates   <- strptime(charDates, format="%d.%m.%Y, %H:%M"))
as.POSIXct(678295.25, origin=as.Date("1970-01-01"))
lDates$mday
lDates$hour
ISOdate(2010, 6, 30, 17, 32, 10, tz="CET")

format(myTime, "%H:%M:%S")
format(lDates, "%d.%m.%Y")

weekdays(lDates)
months(lDates)

Sys.getenv("locale")

##### 2.15.3 Mit Datum und Uhrzeit rechnen ----
(myDate <- as.Date("01.11.1974", format="%d.%m.%Y"))
myDate + 365

(diffDate <- as.Date("1976-06-19") - myDate)
as.numeric(diffDate)
myDate + diffDate

lDates + c(60, 120)
(diff21 <- lDates[2] - lDates[1])
lDates[1] + diff21

round(lDates, units="days")
trunc(lDates, units="years")

seq(ISOdate(2010, 5, 1), ISOdate(2013, 5, 1), by="years")
seq(ISOdate(1997, 10, 22), by="2 weeks", length.out=4)

secsPerDay <- 60 * 60 * 24
randDates  <- ISOdate(1995, 6, 13) + sample(0:(28*secsPerDay), 100, replace=TRUE)
head(round(randDates, units="days"), n=4)
randWeeks  <- cut(randDates, breaks="week")
summary(randWeeks)

####*--------------------------------------------------------------------------*
### 3 Datensätze ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 3.1 Daten importieren und exportieren ----
##### 3.1.1 Datentabellen im Textformat ----
## (xDf <- read.table(file=stdin(), header=TRUE))
txt <- 'X Y Z
123 "A B C" 34.8
99 "D E F" 3543.2'
read.table(text=txt, header=TRUE)

myDf <- data.frame(IV=factor(rep(c("A", "B"), 5)),
                   DV=rnorm(10))
write.table(myDf, file="data.txt", row.names=FALSE)
myDf <- read.table("data.txt", header=TRUE)
str(myDf)

##### 3.1.2 R-Objekte ----
myDf <- data.frame(IV=factor(rep(c("A", "B"), 5)),
                   DV=rnorm(10))
save(myDf, file="data.RData")
load("data.RData")
saveRDS(myDf, file="data.rda")
myDf <- readRDS("data.rda")
dump("myDf", file="dumpMyDf.txt")
source("dumpMyDf.txt")

##### 3.1.3 Daten mit anderen Programmen austauschen ----
##### 3.1.3.1 Programme zur Tabellenkalkulation ----´
## myDf <- read.table("clipboard", sep="\t", dec=",")
library(readxl)
d <- read_xlsx("data.xlsx", sheet="Datenblatt", skip=2,
               col_names=c("ID", "Gruppe", "Wert"),
               col_types=c("text", "text", "numeric"))

as.data.frame(d)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:readxl))

##### 3.1.3.2 SPSS, Stata und SAS ----
## load("data.RData")
library(haven)
myDf_sav <- read_sav("myDf.sav")
head(myDf_sav, n=4)
str(myDf_sav)
myDf_org <- transform(myDf, IV=as_factor(IV))
str(myDf_org)

myDf <- data.frame(IV=factor(rep(c("A", "B"), 5)),
                   DV=rnorm(10),
                   intVar=sample(1L:3L, 10, replace=TRUE))
attr(myDf$IV, "label") <- "Gruppierungsfaktor"
attr(myDf$DV, "label") <- "Messwerte"
class(myDf$intVar) <- "haven_labelled"
attr(myDf$intVar, "labels") <- c("Val1"=1L, "Val2"=2L, "Val3"=3L)
write_sav(myDf, "myDf.sav")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:haven))

##### 3.1.3.3 Datenbanken ----
library(duckdb)
drv <- dbDriver("duckdb")
con <- dbConnect(drv, dbname=":memory:") # "myDf.db")

IQ     <- rnorm(2*100, mean=100, sd=15)
rating <- sample(LETTERS[1:3], 2*50, replace=TRUE)
sex    <- factor(rep(c("f", "m"), times=50))
myDf   <- data.frame(sex, IQ, rating, stringsAsFactors=FALSE)
## für Daten des Buches diese Datei laden:
# load("data_db.Rdata")

dbWriteTable(con, name="MyDataFrame", value=myDf, row.names=FALSE)
dbListTables(con)
dbListFields(con, "MyDataFrame")
out <- dbReadTable(con, "MyDataFrame")
head(out, n=4)
dbGetQuery(con, "SELECT sex, AVG(IQ) AS mIQ, SUM(IQ) AS sIQ FROM MyDataFrame GROUP BY sex")
res <- dbSendQuery(con, "SELECT IQ, rating FROM MyDataFrame WHERE rating = 'A'")
while(!dbHasCompleted(res)) {
    partial <- dbFetch(res, n=3)
    print(partial)
}

dbClearResult(res)
dbRemoveTable(con, "MyDataFrame")
dbDisconnect(con, shutdown=TRUE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:duckdb))
try(detach(package:DBI))

##### 3.1.4 Daten in der Konsole einlesen ----
## vec <- scan()
## charVec <- scan(what=character())

##### 3.1.5 Unstrukturierte Textdateien nutzen ----
x <- scan("scan.txt", what=list(A=character(), B=numeric()))
x
as.data.frame(x)

####*--------------------------------------------------------------------------*
#### 3.2 Dateien verwalten ----
##### 3.2.1 Dateien auswählen ----
getwd()
## paths <- list.files(path="d:/files", pattern="\\.txt$", full.names=TRUE)
## DFlist <- lapply(paths, function(f) {
##    read.table(f, header=TRUE) })

## DFall <- do.call(rbind, DFlist)

## texFiles1 <- Sys.glob("../r_gddmr/*.tex")
## head(texFiles1, n=4)
## texFiles2 <- Sys.glob("d:/work/r_gddmr/*.tex")
## head(texFiles2, n=2)

##### 3.2.2 Dateipfade manipulieren ----
basename("c:/path/to/file.txt")
dirname("c:/path/to/file.txt")

library(tools)
file_ext("c:/path/to/file.txt")
file_path_sans_ext("c:/path/to/file.txt")

##### 3.2.3 Dateien verändern ----
## dir.create("newDir")
## file.create("newDir/newFile.txt")
## file.copy("newDir/newFile.txt", to="newDir/fileA.txt")
## file.rename("newDir/fileA.txt", to="newDir/fileB.txt")
## file.remove("newDir/newFile.txt")
## file.exists("newDir/newFile.txt")
## file.exists("newDir/fileB.txt")

####*--------------------------------------------------------------------------*
#### 3.3 Datensätze aufbereiten und aggregieren ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
head(myDf1)

##### 3.3.1 Verkettung von Befehlen mit der Pipe ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
myDf2 <- setNames(myDf1, c("ID", "SEX", "GROUP", "AGE", "IQ", "RATING"))
myDf3 <- transform(myDf2, IQ_Z=scale(IQ),
                   group2=forcats::fct_collapse(GROUP, CG_WL=c("CG", "WL")))
myDf4 <- subset(myDf3, group2 == "CG_WL")
sort_by(myDf4, ~ RATING)

df_mod <- myDf1 |>
    setNames(c("ID", "SEX", "GROUP", "AGE", "IQ", "RATING")) |>
    transform(IQ_Z=scale(IQ),
              group2=forcats::fct_collapse(GROUP, CG_WL=c("CG", "WL"))) |>
    subset(group2 == "CG_WL") |>
    sort_by(~ RATING)

head(df_mod, n=4)

myDf1 |>
    lm(IQ ~ age + rating, data=_)

##### 3.3.2 Variablen umbenennen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
(names1 <- names(myDf1))
hasName(myDf1, "ID")

names1[3] <- "fac"
df_mod1   <- setNames(myDf1, names1)
head(df_mod1, 3)

names1[names1 == "fac"] <- "cond"
df_mod2 <- setNames(myDf1, names1)
head(df_mod2, 3)

df_mod3 <- setNames(myDf1, toupper(names1))
head(df_mod3, n=3)

##### 3.3.3 Teilmengen von Daten auswählen ----
##### 3.3.3.1 Variablen auswählen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
subset(myDf1, select=c(group, IQ))
subset(myDf1, select=c(-sex, -IQ))
(colIdx <- grep("^i.+", names(myDf1), ignore.case=TRUE))
subset(myDf1, select=colIdx)

(colIdx_num <- vapply(myDf1, is.numeric, logical(1)))
subset(myDf1, select=colIdx_num)

##### 3.3.3.2 Beobachtungen auswählen ----
subset(myDf1, sex == "f")
subset(myDf1, id == rating)
subset(myDf1, (sex == "m") & (rating > 2))
subset(myDf1, (IQ < 90) | (IQ > 110))
subset(myDf1, group %in% c("CG", "WL"))

##### 3.3.4 Doppelte und fehlende Werte ausschließen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
myDfDouble <- rbind(myDf1, myDf1[sample(seq_len(nrow(myDf1)), 4), ])
duplicated(myDfDouble) | duplicated(myDfDouble, fromLast=TRUE)
myDfUnique <- unique(myDfDouble)
any(duplicated(myDfUnique))

myDfNA           <- myDf1
myDfNA$IQ[2]     <- NA
myDfNA$rating[3] <- NA

is.na(myDfNA)[1:3, ]
apply(myDfNA, 2, anyNA)
apply(myDfNA, 2, function(x) { sum(is.na(x)) })
apply(myDfNA, 2, function(x) { length(na.omit(x)) })

complete.cases(myDfNA)
table(complete.cases(myDfNA))
subset(myDfNA, !complete.cases(myDfNA))
idx_cmpl_IQ <- with(myDfNA, complete.cases(IQ))
subset(myDfNA, idx_cmpl_IQ)

head(na.omit(myDfNA), n=4)

##### 3.3.5 Variablen entfernen, hinzufügen und transformieren ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
dfTemp <- myDf1
dfTemp$group <- NULL
head(dfTemp, n=3)

dfTemp[c("sex", "IQ")] <- list(NULL)
head(dfTemp, n=3)

married <- sample(c(TRUE, FALSE), nrow(myDf1), replace=TRUE)
myDf2   <- myDf1
myDf2$married1    <- married
myDf2["married2"] <- married
myDf3 <- cbind(myDf1, married)
head(myDf3, n=3)
myDf4 <- transform(myDf3,
                   rSq=rating^2,
                   IQgrp=cut(IQ, breaks=c(0, 100, Inf)))
head(myDf4, n=3)

##### 3.3.6 Datensätze sortieren ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
sort_by(myDf1, ~ rating)
sort_by(myDf1, ~ group + rating)

##### 3.3.7 Datensätze aufteilen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
split(myDf1, myDf1$group)
split(myDf1, ~ group)
split(myDf1, ~ sex + group)

##### 3.3.8 Datensätze zeilen oder spaltenweise verbinden  ----
(dfNew <- data.frame(id=13:15,
                     group=c("CG", "WL", "T"),
                     sex=c("f", "f", "m"),
                     age=c(18, 31, 21),
                     IQ=c(116, 101, 99),
                     rating=c(4, 4, 1)))
dfComb <- rbind(myDf1, dfNew)
dfComb[11:15, ]

##### 3.3.9 Datensätze zusammenführen ----
(IDDV <- data.frame(ID=factor(rep(1:3, each=2)),
                    DV=round(rnorm(6, 100, 15))))

(IV <- data.frame(ID=factor(1:3),
                  IV=factor(c("A", "B", "A")),
                  sex=factor(c("f", "f", "m"))))

merge(IDDV, IV)

(dfA <- data.frame(ID=1:4,
                   initials=c("AB", "CD", "EF", "GH"),
                   IV1=c("-", "-", "+", "+"),
                   DV1=c(10, 10, 11, 14)))

(dfB <- data.frame(ID_mod=1:4,
                   initials=c("AB", "CD", "EF", "GH"),
                   IV2=c("A", "B", "A", "B"),
                   DV2=c(91, 89, 92, 79)))

merge(dfA, dfB, by.x="ID", by.y="ID_mod")

(dfC <- data.frame(ID=3:6,
                   initials=c("EF", "GH", "IJ", "KL"),
                   IV2=c("A", "B", "A", "B"),
                   DV2=c(92, 79, 101, 81)))
merge(dfA, dfC)
merge(dfA, dfC,             all.y=TRUE)
merge(dfA, dfC, all.x=TRUE, all.y=TRUE)

##### 3.3.10 Organisationsform einfacher Datensätze ändern ----
vec1  <- sample(1:10, 3, replace=TRUE)
vec2  <- sample(1:10, 2, replace=TRUE)
vec3  <- sample(1:10, 1, replace=TRUE)
(lTmp <- list(cond1=vec1, cond2=vec2, cond3=vec3))
(res  <- stack(lTmp))
str(res)
unstack(res)

Nj        <- 3
res$IVnew <- factor(sample(rep(c("A", "B"), Nj), 2*Nj, replace=FALSE))
res$DVnew <- sample(100:200, 2*Nj)
unstack(res, form=DVnew ~ IVnew)

##### 3.3.11 Organisationsform komplexer Datensätze ändern ----
##### 3.3.11.1 Vorgehen bei einem Messwiederholungsfaktor ----
Nj    <- 2
P     <- 2
Q     <- 3
id    <- 1:(P*Nj)
DV_t1 <- round(rnorm(P*Nj, -1, 1), 2)
DV_t2 <- round(rnorm(P*Nj,  0, 1), 2)
DV_t3 <- round(rnorm(P*Nj,  1, 1), 2)
IVbtw <- factor(rep(c("A", "B"), Nj))
(dfW1 <- data.frame(id, IVbtw, DV_t1, DV_t2, DV_t3))

idL    <- rep(id, Q)
DVl    <- c(DV_t1, DV_t2, DV_t3)
IVwth  <- factor(rep(1:3, each=P*Nj))
IVbtwL <- rep(IVbtw, times=Q)
dfL1a  <- data.frame(id=idL, IVbtw=IVbtwL, IVwth=IVwth, DV=DVl)
sort_by(dfL1a, ~ id)

dfL1b <- reshape(dfW1, varying=c("DV_t1", "DV_t2", "DV_t3"),
                 direction="long", idvar=c("id", "IVbtw"), v.names="DV")
head(sort_by(dfL1b, ~ id))
reshape(dfL1a, v.names="DV", timevar="IVwth",
        idvar=c("id", "IVbtw"), direction="wide")

##### 3.3.11.2 Vorgehen bei mehreren Messwiederholungsfaktoren ----
N    <- 4
id   <- seq_len(N)
t_11 <- round(rnorm(N,  8, 2), 2)
t_12 <- round(rnorm(N, 10, 2), 2)
t_21 <- round(rnorm(N, 13, 2), 2)
t_22 <- round(rnorm(N, 15, 2), 2)
t_31 <- round(rnorm(N, 13, 2), 2)
t_32 <- round(rnorm(N, 15, 2), 2)
(dfW2 <- data.frame(id, t_11, t_12, t_21, t_22, t_31, t_32))

(dfL2_IV1 <- reshape(dfW2, varying=list(c("t_11", "t_21", "t_31"),
                                        c("t_12", "t_22", "t_32")),
                     direction="long", timevar="IV1", idvar="id",
                     v.names=c("IV2-1", "IV2-2")))

dfL2_IV1_IV2 <- reshape(dfL2_IV1, varying=c("IV2-1", "IV2-2"),
                        direction="long", timevar="IV2", idvar=c("id", "IV1"),
                        v.names="DV")
head(dfL2_IV1_IV2)

dfW2_IV2 <- reshape(dfL2_IV1_IV2, v.names="DV", timevar="IV2",
                    idvar=c("id", "IV1"), direction="wide")

(dfW2_IV1_IV2 <- reshape(dfW2_IV2, v.names=c("DV.1", "DV.2"),
                         timevar="IV1", idvar="id", direction="wide"))

##### 3.3.12 Datensätze getrennt nach Gruppen auswerten und aggregieren ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
(dat_spl <- split(myDf1, ~ group))
sapply(dat_spl, function(x) { c(M=mean(x$IQ)) })

(m_sd <- sapply(dat_spl, function(x) { c(M=mean(x$IQ), SD=sd(x$IQ)) }))
t(m_sd)

dat_spl2 <- split(myDf1, list(myDf1$sex, myDf1$group), drop=TRUE)
sapply(dat_spl2, function(x) { c(M=mean(x$IQ), SD=sd(x$IQ)) })

aggr_fun <- function(x) {
    data.frame(Sex=unique(x$sex),
               Group=unique(x$group),
               M=mean(x$IQ),
               SD=sd(x$IQ))
}

m_sdL <- lapply(dat_spl2, aggr_fun)
do.call(rbind, m_sdL)

aggregate(cbind(age, IQ) ~ sex + group, FUN=mean, data=myDf1)
aggregate(cbind(age, IQ) ~ 1, FUN=mean, data=myDf1)

d_mean <- aggregate(cbind(age, IQ) ~ sex + group, FUN=mean, data=myDf1)
d_sd   <- aggregate(cbind(age, IQ) ~ sex + group, FUN=sd,   data=myDf1)
merge(d_mean, d_sd, by=c("sex", "group"), suffixes = c(".M",".SD"))

##### 3.3.13 Funktionen auf Variablen anwenden ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
numDf <- subset(myDf1, select=c(age, IQ, rating))
(myL  <- lapply(numDf, mean))
sapply(numDf, range)
(numIdx <- sapply(myDf1, is.numeric))
dataNum <- subset(myDf1, select=numIdx)
head(dataNum)
data.matrix(dataNum)

do.call(c, myL)
work   <- factor(sample(c("home", "office"), 20, replace=TRUE))
hiLo   <- factor(sample(c("hi", "lo"),       20, replace=TRUE))
group  <- factor(sample(c("A", "B"),         20, replace=TRUE))
tab    <- table(work, hiLo, group)
argLst <- list(tab, row.vars="work", col.vars=c("hiLo", "group"))
do.call(ftable, argLst)

##### 3.3.14 Funktionen für mehrere Variablen anwenden ----
N     <- 100
x1    <- rnorm(N, 10, 10)
y1    <- rnorm(N, 10, 10)
x2    <- x1 + rnorm(N, 5, 4)
y2    <- y1 + rnorm(N, 10, 4)
myDf2 <- data.frame(x1, y1)
myDf3 <- data.frame(x2, y2)
mapply(t.test, myDf2, myDf3, MoreArgs=list(alternative="less", var.equal=TRUE))

####*--------------------------------------------------------------------------*
#### 3.4 Datensätze aufbereiten und aggregieren mit dplyr ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
head(myDf1)

##### 3.4.1 Besonderheiten ----
##### 3.4.2 Variablen umbenennen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf1 |>
    rename(score=rating,
           fac=group)

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.3 Teilmengen von Daten auswählen ----
##### 3.4.3.1 Variablen auswählen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf1 |>
    select(group, IQ)

myDf1 |>
    select(matches("^i.+"))

myDf1 |>
    select(where(is.numeric))

myDf1 |>
    select(!where(is.numeric) & starts_with("g"))

myDf1 |>
    select(-sex, -matches("a.+"))

myDf1 |>
    select(-(group:IQ))

##### 3.4.3.2 Beobachtungen auswählen ----
myDf1 |>
    filter(group %in% c("CG", "WL"),
           id > rating,
           IQ > 90)

myDf1 |>
    filter((IQ < 90) | (IQ > 110))

myDf1 |>
    slice(5:7)

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.4 Doppelte und fehlende Werte ausschließen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf1 |>
    select(sex, group) |>
    distinct()

myDf1 |>
    distinct(sex, group)

myDf1 |>
    n_distinct()

myDf9999           <- myDf1
myDf9999$IQ[2]     <- 9999
myDf9999$rating[3] <- 9999

(myDfNA <- myDf9999 |>
        mutate(IQ=na_if(IQ, 9999),
               rating=na_if(rating, 9999)))

myDfNA |>
    na.omit()

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.5 Variablen entfernen, hinzufügen und transformieren ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)
library(forcats)

myDf1 |>
    mutate(married=sample(c(TRUE, FALSE), n(), replace=TRUE))

myDf1 |>
    mutate(group=fct_collapse(group, CG_WL=c("CG", "WL")),
           ratingSq=rating^2,
           ratingSqZ=scale(ratingSq))

myDf1 |>
    mutate(age_even=if_else((age %% 2) == 0, TRUE, FALSE),
           sex_IQ=case_when(
               ((sex == "f") & (IQ <  100)) ~ "female_lo",
               ((sex == "f") & (IQ >= 100)) ~ "female_hi",
               ((sex == "m") & (IQ <  100)) ~ "male_lo",
               ((sex == "m") & (IQ >= 100)) ~ "male_hi",
               TRUE                         ~ "other"))

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))
try(detach(package:forcats))

##### 3.4.6 Datensätze sortieren ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf1 |>
    arrange(rating)

myDf1 |>
    arrange(group, desc(IQ))

myDf1 |>
    select(group, age, everything())

myDf1 |>
    relocate(group, .after=rating)

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.7 Datensätze zeilen- oder spaltenweise verbinden ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf2 <- data.frame(id=21,
                    sex=factor("f", levels=c("f", "m")),
                    age=48,
                    rating=3)

bind_rows(myDf1, myDf2) |>
    tail(n=3)

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.8 Datensätze zusammenführen ----
library(dplyr)

(IDDV <- data.frame(ID=factor(rep(1:3, each=2)),
                    DV=round(rnorm(6, 100, 15))))

(IV <- data.frame(ID=factor(1:3),
                  IV=factor(c("A", "B", "A")),
                  sex=factor(c("f", "f", "m"))))

merge(IDDV, IV)
IDDV |>
    left_join(IV, by=join_by(ID))

(dfA <- data.frame(ID=1:4,
                   initials=c("AB", "CD", "EF", "GH"),
                   IV1=c("-", "-", "+", "+"),
                   DV1=c(10, 10, 11, 14),
                   stringsAsFactors=FALSE))

(dfB <- data.frame(ID_mod=3:6,
                   initials=c("EF", "GH", "IJ", "KL"),
                   IV2=c("A", "B", "A", "B"),
                   DV2=c(92, 79, 101, 81),
                   stringsAsFactors=FALSE))

merge(dfA, dfB, all.x=TRUE, by.x=c("ID", "initials"), by.y=c("ID_mod", "initials"))
dfA |>
    left_join(dfB, by=join_by(ID == ID_mod, initials))

merge(dfA, dfB, all.y=TRUE, by.x=c("ID", "initials"), by.y=c("ID_mod", "initials"))
dfA |>
    right_join(dfB, by=join_by(ID == ID_mod, initials))

merge(dfA, dfB, all.x=TRUE, all.y=TRUE, by.x=c("ID", "initials"), by.y=c("ID_mod", "initials"))
dfA |>
    full_join(dfB, by=join_by(ID == ID_mod, initials))

merge(dfA, dfB, by.x=c("ID", "initials"), by.y=c("ID_mod", "initials"))
dfA |>
    inner_join(dfB, by=join_by(ID == ID_mod, initials))

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.9 Organisationsform komplexer Datensätze ändern ----
library(dplyr)
library(tidyr)

Nj    <- 2
P     <- 2
Q     <- 3
id    <- 1:(P*Nj)
DV_t1 <- round(rnorm(P*Nj, -1, 1), 2)
DV_t2 <- round(rnorm(P*Nj,  0, 1), 2)
DV_t3 <- round(rnorm(P*Nj,  1, 1), 2)
IVbtw <- factor(rep(c("A", "B"), Nj))
(datW <- data.frame(id, IVbtw, DV_t1, DV_t2, DV_t3))

datL <- datW |>
    pivot_longer(cols=starts_with("DV_"),
                 names_to="time", values_to="DV",
                 names_prefix="DV_")

datL

datL |>
    pivot_wider(id_cols=c(id, IVbtw),
                names_from=time, values_from=DV,
                names_prefix="DV_")

datL |>
    mutate(DVsq=DV^2) |>
    pivot_wider(id_cols=c(id, IVbtw),
                names_from=time, values_from=c(DV, DVsq))

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))
try(detach(package:tidyr))

##### 3.4.10 Datensätze getrennt nach Gruppen auswerten und aggregieren ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf1_grp <- myDf1 |>
    group_by(sex) |>
    mutate(IQ_rank=rank(IQ)) |>
    arrange(sex, IQ)

myDf1_grp

myDf1_grp |>
    ungroup()

myDf1_grp |>
    slice_head(n=2)

myDf1_grp |>
    head(n=2)

myDf1 |>
    group_by(group) |>
    summarise(age_M=mean(age),
              age_SD=sd(age),
              IQ_M=mean(IQ),
              IQ_SD=sd(IQ),
              n=n())

df_aggr <- myDf1 |>
    group_by(sex, group) |>
    summarise(rating_M=mean(rating))

df_aggr

df_aggr |>
    summarise(n=n())

myDf1 |>
    group_by(sex) |>
    summarise(n=n())

myDf1 |>
    reframe(Qs_age=quantile(age, probs=c(0.25, 0.5, 0.75)),
            Qs_IQ =quantile(IQ,  probs=c(0.25, 0.5, 0.75))) |>
    mutate(Q=sprintf("Q%.2f", c(0.25, 0.5, 0.75)))

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.11 Funktionen auf Gruppen von Variablen anwenden ----
##### 3.4.11.1 Variablen umbenennen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf1 |>
    rename_with(tolower, matches("^i.+"))

myDf1 |>
    rename_with(~paste0("fac_", .), where(is.character))

myDf1 |>
    rename_with(toupper)

##### 3.4.11.2 Variablen auswählen ----
##### 3.4.11.3 Beobachtungen auswählen ----
getNA <- function(x, prob=c(0.7, 0.3)) {
    NAval <- x[length(x) + 1]
    if_else(sample(c(TRUE, FALSE), length(x), replace=TRUE, prob=prob), x, NAval)
}

myDf1NA <- myDf1 |>
    mutate(group=getNA(group),
           age=getNA(age),
           IQ=getNA(IQ),
           rating=getNA(rating))

## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
myDf1NA

myDf1NA |>
    filter(if_any(group:IQ, is.na))

myDf1NA |>
    filter(if_all(group:IQ, is.na))

myDf1NA |>
    filter(if_all(where(is.numeric), ~(. %% 2) == 1L))

##### 3.4.11.4 Variablen transformieren ----
myDf1 |>
    mutate(across(where(is.numeric),
                  ~scale(., center=TRUE, scale=FALSE)))

myDf1 |>
    mutate(across(age:IQ, list(ctr=~scale(., center=TRUE, scale=FALSE),
                               scl=~scale(., center=FALSE, scale=TRUE))))

myDf1 |>
    mutate(across(everything(), getNA))

##### 3.4.11.5 Daten getrennt nach Gruppen auswerten ----
myDf1 |>
    group_by(across(starts_with("s"))) |>
    summarise(M_IQ=mean(IQ))

myDf1 |>
    group_by(across(where(is.character))) |>
    summarise(M_IQ=mean(IQ))

##### 3.4.11.6 Daten aggregieren ----
vars_sel <- c("sex", "rating")
myDf1 |>
    group_by(group) |>
    summarise(across(all_of(vars_sel), n_distinct))

myDf1NA |>
    group_by(group) |>
    summarise(across(where(is.numeric), ~mean(., na.rm=TRUE)))

myDf1NA |>
    group_by(sex) |>
    summarise(across(where(is.numeric),
                     list(median=~median(., na.rm=TRUE))))

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

##### 3.4.12 Häufigkeiten bestimmen ----
## für Daten des Buches diese Datei laden:
# load("data_myDf1.Rdata")
library(dplyr)

myDf1 |>
    count(sex, group)

as.data.frame(xtabs(~ sex + group, data=myDf1))

myDf1 |>
    count(sex, group, .drop=FALSE)

myDf1 |>
    add_count(sex, group)

myDf1 |>
    count(sex, group) |>
    mutate(freq_rel=n / sum(n))

myDf1 |>
    add_count(sex, group) |>
    mutate(freq_rel=n / n())

myDf1 |>
    count(sex, group, name="n_sex_group") |>
    group_by(sex) |>
    mutate(n_sex=sum(n_sex_group),
           freq_cond_rel=n_sex_group / n_sex) |>
    ungroup()

proportions(xtabs(~ sex + group, data=myDf1), margin="sex")

myDf1 |>
    add_count(sex, name="n_sex") |>
    add_count(sex, group, name="n_sex_group") |>
    mutate(freq_cond_rel=n_sex_group / n_sex) |>
    select(id, sex, group, n_sex, n_sex_group, freq_cond_rel)

## entferne die (ggf. automatisch) geladenen Zusatzpakete
try(detach(package:dplyr))

####*--------------------------------------------------------------------------*
### 4 Zuverlässige und reproduzierbare Datenauswertung ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 4.1 Befehlssequenzen im Editor bearbeiten ----

####*--------------------------------------------------------------------------*
#### 4.2 Dokumente erstellen mit Quarto ----
##### 4.2.1 Grundprinzip ----
## s. Dokument r_gddmr_basic.qmd

##### 4.2.2 Aufbau eines Quelldokuments ----
## s. Dokument r_gddmr_basic.qmd

##### 4.2.3 Arbeitsschritte ----
##### 4.2.4 Auf Elemente verweisen ----
## s. Dokument r_gddmr_verweise.qmd

##### 4.2.5 Fortgeschrittene Möglichkeiten ----
## s. Dokument r_gddmr_extended.qmd

##### 4.2.6 Besondere Ausgabeformate und Erweiterungen ----
##### 4.2.7 R-Markdown ----
## s. Dokument rmarkdown.Rmd

####*--------------------------------------------------------------------------*
#### 4.3 Datenqualität prüfen ----

####*--------------------------------------------------------------------------*
#### 4.4 Reproduzierbare Auswertungen sicherstellen ----
##### 4.4.1 Potentielle Probleme und Maßnahmen ----
## op <- options(conflicts.policy="strict")
## options(op)

##### 4.4.2 Allgemeine Empfehlungen ----

getOption("na.action")
getOption("contrasts")
getOption("stringsAsFactors")
RNGkind()

####*--------------------------------------------------------------------------*
### 5 Hilfsmittel für die Inferenzstatistik ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 5.1 Wichtige Begriffe inferenzstatistischer Tests ----

####*--------------------------------------------------------------------------*
#### 5.2 Lineare Modelle formulieren ----
Y ~ X1
Y ~ X1 + X2 - 1
Y ~ F1
Y ~ F1 + F2 + F1:F2
Y ~ X1 + F1
Y ~ X1*X2
Y ~ (X1 + X2 + X3)^2

terms(Y ~ X1*X2*X3)
terms(Y ~ (X1 + X2 + X3)^2 - X1 - X2:X3)

log(Y) ~ abs(X)
Y ~ I(X*2)

####*--------------------------------------------------------------------------*
#### 5.3 Funktionen von Zufallsvariablen ----
##### 5.3.1 Dichtefunktion ----
dbinom(7, size=10, prob=0.5)
choose(10, 7) * 0.5^7 * (1-0.5)^(10-7)

##### 5.3.2 Verteilungsfunktion ----
pbinom(7, size=10, prob=0.5)
sum(dbinom(0:7, size=10, prob=0.5))
pnorm(c(-Inf, 0, Inf), mean=0, sd=1)
  pnorm(1.645, mean=0, sd=1, lower.tail=FALSE)
1-pnorm(1.645, mean=0, sd=1, lower.tail=TRUE)
m <- 100
s <- 15
pnorm(m+s, mean=m, sd=s) - pnorm(m-s, mean=m, sd=s)
diff(pnorm(c(m-s, m+s), mean=m, sd=s))

##### 5.3.3 Quantilfunktion ----
qnorm(pnorm(0))
qnorm(1-(0.05/2), 0, 1)
qnorm(0.05/2, 0, 1, lower.tail=FALSE)
qt(0.01, 18, 0, lower.tail=FALSE)

####*--------------------------------------------------------------------------*
### 6 Lineare Regression ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 6.1 Test des Korrelationskoeffizienten ----
N   <- 20
DV1 <- rnorm(N, 100, 15)
DV2 <- DV1 + rnorm(N, 0, 50)
## für Daten des Buches diese Datei laden:
# load("data_corTest.Rdata")
cor.test(~ DV1 + DV2)

r     <- cor(DV1, DV2)
(tVal <- sqrt(N-2) * r / sqrt(1-r^2))
(pVal <- 2*pt(tVal, N-2, lower.tail=FALSE))

fishZ   <- 0.5 * log((1+r)/(1-r))
fishV   <- 1 / (N-3)
Zcrit   <- qnorm(0.05/2, 0, 1, lower.tail=FALSE)
Zlo     <- fishZ - Zcrit*sqrt(fishV)
Zup     <- fishZ + Zcrit*sqrt(fishV)
(ciLoUp <- tanh(c(Zlo, Zup)))

####*--------------------------------------------------------------------------*
#### 6.2 Regressionsmodell anpassen ----
N      <- 100
height <- rnorm(N, 175, 7)
age    <- rnorm(N,  30, 8)
sport  <- abs(rnorm(N, 60, 30))
weight <- 0.5*height - 0.3*age - 0.4*sport + 10 + rnorm(N, 0, 3)
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")

(fitHA <- lm(weight ~ height + age))

lm(scale(weight) ~ scale(height) + scale(age))

####*--------------------------------------------------------------------------*
#### 6.3 Koeffzienten, Residuen und Vorhersage extrahieren ----
X     <- model.matrix(fitHA)
Xplus <- solve(t(X) %*% X) %*% t(X)
(b    <- Xplus %*% weight)
H     <- X %*% Xplus
Yhat  <- H %*% weight
all.equal(fitted(fitHA), c(Yhat), check.attributes=FALSE)

S <- cov(cbind(height, age))
(1/sd(weight)) * diag(sqrt(diag(S))) %*% b[-1]

####*--------------------------------------------------------------------------*
#### 6.4 Regression grafisch darstellen ----
# load("data_regrMult.Rdata")
fitH <- lm(weight ~ height)
plot(weight ~ height, xlab="height [cm]", ylab="weight [kg]", pch=20,
     main="Daten mit Zentroid, Regressionsgerade & Modellgerade")
abline(fitH, col="blue", lwd=2)
abline(a=10, b=0.4, col="gray", lwd=2)
points(mean(weight) ~ mean(height), col="red", pch=4, lwd=3, cex=1.5)
legend(x="topleft", legend=c("Daten", "Zentroid", "Regressionsgerade", "Modellgerade"),
       col=c("black", "red", "blue", "gray"), lwd=c(1, 3, 2, 2),
       pch=c(20, 4, NA, NA), lty=c(NA, NA, 1, 1))

library(car)
scatter3d(weight ~ height + age, fill=FALSE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

####*--------------------------------------------------------------------------*
#### 6.5 Regressionsanalyse ----
# load("data_regrMult.Rdata")
(sumHA <- summary(fitHA))

## options(show.signif.stars=FALSE)

coef(sumHA)
(sdCoef <- sqrt(diag(vcov(fitHA))))
(tVals  <- coef(fitHA) / sdCoef)

P      <- 2
(dfSSE <- N - (P+1))
(SSE   <- sum(residuals(fitHA)^2))
deviance(fitHA)
MSE    <- SSE / dfSSE
sqrt(MSE)

Yhat <- fitted(fit)
(rSq <- cor(Yhat, weight)^2)
1 - ((N-1)/(N-P-1)) * (1-rSq)

SScrit   <- sum((weight-mean(weight))^2)
dfSScrit <- N-1
1 - (MSE / (SScrit / dfSScrit))

MSpred <- sum((Yhat - mean(Yhat))^2) / P
(Fval  <- MSpred / MSE)
(pVal  <- pf(Fval, P, dfSSE, lower.tail=FALSE))

confint(fitHA)

extractAIC(fitHA)
extractAIC(fitHA, k=log(N))

N * log(SSE / N) + 2*(2+1)

AIC(fitHA)
N * (log(2*pi) + log(SSE / N) + 1) + 2*(2+1+1)

####*--------------------------------------------------------------------------*
#### 6.6 Regressionsmodell verändern ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
(fitHAS <- update(fitHA,  . ~ . + sport))
(fitAS  <- update(fitHAS, . ~ . - height))
(fitH   <- update(fitHA,  . ~ . - age))

####*--------------------------------------------------------------------------*
#### 6.7 Modelle vergleichen und auswählen ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
step(fitH, scope="~ height + age + sport", direction="forward")
step(fitHAS, direction="backward")
add1(fitH, . ~ . + age + sport, test="F")
(rssH  <- sum(residuals(lm(weight ~ height        ))^2))
(rssHA <- sum(residuals(lm(weight ~ height + age  ))^2))
(rssHS <- sum(residuals(lm(weight ~ height + sport))^2))

dfEH  <-          N - (1+1)
dfEHA <- dfEHS <- N - (2+1)

MSha  <- (rssH - rssHA) / (dfEH - dfEHA)
MSEha <- rssHA / dfEHA
(Fha  <- MSha / MSEha)

MShs  <- (rssH - rssHS) / (dfEH - dfEHS)
MSEhs <- rssHS / dfEHS
(Fhs  <- MShs / MSEhs)

anova(fitH, fitHAS)

#### 6.8 Moderierte Regression ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
heightC <- c(scale(height, center=TRUE, scale=FALSE))
ageC    <- c(scale(age,    center=TRUE, scale=FALSE))
fitHAi  <- lm(weight ~ heightC + ageC + heightC:ageC)
coef(summary(fitHAi))

####*--------------------------------------------------------------------------*
#### 6.9 Regressionsmodelle auf andere Daten anwenden ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
newHeight <- c(177, 150, 192, 189, 181)
newDf     <- data.frame(height=newHeight)
predict(fitH, newDf, interval="prediction", level=0.95)

(coeffs <- coef(fitH))
coeffs[2]*newHeight + coeffs[1]

plot(weight ~ height, pch=20, xlab="Prädiktor",
     ylab="Kriterium und Vorhersage",
     xaxs="i", main="Daten und Vorhersage durch Regression")

predOrg <- predict(fitH, interval="confidence", level=0.95)
hOrd    <- order(height)
polygon(c(height[hOrd],         height[rev(hOrd)]),
        c(predOrg[hOrd, "lwr"], predOrg[rev(hOrd), "upr"]),
        border=NA, col=rgb(0.7, 0.7, 0.7, 0.6))
abline(fitH, col="blue")
legend(x="bottomright", legend=c("Daten", "Vorhersage", "Vertrauensbereich"),
       pch=c(20, NA, NA), lty=c(NA, 1, 1), lwd=c(NA, 1, 8),
       col=c("black", "blue", "gray"))

####*--------------------------------------------------------------------------*
#### 6.10 Regressionsdiagnostik ----
##### 6.10.1 Extremwerte, Ausreißer und Einfluss ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
Xpred <- cbind(height, age, sport)
Xz    <- scale(Xpred)
boxplot(Xz, main="Verteilung standardisierte Prädiktoren")
summary(Xz)

ctrX   <- colMeans(Xpred)
sX     <- cov(Xpred)
mahaSq <- mahalanobis(Xpred, ctrX, sX)
summary(sqrt(mahaSq))

fitHAS <- lm(weight ~ height + age + sport)
h      <- hatvalues(fitHAS)
hist(h, main="Histogramm der Hebelwerte")
summary(h)

all.equal(mahaSq, (N-1)*(h-(1/N)), check.attributes=FALSE)

inflRes <- influence.measures(fitHAS)
summary(inflRes)

cooksDst <- cooks.distance(fitHAS)
plot(cooksDst, main="Cooks Distanz", type="h")

P   <- 3
E   <- residuals(fitHAS)
MSE <- sum(E^2) / (N - (P+1))
CD  <- (E^2 / (MSE * (1-h)^2)) * (h / (P+1))
all.equal(cooksDst, CD)

##### 6.10.2 Verteilungseigenschaften der Residuen ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
library(car)
residualPlots(fitHAS)

Estnd <- rstandard(fitHAS)
all.equal(Estnd, E / sqrt(MSE * (1-h)))

Estud <- rstudent(fitHAS)
all.equal(Estud, E / (lm.influence(fitHAS)$sigma * sqrt(1-h)))

hist(Estud, main="Histogramm studentisierte Residuen", freq=FALSE)
curve(dnorm(x, mean=0, sd=1), col="red", lwd=2, add=TRUE)

qqnorm(Estud, main="Q-Q-Plot studentisierte Residuen")
qqline(Estud, col="red", lwd=2)

shapiro.test(Estud)

plot(fitted(fitHAS), Estud, pch=20, ylab="studentisierte Residuen",
     xlab="Vorhersage", main="Spread-Level-Plot")
abline(h=0, col="red", lwd=2)

lamObj  <- powerTransform(fitHAS, family="bcPower")
(lambda <- coef(lamObj))
wTrans  <- bcPower(weight, lambda)
all.equal(wTrans, ((weight^lambda) - 1) / lambda)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 6.10.3 Multikollinearität ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
(Rx <- cor(cbind(height, age, sport)))

library(car)
vif(fitHAS)

fitHeight <- lm(height ~ age    + sport)
fitAge    <- lm(age    ~ height + sport)
fitSport  <- lm(sport  ~ height + age)

1 / (1-summary(fitHeight)$r.squared)
1 / (1-summary(fitAge   )$r.squared)
1 / (1-summary(fitSport )$r.squared)

diag(solve(Rx))

lmScl <- lm(scale(weight) ~ scale(height) + scale(age) + scale(sport))
kappa(lmScl, exact=TRUE)
X        <- model.matrix(lmScl)
(eigVals <- eigen(t(X) %*% X)$values)
sqrt(eigVals / min(eigVals[eigVals >= .Machine$double.eps]))

vif(lmScl)
kappa(lm(weight ~ height + age + sport), exact=TRUE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

####*--------------------------------------------------------------------------*
#### 6.11 Erweiterungen der linearen Regression ----
##### 6.11.1 Robuste Regression ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
library(robustbase)
fitLMR <- lmrob(weight ~ height + age + sport, setting="KS2014")
summary(fitLMR)

library(sandwich)
library(lmtest)
hcSE <- vcovHC(fitHAS, type="HC3")
coeftest(fitHAS, vcov=hcSE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:robustbase))
try(detach(package:sandwich))
try(detach(package:lmtest))
try(detach(package:zoo))

##### 6.11.2 Penalisierte Regression ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
library(MASS)
lambdas  <- 10^(seq(-2, 4, length=100))
ridgeGCV <- lm.ridge(scale(weight) ~ scale(height) + scale(age) + scale(sport),
                     lambda=lambdas)
select(ridgeGCV)

plot(x=log(ridgeGCV$lambda), y=ridgeGCV$GCV, main="Ridge",
     xlab="log(lambda)", ylab="GCV")

lambda   <- ridgeGCV$lambda[ridgeGCV$GCV == min(ridgeGCV$GCV)]
ridgeSel <- lm.ridge(scale(weight) ~ scale(height) + scale(age) + scale(sport),
                     lambda=lambda)
coef(ridgeSel)

library(glmnet)
matScl  <- scale(cbind(weight, height, age, sport))
ridgeCV <- cv.glmnet(matScl[ , c("height", "age", "sport")], matScl[ , "weight"],
                     nfolds=10, alpha=0)

plot(ridgeCV$glmnet.fit, xvar="lambda", label=TRUE, lwd=2)
title("Ridge", line=3, col="blue")

ridge <- glmnet(matScl[ , c("height", "age", "sport")], matScl[ , "weight"],
                lambda=ridgeCV$lambda.min, alpha=0)
coef(ridge)
coef(ridgeCV, s=ridgeCV$lambda.min)

lassoCV <- cv.glmnet(matScl[ , c("height", "age", "sport")], matScl[ , "weight"],
                     nfolds=10, alpha=1)

plot(lassoCV$glmnet.fit, xvar="norm", label=FALSE, lwd=2)
title("LASSO", line=3, col="blue")

lasso <- glmnet(matScl[ , c("height", "age", "sport")], matScl[ , "weight"],
                lambda=lassoCV$lambda.min, alpha=1)
coef(lasso)
coef(lassoCV, s=lassoCV$lambda.min)

elNetCV <- cv.glmnet(matScl[ , c("height", "age", "sport")], matScl[ , "weight"],
                     nfolds=10, alpha=0.5)
plot(elNetCV$glmnet.fit, xvar="dev", label=FALSE, lwd=2)
title("Elastic Net", line=3, col="blue")
legend(x="bottomleft", legend=c("height", "age", "sport"), lwd=2,
       col=c("black", "red", "green"), bg="white")
elNet <- glmnet(matScl[ , c("height", "age", "sport")], matScl[ , "weight"],
                lambda=elNetCV$lambda.min, alpha=0.5)
coef(elNet)
coef(elNetCV, s=elNetCV$lambda.min)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:MASS))
try(detach(package:glmnet))
try(detach(package:Matrix))

##### 6.11.3 Nichtlineare Zusammenhänge ----
##### 6.11.4 Abhängige Fehler bei Messwiederholung oder Clusterung ----
##### 6.11.5 Beta-Regression für natürliche Anteile ----
##### 6.11.6 Regressionsmodelle für mehrere Verteilungsparameter ----

####*--------------------------------------------------------------------------*
#### 6.12 Partialkorrelation und Semipartialkorrelation ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
weight.S <- residuals(lm(weight ~ sport))
height.S <- residuals(lm(height ~ sport))
cor(weight.S, height.S)
(cor(weight, height) - (cor(weight, sport) * cor(height, sport))) /
     sqrt((1-cor(weight, sport)^2) * (1-cor(height, sport)^2))

R        <- cor(cbind(weight, height, sport))
Rinv     <- solve(R)
DsqrtInv <- diag(1/sqrt(diag(Rinv)))
Rpart    <- -(DsqrtInv %*% Rinv %*% DsqrtInv)
Rpart[upper.tri(Rpart)]

Dinv <- diag(1/diag(Rinv))
bz   <- -(Dinv %*% Rinv)
bz[upper.tri(bz)]
fit  <- lm(scale(weight) ~ scale(height) + scale(sport))
zapsmall(coef(fit))

cor(weight, height.S)
(cor(weight, height) - (cor(weight, sport) * cor(height, sport))) /
     sqrt(1-cor(height, sport)^2)

weight.AS   <- residuals(lm(weight ~ age + sport))
height.AS   <- residuals(lm(height ~ age + sport))
(pcorWH.AS  <- cor(weight.AS, height.AS))
(spcorWH.AS <- cor(weight, height.AS))

spcorWH.AS^2
fitAS  <- lm(weight ~ age + sport)
fitHAS <- lm(weight ~ height + age + sport)
summary(fitHAS)$r.squared - summary(fitAS)$r.squared

fitHAS
cov(weight, height.AS) / var(height.AS)

####*--------------------------------------------------------------------------*
### 7 Parametrische Tests für Dispersions- und Lageparameter von Verteilungen ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 7.1 Tests auf Normalverteilung ----

library(nortest)
DV <- rnorm(100, mean=100, sd=15)^2
lillie.test(DV)
ad.test(DV)
shapiro.test(DV)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:nortest))

####*--------------------------------------------------------------------------*
#### 7.2 Tests auf Varianzhomogenität ----
##### 7.2.1 F-Test auf Varianzhomogenität für zwei Stichproben ----
n1    <- 110
n2    <- 90
DV1   <- rnorm(n1, mean=100, sd=15)
DV2   <- rnorm(n2, mean=100, sd=13)
varDf <- stack(list(grp1=DV1, grp2=DV2))

## für Daten des Buches diese Datei laden:
# load("data_varTest.Rdata")
var.test(values ~ ind, data=varDf)

var1   <- var(DV1)
var2   <- var(DV2)
Fval   <- var1/var2
(pVal  <- 2*pf(Fval, n1-1, n2-1, lower.tail=FALSE))

Fcrit <- qf(c(0.025, 0.975), n1-1, n2-1, lower.tail=FALSE)
(ci   <- var1 / (Fcrit*var2))

##### 7.2.2 Levene-Test für mehr als zwei Stichproben ----
Nj    <- c(22, 18, 20)
N     <- sum(Nj)
P     <- length(Nj)
DV    <- sample(0:100, N, replace=TRUE)
IV    <- factor(sample(rep(1:P, Nj), N, replace=FALSE))
levDf <- data.frame(IV=IV, DV=DV)
## für Daten des Buches diese Datei laden:
# load("data_levene.Rdata")
library(car)
leveneTest(DV ~ IV, data=levDf)

absDiff <- abs(DV - ave(DV, IV, FUN=median))
anova(lm(absDiff ~ IV))

absErr <- abs(DV - ave(DV, IV, FUN=mean))
anova(lm(absErr ~ IV))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 7.2.3 Fligner-Killeen-Test für mehr als zwei Stichproben ----
## für Daten des Buches diese Datei laden:
# load("data_levene.Rdata")
fligner.test(DV ~ IV, data=levDf)

absDiff <- abs(DV - ave(DV, IV, FUN=median))
quants  <- qnorm((0.5 + rank(absDiff) / (2*(N+1))), 0, 1)
MQj     <- tapply(quants, IV, FUN=mean)
(FK     <- sum(Nj * ((MQj - mean(quants))^2)) / var(quants))
(pVal   <- pchisq(FK, P-1, lower.tail=FALSE))

####*--------------------------------------------------------------------------*
#### 7.3 t-Tests ----
##### 7.3.1 t-Test für eine Stichprobe ----
N    <- 100
DV   <- rnorm(N, 5, 20)
## für Daten des Buches diese Datei laden:
# load("data_tOne.Rdata")
muH0 <- 0
t.test(DV ~ 1, alternative="two.sided", mu=muH0)

M     <- mean(DV)
s     <- sd(DV)
(tVal <- (M-muH0) / (s/sqrt(N)))
(pVal <- 2*pt(tVal, N-1, lower.tail=FALSE))
tCrit <- qt(c(0.025, 0.975), N-1, lower.tail=FALSE)
(ci   <- M - tCrit * s/sqrt(N))

library(effectsize)
cohens_d(DV)

(d <- (mean(DV) - muH0) / sd(DV))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.3.2 t-Test für zwei unabhängige Stichproben ----
##### 7.3.2.1 Test mit Annahme von Varianzhomogenität und Schätzung der Effektstärke ----
n1  <- 18
n2  <- 21
DVm <- rnorm(n1, 180, 10)
DVf <- rnorm(n2, 175, 6)
tDf <- stack(list(m=DVm, f=DVf))

## für Daten des Buches diese Datei laden:
# load("data_tTwoInd.Rdata")
t.test(values ~ ind, alternative="greater", var.equal=TRUE, data=tDf)

sdPool     <- sqrt(((n1-1)*var(DVm) + (n2-1)*var(DVf)) / (n1+n2-2))
estSigDiff <- sqrt((n1+n2)/(n1*n2)) * sdPool
(tVal      <- (mean(DVm)-mean(DVf)) / estSigDiff )
(pVal      <- pt(tVal, n1+n2-2, lower.tail=FALSE))
tCrit      <- qt(0.05, n1+n2-2, lower.tail=FALSE)
(ciLo      <- mean(DVm)-mean(DVf) - tCrit*estSigDiff)

library(effectsize)
cohens_d(DVm, DVf)
(d <- (mean(DVm) - mean(DVf)) / sdPool)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.3.2.2 Test ohne Annahme von Varianzhomogenität ----
## für Daten des Buches diese Datei laden:
# load("data_tTwoInd.Rdata")
t.test(values ~ ind, alternative="greater", var.equal=FALSE, data=tDf)

varM     <- var(DVm)
varF     <- var(DVf)
num      <- (varM/n1 + varF/n2)^2
denom    <- varM^2/((n1-1)*n1^2) + varF^2/((n2-1)*n2^2)
(dfWelch <- num/denom)
(tValW   <- (mean(DVm)-mean(DVf)) / sqrt(varM/n1 + varF/n2))
(pValW   <- pt(tValW, dfWelch, lower.tail=FALSE))

##### 7.3.3 t-Test für zwei abhängige Stichproben ----
N      <- 20
DVpre  <- rnorm(N, mean=90,  sd=15)
DVpost <- rnorm(N, mean=100, sd=15)
## für Daten des Buches diese Datei laden:
# load("data_tTwoDep.Rdata")
DVdat <- data.frame(DVpre=DVpre, DVpost=DVpost)
t.test(Pair(DVpre, DVpost) ~ 1, alternative="less", data=DVdat)

DVdiff <- DVpre-DVpost
t.test(DVdiff, alternative="less")

library(effectsize)
cohens_d(DVdiff)
(d <- mean(DVdiff) / sd(DVdiff))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

####*--------------------------------------------------------------------------*
#### 7.4 Einfaktorielle Varianzanalyse (CR-p) ----
##### 7.4.1 Auswertung mit oneway.test() ----
P     <- 4
Nj    <- c(41, 37, 42, 40)
IVeff <- c(0, 0.3, 0.6, 1)
IV    <- factor(rep(LETTERS[1:P], Nj))
DV    <- IVeff[unclass(IV)] + rnorm(sum(Nj), 0, 1)
dfCRp <- data.frame(IV, DV)
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
oneway.test(DV ~ IV, data=dfCRp, var.equal=TRUE)

N     <- sum(Nj)
Vj    <- tapply(DV, IV, FUN=var)
Mj    <- tapply(DV, IV, FUN=mean)
M     <- sum((Nj/N) * Mj)
SSw   <- sum((Nj-1) * Vj)
# SSw <- sum((DV - ave(DV, IV, FUN=mean))^2)
SSb   <- sum(Nj * (Mj-M)^2)
MSw   <- SSw / (N-P)
MSb   <- SSb / (P-1)
(Fval <- MSb / MSw)
(pVal <- pf(Fval, P-1, N-P, lower.tail=FALSE))

oneway.test(DV ~ IV, data=dfCRp, var.equal=FALSE)

##### 7.4.2 Auswertung mit aov() ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
aovCRp <- aov(DV ~ IV, data=dfCRp)
summary(aovCRp)
model.tables(aovCRp, type="means")

library(ggplot2)
library(ggbeeswarm)

ggplot(dfCRp, aes(x=IV, y=DV, fill=IV)) +
    geom_boxplot(width=0.5, outlier.shape=NULL) +
    geom_beeswarm(size=2, alpha=0.5, cex=2) +
    labs(title="Daten je Gruppe") +
    theme(legend.position="none")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:ggbeeswarm))
try(detach(package:ggplot2))

##### 7.4.3 Auswertung mit anova() ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
(anovaCRp <- anova(lm(DV ~ IV, data=dfCRp)))
anovaCRp["Residuals", "Sum Sq"]

fitR <- lm(DV ~ 1, data=dfCRp)  # eingeschränktes Modell
fitU <- lm(DV ~ IV, data=dfCRp) # umfassendes Modell
anova(fitR, fitU)

##### 7.4.4 Effektstärke schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
library(effectsize)
eta_squared(aovCRp, partial=FALSE)
omega_squared(aovCRp)

dfSSb <- anovaCRp["IV",        "Df"]
SSb   <- anovaCRp["IV",        "Sum Sq"]
MSb   <- anovaCRp["IV",        "Mean Sq"]
SSw   <- anovaCRp["Residuals", "Sum Sq"]
MSw   <- anovaCRp["Residuals", "Mean Sq"]

(etaSq   <- SSb / (SSb + SSw))
(omegaSq <- dfSSb * (MSb-MSw) / (SSb + SSw + MSw))
(f       <- sqrt(etaSq / (1-etaSq)))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.4.5 Voraussetzungen grafisch prüfen ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
Estnd <- rstandard(aovCRp)

plot(Estnd ~ fitted(aovCRp), pch=20,
     xlab="Vorhersage", ylab="standardisierte Residuen",
     main="standardis. Residuen vs. Vorhersage", cex=2)
abline(h=0, col="gray60", lwd=2)
plot(Estnd ~ aovCRp$model$IV, main="Residuen vs. Stufen")
qqnorm(Estnd, pch=20, cex=2)
qqline(Estnd, col="gray60", lwd=2)

##### 7.4.6 Einzelvergleiche (Kontraste) ----
##### 7.4.6.1 Beliebige a-priori Kontraste ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
library(multcomp)
cntrMat <- rbind("(A+B)-(C+D)"=c(1/2, 1/2, -1/2, -1/2))
glhtRes <- glht(aovCRp, linfct=mcp(IV=cntrMat), alternative="less")
summary(glhtRes)

P       <- nlevels(dfCRp$IV)
Mj      <- tapply(dfCRp$DV, dfCRp$IV, FUN=mean)
Nj      <- table(dfCRp$IV)
dfSSw   <- sum(Nj)-P
SSw     <- sum((DV - ave(DV, IV, FUN=mean))^2)
MSw     <- SSw / dfSSw
(psiHat <- sum(cntrMat[1, ]   * Mj))
lenSq   <- sum(cntrMat[1, ]^2 / Nj)
(tStat  <- psiHat / sqrt(lenSq*MSw))
(pVal   <- pt(abs(tStat), dfSSw, lower.tail=FALSE))

confint(glhtRes)
(tCrit  <- qt(0.05, dfSSw, lower.tail=FALSE))
(ciUp   <- psiHat + tCrit*sqrt(lenSq*MSw))

cntrMat <- rbind("A-D"          =c(  1,   0,   0,  -1),
                 "1/3*(A+B+C)-D"=c(1/3, 1/3, 1/3,  -1),
                 "B-C"          =c(  0,   1,  -1,   0))
glhtRes <- glht(aovCRp, linfct=mcp(IV=cntrMat), alternative="less")
plot(glhtRes)
(sumRes <- summary(glhtRes, test=adjusted("none")))

psiHats <- cntrMat   %*% Mj
lenSqs  <- cntrMat^2 %*% (1/Nj)
tStats  <- psiHats / sqrt(lenSqs*MSw)
pVals   <- pt(abs(tStats), dfSSw, lower.tail=FALSE)
data.frame(psiHats, tStats, pVals)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:multcomp))
try(detach(package:mvtnorm))
try(detach(package:TH.data))
try(detach(package:survival))
try(detach(package:MASS))

##### 7.4.6.2 Beliebige post-hoc Kontraste nach Scheffé ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
library(DescTools)
ScheffeTest(aovCRp, which="IV", contrasts=t(cntrMat))

dfSSb  <- P-1
(Fstat <- sumRes$test$tstat^2)
(Fcrit <- dfSSb*qf(0.05, df1=dfSSb, df2=dfSSw, lower.tail=FALSE))
(pVal  <- pf(Fstat/dfSSb, dfSSb, dfSSw, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 7.4.6.3 Paarvergleiche mit t-Tests und alpha-Adjustierung ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
DV <- dfCRp$DV
IV <- dfCRp$IV
pairwise.t.test(DV, IV, p.adjust.method="bonferroni")

##### 7.4.6.4 Simultane Konfidenzintervalle nach Tukey ----
## für Daten des Buches diese Datei laden:
# load("data_crp.Rdata")
(tHSD <- TukeyHSD(aovCRp))

library(multcomp)
tukey <- glht(aovCRp, linfct=mcp(IV="Tukey"))
summary(tukey)
confint(tukey)

Mj <- tapply(dfCRp$DV, dfCRp$IV, FUN=mean)
Nj <- table(dfCRp$IV)

diffMat <- outer(Mj, Mj, FUN="-")
(diffs  <- diffMat[lower.tri(diffMat)])
NjMat   <- sqrt(outer(1/Nj, 1/Nj, FUN="+"))
NjFac   <- NjMat[lower.tri(NjMat)]
qTs     <- abs(diffs) / (sqrt(MSw/2) * NjFac)
(pVals  <- ptukey(qTs,  P, dfSSw, lower.tail=FALSE))
tWidth  <- qtukey(0.05, P, dfSSw, lower.tail=FALSE) * sqrt(MSw/2) * NjFac
diffs - tWidth
diffs + tWidth

plot(tHSD)

try(detach(package:multcomp))
try(detach(package:mvtnorm))
try(detach(package:TH.data))
try(detach(package:survival))
try(detach(package:MASS))

####*--------------------------------------------------------------------------*
#### 7.5 Einfaktorielle Varianzanalyse mit abhängigen Gruppen (RB-p) ----
##### 7.5.1 Univariat formuliert auswerten und Effektstärke schätzen ----
N      <- 10
P      <- 4
id     <- factor(rep(1:N, times=P))
IV     <- factor(rep(1:P,  each=N))
DV_t1  <- round(rnorm(N, -0.3, 1), 2)
DV_t2  <- round(rnorm(N, -0.2, 1), 2)
DV_t3  <- round(rnorm(N,  0.1, 1), 2)
DV_t4  <- round(rnorm(N,  0.4, 1), 2)
DV     <- c(DV_t1, DV_t2, DV_t3, DV_t4)
dfRBpL <- data.frame(id, IV, DV)
## für Daten des Buches diese Datei laden:
# load("data_rbp.Rdata")
aovRBp <- aov(DV ~ IV + Error(id/IV), data=dfRBpL)
summary(aovRBp)

anova(lm(DV ~ id + IV + id:IV, data=dfRBpL))

##### 7.5.1.1 Manuelle Kontrolle ----
## für Daten des Buches diese Datei laden:
# load("data_rbp.Rdata")
MiL   <- ave(DV, id, FUN=mean)
MjL   <- ave(DV, IV, FUN=mean)
DVctr <- DV - MiL
MjCtr <- tapply(DVctr, IV, FUN=mean)
SSb   <- sum(N * MjCtr^2)
M     <- mean(DV)
IDxIV <- DV - MiL - MjL + M
# IDxIV <- DVctr - ave(DVctr, IV, FUN=mean)
SSE   <- sum(IDxIV^2)
dfSSb <- P-1
dfSSE <- (N-1) * (P-1)

(MSb  <- SSb / dfSSb)
(MSE  <- SSE / dfSSE)
(Fval <- MSb / MSE)
(pVal <- pf(Fval, dfSSb, dfSSE, lower.tail=FALSE))

##### 7.5.1.2 Effektstärke schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_rbp.Rdata")
library(effectsize)
eta_squared(aovRBp, generalized=TRUE)
eta_squared(aovRBp, partial=TRUE)

anRes   <- anova(lm(DV ~ IV*id, data=dfRBpL))
SSEid   <- anRes["id",    "Sum Sq"]
SSEIVid <- anRes["IV:id", "Sum Sq"]
SSb     <- anRes["IV",    "Sum Sq"]
SSEtot  <- SSEid + SSEIVid
(gEtaSq <- SSb / (SSb + SSEtot))
(pEtaSq <- SSb / (SSb + SSEIVid))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.5.2 Zirkularität der Kovarianzmatrix prüfen ----
## für Daten des Buches diese Datei laden:
# load("data_rbp.Rdata")
DVmat  <- cbind(DV_t1, DV_t2, DV_t3, DV_t4)
N      <- nrow(DVmat)
S      <- var(DVmat)
P      <- nrow(S)
mdS    <- mean(diag(S))
mS     <- mean(S)
mSr    <- rowMeans(S)
num    <- P^2 * (mdS-mS)^2
den    <- (P-1)*(sum(S^2) - 2*P*sum(mSr^2) + P^2*mS^2)
(epsGG <- num / den)

dfId   <- N-1
(epsHF <- ((dfId+1) * (P-1) * epsGG - 2) / ((P-1) * (dfId - (P-1)*epsGG)))

epsHF   <- min(c(1, epsHF))
(pEpsGG <- pf(Fval, dfSSb * epsGG, dfSSE * epsGG, lower.tail=FALSE))
(pEpsHF <- pf(Fval, dfSSb * epsHF, dfSSE * epsHF, lower.tail=FALSE))

dfRBpL <- cbind(dfRBpL, IDxIV)
errMat <- data.matrix(unstack(dfRBpL, IDxIV ~ IV))
Serr   <- cov(errMat)
(epsGG <- (1 / (P-1)) * sum(diag(Serr))^2 / sum(Serr^2))

##### 7.5.3 Multivariat formuliert auswerten mit Anova() ----
## für Daten des Buches diese Datei laden:
# load("data_rbp.Rdata")
dfRBpW <- data.frame(DV_t1, DV_t2, DV_t3, DV_t4)
fitRBp <- lm(cbind(DV_t1, DV_t2, DV_t3, DV_t4) ~ 1, data=dfRBpW)
inRBp  <- data.frame(IV=gl(P, 1))

library(car)
AnovaRBp <- Anova(fitRBp, idata=inRBp, idesign=~IV)
summary(AnovaRBp, multivariate=FALSE, univariate=TRUE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 7.5.4 Multivariat formuliert auswerten mit anova() ----
## für Daten des Buches diese Datei laden:
# load("data_rbp.Rdata")
anova(fitRBp, M=~IV, X=~1, idata=inRBp, test="Spherical")
mauchly.test(fitRBp, M=~IV, X=~1, idata=inRBp)

##### 7.5.5 Einzelvergleiche und alternative Auswertungsmöglichkeiten ----

####*--------------------------------------------------------------------------*
#### 7.6 Zweifaktorielle Varianzanalyse (CRF-pq) ----
##### 7.6.1 Auswertung und Schätzung der Effektstärke ----
Njk     <- 8
P       <- 2
Q       <- 3
IV1     <- factor(rep(1:P, times=Njk*Q))
IV2     <- factor(rep(1:Q,  each=Njk*P))
IVcomb  <- interaction(IV1, IV2)
IVeff   <- c(0.5, -0.5, 0, -1, 1, 0)
DV      <- IVeff[unclass(IVcomb)] + rnorm(Njk*P*Q, 0, 1)
dfCRFpq <- data.frame(IV1, IV2, IVcomb, DV)
## für Daten des Buches diese Datei laden:
# load("data_crfpq.Rdata")

aovCRFpq <- aov(DV ~ IV1*IV2, data=dfCRFpq)
summary(aovCRFpq)

##### 7.6.1.1 Mittelwertsdiagramme ----
library(emmeans)
library(ggplot2)
library(dplyr)

dfAggr1 <- dfCRFpq |>
    group_by(IV1) |>
    summarise(M=mean(DV)) |>
    rename(group=IV1) |>
    mutate(IV="IV1")
    
dfAggr2 <- dfCRFpq |>
    group_by(IV2) |>
    summarise(M=mean(DV)) |>
    rename(group=IV2) |>
    mutate(IV="IV2")

dfAggr <- bind_rows(dfAggr1, dfAggr2)

ggplot(dfAggr, aes(x=group, y=M, color=IV)) +
    geom_point(size=3) +
    facet_wrap(~ IV, scales="free_x") +
    labs(title="Randmittelwerte") +
    theme(legend.position="none")

emmip(aovCRFpq, IV2 ~ IV1, style="factor") +
    labs(title="Mittelwertsverläufe")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:emmeans))
try(detach(package:ggplot2))
try(detach(package:dplyr))

##### 7.6.1.2 Effektstärke schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_crfpq.Rdata")
library(effectsize)
eta_squared(aovCRFpq, partial=FALSE)
eta_squared(aovCRFpq, partial=TRUE)

anRes <- anova(lm(DV ~ IV1*IV2, data=dfCRFpq))
SS1   <- anRes["IV1",       "Sum Sq"]
SS2   <- anRes["IV2",       "Sum Sq"]
SSI   <- anRes["IV1:IV2",   "Sum Sq"]
SSE   <- anRes["Residuals", "Sum Sq"]

(etaSq1 <- SS1 / (SS1 + SS2 + SSI + SSE))
(etaSq2 <- SS2 / (SS1 + SS2 + SSI + SSE))
(etaSqI <- SSI / (SS1 + SS2 + SSI + SSE))

(pEtaSq1 <- SS1 / (SS1 + SSE))
(pEtaSq2 <- SS2 / (SS2 + SSE))
(pEtaSqI <- SSI / (SSI + SSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.6.2 Quadratsummen vom Typ I, II, III ----
mdP   <- 3
mdQ   <- 3
g11   <- c(41, 43, 50)
g12   <- c(51, 43, 53, 54, 46)
g13   <- c(45, 55, 56, 60, 58, 62, 62)
g21   <- c(56, 47, 45, 46, 49)
g22   <- c(58, 54, 49, 61, 52, 62)
g23   <- c(59, 55, 68, 63)
g31   <- c(43, 56, 48, 46, 47)
g32   <- c(59, 46, 58, 54)
g33   <- c(55, 69, 63, 56, 62, 67)
mdDV  <- c(g11, g12, g13, g21, g22, g23, g31, g32, g33)
mdIV1 <- factor(rep(1:mdP, c(3+5+7, 5+6+4, 5+4+6)))
mdIV2 <- factor(rep(rep(1:mdQ, mdP), c(3,5,7, 5,6,4, 5,4,6)))
dfMD  <- data.frame(IV1=mdIV1, IV2=mdIV2, DV=mdDV)

anova(lm(DV ~ IV1 + IV2 + IV1:IV2, data=dfMD))

SSI1 <- anova(lm(DV ~ 1,                   data=dfMD),
              lm(DV ~ IV1,                 data=dfMD))
SSI2 <- anova(lm(DV ~ IV1,                 data=dfMD),
              lm(DV ~ IV1 + IV2,           data=dfMD))
SSIi <- anova(lm(DV ~ IV1 + IV2,           data=dfMD),
              lm(DV ~ IV1 + IV2 + IV1:IV2, data=dfMD))

SSI1[2, "Sum of Sq"]
SSI2[2, "Sum of Sq"]
SSIi[2, "Sum of Sq"]

SSIt <- anova(lm(DV ~ 1,                   data=dfMD),
              lm(DV ~ IV1 + IV2 + IV1:IV2, data=dfMD))
SSIt[2, "Sum of Sq"]
SSI1[2, "Sum of Sq"] + SSI2[2, "Sum of Sq"] + SSIi[2, "Sum of Sq"]

fitIII <- lm(DV ~ IV1 + IV2 + IV1:IV2, data=dfMD,
             contrasts=list(IV1=contr.sum, IV2=contr.sum))
drop1(fitIII, ~ ., test="F")

library(car)
Anova(fitIII, type="III")

(MjkMD  <- tapply(mdDV, list(mdIV1, mdIV2), FUN=mean))
(NjkMD  <- table(mdIV1, mdIV2))
Mj      <- rowMeans(MjkMD)
Mk      <- colMeans(MjkMD)
effNj   <- 1 / rowMeans(1/NjkMD)
effNk   <- 1 / colMeans(1/NjkMD)
gM1     <- sum(effNj*Mj) / sum(effNj)
gM2     <- sum(effNk*Mk) / sum(effNk)
(SSIII1 <- mdP * sum(effNj * (Mj-gM1)^2))
(SSIII2 <- mdQ * sum(effNk * (Mk-gM2)^2))
(SSE    <- sum((mdDV - ave(mdDV, mdIV1, mdIV2, FUN=mean))^2))

dfSS1  <- mdP-1
dfSS2  <- mdQ-1
dfSSE  <- sum(NjkMD-1)
(Fval1 <- (SSIII1/dfSS1) / (SSE/dfSSE))
(Fval2 <- (SSIII2/dfSS2) / (SSE/dfSSE))
(pVal1 <- pf(Fval1, dfSS1, dfSSE, lower.tail=FALSE))
(pVal2 <- pf(Fval2, dfSS2, dfSSE, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 7.6.3 Bedingte Haupteffekte ----
##### 7.6.4 Beliebige a-priori Kontraste ----
## für Daten des Buches diese Datei laden:
# load("data_crfpq.Rdata")
CRFpq1 <- anova(lm(DV ~ IVcomb,  data=dfCRFpq))
CRFpq2 <- anova(lm(DV ~ IV1*IV2, data=dfCRFpq))
all.equal(CRFpq1["Residuals", "Mean Sq"],
          CRFpq2["Residuals", "Mean Sq"])

library(multcomp)
aovComb <- aov(DV ~ IVcomb, data=dfCRFpq)
cntrMat <- rbind("contr 01"=c(1/2, -1/4, 1/2, -1/4, -1/4, -1/4))
cntrEst <- glht(aovComb, linfct=mcp(IVcomb=cntrMat), alternative="greater")
summary(cntrEst)
confint(cntrEst)

DV      <- dfCRFpq$DV
IVcomb  <- dfCRFpq$IVcomb
Mjk     <- tapply(DV, IVcomb, FUN=mean)
dfSSE   <- (Njk-1)*P*Q
SSE     <- sum((DV - ave(DV, IVcomb, FUN=mean))^2)
MSE     <- SSE / dfSSE
(psiHat <- sum(cntrMat[1, ]   * Mjk))
lenSq   <- sum(cntrMat[1, ]^2 / Njk)
(tStat  <- psiHat / sqrt(lenSq*MSE))

(tCrit <- qt(0.05, dfSSE, lower.tail=FALSE))
(pVal  <- pt(abs(tStat), dfSSE, lower.tail=FALSE))
(ciLo  <- psiHat - tCrit*sqrt(lenSq*MSE))

cntrMat <- rbind("contr 01"=c( 1/2, -1/4,  1/2, -1/4, -1/4, -1/4),
                 "contr 02"=c(   0,    0,    1,    0,   -1,    0),
                 "contr 03"=c(-1/2, -1/2,  1/4,  1/4,  1/4,  1/4))

(sumRes <- summary(glht(aovComb, linfct=mcp(IVcomb=cntrMat),
                        alternative="greater"),
                   test=adjusted("none")))

psiHats <- cntrMat   %*% Mjk
lenSqs  <- cntrMat^2 %*% (1/rep(Njk, ncol(cntrMat)))
tStats  <- psiHats / sqrt(lenSqs*MSE)
pVals   <- pt(abs(tStats), dfSSE, lower.tail=FALSE)
data.frame(psiHats, tStats, pVals)

meansA   <- tapply(dfCRFpq$DV, dfCRFpq$IV1, FUN=mean)
cntrVecA <- c(1, -1)
psiHatA  <- sum(cntrVecA   * meansA)
lenSqA   <- sum(cntrVecA^2 / (Q*Njk))
(statA   <- psiHatA / sqrt(lenSqA*MSE))

meansB   <- tapply(dfCRFpq$DV, dfCRFpq$IV2, FUN=mean)
cntrVecB <- c(1, -1/2, -1/2)
psiHatB  <- sum(cntrVecB   * meansB)
lenSqB   <- sum(cntrVecB^2 / (P*Njk))
(statB   <- psiHatB / sqrt(lenSqB*MSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:multcomp))
try(detach(package:mvtnorm))
try(detach(package:TH.data))
try(detach(package:survival))
try(detach(package:MASS))

##### 7.6.5 Beliebige post-hoc Kontraste nach Scheffé ----
## für Daten des Buches diese Datei laden:
# load("data_crfpq.Rdata")
library(DescTools)
aovComb <- aov(DV ~ IVcomb, data=dfCRFpq)
ScheffeTest(aovComb, which="IVcomb", contrasts=t(cntrMat))

(Fstat <- sumRes$test$tstat^2)
dfSSba <- P*Q - 1
(Fcrit <- dfSSba*qf(0.05,  df1=dfSSba, df2=dfSSE, lower.tail=FALSE))
(pVal  <- pf(Fstat/dfSSba, df1=dfSSba, df2=dfSSE, lower.tail=FALSE))

# B-Kontrast
ScheffeTest(aovCRFpq, which="IV2", contrasts=c(-1, 1/2, 1/2))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 7.6.6 Marginale Paarvergleiche nach Tukey ----
## für Daten des Buches diese Datei laden:
# load("data_crfpq.Rdata")
aovCRF <- aov(DV ~ IV1 + IV2, data=dfCRFpq)
TukeyHSD(aovCRF, which="IV2")

library(multcomp)
tukey <- glht(aovCRF, linfct=mcp(IV2="Tukey"))
summary(tukey)
confint(tukey)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:multcomp))
try(detach(package:mvtnorm))
try(detach(package:TH.data))
try(detach(package:survival))
try(detach(package:MASS))

####*--------------------------------------------------------------------------*
#### 7.7 Zweifaktorielle Varianzanalyse mit zwei Intra-Gruppen Faktoren (RBF-pq) ----
##### 7.7.1 Univariat formuliert auswerten und Effektstärke schätzen ----
N        <- 10
P        <- 2
Q        <- 3
id       <- factor(rep(1:N,              times=P*Q))
IV1      <- factor(rep(rep(1:P, each=N), times=Q))
IV2      <- factor(rep(rep(1:Q,           each=N*P)))
DV_t11   <- round(rnorm(N, -0.8, 1), 2)
DV_t12   <- round(rnorm(N, -0.7, 1), 2)
DV_t13   <- round(rnorm(N,  0.0, 1), 2)
DV_t21   <- round(rnorm(N,  0.2, 1), 2)
DV_t22   <- round(rnorm(N,  0.3, 1), 2)
DV_t23   <- round(rnorm(N,  1.0, 1), 2)
DV       <- c(DV_t11, DV_t21, DV_t12, DV_t22, DV_t13, DV_t23)
dfRBFpqL <- data.frame(id, IV1, IV2, DV)
## für Daten des Buches diese Datei laden:
# load("data_rbfpq.Rdata")

aovRBFpq <- aov(DV ~ IV1*IV2 + Error(id/(IV1*IV2)), data=dfRBFpqL)
summary(aovRBFpq)

anova(lm(DV ~ IV1*IV2*id, data=dfRBFpqL))

##### 7.7.1.1 Manuelle Kontrolle ----
## für Daten des Buches diese Datei laden:
# load("data_rbfpq.Rdata")
dfG  <- dfRBFpqL
mDf1 <- aggregate(DV ~ id + IV1, data=dfG, FUN=mean)
mDf2 <- aggregate(DV ~ id + IV2, data=dfG, FUN=mean)
dDfI <- aggregate(DV ~ id + IV2, data=dfG, FUN=diff)

summary(aov(DV ~ IV1 + Error(id/IV1), data=mDf1))
summary(aov(DV ~ IV2 + Error(id/IV2), data=mDf2))
summary(aov(DV ~ IV2 + Error(id/IV2), data=dDfI))

Mjk <- tapply(dfG$DV, list(dfG$IV1, dfG$IV2), FUN=mean)
Mj  <- tapply(dfG$DV, dfG$IV1, FUN=mean)
Mk  <- tapply(dfG$DV, dfG$IV2, FUN=mean)
M   <- mean(Mjk)

IV1xIV2 <- c(sweep(sweep(Mjk, 1, Mj, "-"), 2, Mk, "-")) + M
(SSI    <- N * sum(IV1xIV2^2))

MjkL <- ave(dfG$DV,         dfG$IV1, dfG$IV2, FUN=mean)
MijL <- ave(dfG$DV, dfG$id, dfG$IV1,          FUN=mean)
MikL <- ave(dfG$DV, dfG$id,          dfG$IV2, FUN=mean)
MiL  <- ave(dfG$DV, dfG$id,                   FUN=mean)
MjL  <- ave(dfG$DV,         dfG$IV1,          FUN=mean)
MkL  <- ave(dfG$DV,                  dfG$IV2, FUN=mean)

IDxIV1xIV2 <- dfG$DV - MijL - MikL - MjkL + MiL + MjL + MkL - M
(SSE       <- sum(IDxIV1xIV2^2))

dfSSI <- (P-1) * (Q-1)
dfSSE <- (P-1) * (Q-1) * (N-1)

(FvalI <- (SSI/dfSSI) / (SSE/dfSSE))
(pValI <- pf(FvalI, dfSSI, dfSSE, lower.tail=FALSE))

##### 7.7.1.2 Effektstärke schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_rbfpq.Rdata")
library(effectsize)
eta_squared(aovRBFpq, partial=FALSE)
eta_squared(aovRBFpq, partial=TRUE)
eta_squared(aovRBFpq, generalized=TRUE)

anRes <- anova(lm(DV ~ IV1*IV2*id, data=dfRBFpqL))

SSEid       <- anRes["id",         "Sum Sq"]
SSEIV1id    <- anRes["IV1:id",     "Sum Sq"]
SSEIV2id    <- anRes["IV2:id",     "Sum Sq"]
SSEIV1IV2id <- anRes["IV1:IV2:id", "Sum Sq"]

SSEtot <- SSEid + SSEIV1id + SSEIV2id + SSEIV1IV2id
SS1    <- anRes["IV1",        "Sum Sq"]
SS2    <- anRes["IV2",        "Sum Sq"]
SSI    <- anRes["IV1:IV2",    "Sum Sq"]

(etaSq1  <- SS1 / (SS1 + SS2 + SSI + SSEtot))
(etaSq2  <- SS2 / (SS1 + SS2 + SSI + SSEtot))
(etaSqI  <- SSI / (SS1 + SS2 + SSI + SSEtot))

(pEtaSq1 <- SS1 / (SS1 + SSEIV1id))
(pEtaSq2 <- SS2 / (SS2 + SSEIV2id))
(pEtaSqI <- SSI / (SSI + SSEIV1IV2id))

(gEtaSq1 <- SS1 / (SS1 + SSEtot))
(gEtaSq2 <- SS2 / (SS2 + SSEtot))
(gEtaSqI <- SSI / (SSI + SSEtot))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.7.2 Zirkularität der Kovarianzmatrizen prüfen ----
## für Daten des Buches diese Datei laden:
# load("data_rbfpq.Rdata")
DVmat <- with(dfG, tapply(DV, list(id, IV2), mean))
N     <- nrow(DVmat)
S     <- var(DVmat)
Q     <- nrow(S)
mdS   <- mean(diag(S))
mS    <- mean(S)
mSr   <- rowMeans(S)
num   <- Q^2 * (mdS-mS)^2
den   <- (Q-1)*(sum(S^2) - 2*Q*sum(mSr^2) + Q^2*mS^2)

(epsGG2 <- num/den)
(epsHF2 <- (N*(Q-1)*epsGG2 - 2) / ((Q-1) * ((N-1) - ((Q-1)*epsGG2))))

dfG    <- cbind(dfG, IDxIV1xIV2)
errMat <- data.matrix(unstack(dfG, IDxIV1xIV2 ~ IV2))
Serr   <- cov(errMat)

(epsGGi <- (1 / (Q-1)) * sum(diag(Serr))^2 / sum(Serr^2))
dfId    <- N-1
(epsHFi <- ((dfId+1)*(Q-1)*epsGGi - 2) / ((Q-1) * (dfId - ((Q-1)*epsGGi))))

##### 7.7.3 Multivariat formuliert auswerten ----
## für Daten des Buches diese Datei laden:
# load("data_rbfpq.Rdata")
dfRBFpqW <- data.frame(DV_t11, DV_t21, DV_t12, DV_t22, DV_t13, DV_t23)
fitRBFpq <- lm(cbind(DV_t11, DV_t21, DV_t12, DV_t22, DV_t13, DV_t23) ~ 1,
               data=dfRBFpqW)
(inRBFpq <- expand.grid(IV1=gl(P, 1), IV2=gl(Q, 1)))

library(car)
AnovaRBFpq <- Anova(fitRBFpq, idata=inRBFpq, idesign=~IV1*IV2)
summary(AnovaRBFpq, multivariate=FALSE, univariate=TRUE)

anova(fitRBFpq, M=~IV1,                 X=~1,         idata=inRBFpq, test="Spherical")
anova(fitRBFpq, M=~IV1 + IV2,           X=~IV1,       idata=inRBFpq, test="Spherical")
anova(fitRBFpq, M=~IV1 + IV2 + IV1:IV2, X=~IV1 + IV2, idata=inRBFpq, test="Spherical")

## Mauchly-Test IV1 hier unnötig, da P=2 -> Zirkularität liegt automatisch vor
mauchly.test(fitRBFpq, M=~IV1,                 X=~1,         idata=inRBFpq)
mauchly.test(fitRBFpq, M=~IV1 + IV2,           X=~IV1,       idata=inRBFpq)
mauchly.test(fitRBFpq, M=~IV1 + IV2 + IV1:IV2, X=~IV1 + IV2, idata=inRBFpq)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 7.7.4 Einzelvergleiche (Kontraste) ----

####*--------------------------------------------------------------------------*
#### 7.8 Zweifaktorielle Varianzanalyse mit Split-Plot-Design (SPF-p.q) ----
##### 7.8.1 Univariat formuliert auswerten und Effektstärke schätzen ----
Nj       <- 10
P        <- 3
Q        <- 3
id       <- factor(rep(1:(P*Nj),     times=Q))
IVbtw    <- factor(rep(LETTERS[1:P], times=Q*Nj))
IVwth    <- factor(rep(1:Q,           each=P*Nj))
DV_t1    <- round(rnorm(P*Nj, -0.5, 1), 2)
DV_t2    <- round(rnorm(P*Nj,  0,   1), 2)
DV_t3    <- round(rnorm(P*Nj,  0.5, 1), 2)
DV       <- c(DV_t1, DV_t2, DV_t3)
dfSPFpqL <- data.frame(id, IVbtw, IVwth, DV)
## für Daten des Buches diese Datei laden:
# load("data_spfpq.Rdata")

aovSPFpq <- aov(DV ~ IVbtw*IVwth + Error(id/IVwth), data=dfSPFpqL)
summary(aovSPFpq)

anova(lm(DV ~ IVbtw*IVwth*id, data=dfSPFpqL))

##### 7.8.1.1 Effektstärke schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_spfpq.Rdata")
library(effectsize)
eta_squared(aovSPFpq, partial=FALSE)
eta_squared(aovSPFpq, partial=TRUE)
eta_squared(aovSPFpq, generalized=TRUE)

anRes    <- anova(lm(DV ~ IVbtw*IVwth*id, data=dfSPFpqL))
SSEid    <- anRes["id", "Sum Sq"]
SSEIVwid <- anRes["IVwth:id", "Sum Sq"]
SSEtot   <- SSEid + SSEIVwid
SSbtw    <- anRes["IVbtw", "Sum Sq"]
SSwth    <- anRes["IVwth", "Sum Sq"]
SSI      <- anRes["IVbtw:IVwth", "Sum Sq"]

(etaSqB  <- SSbtw / (SSbtw + SSwth + SSI + SSEtot))
(etaSqW  <- SSbtw / (SSbtw + SSwth + SSI + SSEtot))
(etaSqI  <- SSbtw / (SSbtw + SSwth + SSI + SSEtot))

(pEtaSqB <- SSbtw / (SSbtw + SSEid))
(pEtaSqW <- SSwth / (SSwth + SSEIVwid))
(pEtaSqI <- SSI   / (SSI   + SSEIVwid))

(gEtaSqB <- SSbtw / (SSbtw + SSEtot))
(gEtaSqW <- SSwth / (SSwth + SSEtot))
(gEtaSqI <- SSI   / (SSI   + SSEtot))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.8.2 Voraussetzungen und Prüfen der Zirkularität ----
## für Daten des Buches diese Datei laden:
# load("data_spfpq.Rdata")
mDf <- aggregate(DV ~ id + IVbtw, data=dfSPFpqL, FUN=mean)
summary(aov(DV ~ IVbtw, data=mDf))

Mjk <- ave(dfSPFpqL$DV,              dfSPFpqL$IVbtw, dfSPFpqL$IVwth, FUN=mean)
Mi  <- ave(dfSPFpqL$DV, dfSPFpqL$id,                                 FUN=mean)
Mj  <- ave(dfSPFpqL$DV,              dfSPFpqL$IVbtw,                 FUN=mean)

IDxIV    <- dfSPFpqL$DV - Mi - Mjk + Mj
sum(IDxIV^2)
dfSPFpqL <- cbind(dfSPFpqL, IDxIV)

errMat <- data.matrix(unstack(dfSPFpqL, IDxIV ~ IVwth))
Serr   <- cov(errMat)
(epsGG <- (1 / (Q-1)) * sum(diag(Serr))^2 / sum(Serr^2))

dfId   <- (Nj-1) * P
(epsHF <- ((dfId+1) * (Q-1) * epsGG - 2) / ((Q-1) * (dfId - (Q-1)*epsGG)))

##### 7.8.3 Multivariat formuliert auswerten ----
## für Daten des Buches diese Datei laden:
# load("data_spfpq.Rdata")
IVbtwW   <- factor(rep(LETTERS[1:P], Nj))
dfSPFpqW <- data.frame(IVbtwW, DV_t1, DV_t2, DV_t3)

fitSPFpq <- lm(cbind(DV_t1, DV_t2, DV_t3) ~ IVbtwW, data=dfSPFpqW)
inSPFpq  <- data.frame(IVwth=gl(Q, 1))

library(car)
AnovaSPFpq <- Anova(fitSPFpq, idata=inSPFpq, idesign=~IVwth)
summary(AnovaSPFpq, multivariate=FALSE, univariate=TRUE)

anova(fitSPFpq, M=~1,     X=~0, idata=inSPFpq, test="Spherical")
anova(fitSPFpq, M=~IVwth, X=~1, idata=inSPFpq, test="Spherical")
mauchly.test(fitSPFpq, M=~IVwth, X=~1, idata=inSPFpq)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 7.8.4 Einzelvergleiche (Kontraste) ----
## für Daten des Buches diese Datei laden:
# load("data_spfpq.Rdata")
aovRes  <- aov(DV ~ IVbtw, data=mDf)
cntrMat <- rbind("-0.5*(A+B)+C"=c(-1/2, -1/2, 1))
library(multcomp)
summary(glht(aovRes, linfct=mcp(IVbtw=cntrMat), alternative="greater"))

P       <- nlevels(mDf$IVbtw)
Nj      <- table(mDf$IVbtw)
Mj      <- tapply(mDf$DV, mDf$IVbtw, FUN=mean)
SSw     <- sum((mDf$DV - ave(mDf$DV, mDf$IVbtw, FUN=mean))^2)
MSw     <- SSw / (sum(Nj) - P)
(psiHat <- sum(cntrMat[1, ]   * Mj))
lenSq   <- sum(cntrMat[1, ]^2 / Nj)
(tStat  <- psiHat / sqrt(lenSq*MSw))
(pVal   <- pt(tStat, sum(Nj) - P, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:multcomp))
try(detach(package:mvtnorm))
try(detach(package:TH.data))
try(detach(package:survival))
try(detach(package:MASS))

##### 7.8.5 Erweiterung auf dreifaktorielles SPF-p.qr Design ----
##### 7.8.5.1 Univariat formulierte Auswertung mit aov() ----
Nj     <- 10
P      <- 2
Q      <- 3
R      <- 2
id     <- factor(rep(1:(P*Nj),            times=Q*R))
IVbtw  <- factor(rep(LETTERS[1:P],        times=Q*R*Nj))
IVwth1 <- factor(rep(1:Q,                  each=P*R*Nj))
IVwth2 <- factor(rep(rep(1:R, each=P*Nj), times=Q))
DV_t11 <- round(rnorm(P*Nj,  8, 2), 2)
DV_t21 <- round(rnorm(P*Nj, 13, 2), 2)
DV_t31 <- round(rnorm(P*Nj, 13, 2), 2)
DV_t12 <- round(rnorm(P*Nj, 10, 2), 2)
DV_t22 <- round(rnorm(P*Nj, 15, 2), 2)
DV_t32 <- round(rnorm(P*Nj, 15, 2), 2)
DV     <- c(DV_t11, DV_t12, DV_t21, DV_t22, DV_t31, DV_t32)
dfSPFp.qrL <- data.frame(id, IVbtw, IVwth1, IVwth2, DV)

aovSPFp.qr <- aov(DV ~ IVbtw*IVwth1*IVwth2 + Error(id/(IVwth1*IVwth2)), data=dfSPFp.qrL)
summary(aovSPFp.qr)

##### Effektstärke: generalisiertes eta-Quadrat
library(effectsize)
eta_squared(aovSPFp.qr, partial=FALSE)
eta_squared(aovSPFp.qr, partial=TRUE)
eta_squared(aovSPFp.qr, generalized=TRUE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.8.5.2 Multivariat formulierte Auswertung ----
IVbtwW     <- factor(rep(LETTERS[1:P], Nj))
dfSPFp.qrW <- data.frame(IVbtwW, DV_t11, DV_t21, DV_t31, DV_t12, DV_t22, DV_t32)

fitSPFp.qr <- lm(cbind(DV_t11, DV_t21, DV_t31, DV_t12, DV_t22, DV_t32) ~ IVbtwW,
                 data=dfSPFp.qrW)
inSPFp.qr  <- expand.grid(IVwth1=gl(Q, 1), IVwth2=gl(R, 1))

library(car)
AnovaSPFp.qr <- Anova(fitSPFp.qr, idata=inSPFp.qr, idesign=~IVwth1*IVwth2)
summary(AnovaSPFp.qr, multivariate=FALSE, univariate=TRUE)

anova(fitSPFp.qr, M=~1,                               X=~0,
      idata=inSPFp.qr, test="Spherical")
anova(fitSPFp.qr, M=~IVwth1,                          X=~1,
      idata=inSPFp.qr, test="Spherical")
anova(fitSPFp.qr, M=~IVwth1 + IVwth2,                 X=~IVwth1,
      idata=inSPFp.qr, test="Spherical")
anova(fitSPFp.qr, M=~IVwth1 + IVwth2 + IVwth1:IVwth2, X=~IVwth1 + IVwth2,
      idata=inSPFp.qr, test="Spherical")

mauchly.test(fitSPFp.qr, M=~IVwth1,                          X=~1,
             idata=inSPFp.qr)
## Mauchly-Test IVwth2 hier unnötig, da R=2 -> Zirkularität liegt automatisch vor
mauchly.test(fitSPFp.qr, M=~IVwth1 + IVwth2,                 X=~IVwth1,
             idata=inSPFp.qr)
mauchly.test(fitSPFp.qr, M=~IVwth1 + IVwth2 + IVwth1:IVwth2, X=~IVwth1 + IVwth2,
             idata=inSPFp.qr)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 7.8.6 Erweiterung auf dreifaktorielles SPF-pq.r Design ----
##### 7.8.6.1 Univariat formulierte Auswertung mit aov() ----
Njk    <- 10
P      <- 2
Q      <- 2
R      <- 3
id     <- factor(rep(1:(P*Q*Njk),          times=R))
IVbtw1 <- factor(rep(1:P,                  times=Q*R*Njk))
IVbtw2 <- factor(rep(rep(1:Q, each=P*Njk), times=R))
IVwth  <- factor(rep(1:R,                   each=P*Q*Njk))
DV_t1  <- round(rnorm(P*Q*Njk, -3, 2), 2)
DV_t2  <- round(rnorm(P*Q*Njk,  1, 2), 2)
DV_t3  <- round(rnorm(P*Q*Njk,  2, 2), 2)
DV     <- c(DV_t1, DV_t2, DV_t3)
dfSPFpq.rL <- data.frame(id, IVbtw1, IVbtw2, IVwth, DV)

aovSPFpq.r <- aov(DV ~ IVbtw1*IVbtw2*IVwth + Error(id/IVwth), data=dfSPFpq.rL)
summary(aovSPFpq.r)

##### Effektstärke: generalisiertes eta-Quadrat
library(effectsize)
eta_squared(aovSPFpq.r, partial=FALSE)
eta_squared(aovSPFpq.r, partial=TRUE)
eta_squared(aovSPFpq.r, generalized=TRUE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:effectsize))

##### 7.8.6.2 Multivariat formulierte Auswertung ----
IVbtw1W    <- factor(rep(LETTERS[1:P], times=Q*Njk))
IVbtw2W    <- factor(rep(c("+", "-"),   each=P*Njk))
dfSPFpq.rW <- data.frame(IVbtw1W, IVbtw2W, DV_t1, DV_t2, DV_t3)
fitSPFpq.r <- lm(cbind(DV_t1, DV_t2, DV_t3) ~ IVbtw1W*IVbtw2W, data=dfSPFpq.rW)
inSPFpq.r  <- data.frame(IVwth=gl(R, 1))

library(car)
AnovaSPFpq.r <- Anova(fitSPFpq.r, idata=inSPFpq.r, idesign=~IVwth)
summary(AnovaSPFpq.r, multivariate=FALSE, univariate=TRUE)

anova(fitSPFpq.r, M=~1,     X=~0, idata=inSPFpq.r, test="Spherical")
anova(fitSPFpq.r, M=~IVwth, X=~1, idata=inSPFpq.r, test="Spherical")
mauchly.test(fitSPFpq.r, M=~IVwth, X=~1, idata=inSPFpq.r)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

####*--------------------------------------------------------------------------*
#### 7.9 Kovarianzanalyse ----
##### 7.9.1 Effekte testen und Effekstärke schätzen ----
SSRIpre  <- c(18, 16, 16, 15, 14, 20, 14, 21, 25, 11)
SSRIpost <- c(12,  0, 10,  9,  0, 11,  2,  4, 15, 10)
PlacPre  <- c(18, 16, 15, 14, 20, 25, 11, 25, 11, 22)
PlacPost <- c(11,  4, 19, 15,  3, 14, 10, 16, 10, 20)
WLpre    <- c(15, 19, 10, 29, 24, 15,  9, 18, 22, 13)
WLpost   <- c(17, 25, 10, 22, 23, 10,  2, 10, 14,  7)

P        <- 3
Nj       <- rep(length(SSRIpre), times=P)
IV       <- factor(rep(1:P, Nj), labels=c("SSRI", "Placebo", "WL"))
DVpre    <- c(SSRIpre,  PlacPre,  WLpre)
DVpost   <- c(SSRIpost, PlacPost, WLpost)
dfAncova <- data.frame(id=1:sum(Nj), IV, DVpre, DVpost)

library(ggplot2)
library(dplyr)
dfAncovaL <- reshape(dfAncova,
                     direction="long",
                     varying=c("DVpre", "DVpost"),
                     v.names="DV",
                     timevar="time") |>
    mutate(time=factor(time, levels=1:2, labels=c("pre", "post")))

ggplot(dfAncovaL, aes(x=IV, y=DV)) +
    geom_boxplot() +
    facet_grid(~ time) +
    labs(title="Prä- / Post-Scores je Gruppe")

fitFull <- lm(DVpost ~ IV + DVpre, data=dfAncova)
fitGrp  <- lm(DVpost ~ IV,         data=dfAncova)
fitRegr <- lm(DVpost ~      DVpre, data=dfAncova)

anova(fitGrp)
(anova_full <- anova(fitFull))

library(effectsize)
eta_squared(fitFull, partial=TRUE)

SSEfull <- anova_full["Residuals", "Sum Sq"]
SSregr  <- anova_full["DVpre", "Sum Sq"]
SSgrp   <- anova_full["IV",    "Sum Sq"]
SSgrp  / (SSgrp + SSEfull)
SSregr / (SSregr + SSEfull)
cor(residuals(lm(DVpre ~ IV)), residuals(lm(DVpost ~ IV)))^2

summary(fitFull)

library(emmeans)
library(ggplot2)
dfAncova$tvar <- dfAncova$IV
emmip(fitFull, IV ~ DVpre, at=list(DVpre=0:30), style="numeric") +
    geom_point(data=dfAncova, aes(x=DVpre, y=DVpost)) +
    labs(title="Rohdaten und Regressionsgerade pro Gruppe")

summary(lm(DVpost ~ IV + DVpre + IV:DVpre, data=dfAncova))

try(detach(package:emmeans))
try(detach(package:ggplot2))
try(detach(package:effectsize))
try(detach(package:dplyr))

##### 7.9.2 Quadratsummen vom Typ III ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
library(car)
fitIII <- lm(DVpost ~ IV + DVpre,
             contrasts=list(IV=contr.sum), data=dfAncova)
Anova(fitIII, type="III")
anova(fitRegr, fitFull)
anova(fitGrp,  fitFull)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

##### 7.9.3 Beliebige a-priori Kontraste ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
library(emmeans)
aovAncova <- aov(DVpost ~ IV + DVpre, data=dfAncova)
emmeans(aovAncova, ~IV)

cntrMat <- rbind("SSRI-Placebo"  = c(-1,  1, 0),
                 "SSRI-WL"       = c(-1,  0, 1),
                 "SSRI-0.5(P+WL)"= c(-2,  1, 1))

library(multcomp)
(sumRes <- summary(glht(aovAncova, linfct=mcp(IV=cntrMat),
                        alternative="greater"),
                   test=adjusted("none")))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:emmeans))
try(detach(package:multcomp))
try(detach(package:mvtnorm))
try(detach(package:TH.data))
try(detach(package:survival))
try(detach(package:MASS))

##### 7.9.4 Beliebige post-hoc Kontraste nach Scheffé ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
N         <- nrow(dfAncova)
dfSSgrp   <- P-1
dfSSEfull <- N-P-1
Fstats    <- sumRes$test$tstat^2
(pVals    <- pf(Fstats/dfSSgrp, dfSSgrp, dfSSEfull, lower.tail=FALSE))

##### manuelle Rechnung
X    <- DVpre
Y    <- DVpost
XMj  <- tapply(X, IV, FUN=mean)
YMj  <- tapply(Y, IV, FUN=mean)
N    <- length(Y)
Xctr <- X - ave(X, IV, FUN=mean)
Yctr <- Y - ave(Y, IV, FUN=mean)

bFull     <- cov(Xctr, Yctr) / var(Xctr)
aFull     <- YMj - bFull*XMj
YhatFull  <- bFull*X + aFull[IV]
SSEfull   <- sum((Y-YhatFull)^2)
dfSSEfull <- N-P-1
MSEfull   <- SSEfull / dfSSEfull

bRegr     <- cov(X, Y) / var(X)
aRegr     <- mean(Y) - bRegr*mean(X)
YhatRegr  <- bRegr*X + aRegr
SSEregr   <- sum((Y-YhatRegr)^2)
dfSSEregr <- N-2
MSEregr   <- SSEregr / dfSSEregr

Vj        <- tapply(Y, IV, FUN=var)
M         <- sum((Nj/N) * YMj)
SSEgrp    <- sum((Nj-1) * Vj)
dfSSEgrp  <- N-P
MSEgrp    <- SSEgrp / dfSSEgrp

SSregr    <- SSEgrp - SSEfull
dfRegr    <- dfSSEgrp - dfSSEfull
MSregr    <- SSregr / dfRegr
(Fregr    <- MSregr / MSEfull)

SSgrp     <- SSEregr - SSEfull
dfGrp     <- dfSSEregr - dfSSEfull
MSgrp     <- SSgrp / dfGrp
(Fgrp     <- MSgrp / MSEfull)

####*--------------------------------------------------------------------------*
#### 7.10 Adjustierte Mittelwerte und marginale Effekte ----
Njk    <- 10
P      <- 2
Q      <- 3
F1     <- factor(letters[rep(1:P, times=Njk*Q)])
F2     <- factor(LETTERS[rep(1:Q,  each=Njk*P)])
X      <- rnorm(Njk*P*Q, 10, 1)
eF1    <- c(0.5, -0.5)
eF2    <- c(-0.5, 0, 0.5)
eF1F2  <- c(0.2, -0.2, -0.1, 0.1, -0.1, 0.1)
muF1.  <- eF1[unclass(F1)]
muF2.  <- eF2[unclass(F2)] * 0.2*X
muF1F2 <- eF1F2[unclass(interaction(F1, F2))]
mu     <- -0.2*X + muF1. + muF2. + muF1F2
Y      <- rnorm(Njk*P*Q, mu, 1)
dEMM   <- data.frame(F1, F2, X, Y)

## für Daten des Buches diese Datei laden:
# load("data_emm.Rdata")
lm_fit <- lm(Y ~ F1 + F2 + X + F1:F2 + F2:X, data=dEMM)
summary(lm_fit)

##### 7.10.1 Adjustierte Mittelwerte ----
##### 7.10.1.1 Adjustierte Mittelwerte berechnen ----
## für Daten des Buches diese Datei laden:
# load("data_emm.Rdata")
library(emmeans)
emmeans(lm_fit, specs=~ F1)
emmeans(lm_fit, specs=~ F1 + F2)
emmeans(lm_fit, specs=~ F1 | F2)
emmeans(lm_fit, specs=~ F1 | X, at=list(X=c(5, 10)))
emmeans(lm_fit, specs=~ F1,     at=list(F1=c("a")))
emmeans(lm_fit, specs=~ X,      at=list(X=c(5, 10, 15)))

##### 7.10.1.2 Adjustierte Mittelwerte grafisch darstellen ----
## für Daten des Buches diese Datei laden:
# load("data_emm.Rdata")
library(ggplot2)
emm <- emmeans(lm_fit, specs=~ F2 | F1)
plot(emm) +
    labs(title="Adjustierte Mittelwerte",
         x="Estimated Marginal Mean")

d_ip <- emmip(lm_fit, F2 ~ X,
              at=list(X=seq(5, 15, length=20),
                      F2=c("A", "B", "C")),
              CIs=TRUE, plotit=FALSE)

ggplot(d_ip, aes(x=X, y=yvar, group=F2,
                 color=F2, fill=F2, ymin=LCL, ymax=UCL)) +
    geom_ribbon(alpha=0.4, linetype="blank") +
    geom_line() +
    labs(title="Moderated Regression", y="Predicted Slope")

##### 7.10.1.3 Adjustierte Mittelwerte testen ----
## für Daten des Buches diese Datei laden:
# load("data_emm.Rdata")
emm <- emmeans(lm_fit, specs=~ F2 | F1)
summary(emm, infer=TRUE)

pairs(emm, simple="F2", by="F1", adjust="Tukey")

##### 7.10.1.4 Bedingte Effekte in der zweifaktoriellen ANOVA ----
## für Daten des Buches diese Datei laden:
# load("data_crfpq.Rdata")
str(dfCRFpq)
aovCRFpq <- aov(DV ~ IV1*IV2, data=dfCRFpq)
(emm     <- emmeans(aovCRFpq, ~ IV1 + IV2))
joint_tests(emm, by="IV1")

## manuelle Berechnung
CRFp1 <- anova(lm(DV ~ IV2, data=dfCRFpq, subset=(IV1=="1")))
CRFp2 <- anova(lm(DV ~ IV2, data=dfCRFpq, subset=(IV1=="2")))

SSp1  <- CRFp1["IV2", "Sum Sq"]
SSp2  <- CRFp2["IV2", "Sum Sq"]

CRFpq <- anova(lm(DV ~ IV1*IV2, data=dfCRFpq))
SSB   <- CRFpq["IV2",       "Sum Sq"]
SSI   <- CRFpq["IV1:IV2",   "Sum Sq"]
SSE   <- CRFpq["Residuals", "Sum Sq"]
dfSSB <- CRFpq["IV2",       "Df"]
dfSSE <- CRFpq["Residuals", "Df"]

all.equal(SSp1 + SSp2, SSB + SSI)

(Fp1  <- (SSp1/dfSSB) / (SSE/dfSSE))
(Fp2  <- (SSp2/dfSSB) / (SSE/dfSSE))

(pP1  <- pf(Fp1, dfSSB, dfSSE, lower.tail=FALSE))
(pP2  <- pf(Fp2, dfSSB, dfSSE, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:emmeans))
try(detach(package:ggplot2))

##### 7.10.2 Marginale Effekte ----
##### 7.10.2.1 Marginale Effekte berechnen ----
## für Daten des Buches diese Datei laden:
# load("data_emm.Rdata")
library(emmeans)
(emt1 <- emtrends(lm_fit, ~ F2, var="X", at=list(F2=c("A", "B"))))

##### 7.10.2.2 Marginale Effekte grafisch darstellen ----
## für Daten des Buches diese Datei laden:
# load("data_emm.Rdata")
library(ggplot2)
plot(emt1) +
    labs(title="Marginal Effects",
         x="Simple Slope von X")

##### 7.10.2.3 Marginale Effekte testen ----
emt2 <- emtrends(lm_fit, ~ F2, var="X", at=list(F2=c("A", "B", "C")))
summary(emt2, infer=TRUE)

##### 7.10.2.4 Moderierte Regression ----
ageC_vals <- c(-sd(ageC), 0, sd(ageC))
emt3 <- emtrends(fitHAi, ~ageC, var="heightC",
                 at=list(ageC=ageC_vals), infer=TRUE)

summary(emt3, infer=TRUE)

## manuelle Berechnung
coeffs <- coef(fitHAi)
b0 <- coeffs[1]
b1 <- coeffs[2]
b2 <- coeffs[3]
b3 <- coeffs[4]

b0 + b2*(mean(ageC) - sd(ageC))
b0 + b2* mean(ageC)
b0 + b2*(mean(ageC) + sd(ageC))
b1 + b3*(mean(ageC) - sd(ageC))
b1 + b3* mean(ageC)
b1 + b3*(mean(ageC) + sd(ageC))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:emmeans))
try(detach(package:ggplot2))

####*--------------------------------------------------------------------------*
#### 7.11 Power, Effektstärke und notwendige Stichprobengröße ----
##### 7.11.1 Binomialtest ----
N      <- 7
pH0    <- 0.25
pH1    <- 0.7
alpha  <- 0.05
(critB <- qbinom(alpha, N, pH0, lower.tail=FALSE))
(powB  <- pbinom(critB, N, pH1, lower.tail=FALSE))
sum(dbinom((critB+1):N, N, pH1))

par(mfrow=c(1, 2))
dH0 <- dbinom(0:N, N, pH0)
dH1 <- dbinom(0:N, N, pH1)
mat <- rbind(dH0, dH1)
rownames(mat) <- c("H0 (p=0.25)", "H1 (p=0.7)")
colnames(mat) <- 0:N
barsX <- barplot(mat, beside=TRUE, ylim=c(0, 0.35), xlab="Anzahl Treffer",
                 ylab="Wahrscheinlichkeit",
                 main="Binomialvert. unter H0 und H1 (N=7)",
                 col=c(rgb(1, 0.2, 0.2, 0.7), rgb(0.3, 0.3, 1, 0.6)),
                 names.arg=colnames(mat), legend.text=rownames(mat))
barplot(mat[ , 1:(critB+1)], beside=TRUE, ylim=c(0, 0.35),
        col=c("red", "blue"), add=TRUE)
xx <- barsX[2, critB+1] + (barsX[1, critB+2] - barsX[2, critB+1])/4
abline(v=xx, col="green", lwd=2)
text(xx - 0.4, 0.34, adj=1, labels="kritischer Wert", cex=1.7)

Nvec     <- 2:15
critBvec <- qbinom(alpha,    size=Nvec, prob=pH0, lower.tail=FALSE)
powBvec  <- pbinom(critBvec, size=Nvec, prob=pH1, lower.tail=FALSE)

plot(Nvec, critBvec, main="Power und kritischer Wert Binomialtest",
     xlab="N", xaxt="n", yaxt="n", lwd=2, type="s", pch=16, col="red",
     ylab="kritischer Wert")
axis(side=1, at=seq(Nvec[1], Nvec[length(Nvec)], by=1))
axis(side=2, at=seq(min(critBvec), max(critBvec), by=1), col="red")
par(new=TRUE)
plot(Nvec, powBvec, ylim=c(0, 1), type="b", lwd=2, pch=16, col="blue",
     xlab=NA, ylab=NA, axes=FALSE)
axis(side=4, at=seq(0, 1, by=0.1), col="blue")
mtext(text="Power", side=4, line=3, cex=1.7)

##### 7.11.2 t-Test ----
N      <- 10
muH0   <- 0
muH1   <- 1.6
alpha  <- 0.05
sigma  <- 2

(d     <- (muH1-muH0) / sigma)
(delta <- (muH1-muH0) / (sigma/sqrt(N)))
(tCrit <- qt(alpha, N-1, lower.tail=FALSE))
(powT  <- pt(tCrit, N-1, delta, lower.tail=FALSE))
xLims  <- c(-5, 10)

tLeft  <- seq(xLims[1], tCrit, length.out=100)
tRight <- seq(tCrit, xLims[2], length.out=100)
yH0r   <- dt(tRight, N-1, 0)
yH1l   <- dt(tLeft,  N-1, delta)
yH1r   <- dt(tRight, N-1, delta)

curve(dt(x, N-1, 0), xlim=xLims, lwd=2, col="red", xlab="t", ylab="Dichte",
      main="Verteilung von t unter H0 und H1", ylim=c(0, 0.4), xaxs="i")
curve(dt(x, N-1, delta), lwd=2, col="blue", add=TRUE)
polygon(c(tRight, rev(tRight)), c(yH0r, numeric(length(tRight))), border=NA,
        col=rgb(1, 0.3, 0.3, 0.6))
polygon(c(tLeft,  rev(tLeft)),  c(yH1l, numeric(length(tLeft))),  border=NA,
        col=rgb(0.3, 0.3, 1, 0.6))
polygon(c(tRight, rev(tRight)), c(yH1r, numeric(length(tRight))), border=NA,
        density=5, lty=2, lwd=2, angle=45, col="darkgray")
abline(v=tCrit, lty=1, lwd=3, col="red")
text(tCrit+0.2, 0.4,  adj=0, labels="kritischer Wert", cex=1.4)
text(tCrit-1.5, 0.4,  adj=1, labels="Verteilung unter H0", cex=1.4)
text(tCrit+1.5, 0.3,  adj=0, labels="Verteilung unter H1", cex=1.4)
text(tCrit+0.8, 0.08, adj=0, labels="Power", cex=1.4)
text(tCrit-0.7, 0.05,  str2expression("beta"), cex=1.4)
text(tCrit+0.5, 0.015, str2expression("alpha"), cex=1.4)

power.t.test(n=NULL, delta=muH1-muH0, sd=sigma, sig.level=0.05, power=0.9,
             type="one.sample", alternative="one.sided")

##### 7.11.3 Einfaktorielle Varianzanalyse (CR-p) ----
muJ   <- c(100, 110, 115)
sigma <- 15
power.anova.test(groups=3, n=NULL, between.var=var(muJ), within.var=sigma^2,
                 sig.level=0.05, power=0.9)

P       <- length(muJ)
Nj      <- c(21, 17, 19)
N       <- sum(Nj)
mu      <- sum(Nj * muJ) / N
alphaJ  <- muJ - mu
varMUj  <- sum(Nj * alphaJ^2) / N

(fSq    <- varMUj /  sigma^2)
(etaSq  <- varMUj / (sigma^2 + varMUj))
(lambda <- sum(Nj * alphaJ^2) / sigma^2)
(Fcrit  <- qf(0.05,  P-1, N-P, lower.tail=FALSE))
(powF   <- pf(Fcrit, P-1, N-P, lambda, lower.tail=FALSE))

fVals   <- seq(0, 1.2, length.out=100)
nn      <- seq(10, 25, by=5)

getFPow <- function(n) {
    Fcrit <- qf(0.05, P-1, P*n - P, lower.tail=FALSE)
    pf(Fcrit, P-1, P*n - P, P*n * fVals^2, lower.tail=FALSE)
}

powsF <- sapply(nn, getFPow)

par(mfrow=c(1, 2))
yStr <- "Wahrscheinlichkeit Annahme von H1"
expr <- str2expression("Power~als~Funktion~von~f~(alpha == 0.05)")
matplot(fVals, powsF, type="l", lty=1, lwd=2, xlab="f", ylab=yStr,
        xlim=c(-0.05, 1.1), main=expr, col=c("blue", "red", "darkgreen", "green"))
lines(c(-2, 0, 0), c(0.05, 0.05, -2), lty=2, lwd=2)
abline(h=1, lty=2, lwd=2)
mtext("f=0", side=1, at=0)
mtext(str2expression("alpha == 0.05"), side=2, at=0.05, las=1)
legend(x="bottomright", legend=paste("Nj =", c(10, 15, 20, 25)), lwd=2,
       col=c("blue", "red", "darkgreen", "green"))

getOneFn <- function(pp, varB) {
    res <- power.anova.test(groups=P, n=NULL, between.var=varB,
                            within.var=sigma^2, sig.level=0.05, power=pp)
    res$n
}

getManyFn <- function(ff, powF) {
    varB <- ff^2 * (P/(P-1)) * sigma^2
    sapply(powF, getOneFn, varB)
}

fVals <- seq(0.1, 0.85, length.out=100)
pows  <- seq(0.8, 0.95, by=0.05)
minN  <- sapply(fVals, getManyFn, pows)
yStr  <- "Mindeststichprobengröße (pro Gruppe)"
expr  <- str2expression("Mindest-N~als~Funktion~von~f~(alpha==0.05)")
matplot(fVals, t(minN), type="l", lty=1, lwd=2, xlab="f", ylab=yStr,
        xlim=c(0.1, 0.8), main=expr, col=c("blue", "red", "darkgreen", "green"))
legend(x="topright", legend=paste("Power = ", c(0.80, 0.85, 0.90, 0.95)),
       lwd=2, col=c("blue", "red", "darkgreen", "green"))

####*--------------------------------------------------------------------------*
### 8 Regressionsmodelle für kategoriale Daten ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 8.1 Logistische Regression ----
##### 8.1.1 Modell für dichotome Daten anpassen ----
SSRIpre  <- c(18, 16, 16, 15, 14, 20, 14, 21, 25, 11)
SSRIpost <- c(12,  0, 10,  9,  0, 11,  2,  4, 15, 10)
PlacPre  <- c(18, 16, 15, 14, 20, 25, 11, 25, 11, 22)
PlacPost <- c(11,  4, 19, 15,  3, 14, 10, 16, 10, 20)
WLpre    <- c(15, 19, 10, 29, 24, 15,  9, 18, 22, 13)
WLpost   <- c(17, 25, 10, 22, 23, 10,  2, 10, 14,  7)

P        <- 3
Nj       <- rep(length(SSRIpre), times=P)
IV       <- factor(rep(1:P, Nj), labels=c("SSRI", "Placebo", "WL"))
DVpre    <- c(SSRIpre,  PlacPre,  WLpre)
DVpost   <- c(SSRIpost, PlacPost, WLpost)
dfAncova <- data.frame(id=1:sum(Nj), IV, DVpre, DVpost)

## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
dfAncova <- transform(dfAncova,
                      postFac=cut(DVpost,
                                  breaks=c(-Inf, median(DVpost), Inf),
                                  labels=c("lo", "hi")))

cdplot(postFac ~ DVpre, data=dfAncova, subset=(IV == "SSRI"),
       main="Geschätzte Kategorien-Wkt. SSRI")
cdplot(postFac ~ DVpre, data=dfAncova, subset=(IV == "Placebo"),
       main="Geschätzte Kateg.-Wkt. Placebo")
cdplot(postFac ~ DVpre, data=dfAncova, subset=(IV == "WL"),
       main="Geschätzte Kategorien-Wkt. WL")

(glmFit <- glm(postFac ~ DVpre + IV,
               family=binomial(link="logit"),
               data=dfAncova))

exp(coef(glmFit))
exp(confint(glmFit))

##### 8.1.2 Modell für binomiale Daten anpassen ----
N      <- 100
x1     <- rnorm(N, 100, 15)
x2     <- rnorm(N, 10, 3)
total  <- sample(40:60, N, replace=TRUE)
hits   <- rbinom(N, total, prob=0.4)
hitMat <- cbind(hits, total-hits)
glm(hitMat ~ x1 + x2, family=binomial(link="logit"))

relHits <- hits/total
glm(relHits ~ x1 + x2, weights=total, family=binomial(link="logit"))

##### 8.1.3 Anpassungsgüte ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
(Dev <- deviance(glmFit))
sum(residuals(glmFit)^2)

as.vector(-2 * logLik(glmFit))
all.equal(AIC(glmFit), Dev + 2*(3+1))

library(DescTools)
PseudoR2(glmFit, which=c("McFadden", "CoxSnell", "Nagelkerke"))

glm0 <- update(glmFit, . ~ 1)
LLf  <- logLik(glmFit)
LL0  <- logLik(glm0)

N <- nobs(glmFit)
as.vector( 1 - (LLf / LL0))
as.vector( 1 - exp((2/N) * (LL0 - LLf)))
as.vector((1 - exp((2/N) * (LL0 - LLf))) / (1 - exp(LL0)^(2/N)))

PseudoR2(glmFit, which="Tjur")

Phat <- fitted(glmFit)
(PhatLoHi <- aggregate(Phat ~ postFac, FUN=mean, data=dfAncova))
abs(diff(PhatLoHi$Phat))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 8.1.4 Signifikanztests für Parameter und Modell ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
summary(glmFit)

OR    <- exp(coef(glmFit))
varB  <- diag(vcov(glmFit))
(seOR <- sqrt(OR^2 * varB))[-1]

glm0 <- update(glmFit, . ~ 1)
anova(glm0, glmFit, test="Chisq")

chisqStat <- glmFit$null.deviance - deviance(glmFit)
chisqDf   <- glmFit$df.null       - df.residual(glmFit)
(pVal     <- pchisq(chisqStat, chisqDf, lower.tail=FALSE))

glmPre <- update(glmFit, . ~ . -IV)
anova(glmPre, glmFit, test="Chisq")
anova(glm0,   glmFit, test="Chisq")

##### 8.1.5 Vorhersage, Klassifikation, Kalibrierung und Anwendung auf neue Daten ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
Phat <- fitted(glmFit)
predict(glmFit, type="response")
logitHat <- predict(glmFit, type="link")
all.equal(logitHat, log(Phat / (1-Phat)))
all.equal(logitHat, qlogis(Phat))
mean(Phat)
proportions(xtabs(~ postFac, data=dfAncova))

n_grp_calib <- 3
Phat_cut <- cut(Phat,
                breaks=quantile(Phat, probs=seq(from=0, to=1, length.out=n_grp_calib+1)),
                include.lowest=TRUE)

Phat_cut_avg <- tapply(Phat, Phat_cut, FUN=mean)
obs_cut_avg  <- tapply(dfAncova$postFac == "hi", Phat_cut, FUN=mean)
cbind(p_observed=obs_cut_avg,
      p_predicted=Phat_cut_avg)

thresh <- 0.5
facHat <- cut(Phat, breaks=c(-Inf, thresh, Inf), labels=c("lo", "hi"))
cTab   <- xtabs(~ postFac + facHat, data=dfAncova)
addmargins(cTab)
(CCR   <- sum(diag(cTab)) / sum(cTab))

plot(logitHat, pch=c(1, 16)[unclass(dfAncova$postFac)], cex=1.5, lwd=2,
     main="(Fehl-) Klassifikation durch Vorhersage",
     ylab="vorhergesagte logits")
abline(h=0)
legend(x="bottomright", legend=c("lo", "hi"), pch=c(1, 16), cex=1.5,
       lty=NA, lwd=2, bg="white")

Nnew  <- 3
dfNew <- data.frame(DVpre=rnorm(Nnew, 20, sd=7),
                    IV=factor(rep("SSRI", Nnew), levels=levels(dfAncova$IV)))
predict(glmFit, newdata=dfNew, type="response")

##### 8.1.6 Andere Link-Funktionen ----
##### 8.1.7 Modelle für Overdispersion ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
glmFitQB <- glm(postFac ~ DVpre + IV,
                family=quasibinomial(link="logit"),
                data=dfAncova)

summary(glmFitQB)

res_pearson <- residuals(glmFitQB, type="pearson")
(phi <- sum(res_pearson^2) / glmFitQB$df.residual)

##### 8.1.8 Mögliche Probleme bei der Modellanpassung ----

####*--------------------------------------------------------------------------*
#### 8.2 Ordinale Regression ----
##### 8.2.1 Modellanpassung ----
N      <- 100
X1     <- rnorm(N, 175, 7)
X2     <- rnorm(N,  30, 8)
Ycont  <- 0.5*X1 - 0.3*X2 + 10 + rnorm(N, 0, 6)
Yord   <- cut(Ycont, breaks=quantile(Ycont), include.lowest=TRUE,
              labels=c("--", "-", "+", "++"), ordered=TRUE)
Ycateg <- factor(Yord, ordered=FALSE)
dfOrd  <- data.frame(X1, X2, Yord, Ycateg)
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")

library(VGAM)
vglmFit <- vglm(Yord ~ X1 + X2, family=propodds, model=TRUE, data=dfOrd)
exp(VGAM::coef(vglmFit))

##### 8.2.2 Anpassungsgüte ----
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")
library(DescTools)

VGAM::deviance(vglmFit)
VGAM::AIC(vglmFit)
PseudoR2(vglmFit, which="Nagelkerke")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 8.2.3 Signifikanztests für Parameter und Modell ----
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")
library(VGAM)
sumOrd   <- VGAM::summary(vglmFit)
(coefOrd <- VGAM::coef(sumOrd))
VGAM::confint(vglmFit, method="profile")

vglmR <- vglm(Yord ~ X1, family=propodds, data=dfOrd)
anova(vglmR, vglmFit, type="I")

vglm0 <- vglm(Yord ~ 1,  family=propodds, data=dfOrd)
anova(vglm0, vglmFit, type="I")

vglmP  <- vglm(Yord ~ X1 + X2, family=cumulative(parallel=TRUE),
               data=dfOrd)
vglmNP <- vglm(Yord ~ X1 + X2, family=cumulative(parallel=FALSE),
               data=dfOrd)
anova(vglmNP, vglmP, type="I")

##### 8.2.4 Vorhersage, Klassifikation und Anwendung auf neue Daten ----
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")
PhatCateg <- predict(vglmFit, type="response")
head(PhatCateg, n=3)
categHat <- levels(dfOrd$Yord)[max.col(PhatCateg)]
head(categHat)

facHat <- factor(categHat, levels=levels(dfOrd$Yord))
cTab   <- table(dfOrd$Yord, facHat, dnn=c("Yord", "facHat"))
addmargins(cTab)
(CCR   <- sum(diag(cTab)) / sum(cTab))

Nnew  <- 3
dfNew <- data.frame(X1=rnorm(Nnew, 175, 7),
                    X2=rnorm(Nnew,  30, 8))

predict(vglmFit, newdata=dfNew, type="response")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:VGAM))
try(detach(package:splines))
try(detach(package:stats4))

####*--------------------------------------------------------------------------*
#### 8.3 Multinomiale Regression ----
##### 8.3.1 Modellanpassung ----
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")
library(VGAM)
vglmFitMN <- vglm(Ycateg ~ X1 + X2, family=multinomial(refLevel=1),
                  model=TRUE, data=dfOrd)
exp(VGAM::coef(vglmFitMN))

##### 8.3.2 Anpassungsgüte ----
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")
library(DescTools)

VGAM::deviance(vglmFitMN)
VGAM::AIC(vglmFitMN)
PseudoR2(vglmFitMN, which="Nagelkerke")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 8.3.3 Signifikanztests für Parameter und Modell ----
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")
sumMN   <- VGAM::summary(vglmFitMN)
(coefMN <- VGAM::coef(sumMN))
confint(vglmFitMN, method="profile")

vglmFitR <- vglm(Ycateg ~ X1, family=multinomial(refLevel=1), data=dfOrd)
anova(vglmFitR, vglmFitMN, type="I")

##### 8.3.4 Vorhersage, Klassifikation und Anwendung auf neue Daten ----
## für Daten des Buches diese Datei laden:
# load("data_regrOrd.Rdata")
PhatCateg <- predict(vglmFitMN, type="response")
head(PhatCateg, n=3)
colMeans(PhatCateg)
proportions(xtabs(~ Ycateg, data=dfOrd))

categHat <- levels(dfOrd$Ycateg)[max.col(PhatCateg)]
head(categHat)

facHat <- factor(categHat, levels=levels(dfOrd$Ycateg))
cTab   <- table(dfOrd$Ycateg, facHat, dnn=c("Ycateg", "facHat"))
addmargins(cTab)
(CCR   <- sum(diag(cTab)) / sum(cTab))

Nnew  <- 3
dfNew <- data.frame(X1=rnorm(Nnew, 175, 7),
                    X2=rnorm(Nnew,  30, 8))

predict(vglmFitMN, newdata=dfNew, type="response")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:VGAM))
try(detach(package:splines))
try(detach(package:stats4))

####*--------------------------------------------------------------------------*
#### 8.4 Regression für Zähldaten ----
##### 8.4.1 Poisson-Regression ----
library(mvtnorm)
N     <- 200
sigma <- matrix(c(4,2,-3, 2,16,-1, -3,-1,8), byrow=TRUE, ncol=3)
mu    <- c(-3, 2, 4)
XY    <- rmvnorm(N, mean=mu, sigma=sigma)
Y     <- round(XY[ , 3] - 1.5)
Y[Y < 0] <- 0
dfCount <- data.frame(X1=XY[ , 1], X2=XY[ , 2], Y)
## für Daten des Buches diese Datei laden:
# load("data_regrCount.Rdata")

glmFitP <- glm(Y ~ X1 + X2, family=poisson(link="log"), data=dfCount)
exp(coef(glmFitP))
exp(confint(glmFitP))
summary(glmFitP)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

##### 8.4.2 Ereignisraten analysieren ----
Nt   <- 100
Ti   <- sample(20:40, Nt, replace=TRUE)
Xt   <- rnorm(Nt, 100, 15)
Yt   <- rbinom(Nt, size=Ti, prob=0.5)
fitT <- glm(Yt ~ Xt, family=poisson(link="log"), offset=log(Ti))
summary(fitT)

summary(glm(Yt ~ Xt + offset(log(Ti)), family=poisson(link="log")))

##### 8.4.3 Adjustierte Poisson-Regression und Negativ-Binomial-Regression ----
## für Daten des Buches diese Datei laden:
# load("data_regrCount.Rdata")
glmFitQP <- glm(Y ~ X1 + X2, family=quasipoisson(link="log"), data=dfCount)
summary(glmFitQP)

res_pearson <- residuals(glmFitQP, type="pearson")
(phi <- sum(res_pearson^2) / glmFitQP$df.residual)

library(sandwich)
hcSE <- vcovHC(glmFitP, type="HC0")
library(lmtest)
coeftest(glmFitP, vcov=hcSE)

library(MASS)
glmFitNB <- glm.nb(Y ~ X1 + X2, data=dfCount)
summary(glmFitNB)

library(pscl)
odTest(glmFitNB)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:sandwich))
try(detach(package:lmtest))
try(detach(package:pscl))
try(detach(package:MASS))
try(detach(package:zoo))

##### 8.4.4 Zero-Inflated Poisson-Regression ----
## für Daten des Buches diese Datei laden:
# load("data_regrCount.Rdata")
library(pscl)
ziFitP <- zeroinfl(Y ~ X1 + X2 | 1, dist="poisson", data=dfCount)
summary(ziFitP)

ziFitNB <- zeroinfl(Y ~ X1 + X2 | 1, dist="negbin", data=dfCount)
summary(ziFitNB)

vuong(ziFitP,  glmFitP)
vuong(ziFitNB, glmFitNB)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:pscl))

##### 8.4.5 Zero-truncated Poisson-Regression ----

####*--------------------------------------------------------------------------*
#### 8.5 Loglineare Modelle ----
##### 8.5.1 Modell ----
##### 8.5.2 Modell mit loglm() anpassen ----
library(MASS)
str(UCBAdmissions)

(llFit <- loglm(~ Admit + Dept + Gender, data=UCBAdmissions))

UCBAdf <- as.data.frame(UCBAdmissions)
head(UCBAdf, n=3)
loglm(Freq ~ Admit + Dept + Gender, data=UCBAdf)
(llCoef <- coef(llFit))

library(vcd)
mosaic(~ Admit + Dept + Gender, shade=TRUE, data=UCBAdmissions)

sum(residuals(llFit, type="deviance")^2)
sum(residuals(llFit, type="pearson")^2)

loglm(~ Admit + Dept + Gender + Admit:Dept + Dept:Gender, data=UCBAdmissions)

##### 8.5.3 Modell mit glm() anpassen ----
glmFitT   <- glm(Freq ~ Admit + Dept + Gender, family=poisson(link="log"), data=UCBAdf)
(glmTcoef <- coef(glmFitT))
all.equal(c(fitted(llFit)), fitted(glmFitT), check.attributes=FALSE)

glmFitT_sat <- glm(Freq ~ Admit*Dept*Gender, family=poisson(link="log"), data=UCBAdf)
anova(glmFitT, glmFitT_sat)

glmTcoef["(Intercept)"]
llCoef$`(Intercept)` + llCoef$Admit["Admitted"] + llCoef$Gender["Male"]  + llCoef$Dept["A"]

glmTcoef["(Intercept)"] + glmTcoef["DeptC"] + glmTcoef["GenderFemale"]
llCoef$`(Intercept)` + llCoef$Admit["Admitted"] + llCoef$Dept["C"] + llCoef$Gender["Female"]

glmFitE <- glm(Freq ~ Admit + Dept + Gender, family=poisson(link="log"),
               contrasts=list(Admit=contr.sum,
                               Dept=contr.sum,
                             Gender=contr.sum), data=UCBAdf)
coef(summary(glmFitE))
exp(confint(glmFitE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:MASS))
try(detach(package:vcd))
try(detach(package:grid))

####*--------------------------------------------------------------------------*
### 9 Survival-Analyse ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 9.1 Verteilung von Ereigniszeiten ----

####*--------------------------------------------------------------------------*
#### 9.2 Zensierte und gestutzte Ereigniszeiten ----
##### 9.2.1 Zeitlich konstante Prädiktoren ----
N     <- 180
P     <- 3
sex   <- factor(sample(c("f", "m"), N, replace=TRUE))
X     <- rnorm(N, 0, 1)
IV    <- factor(rep(LETTERS[1:P], each=N/P))
IVeff <- c(0, -1, 1.5)
Xbeta <- 0.7*X + IVeff[unclass(IV)] + rnorm(N, 0, 2)

weibA  <- 1.5
weibB  <- 100
U      <- runif(N, 0, 1)
eventT <- ceiling((-log(U)*(weibB^weibA)*exp(-Xbeta))^(1/weibA))
obsLen <- 120

plot(ecdf(eventT), xlim=c(0, 200), main="Kumulative Überlebenszeit-Verteilung",
     xlab="t", ylab="F(t)", cex.lab=1.4)
abline(v=obsLen, col="blue", lwd=2)
text(obsLen-5, 0.2, adj=1, labels="Ende Beobachtungszeit", cex=1.4)

censT  <- rep(obsLen, N)
obsT   <- pmin(eventT, censT)
status <- eventT <= censT
dfSurv <- data.frame(obsT, status, sex, X, IV)
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")

library(survival)
Surv(obsT, status)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

##### 9.2.2 Daten in Zählprozess-Darstellung ----
##### 9.2.2.1 Wiederkehrende Ereignisse ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
dfSurvCP <- survSplit(Surv(obsT, status) ~ ., cut=seq(30, 90, by=30),
                      start="start", id="ID", zero=0, data=dfSurv)

head(sort_by(dfSurvCP, ~ ID + start), n=7)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

##### 9.2.2.2 Zeitabhängige Prädiktoren ----

####*--------------------------------------------------------------------------*
#### 9.3 Kaplan-Meier-Analyse ----
##### 9.3.1 Survival-Funktion schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
KM0 <- survfit(Surv(obsT, status) ~ 1,  type="kaplan-meier", conf.type="log", data=dfSurv)
(KM <- survfit(Surv(obsT, status) ~ IV, type="kaplan-meier", conf.type="log", data=dfSurv))
quantile(KM0, probs=c(0.25, 0.5, 0.75), conf.int=TRUE)
print(KM0, print.rmean=TRUE)
summary(KM0, times=c(20, 50, 100))

##### 9.3.2 Survival, kumulative Inzidenz und kumulatives Hazard darstellen ----
plot(KM0, main=str2expression("KM-Schätzer~hat(S)(t)~mit~CI"),
     xlab="t", ylab="Survival", lwd=2)

plot(KM0, main=str2expression("KM-Schätzer~1-hat(S)(t)~mit~CI"),
     xlab="t", ylab="kumulative Inzidenz", fun=function(x) { 1-x }, lwd=2)

plot(KM, main=str2expression("KM-Schätzer~hat(S)[g](t)~für~Gruppen"),
     xlab="t", ylab="Survival", lwd=2, col=1:3, lty=1:3)

legend(x="topright", col=1:3, lwd=2, lty=1:3, legend=LETTERS[1:3])

plot(KM0, main=str2expression("KM-Schätzer~hat(Lambda)(t)"),
     xlab="t", ylab="kumulatives hazard", fun="cumhaz", lwd=2)

##### 9.3.3 Log-Rank Test auf gleiche Survival-Funktionen ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
survdiff(Surv(obsT, status) ~ IV, data=dfSurv)
survdiff(Surv(obsT, status) ~ IV + strata(sex), data=dfSurv)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

####*--------------------------------------------------------------------------*
#### 9.4 Cox Proportional Hazards Modell ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
(fitCPH <- coxph(Surv(obsT, status) ~ X + IV, data=dfSurv))
coxph(Surv(start, obsT, status) ~ X + IV, data=dfSurvCP)
summary(fitCPH)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

##### 9.4.1 Anpassungsgüte und Modelltests ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
extractAIC(fitCPH)

fitCPH1 <- coxph(Surv(obsT, status) ~ X, data=dfSurv)
anova(fitCPH1, fitCPH)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

##### 9.4.2 Survival-Funktion, Baseline Hazard und kumulatives Hazard schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
(CPH <- survfit(fitCPH, conf.type="log"))
quantile(CPH, probs=c(0.25, 0.5, 0.75), conf.int=FALSE)

dfNew <- data.frame(sex=factor(c("f", "f"), levels=levels(dfSurv$sex)),
                      X=c(-2, -2),
                     IV=factor(c("A", "C"), levels=levels(dfSurv$IV)))
CPHnew <- survfit(fitCPH, newdata=dfNew)
with(CPHnew, head(data.frame(t=time, surv=surv, cumhaz=cumhaz), n=3))

plot(CPH, main=str2expression("Cox~PH-Schätzung~hat(S)(t)~mit~CI"),
     xlab="t", ylab="Survival", lwd=2)

lines(CPHnew$time, CPHnew$surv[ , 1], lwd=2, col="blue")
lines(CPHnew$time, CPHnew$surv[ , 2], lwd=2, col="red")
legend(x="topright", lwd=2, col=c("black", "blue", "red"),
       legend=c("pseudo-Beobachtung", "sex=f, X=-2, IV=A", "sex=f, X=-2, IV=C"))

with(CPHnew, head(data.frame(time, cum_haz=-log(surv)), n=4))

plot(CPHnew, fun="cumhaz", col=c("blue", "red"),
     main=str2expression("Cox~PH-Schätzung~hat(Lambda)[g](t)"),
     ylab="kumulatives hazard", lwd=2)
legend(x="bottomright", lwd=2, col=c("blue", "red"),
       legend=c("sex=f, X=-2, IV=A", "sex=f, X=-2, IV=C"))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

##### 9.4.3 Modelldiagnostik ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
dfSurv <- transform(dfSurv, Xcut=cut(X, breaks=c(-Inf, median(X), Inf), labels=c("lo", "hi")))

KMiv   <- survfit(Surv(obsT, status) ~ IV,   type="kaplan-meier", data=dfSurv)
KMxcut <- survfit(Surv(obsT, status) ~ Xcut, type="kaplan-meier", data=dfSurv)

par(mfrow=c(1, 2))
plot(KMiv, fun="cloglog", main="cloglog-Plot für IV1", xlab="ln t",
     ylab=str2expression("ln(-ln(hat(S)[g](t)))"),
     col=c("black", "blue", "red"), lty=1:3)

legend(x="topleft", col=c("black", "blue", "red"), lwd=2, lty=1:3, legend=LETTERS[1:3])

plot(KMxcut, fun="cloglog", main="cloglog-Plot für Xcut", xlab="ln t",
     ylab=str2expression("ln(-ln(hat(S)[g](t)))"),
     col=c("black", "blue"), lty=1:2)

legend(x="topleft", col=c("black", "blue"), lwd=2, lty=1:2, legend=c("lo", "hi"))

(czph <- cox.zph(fitCPH))

par(mfrow=c(2, 2))
plot(czph)

dfbetas <- residuals(fitCPH, type="dfbetas")
plot(dfbetas[ , 1], type="h", main="DfBETAS für X",    ylab="DfBETAS", lwd=2)
plot(dfbetas[ , 2], type="h", main="DfBETAS für IV-B", ylab="DfBETAS", lwd=2)
plot(dfbetas[ , 3], type="h", main="DfBETAS für IV-C", ylab="DfBETAS", lwd=2)

resMart <- residuals(fitCPH, type="martingale")
plot(dfSurv$X, resMart, main="Martingal-Residuen bzgl. X",
     xlab="X", ylab="Residuen", pch=20)

lines(loess.smooth(dfSurv$X, resMart), lwd=2, col="blue")
legend(x="bottomleft", col="blue", lwd=2, legend="LOESS fit")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

##### 9.4.4 Vorhersage und Anwendung auf neue Daten ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
pred_risk <- predict(fitCPH, type="risk")
head(pred_risk, n=4)

coef_fit <- coef(fitCPH)
haz_mean <- exp(coef_fit["X"] *mean(dfSurv$X)
              + coef_fit["IVB"]*0
              + coef_fit["IVC"]*0)

haz_obs <- exp(coef_fit["X"] *dfSurv$X
             + coef_fit["IVB"]*(dfSurv$IV == "B")
             + coef_fit["IVC"]*(dfSurv$IV == "C"))

all.equal(haz_obs / haz_mean, pred_risk)

dfNew <- data.frame( X=c(-1, 1),
                    IV=factor(c("B", "C"), levels=levels(dfSurv$IV)))

predict(fitCPH, newdata=dfNew, type="risk")

haz_new <- exp(coef_fit["X"] *dfNew$X
             + coef_fit["IVB"]*(dfNew$IV == "B")
             + coef_fit["IVC"]*(dfNew$IV == "C"))

haz_new / haz_mean

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

##### 9.4.5 Erweiterungen des Cox Proportional Hazards Modells ----

####*--------------------------------------------------------------------------*
#### 9.5 Parametrische Proportional Hazards Modelle ----
##### 9.5.1 Darstellung über die Hazard-Funktion ----
##### 9.5.2 Darstellung als Accelerated Failure Time Modell ----
##### 9.5.3 Anpassungsgüte und Modelltests ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
library(survival)
fitWeib <- survreg(Surv(obsT, status) ~ X + IV, dist="weibull", data=dfSurv)
summary(fitWeib)

(betaHat <- -coef(fitWeib) / fitWeib$scale)

fitExp <- survreg(Surv(obsT, status) ~ X + IV, dist="exponential", data=dfSurv)
anova(fitExp, fitWeib)

fitR <- survreg(Surv(obsT, status) ~ X, dist="weibull", data=dfSurv)
anova(fitR, fitWeib)

##### 9.5.4 Survival-Funktion schätzen ----
## für Daten des Buches diese Datei laden:
# load("data_survival.Rdata")
dfNew <- data.frame(sex=factor(c("m", "m"), levels=levels(dfSurv$sex)),
                      X=c(0, 0),
                     IV=factor(c("A", "C"), levels=levels(dfSurv$IV)))

percs <- (1:99)/100
FWeib <- predict(fitWeib, newdata=dfNew, type="quantile", p=percs, se=TRUE)

matplot(cbind(FWeib$fit[1, ],
              FWeib$fit[1, ] - 2*FWeib$se.fit[1, ],
              FWeib$fit[1, ] + 2*FWeib$se.fit[1, ]), 1-percs, type="l",
        main=str2expression("Weibull-Fit~hat(S)(t)~mit~SE"),
        xlab="t", ylab="Survival", lty=c(1, 2, 2), lwd=2, col="blue")

matlines(cbind(FWeib$fit[2, ],
               FWeib$fit[2, ] - 2*FWeib$se.fit[2, ],
               FWeib$fit[2, ] + 2*FWeib$se.fit[2, ]), 1-percs, col="red", lwd=2)

legend(x="topright", lwd=2, lty=c(1, 2, 1, 2),
       col=c("blue", "blue", "red", "red"),
       legend=c("sex=m, X=0, IV=A", "+- 2*SE",
                "sex=m, X=0, IV=C", "+- 2*SE"))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:survival))

####*--------------------------------------------------------------------------*
### 10 Klassische nonparametrische Verfahren ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 10.1 Anpassungstests ----
##### 10.1.1 Binomialtest ----
draws <- 7
hits  <- 5
pH0   <- 0.25
binom.test(hits, draws, p=pH0, alternative="greater", conf.level=0.95)

(pVal <- pbinom(hits-1, draws, pH0, lower.tail=FALSE))

resG <- binom.test(hits, draws, p=pH0, alternative="greater")
resG$p.value
resL <- binom.test(draws-hits, draws, p=pH0, alternative="less")
resL$p.value
2 * min(c(resG$p.value, resL$p.value))

##### 10.1.2 Test auf Zufälligkeit (Runs Test) ----
queue <- c("f", "m", "m", "f", "m", "f", "f", "f")
Nj    <- xtabs(~ queue)
(runs <- rle(queue))
(rr   <- length(runs$lengths))
(rr1  <- xtabs(~ runs$values)[1])
(rr2  <- xtabs(~ runs$values)[2])

getP <- function(r1, r2, n1, n2) {
    # Anzahl Iterationen einer Gruppe höchstens Gruppengröße
    if((r1 > n1) | (r2 > n2)) { return(0) }

    # Punktwahrscheinlichkeit für r1+r2 ungerade
    p <- (choose(n1-1, r1-1) * choose(n2-1, r2-1)) / choose(n1+n2, n1)

    # Punktwahrscheinlichkeit für r1+r2 gerade: das doppelte von ungerade
    ifelse(((r1+r2) %% 2) == 0, 2*p, p)
}

n1    <- Nj[1]
n2    <- Nj[2]
N     <- sum(Nj)
rMin  <- 2
(rMax <- ifelse(n1 == n2, N, 2*min(c(n1, n2)) + 1))

p3.2 <- getP(3, 2, n1, n2)
p2.3 <- getP(2, 3, n1, n2)
p3.3 <- getP(3, 3, n1, n2)
p4.3 <- getP(4, 3, n1, n2)

(pGrEq <- p3.2 + p2.3 + p3.3 + p4.3)

p2.2 <- getP(2, 2, n1, n2)
p1.2 <- getP(1, 2, n1, n2)
p2.1 <- getP(2, 1, n1, n2)
p1.1 <- getP(1, 1, n1, n2)

(pLess <- p2.2 + p1.2 + p2.1 + p1.1)

pGrEq + pLess

muR   <- 1 + ((2*n1*n2) / N)
varR  <- (2*n1*n2*(2*n1*n2 - N)) / (N^2 * (N-1))
rZ    <- (rr-muR) / sqrt(varR)
(pVal <- pnorm(rZ, lower.tail=FALSE))

##### 10.1.3 Kolmogorov-Smirnov-Anpassungstest ----
DV <- rnorm(8, mean=1.5, sd=3)
## für Daten des Buches diese Datei laden:
# load("data_ksOne.Rdata")
ks.test(DV, y=pnorm, mean=1, sd=2, alternative="two.sided")

d_DV <- data.frame(Y=DV)
ks.test(Y ~ 1, y=pnorm, data=d_DV, mean=1, sd=2, alternative="two.sided")

Fn      <- ecdf(DV)
sortDV  <- sort(DV)
emp     <- Fn(sortDV)
theo    <- pnorm(sortDV, mean=1, sd=2)

diff1   <- emp-theo
diff2   <- c(0, emp[-length(emp)]) - theo

(DtwoS  <- max(abs(c(diff1, diff2))))
(Dless  <- abs(min(c(diff1, diff2))))
(Dgreat <- abs(max(c(diff1, diff2))))

ks.test(DV, pnorm, mean=1, sd=2, alternative="less")$statistic
ks.test(DV, pnorm, mean=1, sd=2, alternative="greater")$statistic

plot(Fn, main="Kolmogorov-Smirnov-Anpassungstest", xlab=NA)
curve(pnorm(x, mean=1, sd=2), n=200, add=TRUE)
matlines(rbind(sortDV, sortDV), rbind(emp, theo), col=rgb(0, 0, 1, 0.7),
         lty=1, lwd=2)
matlines(rbind(sortDV, sortDV), rbind(c(0, emp[1:(length(emp)-1)]), theo),
         col=rgb(1, 0, 0, 0.5), lty=1, lwd=2)
legend(x="topleft", legend=c("direkte Differenzen", "verschobene Differenzen"),
       col=c("blue", "red"), lwd=2)

##### 10.1.4 chi^2-Test auf eine feste Verteilung ----
nRolls <- 50
nCateg <- 6
pH0    <- rep(1/nCateg, nCateg)
myData <- sample(seq_len(nCateg), nRolls, replace=TRUE)
## für Daten des Buches diese Datei laden:
# load("data_chisqDistrFix.Rdata")
(tab   <- xtabs(~ myData))
chisq.test(tab, p=pH0)

expected   <- pH0 * nRolls
(statChisq <- sum((tab-expected)^2 / expected))
(pVal      <- pchisq(statChisq, nCateg-1, lower.tail=FALSE))

##### 10.1.5 chi^2-Test auf eine Verteilungsklasse ----
DV   <- rnorm(100, mean=100, sd=15)
nCls <- 6
## für Daten des Buches diese Datei laden:
# load("data_chisqDistrClass.Rdata")
library(DescTools)
PearsonTest(DV, n.classes=nCls, adjust=TRUE)

limits    <- qnorm(seq(1/nCls, (nCls-1)/nCls, length.out=nCls-1), mean(DV), sd(DV))
DVcut     <- cut(DV, c(-Inf, limits, Inf))
(intFreq  <- xtabs(~ DVcut))
(resChisq <- chisq.test(intFreq, p=rep(1/nCls, nCls)))
statChisq <- resChisq$statistic
dfChisq   <- resChisq$parameter - 2
(pval     <- pchisq(statChisq, dfChisq, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

####*--------------------------------------------------------------------------*
#### 10.2 Analyse von (gemeinsamen) Häufigkeiten kategorialer Variablen ----
##### 10.2.1 chi^2-Test auf Unabhängigkeit ----
N        <- 50
smokes   <- factor(sample(c("no", "yes"), N, replace=TRUE))
siblings <- factor(round(abs(rnorm(N, 1, 0.5))))
## für Daten des Buches diese Datei laden:
# load("data_chisqInd.Rdata")
cTab     <- xtabs(~ smokes + siblings)
addmargins(cTab)
chisq.test(cTab)

P          <- nlevels(smokes)
Q          <- nlevels(siblings)
expected   <- outer(rowSums(cTab), colSums(cTab)) / sum(cTab)
(statChisq <- sum((cTab-expected)^2 / expected))
(pVal      <- pchisq(statChisq, (P-1)*(Q-1), lower.tail=FALSE))

##### 10.2.2 chi^2-Test auf Gleichheit von Verteilungen ----
voteX   <- rep(LETTERS[1:5], c(3,  8, 12, 19, 8))
voteY   <- rep(LETTERS[1:5], c(8, 17, 16,  7, 2))
vote    <- c(voteX, voteY)
studies <- factor(rep(c("X", "Y"), c(length(voteX), length(voteY))))
cTab    <- xtabs(~ studies + vote)
addmargins(cTab)
chisq.test(cTab, simulate.p.value=TRUE)

##### 10.2.3 chi^2-Test für mehrere Auftretenswahrscheinlichkeiten ----
total <- c(5000, 5000, 5000)
hits  <- c(585, 610, 539)
prop.test(hits, total)

(mat <- cbind(hits, total-hits))
prop.test(mat)
chisq.test(mat)

##### 10.2.4 Fishers exakter Test auf Unabhängigkeit ----
disease <- factor(rep(c("no", "yes"),   c(10, 5)))
diagN   <- rep(c("isHealthy", "isIll"), c( 8, 2))
diagY   <- rep(c("isHealthy", "isIll"), c( 1, 4))
diagT   <- factor(c(diagN, diagY))
contT1  <- xtabs(~ disease + diagT)
addmargins(contT1)
fisher.test(contT1, alternative="greater")

(p1   <- dhyper(8, 10, 5, 9))
(p2   <- dhyper(9, 10, 5, 9))
(pVal <- p1+p2)

##### 10.2.5 Fishers exakter Test auf Gleichheit von Verteilungen ----
N          <- 20
smokesFem  <- rbinom(N, size=1, prob=0.6)
smokesMale <- rbinom(N, size=1, prob=0.4)
smokes     <- factor(c(smokesFem, smokesMale), labels=c("no", "yes"))
sex        <- factor(rep(c("f", "m"), each=N))
## für Daten des Buches diese Datei laden:
# load("data_fisherEq.Rdata")
contT2     <- xtabs(~ sex + smokes)
addmargins(contT2)
fisher.test(contT2, alternative="less")

##### 10.2.6 Kennwerte von (2 x 2)-Konfusionsmatrizen ----
## für Daten des Buches diese Datei laden:
# load("data_fisherInd.Rdata")
addmargins(contT1)
TN <- contT1[1, 1]
TP <- contT1[2, 2]
FP <- contT1[1, 2]
FN <- contT1[2, 1]

##### 10.2.6.1 Sensitivität, Spezifität und Relevanz ----
(prevalence  <- sum(contT1[2, ])  / sum(contT1))
(sensitivity <- recall    <- TP   / (TP+FN))
(specificity <- TN                / (TN+FP))
(relevance   <- precision <- TP   / (TP+FP))
(CCR         <- sum(diag(contT1)) / sum(contT1))
(Fval        <- 1 / mean(1 / c(precision, recall)))

##### 10.2.6.2 Odds Ratio,Yules Q und relatives Risiko ----
library(DescTools)
OddsRatio(contT1, conf.level=0.95)
aa  <- contT1[1, 1]
bb  <- contT1[1, 2]
cc  <- contT1[2, 1]
dd  <- contT1[2, 2]
(OR <- (aa/bb) / (cc/dd))

YuleQ(contT1)
(Q <- (aa*dd - bb*cc) / (aa*dd + bb*cc))
(OR-1) / (OR+1)

(risk <- proportions(contT1, margin="disease"))
RelRisk(contT1)

(relRisk <- risk[1, 1] / risk[2, 1])

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.2.7 ROC-Kurve und AUC ----
N      <- 100
height <- rnorm(N, 175, 7)
age    <- rnorm(N, 30, 8)
weight <- 0.4*height + 0.3*age + rnorm(N, 0, 3)
wFac   <- cut(weight, breaks=c(-Inf, median(weight), Inf),
              labels=c("lo", "hi"))
regDf  <- data.frame(wFac, height, age)
## für Daten des Buches diese Datei laden:
# load("data_regrLog.Rdata")

library(pROC)
fit_glm <- glm(wFac ~ height + age, family=binomial(link="logit"), data=regDf)
(rocRes <- roc(fit_glm$y ~ fit_glm$fitted.values,
               levels=c(0, 1),
               direction="<",
               plot=TRUE, ci=TRUE, main="ROC-Kurve",
               xlab="1-Spezifität (TN / (TN+FP))", ylab="Sensitivität (TP / (TP+FN))"))

rocCI <- ci.se(rocRes)
plot(rocCI, type="shape")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:pROC))

####*--------------------------------------------------------------------------*
#### 10.3 Maße für Zusammenhang und Übereinstimmung ----
##### 10.3.1 Zusammenhang stetiger ordinaler Variablen: Spearmans rho und Kendalls tau ----
DV1 <- c(100, 76, 56, 99, 50, 62, 36, 69, 55,  17)
DV2 <- c(42,  74, 22, 99, 73, 44, 10, 68, 19, -34)
cor(DV1, DV2, method="spearman")
cor(rank(DV1), rank(DV2))

cor(DV1, DV2, method="kendall")
cmpMat1 <- outer(DV1, DV1, FUN=">")
cmpMat2 <- outer(DV2, DV2, FUN=">")
selMat  <- upper.tri(cmpMat1)
(nCP    <- sum((cmpMat1 == cmpMat2)[selMat]))
nDP     <- sum((cmpMat1 != cmpMat2)[selMat])
N       <- length(DV1)
nPairs  <- choose(N, 2)
(tau    <- (nCP-nDP) / nPairs)

cor.test(~ DV1 + DV2, method="spearman")
sum((rank(DV1)-rank(DV2))^2)

cor.test(~ DV1 + DV2, method="kendall")

##### 10.3.2 Zusammenhang kategorialer Variablen: Phi, Cramérs V, Kontingenzkoeffizient ----
P    <- 2
Q    <- 3
DV1  <- cut(c(100, 76, 56, 99, 50, 62, 36, 69, 55,  17), breaks=P)
DV2  <- cut(c(42,  74, 22, 99, 73, 44, 10, 68, 19, -34), breaks=Q)
cTab <- xtabs(~ DV1 + DV2)
N    <- sum(cTab)

library(DescTools)
Assocs(cTab)
TschuprowT(cTab)

expected <- outer(rowSums(cTab), colSums(cTab)) / N
chisqVal <- sum((cTab-expected)^2 / expected)
(phiVal  <- sqrt(chisqVal / N))

L        <- min(dim(cTab))
phisqMax <- L-1
chisqMax <- N*phisqMax

(CrV   <- sqrt(chisqVal     / chisqMax))
(CC    <- sqrt(chisqVal     / (N+chisqVal)))
(TschT <- sqrt((chisqVal/N) / sqrt((P-1)*(Q-1))))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.3.3 Inter-Rater-Übereinstimmung ----
##### 10.3.3.1 Prozentuale Übereinstimmung ----
categ <- c("V", "N", "P")
lvls  <- factor(categ, levels=categ)
rtr1  <- rep(lvls, c(60, 30, 10))
rtr2  <- rep(rep(lvls, nlevels(lvls)), c(53,5,2, 11,14,5, 1,6,3))
cTab  <- xtabs(~ rtr1 + rtr2)
addmargins(cTab)

library(DescTools)
Agree(cbind(rtr1, rtr2))

(agree <- sum(diag(proportions(cTab))))

rtr3 <- rep(rep(lvls, nlevels(lvls)), c(48,8,3, 15,10,7, 3,4,2))

cTab3 <- xtabs(~ rtr1 + rtr2 + rtr3)
Agree(cbind(rtr1, rtr2, rtr3))

sum(diag(apply(proportions(cTab3), 3, diag)))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.3.3.2 Cohens kappa ----
## für Daten des Buches diese Datei laden:
# load("data_irr.Rdata")
library(DescTools)
cTab <- xtabs(~ rtr1 + rtr2)
CohenKappa(cTab, conf.level=0.95)

fObs    <- sum(diag(proportions(cTab)))
fExp    <- sum(rowSums(proportions(cTab)) * colSums(proportions(cTab)))
(Ckappa <- (fObs-fExp) / (1-fExp))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.3.3.3 Fleiss' kappa ----
rtr1 <- letters[c(4,2,2,5,2, 1,3,1,1,5, 1,1,2,1,2, 3,1,1,2,1, 5,2,2,1,1, 2,1,2,1,5)]
rtr2 <- letters[c(4,2,3,5,2, 1,3,1,1,5, 4,2,2,4,2, 3,1,1,2,3, 5,4,2,1,4, 2,1,2,3,5)]
rtr3 <- letters[c(4,2,3,5,2, 3,3,3,4,5, 4,4,2,4,4, 3,1,1,4,3, 5,4,4,4,4, 2,1,4,3,5)]
rtr4 <- letters[c(4,5,3,5,4, 3,3,3,4,5, 4,4,3,4,4, 3,4,1,4,5, 5,4,5,4,4, 2,1,4,3,5)]
rtr5 <- letters[c(4,5,3,5,4, 3,5,3,4,5, 4,4,3,4,4, 3,5,1,4,5, 5,4,5,4,4, 2,5,4,3,5)]
rtr6 <- letters[c(4,5,5,5,4, 3,5,4,4,5, 4,4,3,4,5, 5,5,2,4,5, 5,4,5,4,5, 4,5,4,3,5)]

rateMat <- cbind(rtr1, rtr2, rtr3, rtr4, rtr5, rtr6)
library(DescTools)
KappaM(rateMat, conf.level=0.95)

nRtr    <- ncol(rateMat)
nObs    <- nrow(rateMat)
ratings <- c(rtr1, rtr2, rtr3, rtr4, rtr5, rtr6)
obsFac  <- factor(rep(1:nObs, nRtr))
cTab    <- xtabs(~ obsFac + ratings)
rateTab <- proportions(xtabs(~ ratings))
fExp    <- sum(rateTab^2)
fObsAll <- apply(cTab, 1, function(x) { sum(x*(x-1)) / (nRtr*(nRtr-1)) } )
fObs    <- mean(fObsAll)
(Fkappa <- (fObs-fExp) / (1-fExp))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.3.3.4 Gewichtetes Cohens kappa ----
categ <- c("<10%", "11-20%", "21-30%", "31-40%", "41-50%", ">50%")
lvls  <- factor(categ, levels=categ)
tv1   <- rep(lvls, c(22, 21, 23, 16, 10, 8))
tv2   <- rep(rep(lvls, nlevels(lvls)), c(5,8,1,2,4,2, 3,5,3,5,5,0, 1,2,6,11,2,1,
                                         0,1,5,4,3,3, 0,0,1,2,5,2, 0,0,1, 2,1,4))
cTab  <- xtabs(~ tv1 + tv2)
addmargins(cTab)
library(DescTools)
CohenKappa(cTab, weights="Equal-Spacing", conf.level=0.95)

P           <- ncol(cTab)
expected    <- outer(rowSums(cTab), colSums(cTab)) / sum(cTab)
(myWeights  <- seq(0, 1, length.out=P))
(weightsMat <- outer(1:P, 1:P, function(x, y) { 1 - ((abs(x-y)) / (P-1)) } ))
    
wfObs   <- sum(    cTab * weightsMat) / sum(cTab)
wfExp   <- sum(expected * weightsMat) / sum(cTab)
(wKappa <- (wfObs-wfExp) / (1-wfExp))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.3.3.5 Kendalls W ----
rtr1    <- c(1, 6, 3, 2, 5, 4)
rtr2    <- c(1, 5, 6, 2, 4, 3)
rtr3    <- c(2, 3, 6, 5, 4, 1)
ratings <- cbind(rtr1, rtr2, rtr3)

library(DescTools)
KendallW(ratings, test=TRUE)

## äquivalenter Friedman-Test
m     <- nrow(t(ratings))
n     <- ncol(t(ratings))
d_rtr <- data.frame(ratings=c(rtr1, rtr2, rtr3),
                    obj    =rep(paste0("O", seq_along(rtr1)), times=m),
                    rater  =rep(c("rtr1", "rtr2", "rtr3"), each=n))

ft <- friedman.test(ratings ~ obj | rater, data=d_rtr)
ft$p.value                # gleicher p-Wert
ft$statistic / (m*(n-1))  # gleich W

KW <- KendallW(ratings, test=TRUE)
(W <- KW$estimate)

## durchschnittliche paarweise Spearman Korrelation
c01 <- cor.test(ratings[, 1], ratings[, 2], method="spearman")$estimate
c02 <- cor.test(ratings[, 1], ratings[, 3], method="spearman")$estimate
c03 <- cor.test(ratings[, 2], ratings[, 3], method="spearman")$estimate
mean(c(c01, c02, c03))
(m*W-1) / (m-1)
(ft$statistic - n + 1) / ((n-1)*(m-1))

## manuelle Prüfung
rankMat  <- ratings
nObj     <- nrow(rankMat)
nRtr     <- ncol(rankMat)
(rankSum <- rowSums(rankMat))

(S <- sum((rankSum - (nRtr*(nObj+1) / 2))^2))
(W <- (12 / (nRtr^2 * nObj*(nObj^2 - 1))) * S)

(chisqVal <- nRtr * (nObj-1) * W)
(pVal     <- pchisq(chisqVal, nObj-1, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.3.3.6 Intra-Klassen-Korrelation ----
nRtr <- 4
nObs <- 6
rtr1 <- c(9, 6, 8, 7, 10, 6)
rtr2 <- c(2, 1, 4, 1,  5, 2)
rtr3 <- c(5, 3, 6, 2,  6, 4)
rtr4 <- c(8, 2, 8, 6,  9, 7)

library(DescTools)
ICC(cbind(rtr1, rtr2, rtr3, rtr4))

ratings <- c(rtr1, rtr2, rtr3, rtr4)
rateFac <- factor(rep(paste("rater", 1:nRtr,  sep=""),  each=nObs))
obsFac  <- factor(rep(paste("obj",   1:nObs , sep=""), times=nRtr))

(anObs  <- anova(lm(ratings ~ obsFac)))
(anBoth <- anova(lm(ratings ~ obsFac + rateFac)))

MSobs   <-  anObs["obsFac",    "Mean Sq"]
MSEobs  <-  anObs["Residuals", "Mean Sq"]
MSrtr   <- anBoth["rateFac",   "Mean Sq"]
MSEboth <- anBoth["Residuals", "Mean Sq"]

(ICC1  <- (MSobs - MSEobs)  / (MSobs + (nRtr-1)*MSEobs))
(ICC2  <- (MSobs - MSEboth) / (MSobs + (nRtr-1)*MSEboth + ((nRtr/nObs) * (MSrtr - MSEboth))))
(ICC3  <- (MSobs - MSEboth) / (MSobs + (nRtr-1)*MSEboth))
(ICC1k <- (MSobs - MSEobs)  / (MSobs))
(ICC2k <- (MSobs - MSEboth) / (MSobs + ((1/nObs) * (MSrtr - MSEboth))))
(ICC3k <- (MSobs - MSEboth) /  MSobs)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.3.3.7 Bland-Altman Diagramm ----
N       <- 100
method1 <- rnorm(N, 100, 15)
method2 <- method1 + rnorm(N, 5, 5)
m_i     <- (method1 + method2) / 2
d_i     <- method1 - method2

xLoUp <- 1.05*range(m_i)
plot(m_i, d_i, pch=16, main="Bland-Altman Diagramm",
     xlab="paarweiser Mittelwert", ylab="Methode 1 - Methode 2",
     xlim=xLoUp, xaxs="i")

m_di   <- mean(d_i)
var_di <- var(d_i)
alpha  <- 0.05
LoA_z  <- qnorm(1-(alpha/2), mean=0, sd=1)
LoAlo  <- m_di - LoA_z*sqrt(var_di)
LoAup  <- m_di + LoA_z*sqrt(var_di)
rect(xLoUp[1], LoAlo, xLoUp[2], LoAup, col="#cccccc99", border=NA)
abline(h=c(LoAlo, LoAup), col="darkgray", lwd=2, lty=2)
abline(h=0, col="darkgray", lwd=2)
abline(h=m_di, col="darkgray", lwd=2, lty=2)

tCrit      <- qt(alpha/2, df=N-1, lower.tail=FALSE)
mdi_CIlo   <- m_di  - tCrit*sqrt(var_di/N)
mdi_CIup   <- m_di  + tCrit*sqrt(var_di/N)
var_LoA    <- (var(d_i) / N) + (LoA_z^2 * var_di / (2*(N-1)))
LoAlo_CIlo <- LoAlo - tCrit*sqrt(var_LoA)
LoAlo_CIup <- LoAlo + tCrit*sqrt(var_LoA)
LoAup_CIlo <- LoAup - tCrit*sqrt(var_LoA)
LoAup_CIup <- LoAup + tCrit*sqrt(var_LoA)

green <- "#ddffddaa"
rect(xLoUp[1], LoAlo_CIlo, xLoUp[2], LoAlo_CIup, col=green, border=NA)
rect(xLoUp[1], mdi_CIlo,   xLoUp[2], mdi_CIup,   col=green, border=NA)
rect(xLoUp[1], LoAup_CIlo, xLoUp[2], LoAup_CIup, col=green, border=NA)

points(m_i, d_i, pch=16)

####*--------------------------------------------------------------------------*
#### 10.4 Tests auf gleiche Variabilität ----
##### 10.4.1 Mood-Test ----
DV1 <- c(12, 13, 29, 30)
DV2 <- c(15, 17, 18, 24, 25, 26)
Nj  <- c(length(DV1), length(DV2))
DV  <- c(DV1, DV2)
IV  <- factor(rep(1:2, Nj), labels=c("A", "B"))

mood.test(DV ~ IV, alternative="greater")

gX <- (rank(DV) - mean(rank(DV)))^2
MN <- sum(gX[IV == "A"])

N     <- sum(Nj)
muMN  <- (Nj[1] * (N^2 - 1)) / 12
varMN <- (Nj[1]*Nj[2] * (N+1) * (N^2 - 4)) / 180
(MNz  <- (MN - muMN) / sqrt(varMN))
(pVal <- pnorm(MNz, 0, 1, lower.tail=FALSE))

##### 10.4.2 Ansari-Bradley-Test ----
## für Daten des Buches diese Datei laden:
# load("data_mood.Rdata")
ansari.test(DV ~ IV, alternative="greater", exact=FALSE)

gX  <- mean(rank(DV)) - abs(rank(DV) - mean(rank(DV)))
(AB <- sum(gX[IV == "A"]))

muABe  <- (Nj[1] * (N+2)) / 4
muABo  <- (Nj[1] * (N+1)^2) / (4*N)
varABe <- (Nj[1]*Nj[2] * (N^2 - 4)) / (48 * (N-1))
varABo <- (Nj[1]*Nj[2] * (N+1) * (3 + N^2)) / (48 * N^2)

muAB  <- ifelse(N %% 2, muABo,  muABe)
varAB <- ifelse(N %% 2, varABo, varABe)
ABz   <- (AB - muAB) / sqrt(varAB)
pnorm(ABz, mean=0, sd=1)

####*--------------------------------------------------------------------------*
#### 10.5 Tests auf Übereinstimmung von Verteilungen ----
##### 10.5.1 Kolmogorov-Smirnov-Test für zwei Stichproben ----
DV1 <- round(rnorm(8, mean=1, sd=2), 2)
DV2 <- round(rnorm(8, mean=3, sd=2), 2)
## für Daten des Buches diese Datei laden:
# load("data_ksTwo.Rdata")
d_DV <- data.frame(DV=c(DV1, DV2),
                   IV=rep(c("G1", "G2"), each=8))

ks.test(DV ~ IV, data=d_DV, alternative="greater")

sortBoth <- sort(c(DV1, DV2))
both1    <- ecdf(DV1)(sortBoth)
both2    <- ecdf(DV2)(sortBoth)
diff1    <- both1-both2
diff2    <- c(0, both1[1:(length(both1)-1)]) - both2
diff2    <- c(0, both1[-length(both1)]) - both2
diffBoth <- c(diff1, diff2)

(DtwoS   <- max(abs(diffBoth)))
(Dless   <- abs(min(diffBoth)))
(Dgreat  <- abs(max(diffBoth)))

ks.test(DV1, DV2, alternative="less"     )$statistic
ks.test(DV1, DV2, alternative="two.sided")$statistic

xRange <- c(min(sortBoth)-1, max(sortBoth)+1)

plot(ecdf(DV1), xlim=xRange, main="Kolmogorov-Smirnov-Test für zwei Stichproben",
     xlab=NA, col.points="blue", col.hor="blue", lwd=2)
par(new=TRUE)
plot(ecdf(DV2), xlim=xRange, main=NA, xaxt="n", xlab=NA, ylab=NA, lwd=2,
     col.points="red", col.hor="red")
X  <- rbind(sortBoth, sortBoth)
Y1 <- rbind(both1, both2)
Y2 <- rbind(c(0, both1[1:(length(both1)-1)]), both2)
matlines(X, Y1, col="darkgray", lty=1, lwd=2)
matlines(X, Y2, col="darkgray", lty=1, lwd=2)
legend(x="bottomright", col=c("blue", "red"), lwd=2,
       legend=c("kumulierte relative Häufigkeiten 1", "kumulierte relative Häufigkeiten 2"))

##### 10.5.2 Vorzeichen-Test ----
medH0 <- 30
DV    <- sample(0:100, 20, replace=TRUE)
## für Daten des Buches diese Datei laden:
# load("data_signTest.Rdata")
library(DescTools)
SignTest(DV, mu=medH0)

N     <- length(DV)
DVsel <- DV[DV != medH0]
(obs  <- sum(DVsel > medH0))

(pGreater  <- pbinom(obs-1, N, 0.5, lower.tail=FALSE))
(pTwoSided <- 2 * pGreater)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 10.5.3 Wilcoxon-Vorzeichen-Rang-Test für eine Stichprobe ----
IQ    <- c(99, 131, 118, 112, 128, 136, 120, 107, 134, 122)
medH0 <- 110

wilcox.test(IQ, alternative="greater", mu=medH0, conf.int=TRUE)

IQdat <- data.frame(IQ=IQ)
wilcox.test(IQ ~ 1, alternative="greater", data=IQdat, mu=medH0, conf.int=TRUE)

(diffIQ   <- IQ-medH0)
(idx      <- diffIQ > 0)
(rankDiff <- rank(abs(diffIQ)))
(V        <- sum(rankDiff[idx]))
(pVal     <- psignrank(V-1, n=length(IQ), lower.tail=FALSE))

pairM  <- outer(IQ, IQ, FUN="+") / 2
(HLloc <- median(pairM[lower.tri(pairM, diag=TRUE)]))

##### 10.5.4 Wilcoxon-Rangsummen-Test / Mann-Whitney-U-Test für zwei unabhängige Stichproben ----
rtCtrl <- c(85, 106, 118, 81, 138, 90, 112, 119, 107, 95, 88, 103)
rtDrug <- c(96, 105, 104, 108, 86, 84, 99, 101, 78, 124, 121, 97, 129, 87, 109)
rtAll  <- c(rtCtrl, rtDrug)
Nj     <- c(length(rtCtrl), length(rtDrug))
IV     <- factor(rep(1:2, Nj), labels=c("control", "drug"))
wilcox.test(rtAll ~ IV, alternative="greater", conf.int=TRUE)

gX    <- rank(rtAll)
(W    <- sum(gX[IV == "control"]) - sum(1:Nj[1]))
(pVal <- pwilcox(W-1, m=Nj[1], n=Nj[2], lower.tail=FALSE))

pairD <- outer(rtCtrl, rtDrug, FUN="-")
(HLdl <- median(pairD))
(U    <- sum(outer(rtCtrl, rtDrug, FUN=">=")))

##### 10.5.5 Wilcoxon-Test für zwei abhängige Stichproben ----
N      <- 20
DVpre  <- rnorm(N, mean=90,  sd=15)
DVpost <- rnorm(N, mean=100, sd=15)
IV     <- factor(rep(0:1, each=N), labels=c("pre", "post"))
DV     <- c(DVpre, DVpost)
## für Daten des Buches diese Datei laden:
# load("data_tTwoDep.Rdata")
DVdat <- data.frame(DVpre=DVpre, DVpost=DVpost)
wilcox.test(Pair(DVpre, DVpost) ~ 1, alternative="less",
            conf.int=TRUE, data=DVdat)

DVdiff <- DVpre-DVpost
wilcox.test(DVdiff, alternative="less", conf.int=TRUE)

##### 10.5.6 Kruskal-Wallis-H-Test für unabhängige Stichproben ----
IQ1  <- c( 99, 131, 118, 112, 128, 136, 120, 107, 134, 122)
IQ2  <- c(134, 103, 127, 121, 139, 114, 121, 132)
IQ3  <- c(120, 133, 110, 141, 118, 124, 111, 138, 120)
IQ4  <- c(117, 125, 140, 109, 128, 137, 110, 138, 127, 141, 119, 148)

IQ1  <- c( 99, 131, 118, 112,  92, 136, 120, 107, 134, 122)
IQ2  <- c(134, 103, 127, 121, 139, 114, 121, 132)
IQ3  <- c(120, 133, 110, 141, 118, 124, 111, 138, 120)
IQ4  <- c(120, 128, 143, 116, 131, 140, 113, 141, 130, 144, 122, 151)

DV   <- c(IQ1, IQ2, IQ3, IQ4)
Nj   <- c(length(IQ1), length(IQ2), length(IQ3), length(IQ4))
N    <- sum(Nj)
IV   <- factor(rep(1:4, Nj), labels=c("I", "II", "III", "IV"))
KWdf <- data.frame(IV, DV)

kruskal.test(DV ~ IV, data=KWdf)

(rankSumI <- tapply(rank(DV), IV, FUN=sum))
(H        <- (12 / (N*(N+1))) * sum(rankSumI^2/Nj) - 3*(N+1))
(pVal     <- pchisq(H, nlevels(IV)-1, lower.tail=FALSE))

##### 10.5.7 Friedman-Rangsummen-Test für abhängige Stichproben ----
DVcaff <- c(14, 13, 12, 11, 10)
DVds   <- c(11, 12, 13, 14, 15)
DVdd   <- c(16, 15, 14, 13, 12)
DVplac <- c(13, 12, 11, 10,  9)
DV     <- c(DVcaff, DVds, DVdd, DVplac)
nBl    <- length(DVcaff)
P      <- 4
IV     <- factor(rep(1:P, each=nBl),
                 labels=c("Caffeine", "Single", "Double", "Placebo"))
blocks <- factor(rep(1:nBl, times=P))
fDf    <- data.frame(IV, DV, blocks)
friedman.test(DV ~ IV | blocks, data=fDf)

(DVmat    <- cbind(DVcaff, DVds, DVdd, DVplac))
(rankMat  <- t(apply(DVmat, 1, rank)))
(rankSumJ <- colSums(rankMat))
(S        <- (12 / (nBl*P*(P+1))) * sum(rankSumJ^2) - 3*nBl*(P+1))
(pVal     <- pchisq(S, P-1, lower.tail=FALSE))

##### 10.5.8 Cochran Q-Test für abhängige Stichproben ----
pref  <- c(1,1,0,1,0, 0,1,0,0,1, 1,0,1,0,0, 1,1,1,1,1, 0,1,0,0,0,
           1,0,1,1,1, 0,0,0,0,0, 1,1,1,1,0, 0,1,0,1,1, 1,0,1,0,0)
N     <- 10
year  <- factor(rep(1981:1985, times=N))
P     <- nlevels(year)
id    <- factor(rep(1:N, each=P))

library(coin)
symmetry_test(pref ~ year | id, teststat="quad")

prefMat <- matrix(pref, nrow=N, ncol=P, byrow=TRUE)
rSum    <- rowSums(prefMat)
cSum    <- colSums(prefMat)
(Q      <- (P*(P-1)*sum((cSum-mean(cSum))^2)) / (P*sum(rSum) - sum(rSum^2)))
(pVal   <- pchisq(Q, P-1, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:coin))
try(detach(package:survival))

##### 10.5.9 Bowker-Test für zwei abhängige Stichproben ----
categ <- factor(1:3, labels=c("lo", "med", "hi"))
Q     <- nlevels(categ)
drug  <- rep(categ, c(30, 50, 20))
plac  <- rep(rep(categ, length(categ)), c(14,7,9, 5,26,19, 1,7,12))
DV    <- c(drug, plac)
IV    <- factor(rep(c("drug", "plac"), each=100))
ID    <- factor(sprintf("P%.03d", rep(seq_len(100), 2)))

cTabBow <- xtabs(~ drug + plac)
addmargins(cTabBow)

mcnemar.test(cTabBow)

sqDiffs   <- (cTabBow-t(cTabBow))^2 / (cTabBow+t(cTabBow))
(chisqVal <- sum(sqDiffs[upper.tri(cTabBow)]))
(bowDf    <- choose(Q, 2))
(pVal     <- pchisq(chisqVal, bowDf, lower.tail=FALSE))

##### 10.5.10 McNemar-Test für zwei abhängige Stichproben ----
N       <- 20
pre     <- rbinom(N, size=1, prob=0.6)
post    <- rbinom(N, size=1, prob=0.4)
preFac  <- factor(pre,  labels=c("no", "yes"))
postFac <- factor(post, labels=c("no", "yes"))
P       <- nlevels(preFac)
## für Daten des Buches diese Datei laden:
# load("data_mcnemar.Rdata")
cTab <- xtabs(~ preFac + postFac)
addmargins(cTab)
mcnemar.test(cTab, correct=FALSE)

library(coin)
symmetry_test(cTab, teststat="quad")

(chisqVal <- (cTab[1, 2] - cTab[2, 1])^2 / (cTab[1, 2] + cTab[2, 1]))
(mcnDf    <- choose(P, 2))
(pVal     <- pchisq(chisqVal, mcnDf, lower.tail=FALSE))

binom.test(c(cTab[1, 2], cTab[2, 1]), p=0.5)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:coin))
try(detach(package:survival))

##### 10.5.11 Stuart-Maxwell-Test für zwei abhängige Stichproben ----
## für Daten des Buches diese Datei laden:
# load("data_bowker.Rdata")
library(coin)
mh_test(cTabBow)

addmargins(cTabBow)
(Nij  <- ((cTabBow+t(cTabBow)) / 2)[upper.tri(cTabBow)])
(d    <- rowSums(cTabBow) - colSums(cTabBow))
num   <- sum(Nij * rev(d^2))
denom <- 2 * sum(apply(combn(Nij, 2), 2, prod))

(chisqVal <- num / denom)
(smmhDf   <- nrow(cTabBow)-1)
(pVal     <- pchisq(chisqVal, smmhDf, lower.tail=FALSE))

S         <- -(cTabBow + t(cTabBow))
diag(S)   <- rowSums(cTabBow) + colSums(cTabBow) - 2*diag(cTabBow)
keep      <- seq_len(nrow(cTabBow)-1)
(chisqVal <- t(d[keep]) %*% solve(S[keep, keep]) %*% d[keep])

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:coin))
try(detach(package:survival))

####*--------------------------------------------------------------------------*
### 11 Resampling-Verfahren ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 11.1 Nonparametrisches Bootstrapping ----
##### 11.1.1 Replikationen erstellen ----
N    <- 200
muH0 <- 100
sdH0 <- 40
DV   <- rnorm(N, muH0, sdH0)

getM <- function(orgDV, idx) {
    n     <- length(orgDV[idx])
    bsM   <- mean(orgDV[idx])
    bsS2M <- (((n-1) / n) * var(orgDV[idx])) / n
    c(bsM, bsS2M)
}

library(boot)
nR    <- 999
bsRes <- boot(DV, statistic=getM, R=nR)
## für Daten des Buches diese Datei laden:
# load("data_bootMu.Rdata")
bsRes

(M      <- mean(DV))
(S2M    <- (((N-1)/N) * var(DV)) / N)
Mstar   <- bsRes$t[ , 1]
S2Mstar <- bsRes$t[ , 2]
(biasM  <- mean(Mstar) - M)
mean(S2Mstar) - S2M
c(sd(Mstar), sd(S2Mstar))

par(mfrow=c(1, 2))
hist(Mstar-M, freq=FALSE, breaks="FD")
rug(jitter(Mstar-M))
curve(dnorm(x, mean(Mstar-M), sd(Mstar-M)), lwd=2, col="blue", add=TRUE)
lines(density(Mstar-M), lwd=2, col="red")
qqnorm(Mstar-M, pch=16)
qqline(Mstar-M, lwd=2, col="blue")

bootIdx <- boot.array(bsRes, indices=TRUE)
bootIdx[1:3, 1:10]

repl1Idx <- bootIdx[1, ]
repl1DV  <- DV[repl1Idx]
head(repl1DV, n=5)

##### 11.1.2 Bootstrap-Vertauensintervalle für mu ----
## für Daten des Buches diese Datei laden:
# load("data_bootMu.Rdata")
alpha <- 0.05
boot.ci(bsRes, conf=1-alpha, type=c("basic", "perc", "norm", "stud", "bca"))

(idx  <- trunc((nR + 1) * c(alpha/2, 1 - alpha/2)))
tStar <- (Mstar-M) / sqrt(S2Mstar)
tCrit <- sort(tStar)[idx]
zCrit <- qnorm(c(alpha/2, 1 - alpha/2))

(ciBasic <- 2*M - sort(Mstar)[idx])
(ciPerc  <- sort(Mstar)[idx])
(ciNorm  <- M-biasM - zCrit*sd(Mstar))
(ciT     <- M - tCrit*sqrt(S2M))

res <- replicate(nR, getM(DV, sample(seq_along(DV), replace=TRUE)))
## für Daten des Buches diese Datei laden:
# load("data_bootMu.Rdata")
Mstar  <- res[1, ]
SMstar <- sqrt(res[2, ])
tStar  <- (Mstar-mean(DV)) / SMstar

plot(tStar, ecdf(tStar)(tStar), col="gray60", pch=1, xlab="t* bzw. t",
     ylab="P(T <= t)", main="t*: Kumulierte rel. Häufigkeiten und Verteilungsfunktion")
curve(pt(x, N-1), lwd=2, add=TRUE)
legend(x="topleft", lty=c(NA, 1), pch=c(1, NA), lwd=c(2, 2),
       col=c("gray60", "black"), legend=c("t*", "t"))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

##### 11.1.3 Bootstrap-Vertauensintervalle für mu2-mu1 ----
n1  <- 18
n2  <- 21
DVm <- rnorm(n1, 180, 10)
DVf <- rnorm(n2, 175, 6)
tDf <- stack(list(m=DVm, f=DVf))
## für Daten des Buches diese Datei laden:
# load("data_tTwoInd.Rdata")
getDM <- function(dat, idx) {
    Mfm <- aggregate(values ~ ind, data=dat, subset=idx, FUN=mean)
    -diff(Mfm$values)
}

library(boot)
bsTind <- boot(tDf, statistic=getDM, strata=tDf$ind, R=999)
boot.ci(bsTind, conf=0.95, type=c("basic", "bca"))

tt <- t.test(values ~ ind, alternative="two.sided", var.equal=TRUE, data=tDf)
tt$conf.int

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

##### 11.1.4 Lineare Modelle: Case Resampling ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
sqrt(diag(vcov(fitHAS)))
confint(fitHAS)

getRegr <- function(dat, idx) {
    bsFit <- lm(weight ~ height + age + sport, subset=idx, data=dat)
    coef(bsFit)
}

library(boot)
nR <- 999
regrDf  <- data.frame(weight, height, age, sport)
(bsRegr <- boot(regrDf, statistic=getRegr, R=nR))

boot.ci(bsRegr, conf=0.95, type="bca", index=1)$bca
boot.ci(bsRegr, conf=0.95, type="bca", index=2)$bca
boot.ci(bsRegr, conf=0.95, type="bca", index=3)$bca
boot.ci(bsRegr, conf=0.95, type="bca", index=4)$bca

boot.ci(bsRegr, conf=0.95, type="perc", index=1)$percent

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

##### 11.1.5 Lineare Modelle: Model-Based Resampling ----
## Daten erzeugen wie in 7.3.1
P     <- 4
Nj    <- c(41, 37, 42, 40)
DVa   <- rnorm(Nj[1], 0,   1)
DVb   <- rnorm(Nj[2], 0.3, 1)
DVc   <- rnorm(Nj[3], 0.6, 1)
DVd   <- rnorm(Nj[4], 1.0, 1)
DV    <- c(DVa, DVb, DVc, DVd)
IV    <- factor(rep(LETTERS[1:P], Nj))
dfCRp <- data.frame(IV, DV)
## für Daten des Buches diese Datei laden:
# load("data_bootAnova.Rdata")
anBase <- anova(lm(DV ~ IV))
Fbase  <- anBase["IV", "F value"]
(pBase <- anBase["IV", "Pr(>F)"])

E    <- residuals(lm(DV ~ 1))
Yhat <- fitted(lm(DV ~ 1))

getAnova <- function(dat, idx) {
    Ystar <- Yhat + E[idx]
    anBS  <- anova(lm(Ystar ~ IV, data=dat))
    anBS["IV", "F value"]
}

library(boot)
nR       <- 999
(bsAnova <- boot(dfCRp, statistic=getAnova, R=nR))
Fstar    <- bsAnova$t
tol      <- .Machine$double.eps^0.5
FsIsGEQ  <- (Fstar > Fbase) | (abs(Fstar-Fbase) < tol)
(pValBS  <- (sum(FsIsGEQ) + 1) / (length(Fstar) + 1))

plot(Fstar, ecdf(Fstar)(Fstar), col="gray60", pch=1, xlab="f* bzw. f",
     ylab="P(F <= f)", main="F*: Kumulierte rel. Häufigkeiten und Verteilungsfunktion")
curve(pf(x, P-1, sum(Nj) - P), lwd=2, add=TRUE)
legend(x="topleft", lty=c(NA, 1), pch=c(1, NA), lwd=c(2, 2),
       col=c("gray60", "black"), legend=c("F*", "F"))

##### 11.1.6 Lineare Modelle: Wild Bootstrap ----
## für Daten des Buches diese Datei laden:
# load("data_bootAnova.Rdata")
fit0 <- lm(DV ~ 1)                           ## Anpassung H0-Modell
E    <- rstandard(fit0)                      ## standardisierte Residuen
Yhat <- fitted(fit0)                         ## ursprüngliche Vorhersage

## Funktion für Replikations-F* mit wild bootstrap resampling
getAnovaWild <- function(dat, idx) {
    n  <- length(idx)                        ## Größe der Replikation
    ## Rademacher-Variable
    Ur <- sample(c(-1, 1), size=n, replace=TRUE, prob=c(0.5, 0.5))

    ## zweite Variante der Zufallsvariable U
    Uf <- sample(c(-(sqrt(5) - 1)/2, (sqrt(5) + 1)/2), size=n, replace=TRUE,
                 prob=c((sqrt(5) + 1)/(2*sqrt(5)), (sqrt(5) - 1)/(2*sqrt(5))))

    Ystar <- Yhat + (E*Ur)[idx]              ## für E* mit Rademacher-Variablen
    # Ystar <- Yhat + (E*Uf)[idx]            ## für E* mit zweiter Variante
    anBS  <- anova(lm(Ystar ~ IV, data=dat))
    anBS["IV", "F value"]
}

## fortfahren wie oben
library(boot)
nR       <- 999
bsAnovaW <- boot(dfCRp, statistic=getAnovaWild, R=nR)
FstarW   <- bsAnovaW$t
tol      <- .Machine$double.eps^0.5
FsIsGEQ  <- (FstarW > Fbase) | (abs(FstarW-Fbase) < tol)
(pValBSw <- (sum(FsIsGEQ) + 1) / (length(FstarW) + 1))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

####*--------------------------------------------------------------------------*
#### 11.2 Parametrisches Bootstrapping ----
##### 11.2.1 Bootstrap-Vertrauensintervalle für mu2 - mu1 ----
n1  <- 18
n2  <- 21
DVm <- rnorm(n1, 180, 10)
DVf <- rnorm(n2, 175, 6)
tDf <- stack(list(m=DVm, f=DVf))
## für Daten des Buches diese Datei laden:
# load("data_tTwoInd.Rdata")
getSDML <- function(x) {
    c(sqrt(cov.wt(as.matrix(x), method="ML")$cov))
}

MSD <- list( M=ave(tDf$values, tDf$ind, FUN=mean),
            SD=ave(tDf$values, tDf$ind, FUN=getSDML))

rGenMD <- function(dat, fm) {
    out <- dat
    out$values <- fm$M + rnorm(length(fm$M), mean=0, sd=fm$SD)
    return(out)
}

getMD <- function(dat) {
    Mfm <- aggregate(values ~ ind, data=dat, FUN=mean)
    -diff(Mfm$values)
}

library(boot)
nR   <- 999
bsMD <- boot(tDf, statistic=getMD, R=nR,
             sim="parametric", mle=MSD, ran.gen=rGenMD)

## für Daten des Buches diese Datei laden:
# load("data_bootParamMD.Rdata")
boot.ci(bsMD, conf=0.95, type="basic")$basic

tt <- t.test(values ~ ind, alt="two.sided", var.equal=FALSE, data=tDf)
tt$conf.int

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

##### 11.2.2 Verallgemeinerte lineare Modelle ----
## für Daten des Buches diese Datei laden:
# load("data_regrCount.Rdata")
rGenPois <- function(dat, mle) {
    out   <- dat
    out$Y <- simulate(mle)[[1]]
    return(out)
}

getPois <- function(dat) {
    glmFit <- glm(Y ~ X1 + X2, family=poisson(link="log"), data=dat)
    coef(glmFit)
}

library(boot)
nR <- 999
bsPois <- boot(dfCount, statistic=getPois, R=nR,
               sim="parametric", mle=glmFitP, ran.gen=rGenPois)

boot.ci(bsPois, conf=0.95, type="basic", index=1)$basic
boot.ci(bsPois, conf=0.95, type="basic", index=2)$basic
boot.ci(bsPois, conf=0.95, type="basic", index=3)$basic

confint(glmFitP)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

####*--------------------------------------------------------------------------*
#### 11.3 Permutationstests ----
##### 11.3.1 Test auf gleiche Lageparameter in unabhängigen Stichproben ----
Nj  <- c(7, 8)
DVa <- round(rnorm(Nj[1], 100, 20))
DVb <- round(rnorm(Nj[2], 110, 20))
## für Daten des Buches diese Datei laden:
# load("data_permTest.Rdata")
dat <- stack(list(a=DVa, b=DVb))

library(coin)
(ot <- oneway_test(values ~ ind, alternative="less", distribution="exact", data=dat))

par(mfrow=c(1, 2))
supp <- support(ot)
dens <- sapply(supp, dperm, object=ot)
plot(supp, dens, xlab="Support", ylab=NA, pch=20, main="Dichte Permutationsverteilung")

qEmp <- sapply(ppoints(supp), qperm, object=ot)
qqnorm(qEmp, xlab="Quantile Normalverteilung", ylab="Permutations-Quantile",
       pch=20, main="Permutations- vs. theoretische NV-Quantile")
abline(a=0, b=1, lwd=2, col="blue")

tt <- t.test(values ~ ind, alternative="less", var.equal=TRUE, data=dat)
tt$p.value

idx      <- seq_len(sum(Nj))
idxA     <- combn(idx, Nj[1])
getDM    <- function(x) { mean(dat[x, "values"]) - mean(dat[-x, "values"]) }
DMstar   <- apply(idxA, 2, getDM)
DMbase   <- mean(DVa) - mean(DVb)
tol      <- .Machine$double.eps^0.5
DMsIsLEQ <- (DMstar < DMbase) | (abs(DMstar-DMbase) < tol)
(pVal    <- sum(DMsIsLEQ) / length(DMstar))

par(mfrow=c(1, 2))
hist(DMstar, freq=FALSE, breaks="FD", xlab="Mittelwertsdifferenzen",
     main="Permutationstest: Histogramm Mittelwertsdifferenzen")
curve(dnorm(x, 0, 20/sqrt(Nj[1]) + 20/sqrt(Nj[2])), lwd=2, add=TRUE)
legend(x="topright", lty=1, lwd=2,
       legend=str2expression("N(0, sigma[1]^2 / n[1] + sigma[2]^2 / n[2])"))

plot(DMstar, ecdf(DMstar)(DMstar), col="gray60", pch=16,
     xlab="Mittelwertsdifferenzen", ylab="kumulierte relative Häufigkeit",
     main="Kumulierte relative Häufigkeiten")
curve(pnorm(x, 0, 20/sqrt(Nj[1]) + 20/sqrt(Nj[2])), lwd=2, add=TRUE)
legend(x="bottomright", lty=c(NA, 1), pch=c(16, NA), lwd=c(1, 2),
       col=c("gray60", "black"),
       legend=c("Permutationen",
       str2expression("N(0, sigma[1]^2 / n[1] + sigma[2]^2 / n[2])")))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:coin))
try(detach(package:survival))

##### 11.3.2 Test auf gleiche Lageparameter in abhängigen Stichproben ----
N      <- 12
DVpre  <- round(rnorm(N, 100, 20))
DVpost <- round(rnorm(N, 110, 20))
## für Daten des Buches diese Datei laden:
# load("data_permTest.Rdata")
datPP <- stack(list(pre=DVpre, post=DVpost))
datPP$id <- factor(rep(1:N, times=2))

library(coin)
oneway_test(values ~ ind | id, alternative="less",
            distribution=approximate(B=9999), data=datPP)

tt <- t.test(Pair(DVpre, DVpost) ~ 1, alternative="less", data=datPP)
tt$p.value

DVd    <- DVpre-DVpost
sgnLst <- lapply(numeric(N), function(x) { c(-1, 1) } )
sgnMat <- data.matrix(expand.grid(sgnLst))
getMD  <- function(x) { mean(abs(DVd) * x) }
MDstar <- apply(sgnMat, 1, getMD)
MDbase <- mean(DVd)

tol      <- .Machine$double.eps^0.5
MDsIsLEQ <- (MDstar < MDbase) | (abs(MDstar-MDbase) < tol)
(pVal    <- sum(MDsIsLEQ) / length(MDstar))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:coin))
try(detach(package:survival))

##### 11.3.3 Test auf Unabhängigkeit von zwei Variablen ----
Nf  <- 8
DV1 <- rbinom(Nf, size=1, prob=0.5)
DV2 <- rbinom(Nf, size=1, prob=0.5)
## für Daten des Buches diese Datei laden:
# load("data_permTest.Rdata")
fisher.test(DV1, DV2, alternative="greater")$p.value

library(DescTools)
permIdx <- Permn(seq_len(Nf))

getAgree <- function(idx) {
    sum(diag(table(DV1, DV2[idx])))
}

resAgree <- apply(permIdx, 1, getAgree)
agree12  <- sum(diag(table(DV1, DV2)))
(pVal    <- sum(resAgree >= agree12) / length(resAgree))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

####*--------------------------------------------------------------------------*
### 12 Multivariate Verfahren ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 12.1 Lineare Algebra ----
##### 12.1.1 Matrix-Algebra ----
N  <- 4
p  <- 2
(X <- matrix(c(20, 26, 10, 19, 29, 27, 20, 12), nrow=N, ncol=p))
t(X)

diag(cov(X))
diag(1:3)
diag(2)

c(1, 2) %*% rbind(c(1, 2, 3), c(4, 5, 6))
rbind(c(1, 2, 3), c(4, 5, 6)) %*% c(7, 8, 9)
(Xc <- diag(N) - matrix(rep(1/N, N^2), nrow=N))
all.equal(Xc %*% Xc, Xc)
(Xdot <- Xc %*% X)
(SSP  <- t(Xdot) %*% Xdot)
crossprod(Xdot)

(1/(N-1)) * SSP
(S <- cov(X))
Dsi <- diag(1/sqrt(diag(S)))
Dsi %*% S %*% Dsi
cov2cor(S)

b <- 2
a <- c(-2, 1)
sweep(b*X, 2, a, "+")
colLens <- sqrt(colSums(X^2))
sweep(X, 2, colLens, "/")
X %*% diag(1/colLens)

##### 12.1.2 Lineare Gleichungssysteme lösen ----
Y     <- matrix(c(1, 1, 1, -1), nrow=2)
(Yinv <- solve(Y))
Y %*% Yinv

A  <- matrix(c(9, 1, -5, 0), nrow=2)
b  <- c(5, -3)
(x <- solve(A, b))
A %*% x

##### 12.1.3 Norm und Abstand von Vektoren und Matrizen ----
a1 <- c(3, 4, 1, 8, 2)
sqrt(crossprod(a1))
sqrt(sum(a1^2))

a2 <- c(6, 9, 10, 8, 7)
A  <- cbind(a1, a2)
sqrt(diag(crossprod(A)))
sqrt(colSums(A^2))

norm(A, type="F")
sqrt(crossprod(c(A)))

B <- matrix(sample(-20:20, 12, replace=TRUE), ncol=3)
dist(B, diag=TRUE, upper=TRUE)
sqrt(crossprod(B[1, ] - B[2, ]))

##### 12.1.4 Mahalanobistransformation und Mahalanobisdistanz ----
sigma <- matrix(c(4,2,-3, 2,16,-1, -3,-1,9), byrow=TRUE, ncol=3)
mu    <- c(-3, 2, 4)
N     <- 100

library(mvtnorm)
X     <- round(rmvnorm(N, mean=mu, sigma=sigma))
## für Daten des Buches diese Datei laden:
# load("data_mahaDist.Rdata")
ctr   <- colMeans(X)
S     <- cov(X)

Seig  <- eigen(S)
sqrtD <- sqrt(Seig$values)
SsqrtInv <- Seig$vectors %*% diag(1/sqrtD) %*% t(Seig$vectors)

Xdot  <- scale(X, center=ctr, scale=FALSE)
Xmt   <- t(SsqrtInv %*% t(Xdot))
cov(Xmt)
colMeans(Xmt)

ideal <- c(1, 2, 3)
x     <- X[1, ]
y     <- X[2, ]
mat   <- rbind(x, y)

mahalanobis(mat, ideal, S)

Sinv <- solve(S)
t(x-ideal) %*% Sinv %*% (x-ideal)
t(y-ideal) %*% Sinv %*% (y-ideal)

mDist   <- mahalanobis(X, ideal, S)
min(mDist)
(idxMin <- which.min(mDist))
X[idxMin, ]

idealM <- t(SsqrtInv %*% (ideal - ctr))
crossprod(Xmt[1, ] - t(idealM))
crossprod(Xmt[2, ] - t(idealM))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

##### 12.1.5 Kennwerte von Matrizen ----
(A <- matrix(c(9, 1, 1, 4), nrow=2))
sum(diag(A))
sum(diag(t(A) %*% A))
sum(diag(A %*% t(A)))
sum(A^2)

det(A)
B <- matrix(c(-3, 4, -1, 7), nrow=2)
all.equal(det(A %*% B), det(A) * det(B))
det(diag(1:4))
Ainv <- solve(A)
all.equal(1/det(A), det(Ainv))

qrA <- qr(A)
qrA$rank

(eigA <- eigen(A))
zapsmall(eigA$vectors %*% t(eigA$vectors))
sum(eigA$values)
prod(eigA$values)

X     <- matrix(c(20, 26, 10, 19, 29, 27, 20, 12, 17, 23, 27, 25), nrow=4, ncol=3)
kappa(X, exact=TRUE)
Xplus <- solve(t(X) %*% X) %*% t(X)
norm(X, type="2") * norm(Xplus, type="2")

evX <- eigen(t(X) %*% X)$values
sqrt(max(evX) / min(evX[evX >= .Machine$double.eps]))
sqrt(evX / min(evX[evX >= .Machine$double.eps]))

##### 12.1.6 Zerlegung von Matrizen ----
X    <- matrix(c(20, 26, 10, 19, 29, 27, 20, 12, 17, 23, 27, 25), nrow=4, ncol=3)
S    <- cov(X)
eigS <- eigen(S)
G    <- eigS$vectors
D    <- diag(eigS$values)
all.equal(S, G %*% D %*% t(G))

sqrtD <- diag(sqrt(eigS$values))
A     <- G %*% sqrtD %*% t(G)
all.equal(S, A %*% A)

N <- eigS$vectors %*% sqrt(diag(eigS$values))
all.equal(S, N %*% t(N))

svdX <- svd(X)
all.equal(X, svdX$u %*% diag(svdX$d) %*% t(svdX$v))
all.equal(sqrt(eigen(t(X) %*% X)$values), svdX$d)

R <- chol(S)
all.equal(S, t(R) %*% R)

qrX <- qr(X)
Q   <- qr.Q(qrX)
R   <- qr.R(qrX)
all.equal(X, Q %*% R)

##### 12.1.7 Orthogonale Projektion ----
##### 12.1.7.1 Eigenschaften ----
##### 12.1.7.2 Beispiele ----
X    <- matrix(c(20, 26, 10, 19, 29, 27, 20, 12, 17, 23, 27, 25), nrow=4, ncol=3)
ones <- rep(1, nrow(X))
P1   <- ones %*% solve(t(ones) %*% ones) %*% t(ones)
P1x  <- P1 %*% X
head(P1x, n=3)

colMeans(X)

a  <- ones / sqrt(c(crossprod(ones)))
P2 <- a %*% t(a)
all.equal(P1, P2)

IP1  <- diag(nrow(X)) - P1
IP1x <- IP1 %*% X
all.equal(IP1x, scale(X, center=colMeans(X), scale=FALSE), check.attributes=FALSE)

A   <- cbind(c(1, 0, 0), c(0, 1, 0))
P3  <- A %*% solve(t(A) %*% A) %*% t(A)
Px3 <- t(P3 %*% t(X))
head(Px3, n=3)

qrX   <- qr(X)
Q     <- qr.Q(qrX)
R     <- qr.R(qrX)
Xplus <- solve(t(X) %*% X) %*% t(X)
all.equal(Xplus, solve(R) %*% t(Q))
all.equal(X %*% Xplus, Q %*% t(Q))

####*--------------------------------------------------------------------------*
#### 12.2 Hauptkomponentenanalyse ----
##### 12.2.1 Berechnung ----
sigma <- matrix(c(4, 2, 2, 3), ncol=2)
mu    <- c(1, 2)
N     <- 50
library(mvtnorm)
X     <- rmvnorm(N, mean=mu, sigma=sigma)
## für Daten des Buches diese Datei laden:
# load("data_pca.Rdata")
(pca  <- prcomp(X))

eig    <- eigen(cov(X))
eigVal <- eig$values
sqrt(eigVal)
(G     <- eig$vectors)

B    <- G %*% diag(sqrt(eigVal))
ctr  <- colMeans(X)
xMat <- rbind(ctr[1] - B[1, ], ctr[1])
yMat <- rbind(ctr[2] - B[2, ], ctr[2])

ab1 <- solve(cbind(1, xMat[ , 1]), yMat[ , 1])
ab2 <- solve(cbind(1, xMat[ , 2]), yMat[ , 2])

par(mfrow=c(1, 2), lend=1)
plot(X, xlab="x", ylab="y", pch=19, asp=1,
     main="Datenwolke und Hauptkomponenten")
abline(coef=ab1, lwd=2, col="gray")
abline(coef=ab2, lwd=2, col="gray")
matlines(xMat, yMat, lty=1, lwd=6, col="blue")
points(ctr[1], ctr[2], pch=16, col="red", cex=3)
legend(x="topleft",
       legend=c("Daten", "Achsen HK", "Streuungen HK", "Zentroid"),
       pch=c(20, NA, NA, 16), lty=c(NA, 1, 1, NA), lwd=c(NA, 2, 2, NA),
       col=c("black", "gray", "blue", "red"), bg="white")

Y    <- predict(pca)
head(Y)
(Ysd <- apply(Y, 2, sd))

head(cmdscale(dist(X)))

Xdot    <- scale(X, center=TRUE, scale=FALSE)
Y_XdotG <- Xdot %*% G
head(Y_XdotG, n=3)

svd_Xdot <- svd(Xdot)
Y_svd    <- svd_Xdot$u %*% diag(svd_Xdot$d)
head(Y_svd, n=3)

Xnew <- matrix(1:4, ncol=2)
predict(pca, newdata=Xnew)

summary(pca)
Ysd^2 / sum(diag(cov(X)))

##### 12.2.2 Dimensionsreduktion ----
B    <- G %*% diag(sqrt(eigVal))
H    <- scale(Y)
HB   <- H %*% t(B)
repr <- sweep(HB, 2, ctr, "+")
all.equal(X, repr)

HB1   <- H[ , 1, drop=FALSE] %*% t(B[ , 1, drop=FALSE])
repr1 <- sweep(HB1, 2, ctr, "+")
sum((X-repr1)^2)
qr(scale(repr1, center=TRUE, scale=FALSE))$rank

plot(X, xlab="x", ylab="y", pch=19, asp=1,
     main="Originaldaten und Approximation")
abline(coef=ab1, lwd=2, col="gray")
abline(coef=ab2, lwd=2, col="gray")
segments(X[ , 1], X[ , 2], repr1[ , 1], repr1[ , 2])
points(repr1, pch=1, lwd=2, col="blue", cex=2)
points(ctr[1], ctr[2], pch=16, col="red", cex=3)
legend(x="topleft",
       legend=c("Daten", "Achsen HK", "Zentroid", "Approximation"),
       pch=c(20, NA, 16, 1), lty=c(NA, 1, NA, NA), lwd=c(NA, 2, NA, 2),
       col=c("black", "gray", "red", "blue"), bg="white")

B %*% t(B)
cov(X)
B[ , 1] %*% t(B[ , 1])

(pcaPrin <- princomp(X))
Sml <- cov.wt(Y, method="ML")$cov
sqrt(diag(Sml))
pcaPrin$loadings

##### 12.2.3 Visualisierung ----
plot(pca, type="b")
biplot(pca)
(f <- svd_Xdot$d)
sqrt(eigVal)*sqrt(N)
head(Y %*% diag(1/f), n=3)
G %*% diag(f)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

####*--------------------------------------------------------------------------*
#### 12.3 Faktorenanalyse ----
N <- 200
P <- 6
Q <- 2
(Lambda <- matrix(c(0.7,-0.4, 0.8,0, -0.2,0.9, -0.3,0.4, 0.3,0.7, -0.8,0.1),
                  nrow=P, ncol=Q, byrow=TRUE))
Kf <- diag(Q)
mu <- c(5, 15)
library(mvtnorm)
FF <- rmvnorm(N, mean=mu, sigma=Kf)
E  <- rmvnorm(N, rep(0, P), diag(P))
X  <- FF %*% t(Lambda) + E

## für Daten des Buches diese Datei laden:
# load("data_factanal.Rdata")
(fa <- factanal(X, factors=2, scores="regression"))

Lhat <- fa$loadings
colSums(Lhat^2)
colSums(Lhat^2) / sum(diag(cor(X)))
head(fa$scores)

1 - fa$uniquenesses
rowSums(Lhat^2)

plot(Lhat, xlab="Faktor 1", ylab="Faktor 2",
     xlim=c(-1.1, 1.1), ylim=c(-1.1, 1.1), pch=20, asp=1,
     main="Faktorenanalyse: Ladung & Faktoren")
abline(h=0, v=0)
arrows(0, 0, c(1, 0), c(0, 1), col="blue", lwd=2)
angles <- seq(0, 2*pi, length.out=200)
circ   <- cbind(cos(angles), sin(angles))
lines(circ)

KxEst <- Lhat %*% t(Lhat) + diag(fa$uniquenesses)
ang   <- pi/3
G     <- matrix(c(cos(ang), sin(ang), -sin(ang), cos(ang)), nrow=2)
(Lrot <- Lhat %*% G)
KxEstRot <- Lrot %*% t(Lrot) + diag(fa$uniquenesses)
all.equal(KxEst, KxEstRot)

arrows(0, 0, G[1, ], G[2, ], col="green", lwd=2)
segments(0, 0, -G[1, ], -G[2, ])
text(Lhat[ , 1] , Lhat[ , 2]+0.08, labels=paste("Var", 1:Q))
legend(x="bottomright",
       legend=c("Ladungen", "Faktoren", "rotierte Faktoren"),
       pch=c(20, NA, NA), lty=c(NA, 1, 1), col=c("black", "blue", "green"),
       bg="white")

plot(eigen(cor(X))$values, type="b", ylab="Eigenwert", main="Scree-Plot")
abline(h=1)

G         <- matrix(c(0.7071, 0.7071, -0.866, 0.5), nrow=2)
KfRot     <- t(G) %*% diag(Q) %*% G
Lrot      <- Lhat %*% solve(t(G))
facStruct <- Lhat %*% diag(Q) %*% G
KxEstRot  <- Lrot %*% KfRot %*% t(Lrot) + diag(fa$uniquenesses)
all.equal(KxEst, KxEstRot)

## Faktorenanalyse ohne Rotation: aufgeklärte Varianz = zugehöriger
## Eigenwert der geschätzten reduzierten Korrelationsmatrix
faNoRot   <- factanal(X, factors=2, rotation="none", scores="regression")
LambdaHat <- faNoRot$loadings        ## geschätzte Ladungsmatrix
colSums(LambdaHat^2)                 ## aufgeklärte Varianz
Rhat <- LambdaHat %*% t(LambdaHat)   ## geschätzte reduzierte Korrelationsmatrix
zapsmall(eigen(Rhat)$values)         ## Eigenwerte

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

####*--------------------------------------------------------------------------*
#### 12.4 Multidimensionale Skalierung ----
cities  <- c("Augsburg", "Berlin", "Dresden", "Hamburg", "Hannover",
             "Karlsruhe", "Kiel", "München", "Rostock", "Stuttgart")
n       <- length(cities)
dstMat  <- matrix(numeric(n^2), nrow=n)
##             B   DD   HH    H   KA   KI    M  HRO    S
cityDst <- c(596, 467, 743, 599, 226, 838,  65, 782, 160, ## AUG
                  194, 288, 286, 673, 353, 585, 231, 633, ## B
                       477, 367, 550, 542, 465, 420, 510, ## DD
                            157, 623,  96, 775, 187, 665, ## HH
                                 480, 247, 632, 330, 512, ## H
                                      723, 298, 805,  80, ## KA
                                           872, 206, 752, ## KI
                                                777, 220, ## M
                                                     824) ## HRO

dstMat[upper.tri(dstMat)] <- rev(cityDst)
dstMat <- t(dstMat[ , n:1])[ , n:1]
dstMat[lower.tri(dstMat)] <- t(dstMat)[lower.tri(dstMat)]
dimnames(dstMat) <- list(city=cities, city=cities)

(mds <- cmdscale(dstMat, k=2))

xLims <- range(mds[ , 1]) + c(0, 250)
plot(mds, xlim=xLims, xlab="Nord-Süd", ylab="Ost-West", pch=16,
     main="Anordnung der Städte nach MDS")
text(mds[ , 1]+50, mds[ , 2], adj=0, labels=cities, cex=1.3)

####*--------------------------------------------------------------------------*
#### 12.5 Multivariate multiple Regression ----
N      <- 100
height <- rnorm(N, 175, 7)
age    <- rnorm(N, 30, 8)
sport  <- abs(rnorm(N, 60, 30))
weight <- 0.5*height - 0.3*age - 0.4*sport + 10 + rnorm(N, 0, 3)
health <- -0.3*age + 0.6*sport + rnorm(N, 4)
## für Daten des Buches diese Datei laden:
# load("data_regrMultMV.Rdata")
Y      <- cbind(weight, health)

(fitM <- lm(Y ~ height + age + sport))
coef(lm(weight ~ height + age + sport))
coef(lm(health ~ height + age + sport))

summary(manova(fitM), test="Hotelling-Lawley")
summary(manova(fitM), test="Wilks")
summary(manova(fitM), test="Roy")
summary(manova(fitM), test="Pillai")

####*--------------------------------------------------------------------------*
#### 12.6 Hotellings T^2 ----
##### 12.6.1 Test für eine Stichprobe ----
muH0  <- c(-1, 2)
sigma <- matrix(c(16,-2, -2,9), byrow=TRUE, ncol=2)
mu11  <- c(-4, 4)
Nj    <- c(15, 25, 20)
library(mvtnorm)
Y11   <- round(rmvnorm(Nj[1], mean=mu11, sigma=sigma))

## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
Y11ctr <- scale(Y11, center=muH0, scale=FALSE)
(anRes <- anova(lm(Y11ctr ~ 1), test="Hotelling-Lawley"))

n     <- nrow(Y11)
ctr   <- colMeans(Y11)
(T2   <- n * (t(ctr-muH0) %*% solve(cov(Y11)) %*% (ctr-muH0)))
n * mahalanobis(ctr, muH0, cov(Y11))
Tr_HL <- anRes[1, "Hotelling-Lawley"]
(n-1) * Tr_HL
r     <- ncol(Y11)
(F_HL <- ((n-r) / r) * Tr_HL)
(pVal <- pf(F_HL, r, n-r, lower.tail=FALSE))

##### 12.6.2 Test für zwei unabhängige Stichproben ----
mu21 <- c(3, 3)
library(mvtnorm)
Y21  <- round(rmvnorm(Nj[2], mean=mu21, sigma=sigma))

## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
Yht  <- rbind(Y11, Y21)
IVht <- factor(rep(1:2, Nj[1:2]))
anova(lm(Yht ~ IVht), test="Hotelling-Lawley")
(sumRes <- summary(manova(Yht ~ IVht), test="Hotelling-Lawley"))

n1    <- nrow(Y11)
n2    <- nrow(Y21)
ctr1  <- colMeans(Y11)
ctr2  <- colMeans(Y21)
S1    <- cov.wt(Y11, method="ML")$cov
S2    <- cov.wt(Y21, method="ML")$cov
Su    <- (1 / (n1+n2-2)) * (n1*S1 + n2*S2)
(T2   <- ((n1*n2) / (n1+n2)) * (t(ctr2-ctr1) %*% solve(Su) %*% (ctr2-ctr1)))
((n1*n2) / (n1+n2)) * mahalanobis(ctr1, ctr2, Su)
Tr_HL <- sumRes$stats["IVht", "Hotelling-Lawley"]
(n1+n2-2) * Tr_HL
r     <- ncol(Y11)
(F_HL <- ((n1+n2-r-1) / r) * Tr_HL)
(pVal <- pf(F_HL, r, n1+n2-r-1, lower.tail=FALSE))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

##### 12.6.3 Test für zwei abhängige Stichproben ----
N    <- 20
P    <- 2
Y1t0 <- rnorm(N, mean=90,  sd=15)
Y1t1 <- rnorm(N, mean=100, sd=15)
Y2t0 <- rnorm(N, mean=85,  sd=15)
Y2t1 <- rnorm(N, mean=105, sd=15)
IV   <- factor(rep(1:P, each=N), labels=c("t0", "t1"))
id   <- factor(rep(1:N, times=P))
Ydf  <- data.frame(id, Y1=c(Y1t0, Y1t1), Y2=c(Y2t0, Y2t1), IV)
## für Daten des Buches diese Datei laden:
# load("data_tSq2dep.Rdata")

dfDiff <- aggregate(cbind(Y1, Y2) ~ id, data=Ydf, FUN=diff)
DVdiff <- data.matrix(dfDiff[ , -1])
anova(lm(DVdiff ~ 1), test="Hotelling-Lawley")

##### 12.6.4 Univariate Varianzanalyse mit abhängigen Gruppen RB-p ----
## Daten erzeugen wie in 7.4.1
N      <- 10
P      <- 4
id     <- factor(rep(1:N, times=P))
IV     <- factor(rep(1:P,  each=N))
DV_t1  <- round(rnorm(N, -0.3, 1), 2)
DV_t2  <- round(rnorm(N, -0.2, 1), 2)
DV_t3  <- round(rnorm(N,  0.1, 1), 2)
DV_t4  <- round(rnorm(N,  0.4, 1), 2)
DV     <- c(DV_t1, DV_t2, DV_t3, DV_t4)
dfRBpL <- data.frame(id, IV, DV)

## für Daten des Buches diese Datei laden:
# load("data_rbp.Rdata")
P       <- 4
DVw     <- cbind(DV_t1, DV_t2, DV_t3, DV_t4)
diffMat <- combn(1:P, 2, function(x) { DVw[ , x[1]] - DVw[ , x[2]] } )
DVdiff  <- diffMat[ , 1:(P-1), drop=FALSE]
anova(lm(DVdiff ~ 1), test="Hotelling-Lawley")

fitRBp   <- lm(DVw ~ 1)
intraRBp <- data.frame(IV=gl(P, 1))

library(car)
AnovaRBp <- Anova(fitRBp, idata=intraRBp, idesign=~IV)
summary(AnovaRBp, multivariate=TRUE, univariate=FALSE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

####*--------------------------------------------------------------------------*
#### 12.7 Multivariate Varianzanalyse (MANOVA) ----
##### 12.7.1 Einfaktorielle MANOVA ----
mu31 <- c(1, -1)
library(mvtnorm)
Y31  <- round(rmvnorm(Nj[3], mean=mu31, sigma=sigma))
## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
Ym1   <- rbind(Y11, Y21, Y31)
IVman <- factor(rep(1:3, Nj))

manRes1 <- manova(Ym1 ~ IVman)
summary(manRes1, test="Wilks")
summary(manRes1, test="Roy")
summary(manRes1, test="Pillai")
summary(manRes1, test="Hotelling-Lawley")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

##### 12.7.2 Zweifaktorielle MANOVA ----
mu12 <- c(-1,  4)
mu22 <- c( 4,  8)
mu32 <- c( 4,  0)
library(mvtnorm)
Y12  <- round(rmvnorm(Nj[1], mean=mu12, sigma=sigma))
Y22  <- round(rmvnorm(Nj[2], mean=mu22, sigma=sigma))
Y32  <- round(rmvnorm(Nj[3], mean=mu32, sigma=sigma))
## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
Ym2  <- rbind(Ym1, Y12, Y22, Y32)
IV1  <- rep(IVman, times=2)
IV2  <- factor(rep(1:2, each=sum(Nj)))

manRes2 <- manova(Ym2 ~ IV1*IV2)
summary(manRes2, test="Pillai")
summary(manRes2, test="Wilks")
summary(manRes2, test="Roy")
summary(manRes2, test="Hotelling-Lawley")

fitIII <- lm(Ym2 ~ IV1*IV2,
             contrasts=list(IV1=contr.sum, IV2=contr.sum))

library(car)
Manova(fitIII, type="III")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))
try(detach(package:car))
try(detach(package:carData))

####*--------------------------------------------------------------------------*
#### 12.8 Diskriminanzanalyse ----
Nj    <- c(15, 25, 20)
sigma <- matrix(c(16,-2, -2,9), byrow=TRUE, ncol=2)
mu11  <- c(-4, 4)
mu21  <- c(3, 3)
mu31  <- c(1, -1)
library(mvtnorm)
Y11   <- round(rmvnorm(Nj[1], mean=mu11, sigma=sigma))
Y21   <- round(rmvnorm(Nj[2], mean=mu21, sigma=sigma))
Y31   <- round(rmvnorm(Nj[3], mean=mu31, sigma=sigma))
Ym1   <- rbind(Y11, Y21, Y31)
IVman <- factor(rep(1:3, Nj))
## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
Ydf1 <- data.frame(IVman, DV1=Ym1[ , 1], DV2=Ym1[ , 2])

library(MASS)
(ldaRes <- lda(IVman ~ DV1 + DV2, data=Ydf1))
ldaP    <- lda(IVman ~ DV1 + DV2, CV=TRUE, data=Ydf1)
ldaP$posterior

ldaPred <- predict(ldaRes, Ydf1)
head(ldaPred$x)
head(ldaPred$class)

cTab <- table(IVman, ldaPred$class, dnn=c("IVman", "ldaPred"))
addmargins(cTab)
sum(diag(cTab)) / sum(cTab)

## berechne Matrizen B und W wie in 11.9.9
X  <- model.matrix(~ IVman)
Pf <- X  %*% solve(t(X)  %*% X)  %*% t(X)
Pu <- Pf
X0 <- X[ , 1, drop=FALSE]
P0 <- X0 %*% solve(t(X0) %*% X0) %*% t(X0)
Pr <- P0
Id <- diag(sum(Nj))
BB <- t(Ym1) %*% (Pu-Pr) %*% Ym1
WW <- t(Ym1) %*% (Id-Pf) %*% Ym1

eigWinvB <- eigen(solve(WW) %*% BB)
eigVec   <- eigWinvB$vectors
eigVal   <- eigWinvB$values
p        <- nlevels(IVman)
My       <- colMeans(Ym1)
N        <- sum(Nj)

eigVal / sum(diag(solve(WW) %*% BB))
eigVal / sum(eigVal)

scl <- sqrt((N-p) / diag(t(eigVec) %*% WW %*% eigVec))
b0  <- -scl * t(eigVec) %*% My
(bk <- eigVec %*% diag(scl))
ld  <- sweep(Ym1 %*% bk, 2, b0, "+")
all.equal(ld, ldaPred$x, check.attributes=FALSE)

anova(lm(ld[ , 1] ~ IVman))
anova(lm(ld[ , 2] ~ IVman))

((N-p) / (p-1)) * eigVal

priorP <- rep(1/nlevels(IVman), nlevels(IVman))
ldaEq  <- lda(IVman ~ DV1 + DV2, prior=priorP, data=Ydf1)
predEq <- predict(ldaEq, Ydf1)
LDmat  <- predEq$x
ctrDf  <- aggregate(cbind(LD1, LD2) ~ IVman, FUN=mean, data=LDmat)
ctrLD  <- data.matrix(ctrDf[ , -1])

diffMat1 <- scale(LDmat, center=ctrLD[1,], scale=FALSE)
diffMat2 <- scale(LDmat, center=ctrLD[2,], scale=FALSE)
diffMat3 <- scale(LDmat, center=ctrLD[3,], scale=FALSE)
dst1     <- sqrt(diag(tcrossprod(diffMat1)))
dst2     <- sqrt(diag(tcrossprod(diffMat2)))
dst3     <- sqrt(diag(tcrossprod(diffMat3)))
dstMat   <- cbind(dst1, dst2, dst3)
dstPred  <- apply(dstMat, 1, which.min)
head(dstPred)
all(dstPred == unclass(predEq$class))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:MASS))
try(detach(package:mvtnorm))

####*--------------------------------------------------------------------------*
#### 12.9 Das allgemeine lineare Modell ----
##### 12.9.1 Modell der multiplen linearen Regression ----
##### 12.9.2 Modell der einfaktoriellen Varianzanalyse ----
contr.treatment(4)
solve(cbind(1, contr.treatment(4)))
getOption("contrasts")

contr.sum(4)
solve(cbind(1, contr.sum(4)))
options(contrasts=c("contr.sum",       "contr.poly"))
options(contrasts=c("contr.treatment", "contr.poly"))

IV   <- gl(3, 5)
(IVe <- C(IV, contr.sum))
contrasts(IV)

DV <- rnorm(15)
lm(DV ~ IV, contrasts=list(IV=contr.treatment))

##### 12.9.3 Modell der zweifaktoriellen Varianzanalyse ----
##### 12.9.4 Parameterschätzungen, Vorhersage und Residuen ----
##### 12.9.5 Hypothesen über parametrische Funktionen testen ----
##### 12.9.6 Lineare Hypothesen als Modellvergleiche formulieren ----
##### 12.9.7 Lineare Hypothesen testen ----
##### 12.9.8 Beispiel: Multivariate multiple Regression ----
## Daten erzeugen wie in 11.5
N      <- 100
height <- rnorm(N, 175, 7)
age    <- rnorm(N, 30, 8)
sport  <- abs(rnorm(N, 60, 30))
weight <- 0.5*height - 0.3*age - 0.4*sport + 10 + rnorm(N, 0, 3)
health <- -0.3*age + 0.6*sport + rnorm(N, 4)

## für Daten des Buches diese Datei laden:
# load("data_regrMultMV.Rdata")
Y  <- cbind(weight, health)
X  <- cbind(1, height, age, sport)
XR <- model.matrix(~ height + age + sport)
all.equal(X, XR, check.attributes=FALSE)

Xplus <- solve(t(X) %*% X) %*% t(X)
B     <- Xplus %*% Y
Pf    <- X %*% Xplus
Yhat  <- Pf %*% Y

fit <- lm(Y ~ height + age + sport)
all.equal(B,    coef(fit),   check.attributes=FALSE)
all.equal(Yhat, fitted(fit), check.attributes=FALSE)

X0  <- X[ , 1, drop=FALSE]
P0  <- X0 %*% solve(t(X0) %*% X0) %*% t(X0)

Id  <- diag(N)
WW  <- t(Y) %*% (Id - Pf) %*% Y

Xr1 <- X0
Pr1 <- P0

Xu1 <- X[ , c(1, 2)]
Pu1 <- Xu1 %*% solve(t(Xu1) %*% Xu1) %*% t(Xu1)
B1  <- t(Y) %*% (Pu1 - Pr1) %*% Y

Xr2 <- Xu1
Pr2 <- Pu1

Xu2 <- X[ , c(1, 2, 3)]
Pu2 <- Xu2 %*% solve(t(Xu2) %*% Xu2) %*% t(Xu2)
B2  <- t(Y) %*% (Pu2 - Pr2) %*% Y

Xr3 <- Xu2
Pr3 <- Pu2

Xu3 <- X
Pu3 <- Pf
B3  <- t(Y) %*% (Pu3 - Pr3) %*% Y

(WL1   <- det(WW) / det(B1 + WW))
(WL2   <- det(WW) / det(B2 + WW))
(WL3   <- det(WW) / det(B3 + WW))

(RLRl1 <- max(eigen(solve(WW) %*% B1)$values))
(RLRl2 <- max(eigen(solve(WW) %*% B2)$values))
(RLRl3 <- max(eigen(solve(WW) %*% B3)$values))

(RLRt1 <- max(eigen(solve(B1 + WW) %*% B1)$values))
(RLRt2 <- max(eigen(solve(B2 + WW) %*% B2)$values))
(RLRt3 <- max(eigen(solve(B3 + WW) %*% B3)$values))

(PBT1  <- sum(diag(solve(B1 + WW) %*% B1)))
(PBT2  <- sum(diag(solve(B2 + WW) %*% B2)))
(PBT3  <- sum(diag(solve(B3 + WW) %*% B3)))

(HLT1  <- sum(diag(solve(WW) %*% B1)))
(HLT2  <- sum(diag(solve(WW) %*% B2)))
(HLT3  <- sum(diag(solve(WW) %*% B3)))

## Kontrolle
summary(manova(fit), test="Wilks")
summary(manova(fit), test="Roy")
summary(manova(fit), test="Pillai")
summary(manova(fit), test="Hotelling-Lawley")

##### 12.9.9 Beispiel: Einfaktorielle MANOVA ----
## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
XstarP <- cbind(as.numeric(IVman == 1),
                as.numeric(IVman == 2),
                as.numeric(IVman == 3))

Ct   <- contr.treatment(ncol(XstarP))
Xpm1 <- XstarP %*% Ct
X    <- cbind(1, Xpm1)
XR   <- model.matrix(~ IVman)
all.equal(X, XR, check.attributes=FALSE)

Pf  <- X  %*% solve(t(X)  %*% X)  %*% t(X)
Pu  <- Pf

X0  <- X[ , 1, drop=FALSE]
P0  <- X0 %*% solve(t(X0) %*% X0) %*% t(X0)
Pr  <- P0

(Bt <- coef(lm(Ym1 ~ IVman, contrasts=list(IVman=contr.treatment))))
(Mj <- aggregate(cbind(X1, X2) ~ IVman, data=data.frame(Ym1), FUN=mean))

Mj[2, c("X1", "X2")] - Mj[1, c("X1", "X2")]
Mj[3, c("X1", "X2")] - Mj[1, c("X1", "X2")]

(BstarPt <- Ct %*% Bt[-1, ])

(Be <- coef(lm(Ym1 ~ IVman, contrasts=list(IVman=contr.sum))))
Mj_num <- Mj[ , c("X1", "X2")]
colMeans(Mj_num)
scale(Mj_num, colMeans(Mj_num), scale=FALSE)

Ce       <- contr.sum(ncol(XstarP))
(BstarPe <- Ce %*% Be[-1, ])
colSums(BstarPe)

(Bcm <- coef(lm(Ym1 ~ IVman - 1)))

Yhat <- Pf %*% Ym1
unique(Yhat)

Id <- diag(sum(Nj))
BB <- t(Ym1) %*% (Pu-Pr) %*% Ym1
WW <- t(Ym1) %*% (Id-Pf) %*% Ym1
TT <- t(Ym1) %*% (Id-P0) %*% Ym1

all.equal(BB, (sum(Nj)-1) * cov((Pu-Pr) %*% Ym1))
all.equal(WW, (sum(Nj)-1) * cov((Id-Pf) %*% Ym1))
all.equal(TT, (sum(Nj)-1) * cov((Id-P0) %*% Ym1))
all.equal(TT, (sum(Nj)-1) * cov(Ym1))
all.equal(TT, BB + WW)

(WL   <- det(WW) / det(BB + WW))
(RLRl <- max(eigen(solve(WW)      %*% BB)$values))
(RLRt <- max(eigen(solve(BB + WW) %*% BB)$values))
(PBT  <- sum( diag(solve(BB + WW) %*% BB)))
(HLT  <- sum( diag(solve(WW)      %*% BB)))

## Kontrolle
fit <- manova(Ym1 ~ IVman)
summary(fit, test="Wilks")
summary(fit, test="Roy")
summary(fit, test="Pillai")
summary(fit, test="Hotelling-Lawley")

##### 12.9.10 Beispiel: Zweifaktorielle MANOVA ----
## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
Xstar1  <- cbind(as.numeric(IV1 == 1),
                 as.numeric(IV1 == 2),
                 as.numeric(IV1 == 3))
Xstar2  <- cbind(as.numeric(IV2 == 1),
                 as.numeric(IV2 == 2))
Xstar12 <- cbind(as.numeric(IV1 == 1) * as.numeric(IV2 == 1),
                 as.numeric(IV1 == 2) * as.numeric(IV2 == 1),
                 as.numeric(IV1 == 3) * as.numeric(IV2 == 1),
                 as.numeric(IV1 == 1) * as.numeric(IV2 == 2),
                 as.numeric(IV1 == 2) * as.numeric(IV2 == 2),
                 as.numeric(IV1 == 3) * as.numeric(IV2 == 2))

C1  <- contr.treatment(ncol(Xstar1))
C2  <- contr.treatment(ncol(Xstar2))
C12 <- kronecker(C2, C1)

X1  <- Xstar1  %*% C1
X2  <- Xstar2  %*% C2
X12 <- Xstar12 %*% C12

X  <- cbind(1, X1, X2, X12)
XR <- model.matrix(~ IV1*IV2)
all.equal(X, XR, check.attributes=FALSE)

X0 <- X[ , 1, drop=FALSE]
P0 <- X0 %*% solve(t(X0) %*% X0) %*% t(X0)
Pf <- X  %*% solve(t(X)  %*% X)  %*% t(X)

Id <- diag(nrow(Ym2))
TT <- t(Ym2) %*% (Id - P0) %*% Ym2
WW <- t(Ym2) %*% (Id - Pf) %*% Ym2

Yhat <- Pf %*% Ym2
unique(Yhat)
aggregate(cbind(X1, X2) ~ IV1 + IV2, data=data.frame(Ym2), FUN=mean)

Xr1 <- X0
Pr1 <- P0

Xu1 <- X[ , c(1, 2, 3)]
Pu1 <- Xu1 %*% solve(t(Xu1) %*% Xu1) %*% t(Xu1)
B1  <- t(Ym2) %*% (Pu1 - Pr1) %*% Ym2

Xr2 <- Xu1
Pr2 <- Pu1

Xu2 <- X[ , c(1, 2, 3, 4)]
Pu2 <- Xu2 %*% solve(t(Xu2) %*% Xu2) %*% t(Xu2)
B2  <- t(Ym2) %*% (Pu2 - Pr2) %*% Ym2

Xr12 <- Xu2
Pr12 <- Pu2

Xu12 <- X
Pu12 <- Pf
B12  <- t(Ym2) %*% (Pu12 - Pr12) %*% Ym2

all.equal(TT, B1 + B2 + B12 + WW)

(WL1    <- det(WW) / det(B1  + WW))
(WL2    <- det(WW) / det(B2  + WW))
(WL12   <- det(WW) / det(B12 + WW))

(RLRl1  <- max(eigen(solve(WW) %*% B1 )$values))
(RLRl2  <- max(eigen(solve(WW) %*% B2 )$values))
(RLRl12 <- max(eigen(solve(WW) %*% B12)$values))

(RLRt1  <- max(eigen(solve(B1  + WW) %*% B1 )$values))
(RLRt2  <- max(eigen(solve(B2  + WW) %*% B2 )$values))
(RLRt12 <- max(eigen(solve(B12 + WW) %*% B12)$values))

(PBT1   <- sum(diag(solve(B1  + WW) %*% B1 )))
(PBT2   <- sum(diag(solve(B2  + WW) %*% B2 )))
(PBT12  <- sum(diag(solve(B12 + WW) %*% B12)))

(HLT1   <- sum(diag(solve(WW) %*% B1 )))
(HLT2   <- sum(diag(solve(WW) %*% B2 )))
(HLT12  <- sum(diag(solve(WW) %*% B12)))

## Kontrolle
fit <- manova(Ym2 ~ IV1 * IV2)
summary(fit, test="Wilks")
summary(fit, test="Roy")
summary(fit, test="Pillai")
summary(fit, test="Hotelling-Lawley")

##### 12.9.11 Beispiel: MANCOVA ----
## für Daten des Buches diese Datei laden:
# load("data_manova.Rdata")
IQ <- rnorm(sum(Nj), mean=100, sd=15)

Xp  <- cbind(as.numeric(IVman == 1),
             as.numeric(IVman == 2),
             as.numeric(IVman == 3))
Xpi <- cbind(as.numeric(IVman == 1) * IQ,
             as.numeric(IVman == 2) * IQ,
             as.numeric(IVman == 3) * IQ)

Ct    <- contr.treatment(ncol(Xp))
Xpm1  <- Xp  %*% Ct
Xpm1i <- Xpi %*% Ct
X     <- cbind(1, Xpm1, IQ, Xpm1i)
XR    <- model.matrix(~ IVman * IQ)
all.equal(X, XR, check.attributes=FALSE)

X0 <- X[ , 1, drop=FALSE]
P0 <- X0 %*% solve(t(X0) %*% X0) %*% t(X0)
Pf <- X  %*% solve(t(X)  %*% X)  %*% t(X)

Id <- diag(nrow(Ym1))
TT <- t(Ym1) %*% (Id - P0) %*% Ym1
WW <- t(Ym1) %*% (Id - Pf) %*% Ym1

Xr1 <- X0
Pr1 <- P0

Xu1 <- X[ , c(1, 2, 3)]
Pu1 <- Xu1 %*% solve(t(Xu1) %*% Xu1) %*% t(Xu1)
B1  <- t(Ym1) %*% (Pu1 - Pr1) %*% Ym1

Xr2 <- Xu1
Pr2 <- Pu1

Xu2 <- X[ , c(1, 2, 3, 4)]
Pu2 <- Xu2 %*% solve(t(Xu2) %*% Xu2) %*% t(Xu2)
B2  <- t(Ym1) %*% (Pu2 - Pr2) %*% Ym1

Xr12 <- Xu2
Pr12 <- Pu2

Xu12 <- X
Pu12 <- Pf
B12  <- t(Ym1) %*% (Pu12 - Pr12) %*% Ym1

all.equal(TT, B1 + B2 + B12 + WW)

(WL1    <- det(WW) / det(B1  + WW))
(WL2    <- det(WW) / det(B2  + WW))
(WL12   <- det(WW) / det(B12 + WW))

(RLRl1  <- max(eigen(solve(WW) %*% B1 )$values))
(RLRl2  <- max(eigen(solve(WW) %*% B2 )$values))
(RLRl12 <- max(eigen(solve(WW) %*% B12)$values))

(RLRt1  <- max(eigen(solve(B1  + WW) %*% B1 )$values))
(RLRt2  <- max(eigen(solve(B2  + WW) %*% B2 )$values))
(RLRt12 <- max(eigen(solve(B12 + WW) %*% B12)$values))

(PBT1   <- sum(diag(solve(B1  + WW) %*% B1 )))
(PBT2   <- sum(diag(solve(B2  + WW) %*% B2 )))
(PBT12  <- sum(diag(solve(B12 + WW) %*% B12)))

(HLT1   <- sum(diag(solve(WW) %*% B1 )))
(HLT2   <- sum(diag(solve(WW) %*% B2 )))
(HLT12  <- sum(diag(solve(WW) %*% B12)))

## Kontrolle
fit <- manova(Ym1 ~ IVman * IQ)
summary(fit, test="Wilks")
summary(fit, test="Roy")
summary(fit, test="Pillai")
summary(fit, test="Hotelling-Lawley")

####*--------------------------------------------------------------------------*
### 13 Vorhersagegüte prädiktiver Modelle ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 13.1 Kreuzvalidierung von Regressionsmodellen ----
##### 13.1.1 k-fache Kreuzvalidierung ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
regrDf <- data.frame(weight, height, age, sport)
glmFit <- glm(weight ~ height + age + sport, data=regrDf,
              family=gaussian(link="identity"))

library(boot)
k    <- 3
kfCV <- cv.glm(data=regrDf, glmfit=glmFit, K=k)
kfCV$delta

doCV <- function(idxTst) {
    fitTrn  <- lm(weight ~ height + age + sport, data=regrDf,
                  subset=-idxTst)
    predTst <- predict(fitTrn, newdata=regrDf[idxTst, ])
    mean((predTst - regrDf$weight[idxTst])^2)
}

tstGrp <- sample(seq_len(N), N, replace=FALSE) %% k
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
foldsL <- split(seq_along(tstGrp), tstGrp)
(cveL  <- lapply(foldsL, doCV))
mean(unlist(cveL))

library(boot)
n_repl   <- 5
rep_kfCV <- replicate(n_repl,
                      cv.glm(data=regrDf, glmfit=glmFit, K=k),
                      simplify=FALSE)

(deltas <- vapply(rep_kfCV, function(x) { x$delta[2] }, numeric(1)))
mean(deltas)

##### 13.1.2 Leave-One-Out Kreuzvalidierung ----
## für Daten des Buches diese Datei laden:
# load("data_regrMult.Rdata")
LOOCV <- cv.glm(data=regrDf, glmfit=glmFit, K=N)
LOOCV$delta

lmFit <- lm(weight ~ height + age + sport, data=regrDf)
PRESS <- rstandard(lmFit, type="predictive")^2
mean(PRESS)

mean((residuals(lmFit) / (1-hatvalues(lmFit)))^2)

idx <- seq_len(N)
res <- sapply(idx, function(x) { doCV(idx == x) } )
mean(res)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

####*--------------------------------------------------------------------------*
#### 13.2 Kreuzvalidierung verallgemeinerter linearer Modelle ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
glmLR <- glm(postFac ~ DVpre, family=binomial(link="logit"), data=dfAncova)

brierA <- function(y, pHat) {
    mean(((y == 1) * pHat)^2 + ((y == 0) * (1-pHat))^2)
}

library(boot)
B1 <- cv.glm(data=dfAncova, glmfit=glmLR, cost=brierA, K=10)
B1$delta

brierB <- function(y, pHat) {
    mean((y-pHat)^2)
}

set.seed(1234)
B2 <- cv.glm(data=dfAncova, glmfit=glmLR, cost=brierB, K=10)
B2$delta

set.seed(1234)
B3 <- cv.glm(data=dfAncova, glmfit=glmLR, K=10)
B3$delta

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

####*--------------------------------------------------------------------------*
#### 13.3 Kreuzvalidierung von Survival-Modellen ----

####*--------------------------------------------------------------------------*
#### 13.4 Bootstrap-Vorhersagefehler ----
## für Daten des Buches diese Datei laden:
# load("data_ancova.Rdata")
glmLR <- glm(postFac ~ DVpre, family=binomial(link="logit"), data=dfAncova)

getBSB <- function(dat, idx) {
    op <- options(warn=2)
    on.exit(options(op))

    bsFit <- try(glm(postFac ~ DVpre, family=binomial(link="logit"), subset=idx, data=dat))
    fail  <- inherits(bsFit, "try-error")
    if(fail || !bsFit$converged) {
        return(NA)
    } else {
        BbsTrn <- brierB(bsFit$y, predict(bsFit, type="response"))
        BbsTst <- brierB(as.numeric(dat$postFac)-1, predict(bsFit, newdata=dat, type="response"))
        return(BbsTrn - BbsTst)
    }
}

library(boot)
nR    <- 999
bsRes <- boot(dfAncova, statistic=getBSB, R=nR)
sum(is.na(bsRes$t))
(Btrain   <- brierB(glmLR$y, predict(glmLR, type="response")))
(optimism <- mean(bsRes$t, na.rm=TRUE))
(predErr  <- Btrain - optimism)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:boot))

####*--------------------------------------------------------------------------*
### 14 Diagramme erstellen ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 14.1 Graphik Devices ----
##### 14.1.1 Aufbau und Verwaltung von Grafik-Devices ----
dev.new(); dev.new(); dev.new()
dev.list()
dev.cur()
dev.set(3)
dev.set(dev.next())
dev.off()
graphics.off()

##### 14.1.2 Grafiken speichern ----
pdf("pdf_test.pdf")
plot(1:10, rnorm(10))
dev.off()

plot(1:10, rnorm(10))
dev.copy(jpeg, filename="copied.jpg", quality=90)
graphics.off()

####*--------------------------------------------------------------------------*
#### 14.2 Streu- und Liniendiagramme ----
##### 14.2.1 Streudiagramme mit plot() ----
vec <- rnorm(10)
par(mfrow=c(2, 3), mar=c(3.5, 4, 4, 2)+0.1)
plot(vec, type="p", xlab=NA, main="type p", cex=1.5)
plot(vec, type="l", xlab=NA, main="type l", cex=1.5)
plot(vec, type="b", xlab=NA, main="type b", cex=1.5)
plot(vec, type="o", xlab=NA, main="type o", cex=1.5)
plot(vec, type="s", xlab=NA, main="type s", cex=1.5)
plot(vec, type="h", xlab=NA, main="type h", cex=1.5)

##### 14.2.2 Datenpunkte eines Streudiagramms identifizieren ----
vec <- rnorm(10)
plot(vec)
## identify(vec)

##### 14.2.3 Streudiagramme mit matplot() ----
vec <- seq(from=-2*pi, to=2*pi, length.out=50)
mat <- cbind(2*sin(vec), sin(vec-(pi/4)), 0.5*sin(vec-(pi/2)))
matplot(vec, mat, type="b", xlab=NA, ylab=NA, pch=1:3, main="Sinuskurven")

####*--------------------------------------------------------------------------*
#### 14.3 Diagramme formatieren ----
##### 14.3.1 Grafikelemente formatieren ----
par(mfrow=c(1, 2))
op <- par(col="gray60", lwd=2, pch=19)
plot(rnorm(10), main="Grau, fett, gefüllte Kreise")
par(op)
plot(rnorm(10), main="Standardformatierung")

X <- row(matrix(numeric(6*11), nrow=6, ncol=11))
Y <- col(matrix(numeric(6*11), nrow=6, ncol=11))

par(mar=c(1, 1, 4, 2))
plot(0:6, seq(1, 11, length.out=7), type="n", axes=FALSE,
     main="pch Datenpunkt-Symbole und lty Linientypen")
points(X[1:26], Y[1:26], pch=0:25, bg="gray", cex=2)
matlines(X[ , 6:11], Y[ , 6:11], lty=6:1, lwd=2, col="black")
text(X[1:26]-0.3, Y[1:26],    labels=0:25)
text(rep(0.7, 6), Y[1, 6:11], labels=6:1)
text(0, 9, labels="Linientypen für lty", srt=90, cex=1.4)
text(0, 3, labels="Symbole für pch",     srt=90, cex=1.4)

##### 14.3.2 Farben spezifizieren ----
palette("default")
rgb(0, 1, 1)
rgb(t(col2rgb("red")/255))
hsv(0.1666, 1, 1)
gray(0.5)
## palette(rainbow(10))

##### 14.3.3 Achsen formatieren ----

####*--------------------------------------------------------------------------*
#### 14.4 Säulen- und Punktdiagramme ----
##### 14.4.1 Einfache Säulendiagramme ----
dice  <- sample(1:6, 100, replace=TRUE)
## für Daten des Buches diese Datei laden:
# load("data_barplot.Rdata")
(dTab <- xtabs(~ dice))

par(mfrow=c(1, 2))
barplot(dTab, ylim=c(0, 30), xlab="Augenzahl", ylab="N", col="black",
        main="Absolute Häufigkeiten")

pTab    <- proportions(dTab)
(d_dice <- as.data.frame(pTab))

barplot(Freq ~ dice, data=d_dice,
        xlim=c(0, 0.25), horiz=TRUE,
        xlab="relative Häufigkeit",
        ylab="Augenzahl",
        col="gray50", main="Relative Häufigkeiten")

##### 14.4.2 Gruppierte und gestapelte Säulendiagramme ----
## für Daten des Buches diese Datei laden:
# load("data_barplot.Rdata")
series   <- rep(c("first", "second"), each=50)
(rollAll <- xtabs(~ series + dice))

barplot(rollAll, beside=FALSE, legend.text=TRUE, xlab="Augenzahl", ylab="N",
        main="Abs. Häufigkeiten in zwei Stichproben")

d_rolls <- as.data.frame(rollAll)
head(d_rolls, n=4)
barplot(Freq ~ series + dice, data=d_rolls,
        beside=TRUE, ylim=c(0, 15), col=c("red", "green"),
        legend.text=TRUE, xlab="Augenzahl", ylab="N",
        main="Abs. Häufigkeiten in zwei Stichproben")

N      <- 100
age    <- sample(18:45, N, replace=TRUE)
drinks <- c("beer", "red wine", "white wine")
pref   <- factor(sample(drinks, N, replace=TRUE))
xRange <- round(range(age), -1) + c(-10, 10)
lims   <- seq(xRange[1], xRange[2], by=10)
## für Daten des Buches diese Datei laden:
# load("data_barplot.Rdata")
spineplot(x=age, y=pref, xlab="Altersstufe", ylab="Getränk", breaks=lims,
          main="Bevorzugte Getränke in Altersstufe")

##### 14.4.3 Dotchart ----
Nj  <- 5
DV1 <- rnorm(Nj, 20, 2)
DV2 <- rnorm(Nj, 25, 2)
DV  <- c(DV1, DV2)
IV  <- gl(2, Nj)
Mj  <- tapply(DV, IV, FUN=mean)

dotchart(DV, gdata=Mj, pch=rep(c(17, 19), each=Nj),
         color=rep(c("red", "blue"), each=Nj),
         gcolor="black", labels=rep(LETTERS[1:Nj], 2), groups=IV,
         xlab="Messwerte", ylab="Gruppen", cex=1.3,
         main="Individuelle Ergebnisse und Mittel aus 2 Gruppen")

####*--------------------------------------------------------------------------*
#### 14.5 Elemente einem bestehenden Diagramm hinzufügen ----
##### 14.5.1 Koordinaten in einem Diagramm identifizieren ----
plot(rnorm(10))
## (xy <- locator(n=3))

##### 14.5.2 In beliebige Diagrammbereiche zeichnen ----
library(Hmisc)
par(xpd=NA, mar=c(5, 5, 2, 2))
plot(rnorm(10), xlab=NA, ylab=NA, pch=20)
pt1 <- cnvrt.coords(0, 0, input="fig")
pt1$usr
points(pt1$usr$x + 0.4, pt1$usr$y + 0.14, pch=4, lwd=5, cex=5, col="darkgray")
text(pt1$usr$x + 0.8, pt1$usr$y + 0.05, adj=c(0, 0),
     labels="Kreuz links-unten Figure-Region", cex=2)

pt2 <- cnvrt.coords(c(0.05, 0.95), c(0.95, 0.05), input="tdev")
pt2$usr
arrows(x0=pt2$usr$x[1], y0=pt2$usr$y[1],
       x1=pt2$usr$x[2], y1=pt2$usr$y[2], lwd=4, code=3, angle=90, lend=2,
       col="darkgray")
text(pt2$usr$x[1] + 0.5, pt2$usr$y[1], adj=c(0, 0),
     labels="Pfeil über gesamte Device-Region", cex=2)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:Hmisc))

##### 14.5.3 Punkte ----
xA <- seq(-15, 15, length.out=200)
yA <- sin(xA) / xA
plot(xA, yA, type="l", xlab="x", ylab="sinc(x)",
     main="Punkte und Linien einfügen", lwd=1.6)

xB <- seq(-15, 15, length.out=30)
yB <- sin(xB) / xB
points(xB, yB, col="red", pch=20)

##### 14.5.4 Linien ----
yC <- sin(pi * xA) / (pi * xA)
lines(xA, yC, col="blue", type="l", lwd=1.6)
abline(h=0, v=0, col="green", lwd=1.6)

height <- rnorm(20, 175, 7)
weight <- 0.5*height + 10 + rnorm(20, 0, 4)
fit    <- lm(weight ~ height)
pred   <- fitted(fit)
## für Daten des Buches diese Datei laden:
# load("data_addGridArrows.Rdata")

plot(weight ~ height, asp=1, col="blue", pch=16,
     main="Gitter, Segmente und Pfeile einfügen")
abline(fit, lwd=2)
grid(lwd=3, col="darkgray")
segments(x0=height, y0=pred, x1=height, y1=weight, col="gray")
arrows(x0=c(height[1]-3,   height[3]),
       y0=c(weight[1],     weight[3]+3),
       x1=c(height[1]-0.5, height[3]),
       y1=c(weight[1],     weight[3]+0.5),
       col="red", lwd=2)
arrows(x0=height[4]+0.1*(height[8]-height[4]),
       y0=weight[4]+0.1*(weight[8]-weight[4]),
       x1=height[4]+0.9*(height[8]-height[4]),
       y1=weight[4]+0.9*(weight[8]-weight[4]), code=3, col="red", lwd=2)

##### 14.5.5 Polygone ----
n      <- 7
len    <- 1/n
colsR  <- rep(seq(0.9, 0.2, length.out=n),  each=n)
colsG  <- rep(seq(0.9, 0.2, length.out=n), times=n)
cols   <- rgb(colsR, colsG, 0)
## als Graustufen
# grays  <- rep(1:n, n) - rep(0:(n-1), each=n)
# grays  <- ((grays-min(grays)) / (max(grays)-min(grays)) * 0.8) + 0.1
# cols   <- gray(grays)
xLeft  <- rep(seq(0,   1-len, by=len), times=n)
yBot   <- rep(seq(0,   1-len, by=len),  each=n)
xRight <- rep(seq(len, 1,     by=len), times=n)
yTop   <- rep(seq(len, 1,     by=len),  each=n)

par(mfrow=c(1, 2), oma=c(1, 1, 1, 1), mar=c(0, 0, 1, 0))
plot(c(0, 1), c(0, 1), axes=FALSE, type="n", asp=1, main="Farbverlauf")
rect(xLeft, yBot, xRight, yTop, border=NA, col=cols)
idx     <- c(10, 27)
xText   <- xLeft[idx] + (xRight[idx]-xLeft[idx])/2
yText   <- yBot[idx]  + (yTop[idx]  - yBot[idx])/2
colVals <- substring(cols, 2, nchar(cols)) # just the color values, minus the "#"
text(xText, yText, labels=colVals[idx])
shuffled <- sample(seq_along(cols), length(cols), replace=FALSE)
idxS     <- c(which(shuffled == idx[1]), which(shuffled == idx[2]))
plot(c(0, 1), c(0, 1), axes=FALSE, type="n", asp=1,
     main="Dieselben Farben zufällig angeordnet")
rect(xLeft, yBot, xRight, yTop, border=NA, col=cols[shuffled])
xTextS   <- xLeft[idxS] + (xRight[idxS]-xLeft[idxS])/2
yTextS   <- yBot[idxS]  + (yTop[idxS]  - yBot[idxS])/2
text(xTextS, yTextS, labels=colVals[idx])

mu    <- 0
sigma <- 3
xLims <- c(mu-4*sigma, mu+4*sigma)
X     <- seq(xLims[1], xLims[2], length.out=100)
Y     <- dnorm(X, mu, sigma)
selX  <- seq(mu-sigma, mu+sigma, length.out=100)
selY  <- dnorm(selX, mu, sigma)
cdf   <- pnorm(X, mu, sigma)

par(mar=c(5, 4, 4, 5))
plot(X, Y, type="n", xlim=xLims-c(-2, 2), xlab=NA, ylab=NA,
     main="Dichtefunktion und Verteilungsfunktion N(0, 3)")
box(which="plot", col="gray", lwd=2)
polygon(c(selX, rev(selX)), c(selY, rep(-1, length(selX))), border=NA, col="lightgray")
lines(X, Y, lwd=2)
par(new=TRUE)
plot(X, cdf, xlim=xLims-c(-2, 2), type="l", lwd=2, col="blue", xlab="x",
     ylab=NA, axes=FALSE)
axis(side=4, at=seq(0, 1, by=0.1), col="blue")
segments(x0=c(mu-sigma, mu, mu+sigma),
         y0=c(-1, -1, -1),
         x1=c(mu-sigma, mu, mu+sigma),
         y1=c(pnorm(mu-sigma, mu, sigma), pnorm(mu, mu, sigma), pnorm(mu+sigma, mu, sigma)),
         lwd=2, col=c("darkgreen", "red", "darkgreen"), lty=2)
segments(x0=c(mu-sigma, mu, mu+sigma),
         y0=c(pnorm(mu-sigma, mu, sigma), pnorm(mu, mu, sigma), pnorm(mu+sigma, mu, sigma)),
         x1=xLims[2]+10,
         y1=c(pnorm(mu-sigma, mu, sigma), pnorm(mu, mu, sigma), pnorm(mu+sigma, mu, sigma)),
         lwd=2, col=c("darkgreen", "red", "darkgreen"), lty=2)
arrows(x0=c(mu-sigma+0.2, mu+sigma-0.2), y0=-0.02,
       x1=c(mu-0.2, mu+0.2),             y1=-0.02,
       code=3, angle=90, length=0.05, lwd=2, col="darkgreen")
mtext(text="F(x)", side=4, line=3, cex=1.4)
rect(-8.5, 0.92, -5.5, 1.0, col="lightgray", border=NA)
text(-7.2, 0.9, labels="Wahrscheinlichkeit")
math_str <- "des~Intervalls~group('[',list(-sigma,sigma),']')"
text(-7.1, 0.86, str2expression(math_str))
text(mu-sigma/2, 0,    str2expression("sigma"), col="darkgreen", cex=1.2)
text(mu+sigma/2, 0,    str2expression("sigma"), col="darkgreen", cex=1.2)
text(mu+0.5,     0.02, str2expression("mu"),    col="red",       cex=1.2)

##### 14.5.6 Funktionsgraphen ----
mu    <- 0
sigma <- 2
curve(dnorm(x, mean=1, sd=1), from=-7, to=7, col="blue", lwd=2)
curve((1/(sigma*sqrt(2*pi))) * exp(-0.5*(((x-mu)/sigma)^2)), add=TRUE, lwd=2, lty=2)

##### 14.5.7 Text, legende und mathematische Formeln ----
title(main="zwei Normalverteilungskurven", sub="Untertitel")
legend(x="topleft", legend=c("N(1, 1)", "N(0, 2)"), col=c("blue", "black"),
       lty=c(1, 2))
text(x=3.6,  y=0.35, labels="Normalverteilung\nN(1, 1)")
text(x=-3.5, y=0.1 , labels="N(0, 2)")
mtext(text="Wahrscheinlichkeitsdichte", side=3)
math_str <- paste0('frac(1, sigma*sqrt(2*pi))',
                   '~ exp*bgroup("(", -frac(1, 2)',
                   '~ bgroup("(", frac(x-mu, sigma), ")")^2, ")")')
text(-4, 0.3, str2expression(math_str))

##### 14.5.8 Achsen ----
vec <- seq(from=-2*pi, to=2*pi, length.out=200)
mat <- cbind(sin(vec), cos(vec), tan(vec))
mat <- ifelse(abs(mat) > 2, NA, mat)

matplot(vec, mat, lwd=2, col=c(12, 14, 17), type="l", lty=c(1, 2, 4), xaxt="n",
        xlab=NA, ylab=NA, main="Trigonometrische Funktionen")
xTicks  <- seq(from=-2*pi, to=2*pi, by=pi/2)
xLabels <- c("-2*pi", "-3*pi/2", "-pi", "-pi/2", "0", "pi/2", "pi", "3*pi/2", "2*pi")
axis(side=1, at=xTicks, labels=xLabels)
abline(h=c(-1, 0, 1), v=seq(from=-3*pi/2, to=3*pi/2, by=pi/2), col="gray", lty=3, lwd=2)
abline(h=0, v=0, lwd=2)
legend(x="bottomleft", legend=c("sin(x)", "cos(x)", "tan(x)"), lty=1,
       col=c(12, 14, 17))

##### 14.5.9 Fehlerbalken ----
Nj <- c(15, 20, 18, 22)
P  <- length(Nj)
DV <- rnorm(sum(Nj), rep(c(30, 20, 25, 15), Nj), 6)
IV <- factor(rep(1:P, Nj))
Mj <- tapply(DV, IV, FUN=mean)
Sj <- tapply(DV, IV, FUN=sd)
ciWidths <- qt(0.975, df=Nj-1) * Sj / sqrt(Nj)

library(DescTools)
stripchart(DV ~ IV, method="jitter", xlab="Gruppe",
           main="Rohdaten und Konfidenzintervalle", xaxt="n", col="darkgray",
           ylim=c(0, 40), pch=16, vert=TRUE)
ErrBars(from=Mj-ciWidths, to=Mj+ciWidths, pos=1:P, length=0.1,
        col="blue", col.pch="blue", lwd=2, pch=19)
axis(side=1, at=1:P, labels=LETTERS[1:P])

Mj1  <- c(2, 3, 6, 3, 5)
Sj1  <- c(1.7, 1.8, 1.7, 1.9, 1.8)
Mj2  <- c(4, 3, 2, 1, 3)
Sj2  <- c(1.4, 1.7, 1.7, 1.3, 1.5)
Q    <- length(Mj1)
xOff <- 0.1
plot(c((1:Q)-xOff, (1:Q)+xOff), c(Mj1, Mj2), pch=19,
     main="Mittelwerte und SDs im 5x2 Design",
     xlab="Faktor A", ylab="Mittelwert",
     col=rep(c("blue", "red"), each=5), ylim=c(0, 8))
ErrBars(from=c(Mj1, Mj2) - c(Sj1, Sj2),
          to=c(Mj1, Mj2) + c(Sj1, Sj2),
        pos=c((1:Q)-xOff, (1:Q)+xOff),
        lty=rep(1:2, each=Q),
        col=rep(c("blue", "red"), each=Q),
        col.pch=rep(c("blue", "red"), each=Q))
legend(x="topleft", legend=c("B-1", "B-2"), pch=19, col=c("blue", "red"))

barsX <- barplot(height=Mj, ylim=c(0, 40), xaxt="n", xlab="Gruppe",
                 ylab="Mittelwert", main="Mittelwerte und Konfidenzintervalle")
axis(side=1, at=barsX, labels=LETTERS[1:P])
limHi <- Mj + ciWidths
limLo <- Mj - ciWidths
arrows(x0=barsX, y0=limLo, x1=barsX, y1=limHi, code=3, angle=90, length=0.1,
       col="blue")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:DescTools))

##### 14.5.10 Rastergrafiken ----
pxSq  <- 6
colsR <- rep(0.4, pxSq^2)
colsG <- rep(seq(0, 1, length.out=pxSq), times=pxSq)
colsB <- rep(seq(0, 1, length.out=pxSq), each=pxSq)
arrSq <- array(c(colsR, colsG, colsB), c(pxSq, pxSq, 3))
sqIm  <- as.raster(arrSq)

pxG    <- 500
alpha  <- 0
beta   <- min(c(1-alpha, 1+alpha))
freq   <- 3.5
vals   <- rep(seq(-2*pi, 2*pi, length.out=pxG), pxG)
x      <- matrix(vals, nrow=pxG, byrow=TRUE)
y      <- matrix(vals, nrow=pxG, byrow=FALSE)
phi    <- alpha*x + beta*y
cosMat <- 0.5*cos(freq*phi) + 0.5

library(mvtnorm)
mu       <- c(0, 0)
sigma    <- diag(2)*10
gaussVal <- dmvnorm(cbind(c(x), c(y)), mu, sigma)
gaussMat <- matrix(gaussVal, nrow=pxG) / max(gaussVal)
gabIm    <- as.raster(cosMat*gaussMat)

plot(c(0, 1), c(0, 1), type="n", main="Bitmaps", xlab="", ylab="", asp=1)
rasterImage(sqIm,  0,   0,   0.3, 0.3, angle=0,  interpolate=FALSE)
rasterImage(gabIm, 0.5, 0.3, 1.1, 0.9, angle=10, interpolate=TRUE)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

####*--------------------------------------------------------------------------*
#### 14.6 Verteilungsdiagramme ----
##### 14.6.1 Histogramm und Schätzung der Dichtefunktion ----
height <- rnorm(100, 175, 7)
hist(height, xlab="height [cm]", ylab="N")

fromTo <- round(range(height), -1) + c(-10, 10)
limits <- seq(from=fromTo[1], to=fromTo[2], 5)
hist(height, freq=FALSE, xlim=fromTo, xlab="height [cm]", ylab="relative Häufigkeit",
     breaks=limits, main="Histogramm und Normalverteilung")
rug(jitter(height))
curve(dnorm(x, mean(height), sd(height)), lwd=2, col="blue", add=TRUE)

hist(height, freq=FALSE, xlim=fromTo, xlab="height [cm]",
     main="Histogramm und Dichte-Schätzung")
lines(density(height, bw="SJ"), lwd=2, col="blue")
rug(jitter(height))

##### 14.6.2 Stamm-Blatt-Diagramm ----
stem(rnorm(100, mean=175, sd=7))

##### 14.6.3 Boxplot ----
Nj <- 40
P  <- 3
DV <- rnorm(P*Nj, mean=100, sd=15)
IV <- gl(P, Nj, labels=c("Control", "Group A", "Group B"))
Mj <- tapply(DV, IV, FUN=mean)

boxplot(DV ~ IV, ylab="Score", col=c("red", "blue", "green"),
        main="Boxplots der Scores in 3 Gruppen")
points(1:P, Mj, pch=16, cex=2)

library(beeswarm)
DVrange <- round(range(DV), digits=-1)
boxplot(DV ~ IV, ylab="Score", col=c("red", "blue", "green"),
        main="Boxplots und Scores in 3 Gruppen",
        outline=FALSE, ylim=DVrange)

beeswarm(DV ~ IV, add=TRUE, pch=16, col="#00000077")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:beeswarm))

##### 14.6.4 Stripchart ----
Nj   <- 25
P    <- 4
dice <- sample(1:6, P*Nj, replace=TRUE)
IV   <- gl(P, Nj)
## für Daten des Buches diese Datei laden:
# load("data_stripchart.Rdata")

stripchart(dice ~ IV, xlab="Augenzahl", ylab="Gruppe", pch=1,  col="blue",
           main="Würfelwürfe - 4 Gruppen", sub="jitter-Methode", method="jitter")
stripchart(dice ~ IV, xlab="Augenzahl", ylab="Gruppe", pch=16, col="red",
           main="Würfelwürfe - 4 Gruppen", sub="stack-Methode",  method="stack")

##### 14.6.5 Quantil-Quantil-Diagramm ----
##### 14.6.5.1 Vergleich zweier Stichproben ----
DV1 <- rnorm(200)
DV2 <- rf(100, df1=3, df2=15)

qqplot(DV1, DV2, xlab="Quantile N(0, 1)", ylab="Quantile F(3, 15)",
       main="Quantile N(0, 1) vs. F(3, 15)")

##### 14.6.5.2 Vergleich mit theoretischer Verteilung ----
cProb <- ppoints(height)
qTheo <- qt(cProb, df=10)
qEmp  <- sort(height)
plot(qTheo, qEmp, xlab="Quantile t-Verteilung", ylab="empirische Quantile",
     pch=20, main="Empirische ~ theoretische t-Quantile")

height <- rnorm(100, mean=175, sd=7)
qqnorm(height)
qqline(height, col="red", lwd=2)

##### 14.6.6 Empirische kumulierte Häufigkeitsverteilung ----
vec <- round(rnorm(10), 1)
Fn  <- ecdf(vec)
plot(Fn, main="Empirische kumulierte Häufigkeitsverteilung")
curve(pnorm, add=TRUE, col="gray", lwd=2)

##### 14.6.7 Kreisdiagramm ----
dice <- sample(1:6, 100, replace=TRUE)
dTab <- table(dice)

par(mar=c(1, 1, 4, 1))
pie(dTab, col=c("blue", "red", "yellow", "pink", "green", "orange"),
    main="Relative Häufigkeiten beim Würfeln")

dTabFreq <- proportions(dTab)
textRad  <- 0.5
angles   <- dTabFreq * 2 * pi
csAngles <- cumsum(angles)
csAngles <- csAngles - angles/2
textX    <- textRad * cos(csAngles)
textY    <- textRad * sin(csAngles)
text(x=textX, y=textY, labels=dTabFreq)

##### 14.6.8 Gemeinsame Verteilung zweier Variablen ----
N  <- 200
P  <- 2
x  <- rnorm(N, 100, 15)
y  <- 0.5*x + rnorm(N, 0, 10)
IV <- gl(P, N/P, labels=LETTERS[1:P])
## für Daten des Buches diese Datei laden:
# load("data_distr2var.Rdata")

plot(x, y, pch=c(4, 16)[unclass(IV)], lwd=2, col=c("black", "darkgray")[unclass(IV)],
     main="Gemeinsame Verteilung getrennt nach Gruppen")
legend(x="topleft", legend=c("Gruppe A", "Gruppe B"), pch=c(4, 16),
       col=c("black", "darkgray"))

library(car)
mat <- cbind(x, y)
ctr <- colMeans(mat)
plot(mat, xlab="x", ylab="y", asp=1,
     main="Gemeinsame Verteilung zweier Variablen")

ellipse(ctr, shape=cov(mat), radius=1, col="blue")
hullIdx <- chull(x, y)
polygon(x[hullIdx], y[hullIdx])
legend(x="bottomright", legend=c("Datenpunkte", "Zentroid", "Streuungsellipse",
       "Konvexe Hülle"), pch=c(1, 19, NA, NA),
       lty=c(NA, NA, 1, 1), col=c("black", "blue", "blue", "black"))

vec1 <- sample(1:10, 100, replace=TRUE)
vec2 <- sample(1:10, 100, replace=TRUE)
plot(vec2 ~ vec1, main="Punktwolke")
plot(jitter(vec2) ~ vec1, main="Punktwolke mit jitter")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

####*--------------------------------------------------------------------------*
#### 14.7 Multivariate Daten visualisieren ----
##### 14.7.1 Höhenlinien und variable Datenpunktsymbole ----
mu    <- c(1, 3)
sigma <- matrix(c(1, 0.6, 0.6, 1), nrow=2)
rng   <- 2.5
N     <- 50
X     <- seq(from=mu[1] - rng*sigma[1, 1], to=mu[1] + rng*sigma[1, 1], length.out=N)
Y     <- seq(from=mu[2] - rng*sigma[2, 2], to=mu[2] + rng*sigma[2, 2], length.out=N)

library(mvtnorm)
genZ <- function(x, y) { dmvnorm(cbind(x, y), mu, sigma) }
matZ <- outer(X, Y, FUN="genZ")

contour(X, Y, matZ, main="Höhenlinien für 2D-NV Dichte")
filled.contour(X, Y, matZ, main="Farbige Höhenlinien")

N      <- 10
age    <- rnorm(N, 30, 8)
sport  <- abs(-0.25*age + rnorm(N, 60, 40))
weight <- -0.3*age -0.4*sport + 100 + rnorm(N, 0, 3)
wScale <- (weight-min(weight)) * (0.8 / abs(diff(range(weight)))) + 0.2
## für Daten des Buches diese Datei laden:
# load("data_symbols.Rdata")

symbols(age, sport, circles=wScale, inch=0.6, fg=NULL, bg=rainbow(N),
        main="Gewicht vs. Alter und Sport")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:mvtnorm))

##### 14.7.2 Dreidimensionale Gitter und Streudiagramme ----
## für Daten des Buches diese Datei laden:
# load("data_persp.Rdata")
par(mar=c(2, 2, 4, 2) + 0.1)
persp(X, Y, matZ, xlab="x", ylab="y", zlab="Dichte", theta=5, phi=35,
      main="Dichte 2D Normalverteilung")

library(rgl)
vecX <- rep(seq(-10, 10, length.out=10), times=10)
vecY <- rep(seq(-10, 10, length.out=10),  each=10)
vecZ <- vecX*vecY
open3d(windowRect=c(34, 57, 290, 313))
plot3d(vecX, vecY, vecZ, main="3D Scatterplot", col="blue", type="h", aspect=TRUE,
       cex=1.7)
spheres3d(vecX, vecY, vecZ, col="red", radius=2)
grid3d(c("x", "y+", "z"))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:rgl))

##### 14.7.3 Matrix aus Streudiagrammen ----
N      <- 20
P      <- 2
IV     <- factor(rep(c("CG", "T"), each=N/P))
age    <- sample(18:35, N, replace=TRUE)
IQ     <- round(rnorm(N, mean=rep(c(100, 115), each=N/P), sd=15))
rating <- round(0.4*IQ - 30 + rnorm(N, 0, 10), 1)
score  <- round(-0.3*IQ + 0.7*age + rnorm(N, 0, 8), 1)
## für Daten des Buches diese Datei laden:
# load("data_pairs.Rdata")
mvDf   <- data.frame(IV, age, IQ, rating, score)

pairs(~ age + IQ + rating + score, main="Streudiagramm-Matrix", data=mvDf,
      pch=c(4, 16)[unclass(IV)], col=c("red", "blue")[unclass(IV)])

myHist <- function(x, ...) { par(new=TRUE); hist(x, ..., main="") }

library(car)
myEll  <- function(x, y, ...) {
    splLL <- split(data.frame(x, y), mvDf$IV)
    matCG <- data.matrix(splLL$CG)
    matT  <- data.matrix(splLL$T)
    ctrCG <- colMeans(matCG)
    ctrT  <- colMeans(matT)
    ellipse(ctrCG, shape=cov(matCG), col="red",  radius=1)
    ellipse(ctrT,  shape=cov(matT),  col="blue", radius=1)
}

## für Daten des Buches diese Datei laden:
# load("data_pairs.Rdata")
pairs(~ age + IQ + rating + score, diag.panel=myHist,
      upper.panel=myEll, main="Streudiagramm-Matrix", pch=16,
      col=c("red", "blue")[unclass(IV)])

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:car))
try(detach(package:carData))

#### 14.7.4 Heatmap ----
corMat <- cor(data.matrix(mvDf))
round(corMat, digits=3)

cmCol  <- rev(heat.colors(nrow(corMat)))
cmRamp <- colorRampPalette(cmCol)
cols   <- cmRamp(length(corMat))
heatmap(corMat, symm=TRUE, revC=TRUE, Rowv=NA, Colv=NA,
        col=cols, RowSideColors=cmCol,
        main="Heatmap einer Korrelationsmatrix")

####*--------------------------------------------------------------------------*
#### 14.8 Mehrere Diagramme in einem Grafik-Device darstellen ----
##### 14.8.1 layout() ----
(mat1 <- matrix(1:4, 2, 2))
layout(mat1)
par(lwd=3, cex=3)
layout.show(4)

layout(mat1, widths=c(1, 2), heights=c(1, 2))
par(lwd=3, cex=3)
layout.show(4)

layout(mat1, widths=c(1, 2), heights=c(1, 2))
barplot(table(round(rnorm(100))), horiz=TRUE, main="Barplot")
boxplot(rt(100, 5), main="Boxplot")
stripchart(sample(1:20, 40, replace=TRUE), method="stack", main="Stripchart")
pie(table(sample(1:6, 20, replace=TRUE)), main="Kreisdiagramm")

(mat2 <- matrix(c(1, 0, 1, 2), 2, 2))
layout(mat2)
stripchart(sample(1:20, 40, replace=TRUE), method="stack", main="Stripchart")
barplot(table(round(rnorm(100))), main="Säulendiagramm")

##### 14.8.2 par(mfrow, mfcol, fig) ----
dev.new(width=7, height=4)
par(mfrow=c(1, 2))
boxplot(rt(100, 5), xlab=NA, notch=TRUE, main="Boxplot")
plot(rnorm(10), pch=16, xlab=NA, ylab=NA, main="Streudiagramm")

resBinom <- rbinom(1000, size=10, prob=0.3)
facBinom <- factor(resBinom, levels=0:10)
tabBinom <- table(facBinom)

par(fig=c(0, 1, 0.10, 1))
plot(tabBinom, type="h", bty="n", xaxt="n", xlim=c(0, 10), xlab=NA, ylab="Häufigkeit",
     main="Ergebnisse 1000*10 Bernoulli Experimente (p=0.3)")
points(names(tabBinom), tabBinom, pch=16, col="gray", cex=2)
par(fig=c(0, 1, 0, 0.35), bty="n", new=TRUE)
boxplot(resBinom, horizontal=TRUE, ylim=c(0, 10), notch=TRUE, col="gray",
        xlab="Anzahl der Erfolge")

##### 14.8.3 split.screen() ----
splitMat <- rbind(c(0, 0.5, 0, 0.5), c(0.15, 0.85, 0.15, 0.85), c(0.5, 1, 0.5, 1))
split.screen(splitMat)
screen(1)
barplot(table(round(rnorm(100))), main="Säulendiagramm")
screen(2)
boxplot(sample(1:20, 100, replace=TRUE) ~ gl(4, 25, labels=LETTERS[1:4]),
        col=rainbow(4), notch=TRUE, main="Boxplot")
screen(3)
plot(sample(1:20, 40, replace=TRUE), pch=20, xlab=NA, ylab=NA,
     main="Streudiagramm")
close.screen(all.screens=TRUE)

####*--------------------------------------------------------------------------*
### 15 Diagramme mit ggplot2 ----
####*--------------------------------------------------------------------------*

#### 15.1 Grundprinzip ----
##### 15.1.1 Grundschicht ----
Njk    <- c(20, 38, 25, 33, 22, 42)
sex    <- sample(factor(rep(c("f", "m"), times=c(78, 102))))
group  <- sample(factor(rep(c("cntrl", "placebo", "treat"),
                            times=c(77, 43, 60))))

sgComb <- interaction(sex, group)
mood   <- round(rnorm(sum(Njk), mean=c(85, 80, 110, 90, 130, 100)[sgComb],
                      sd=25))

height <- rnorm(sum(Njk), mean=c(170, 180)[sex], sd=7)
rating <- factor(rbinom(sum(Njk), size=2,
                        prob=c(c(0.1, 0.25, 0.4, 0.6, 0.75, 0.9))[sgComb]),
                 levels=0:2,
                 labels=c("Disagree", "Neutral", "Agree"))

myDf   <- data.frame(sex, group, sgComb, mood, height, rating)

library(ggplot2)
ggplot(myDf, aes(x=height, y=mood, colour=sex, shape=group))

##### 15.1.2 Diagramme speichern ----

####*--------------------------------------------------------------------------*
#### 15.2 Diagrammtypen ----
##### 15.2.1 Punkt- und Streudiagramm ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(x=height, y=mood, colour=sex, shape=group)) +
    geom_point(size=3) +
    theme(legend.position="bottom")

(groupN  <- as.data.frame(xtabs(~ group, data=myDf)))
(groupM  <- aggregate(mood ~ group, data=myDf, FUN=mean))
(groupNM <- merge(groupN, groupM, by="group"))

ggplot(groupNM, aes(x=group, y=mood, size=Freq)) +
    geom_point() +
    scale_size_area(max_size=15)

##### 15.2.2 Linien- und Flächendiagramm ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
(sex_groupM <- aggregate(mood ~ sex + group, data=myDf, FUN=mean))
ggplot(sex_groupM, aes(x=group, y=mood, color=sex, shape=sex, group=sex)) +
    geom_point(size=2) +
    geom_line(linewidth=0.8)

ggplot(sex_groupM, aes(x=group, y=mood, color=sex, fill=sex,
                       shape=sex, group=sex)) +
    geom_area(position=position_identity(), alpha=0.4) +
    geom_point(size=3) +
    geom_line(linewidth=0.8)

##### 15.2.3 Säulendiagramm ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(x=rating)) +
    geom_bar()

ggplot(myDf, aes(x=rating)) +
    geom_bar(aes(y=after_stat(count / sum(count)))) +
    labs(y="Relative Frequency")

(myDf_freq <- as.data.frame(xtabs(~ rating, data=myDf)))

ggplot(myDf_freq, aes(x=Freq, y=rating)) +
    geom_col()

ggplot(myDf, aes(x=rating, group=sex, fill=sex)) +
    geom_bar(position=position_stack())

ggplot(myDf, aes(x=rating, group=sex, fill=sex)) +
    geom_bar(aes(y=after_stat(count / sum(count))),
             position=position_fill()) +
    labs(y="Relative Frequency")

tab_freq  <- xtabs(~ sex + rating, data=myDf)
(tab_crel <- proportions(tab_freq, margin=1))
myDf_crel <- as.data.frame(tab_crel) |>
    transform(Freq_round=round(Freq, 2))

ggplot(myDf_crel, aes(x=sex, y=Freq, group=rating, fill=rating)) +
    geom_col(position=position_dodge(width=0.8), width=0.8)

##### 15.2.4 Histogramm und Kerndichteschätzer ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(x=mood)) +
    geom_histogram(bins=30)

ggplot(myDf, aes(x=mood)) +
    geom_histogram(aes(y=after_stat(density))) +
    geom_density(color="darkgrey", fill="grey", alpha=0.6)

ggplot(myDf, aes(x=mood, y=height)) +
    geom_density2d() +
    geom_point()
    
ggplot(myDf, aes(x=mood, y=height)) +
    geom_density2d_filled() +
    geom_point() +
    theme(legend.position="none")

##### 15.2.5 Boxplot ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(x=1, y=mood)) +
    geom_boxplot() +
    theme(axis.title.x=element_blank(),
          axis.text.x =element_blank())

ggplot(myDf, aes(x=sex, y=height, fill=sex)) +
    geom_boxplot(notch=TRUE) +
    theme(legend.position="none")

library(ggbeeswarm)
ggplot(myDf, aes(x=sex, y=height, fill=sex)) +
    geom_boxplot(width=0.5, outlier.shape=NULL) +
    geom_beeswarm(size=2, alpha=0.5, cex=2) +
    theme(legend.position="none")

ggplot(myDf, aes(x=mood, y=group, fill=group)) +
    geom_boxplot() +
    geom_violin(alpha=0.5) +
    theme(legend.position="none")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:ggbeeswarm))

##### 15.2.6 Quantil-Quantil Diagramm ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(sample=height)) +
    geom_qq() +
    geom_qq_line(color="blue")

parL <- list(df=10, ncp=0)
ggplot(myDf, aes(sample=height)) +
    geom_qq(     distribution=qt, dparams=parL) +
    geom_qq_line(distribution=qt, dparams=parL, color="blue")

probs <- ppoints(50)
qDat  <- data.frame(qmood=quantile(myDf$mood, probs),
                    qheight=quantile(myDf$height, probs))

head(qDat)
ggplot(qDat, aes(x=qmood, y=qheight)) +
    geom_point() +
    labs(x="Quantiles mood", y="Quantiles height")

####*--------------------------------------------------------------------------*
#### 15.3 Bedingte Diagramme in Panels darstellen ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(x=sex, y=height, fill=sex)) +
    geom_boxplot() +
    facet_grid(~ group) +
    theme(legend.position="none")

ggplot(myDf, aes(x=height, y=mood)) +
    geom_point() +
    facet_wrap(~ sex + group, ncol=2, scales="free_y") +
    guides(x=guide_axis(angle=90))

p1 <- ggplot(myDf, aes(x=sex, y=mood)) +
    geom_violin()

p2 <- ggplot(myDf, aes(x=rating)) +
    geom_bar() +
    guides(x=guide_axis(angle=90))

library(patchwork)
(p1 + p2) + plot_layout(ncol=2, widths=c(1, 2)) +
    plot_annotation(tag_levels="a",
                    tag_suffix=")",
                    "Titel der zusammengesetzten Abbildung")

p3 <- ggplot(myDf, aes(x=height)) +
    geom_density()

(p1 | (p2 / p3)) +
    plot_annotation(tag_levels="I", tag_suffix=".")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:patchwork))

####*--------------------------------------------------------------------------*
#### 15.4 Diagrammelemente hinzufügen ----

####*--------------------------------------------------------------------------*
#### 15.5 Geometrische Grundformen ----

####*--------------------------------------------------------------------------*
#### 15.6 Besondere Diagrammelemente ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
sex_groupN     <- as.data.frame(xtabs(~ sex + group, data=myDf))
sex_groupM     <- aggregate(mood ~ sex + group, data=myDf, FUN=mean)
sex_groupSD    <- aggregate(mood ~ sex + group, data=myDf, FUN=sd)
sex_groupMSD   <- merge(sex_groupM,   sex_groupSD, by=c("sex", "group"),
                        suffixes=c(".M", ".SD"))

sex_groupMSDN  <- merge(sex_groupMSD, sex_groupN,  by=c("sex", "group"))
(sex_groupMSDN <- transform(sex_groupMSDN,
                            SEMlo=mood.M - mood.SD/sqrt(Freq),
                            SEMup=mood.M + mood.SD/sqrt(Freq)))

ggplot(sex_groupMSDN,
       aes(x=group, y=mood.M, ymin=SEMlo, ymax=SEMup,
           color=sex, shape=sex, group=sex)) +
    geom_point(size=3) +
    geom_linerange() +
    geom_line()

####*--------------------------------------------------------------------------*
#### 15.7 Symbole und mathematische Formeln ----
ggplot(myDf, aes(x=height, y=mood)) +
    geom_point() +
    geom_vline(xintercept=170, color="red", linewidth=2, linetype=2) +
    annotate("text", x=185, y=160, label="frac(pi^2, sigma*sqrt(2))",
             parse=TRUE, size=5) +
    annotate("rect", xmin=180, xmax=195, ymin=50, ymax=125,
             alpha=0.2, fill="blue") +
    annotate("segment", x=150, y=150, xend=155, yend=75,
             arrow=arrow(type="closed"), linewidth=2, color="darkgreen")

lmF <- summary(lm(mood ~ height, data=myDf, subset=(sex == "f")))
lmM <- summary(lm(mood ~ height, data=myDf, subset=(sex == "m")))

lmDf <- data.frame(sex=c("f", "m"),
                   R2=c(lmF$r.squared, lmM$r.squared)) |>
    transform(R2_label=sprintf("R2 = %.3f", R2))

myDf_lm <- merge(myDf, lmDf, by="sex")

title_xpr <- expression(paste("mood ~ height getrennt nach Geschlecht mit ",
                              R^2))

ggplot(myDf_lm, aes(x=height, y=mood)) +
    geom_vline(aes(xintercept=180), linetype=2) +
    geom_point(size=3) +
    geom_smooth(method=lm, se=TRUE, linewidth=1.2, fullrange=TRUE) +
    facet_grid(sex ~ .) +
    labs(title=title_xpr) +
    geom_text(aes(x=200, y=155, label=R2_label),
              size=7, hjust="right", show.legend=FALSE) +
    annotate("text", x=150, y=35, size=7, label="Annotation")

####*--------------------------------------------------------------------------*
#### 15.8 Diagramme formatieren ----
##### 15.8.1 Elemenposition kontrollieren ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(x=mood, fill=sex)) +
    geom_histogram(aes(y=after_stat(density)),
                   position=position_dodge())

ggplot(myDf, aes(x=rating, group=sex, fill=sex)) +
    geom_bar(stat="count",
             aes(y=after_stat(count / sum(count))),
             position=position_fill())

##### 15.8.2 Achsen anpassen ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(myDf, aes(x=mood, y=height)) +
    geom_point() +
    scale_x_continuous(transform="log10")

ggplot(myDf, aes(x=mood, y=height)) +
    geom_point() +
    coord_trans(x="log10")

ggplot(myDf_crel, aes(x=sex, y=Freq, label=Freq_round,
                      group=rating, fill=rating)) +
    geom_col(position=position_dodge(width=0.9)) +
    geom_text(position=position_dodge(width=0.9),
              vjust=-0.8) +
    scale_y_continuous(limits=c(0, 0.4),
                       labels=scales::label_percent())

levels(myDf$rating)
myDf$rating <- ordered(myDf$rating,
                       levels=rev(levels(myDf$rating)))

ggplot(myDf, aes(x=rating, group=sex, fill=sex)) +
    geom_bar(stat="count",
             aes(y=after_stat(count / sum(count))),
             position=position_fill()) +
    labs(x="Rating category", y="Cumulative relative frequency") +
    guides(x=guide_axis(angle=90))

ggplot(myDf, aes(x=height, y=mood, colour=sex, shape=group)) +
    geom_point(size=3) +
    scale_x_continuous(limits=c(140, 200),
                       expand=c(0, 0),
                       breaks=seq(140, 200, by=5)) +
    scale_y_continuous(n.breaks=8) +
    guides(x=guide_axis(angle=90))

##### 15.8.3 Legende ändern ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
legend_xpr <- expression(paste("Sex/Group", ~hat(sigma)^2))
ggplot(myDf, aes(x=height, y=mood, colour=sex:group, shape=sex)) +
    geom_vline(aes(xintercept=180), linetype=2) +
    geom_point(size=3) +
    geom_smooth(method="lm", se=TRUE, linewidth=1.2, fullrange=TRUE) +
    facet_grid(sex ~ group) +
    labs(title="mood ~ height getrennt nach Geschlecht + Gruppe") +
    geom_text(aes(x=190, y=70, label=sgComb), size=7,
              hjust="right", show.legend=FALSE) +
    annotate("text", x=165, y=35, size=7, label="Annotation") +
    guides(shape="none",
           colour=guide_legend(title=legend_xpr)) +
    theme(legend.position="bottom")

##### 15.8.4 Farben, Datenpunktsymbole und Linientypen ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
ggplot(sex_groupM, aes(x=group, y=mood, color=sex, shape=sex, group=sex)) +
    geom_point(size=8, stroke=2) +
    geom_line(linewidth=2, linetype="dashed") +
    scale_shape_discrete(solid=FALSE)

library(colorspace)
ggplot(myDf, aes(x=mood, fill=group)) +
    geom_histogram(aes(y=after_stat(density)), alpha=0.5) +
    geom_density(alpha=0.7) +
    scale_fill_discrete_qualitative()

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:colorspace))

##### 15.8.5 Aussehen im Detail verändern ----
## für Daten des Buches diese Datei laden:
# load("data_ggplot.Rdata")
library(colorspace)
ggplot(myDf, aes(x=mood, fill=group)) +
    geom_histogram(aes(y=after_stat(density)), alpha=0.5) +
    geom_density(alpha=0.7) +
    scale_fill_discrete_qualitative() +
    theme_bw()

ggplot(myDf, aes(y=sex, x=height, fill=sex)) +
    geom_boxplot() +
    theme_minimal() +
    labs(title="Schriftgröße anpassen") +
    theme(axis.title  =element_text(size=rel(1.4)),
          axis.text   =element_text(size=rel(1.4)),
          legend.text =element_text(size=rel(1.4)),
          legend.title=element_text(size=rel(1.4)),
          plot.title  =element_text(size=rel(1.4)))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:ggplot2))
try(detach(package:colorspace))

####*--------------------------------------------------------------------------*
### 16 Numerische Methoden ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 16.1 Daten interpolieren und glätten ----
##### 16.1.1 Lineare Interpolation ----
xOne     <- 1:9                                       # x-Koordinaten
yOne     <- rnorm(9)                                  # y-Koordinaten
ptsLin   <- approx(xOne, yOne, method="linear")       # linear
ptsConst <- approx(xOne, yOne, method="constant")     # konstant

## für Daten des Buches diese Datei laden:
# load("data_interp.Rdata")
plot(xOne, yOne, xlab=NA, ylab=NA, pch=19, main="Datenpunkte interpolieren",
     cex=1.5)
points(ptsLin,   pch=16, cex=1.5, col="red")
points(ptsConst, pch=22, cex=1.5, col="blue", lwd=2)
legend(x="topright", c("Daten", "linear", "konstant"), pch=c(19, 16, 22),
       col=c("black", "red", "blue"), cex=1.8)

##### 16.1.2 Splines ----
plot(xOne, yOne, xlab=NA, ylab=NA, pch=19, main="Splines", cex=1.5)
ptsSpline <- spline(xOne, yOne, n=201)
smSpline1 <- smooth.spline(xOne, yOne, spar=0.25)
smSpline2 <- smooth.spline(xOne, yOne, spar=0.35)
smSpline3 <- smooth.spline(xOne, yOne, spar=0.45)
ptsX      <- seq(1, 9, length.out=201)
ptsSmSpl1 <- predict(smSpline1, ptsX)
ptsSmSpl2 <- predict(smSpline2, ptsX)
ptsSmSpl3 <- predict(smSpline3, ptsX)
lines(ptsSpline, col="darkgray", lwd=2)
matlines(x=ptsX, y=cbind(ptsSmSpl1$y, ptsSmSpl2$y, ptsSmSpl3$y),
         col=c("blue", "green", "orange"), lty=1, lwd=2.5)
legend(x="topright", c("Daten", "Spline", "spar=0.25", "spar=0.35", "spar=0.45"),
       pch=c(19, NA, NA, NA, NA), lty=c(NA, 1, 1, 1, 1),
       col=c("black", "darkgray", "blue", "green", "orange"),
       lwd=c(NA, 3, 3, 3, 3), cex=1.8)

xTwo <- rnorm(100)                                   # x-Koordinaten
yTwo <- 0.4 * xTwo + rnorm(100, 0, 1)                # y-Koordinaten
ord  <- order(xTwo)
idx  <- seq(8, 88, by=20)
xspline(xTwo[ord][idx], yTwo[ord][idx], c(1, -1, -1, 1, 1), border="darkgreen",
        lwd=2, open=FALSE)

##### 16.1.3 LOESS-Glätter ----
## für Daten des Buches diese Datei laden:
# load("data_interp.Rdata")
ptsL1 <- loess.smooth(xTwo, yTwo, span=1/3)
ptsL2 <- loess.smooth(xTwo, yTwo, span=2/3)

plot(xTwo, yTwo, xlab=NA, ylab=NA, pch=16, main="Geglättetes Streudiagramm")
lines(ptsL1, lwd=2, col="red")
lines(ptsL2, lwd=2, col="blue")
legend(x="topleft", c("Daten", "Loess span 1/3", "Loess span 2/3", "xspline"),
       pch=c(19, NA, NA, NA), lty=c(NA, 1, 1, 1), bg="white",
       col=c("black", "red", "blue", "darkgreen"), lwd=c(NA, 3, 3, 3), cex=1.8)

zTwo      <- -0.5 * xTwo + rnorm(100, 0, 0.5)
d_xyz     <- data.frame(x=xTwo, y=yTwo, z=zTwo)
loess_xyz <- loess(z ~ x + y, data=d_xyz)
x_grid    <- seq(from=min(xTwo), to=max(xTwo), length.out=50)
y_grid    <- seq(from=min(yTwo), to=max(yTwo), length.out=50)
d_grid    <- expand.grid(x=x_grid, y=y_grid)

library(rgl)
d_loess_pred <- predict(loess_xyz, newdata=d_grid, se=TRUE)
plot3d(xTwo, yTwo, zTwo, type="s", size=0.75, lit=FALSE, col="red")
points3d(xTwo, yTwo, zTwo, size=5, col="red")
surface3d(x_grid, y_grid, d_loess_pred[[1]],
          alpha=0.4, front="lines", back="lines")

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:rgl))

##### 16.1.4 Nonparametrische Kerndichteschätzer ----

####*--------------------------------------------------------------------------*
#### 16.2 Nullstellen finden ----
pHoyt <- function(q, qpar, omega) {
    alphaQ <- (sqrt((1 - qpar^4))/(2*qpar)) * sqrt((1 + qpar)/(1 - qpar))
     betaQ <- (sqrt((1 - qpar^4))/(2*qpar)) * sqrt((1 - qpar)/(1 + qpar))

    y <- q / sqrt(omega)
    pchisq((alphaQ*y)^2, df=2, ncp=( betaQ*y)^2) -
    pchisq(( betaQ*y)^2, df=2, ncp=(alphaQ*y)^2)
}

f <- function(x, p, qpar, omega) {
    pHoyt(x, qpar=qpar, omega=omega) - p
}

qHoyt <- function(p, qpar, omega) {
    uniroot(f, interval=c(0, omega), extendInt="upX",
            p=p, qpar=qpar, omega=omega)$root
}

qHoyt(p=0.7, qpar=0.5, omega=10)

U  <- runif(1000)
rh <- sapply(U, function(x) { qHoyt(p=x, qpar=0.5, omega=10) })

plot(ecdf(rh), col="blue")
curve(pHoyt(x, qpar=0.5, omega=10), from=0, to=10, add=TRUE)

####*--------------------------------------------------------------------------*
#### 16.3 Numerisch integrieren und differenzieren ----
##### 16.3.1 Numerisch integrieren ----
integrate(dnorm, lower=-Inf, upper=1, mean=1, sd=2)$value
pnorm(1, mean=1, sd=2)

dGauss <- function(x, mu=0, sigma=1) {
    (1/(sigma*sqrt(2*pi))) * exp(-0.5 * ((x-mu)/sigma)^2)
}

pGauss <- function(x, mu=0, sigma=1) {
    integrate(dGauss, lower=-Inf, upper=x, mu=mu, sigma=sigma)$value
}

f <- function(x, p, mu=0, sigma=1) {
    pGauss(x, mu=mu, sigma=sigma) - p
}

qGauss <- function(u, mu=0, sigma=1) {
    interval <- c(mu-10*sigma, mu+10*sigma)
    uniroot(f, interval=interval, extendInt="yes", p=u, mu=mu, sigma=sigma)$root
}

U <- runif(5)
sapply(U, qGauss, mu=0, sigma=1)
qnorm(U, mean=0, sd=1)

##### 16.3.2 Numerisch differenzieren ----
library(numDeriv)
x <- seq(-2, 2, length.out=5)
grad(pnorm, x, mean=0, sd=1)
dnorm(x, mean=0, sd=1)

grad(exp, x)
exp(x)

N  <- 100
X  <- rnorm(N, 0, 2)
mu <- exp(1 + 0.5*X)
Y  <- rpois(N, mu)
glmFit <- glm(Y ~ X, family=poisson(link="log"))

# Spaltenvektor der maximum-likelihood Schätzer der Parameter
(bML <- cbind(coef(glmFit)))
f    <- function(b) { b[1] / b[2] }
f(bML)

(bGrad    <- grad(f, bML))
(SEdelta <- sqrt(t(bGrad) %*% vcov(glmFit) %*% bGrad))
(CIdelta <- c(lo=f(bML) - c(qnorm(1-(0.05/2))*SEdelta),
              up=f(bML) + c(qnorm(1-(0.05/2))*SEdelta)))

nllPois <- function(b, X, Y) {
    mu <- exp(b[1] + b[2]*X)
    -sum(Y*log(mu) - mu - lgamma(Y+1))
}

h <- hessian(nllPois, x=bML, X=X, Y=Y)    # Hesse-Matrix
solve(h)
vcov(glmFit)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:numDeriv))

####*--------------------------------------------------------------------------*
#### 16.4 Numerisch optimieren ----
##### 16.4.1 Maximum-Likelihood-Parameterschätzung ----
library(MASS)
X <- rweibull(100, shape=1.5, scale=100)
fitdistr(X, densfun="weibull", start=list(shape=1, scale=50))

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:MASS))

##### 16.4.2 Allgemeine Optimierung ----
DV <- rnorm(50, mean=0, sd=1)
mean(DV)
sd(DV)

nCls   <- 4
limits <- qnorm(seq(from=1/nCls, to=(nCls-1)/nCls, length.out=nCls-1),
                mean=0, sd=1)

breaks   <- c(-Inf, limits, Inf)
DVcut    <- cut(DV, breaks=breaks)
observed <- table(DVcut)
minFunMin <- function(param, brks, obs) {
    if(param[2] < 1e-4) { return(NA_real_) }
    probs    <- diff(pnorm(brks, mean=param[1], sd=param[2]))
    expected <- sum(obs) * probs        # erwartete Häufigkeiten
    sum((obs-expected)^2 / expected)    # chi^2 Teststatistik
}

resMinChisq <- optim(c(mean(DV), sd(DV)), minFunMin,
                     brks=breaks, obs=observed, gr=NULL, method="BFGS")
resMinChisq$par                         # min-chi^2-Schätzer

minFunGML <- function(param, brks, obs) {
    if(param[2] < 1e-4) { return(NA_real_) }
    probs <- diff(pnorm(brks, mean=param[1], sd=param[2]))
    -dmultinom(obs, size=sum(obs), prob=probs, log=TRUE)
}

resGrML <- optim(c(mean(DV), sd(DV)), minFunGML,
                 brks=breaks, obs=observed, gr=NULL, method="BFGS")
resGrML$par

####*--------------------------------------------------------------------------*
### 17 R als Programmiersprache ----
####*--------------------------------------------------------------------------*

####*--------------------------------------------------------------------------*
#### 17.1 Kontrollstrukturen ----
##### 17.1.1 Fallunterscheidungen ----
(x <- round(rnorm(1, mean=100, sd=15)))
y <- NA
y <- if(x > 100) { TRUE }
y

x <- round(rnorm(1, mean=100, sd=15))
text_out <- if (x > 100) {
    cat("x is", x, "(greater than 100)\n")
} else {
    cat("x is", x, "(100 or less)\n")
}

text_out

(day <- sample(c("Mon", "Tue", "Wed"), size=1))
if(day == "Mon") {
    print("The day is Monday")
} else {
    if(day == "Tue") {
        print("The day is Tuesday")
    } else {
        print("The day is neither Monday nor Tuesday")
    }
}

text_out <- if(day == "Mon") {
    print("The day is Monday")
} else if(day == "Tue") {
    print("The day is Tuesday")
} else {
    print("The day is neither Monday nor Tuesday")
}

text_out

(val <- sample(1:3, size=1))
switch(val, print("val is 1"), print("val is 2"), print("val is 3"))
myCalc <- function(op, vals) {
    switch(op,
           plus  = vals[1] + vals[2],
           minus = vals[1] - vals[2],
           times = vals[1] * vals[2],
           vals[1] / vals[2])
}

(vals <- round(rnorm(2, 1, 10)))
myCalc("minus", vals)
myCalc("XX", vals)

##### 17.1.2 Schleifen ----
ABC <- c("Alfa", "Bravo", "Charlie", "Delta")
for(i in ABC) { print(i) }

src     <- log(rnorm(100000, 0, 1)^2)
alpha   <- 0.05
nTests  <- 1000
Nj      <- 20
sigVecT <- numeric(nTests)
sigVecV <- numeric(nTests)

for(i in seq(length.out=nTests)) {
    group1     <- sample(src, Nj, replace=FALSE)
    group2     <- sample(src, Nj, replace=FALSE)
    resT       <-   t.test(group1, group2)
    resV       <- var.test(group1, group2)
    sigVecT[i] <- as.logical(resT$p.value < alpha)
    sigVecV[i] <- as.logical(resV$p.value < alpha)
}

cat("Erwartete Anzahl signifikanter Tests:", alpha*nTests, "\n")
cat("Signifikante t-Tests:", sum(sigVecT), "\n")
cat("Signifikante F-Tests auf Varianzhomogenität:", sum(sigVecV), "\n")

x <- 37
y <- 10
while(x >= y) { x <- x-y }
x

for(i in 1:10) {
    if((i %% 2) != 0) { next  }
    if((i %% 8) == 0) { break }
    print(i)
}

i <- 0
repeat {
    i <- i+1
    if((i %% 4) == 0) { break }
    print(i)
}

####*--------------------------------------------------------------------------*
#### 17.2 Funktionsaufrufe ohne Schleifen wiederholen ----
replicate(6, round(rnorm(4), digits=2))

group1 <- replicate(nTests, sample(src, Nj, replace=FALSE))
group2 <- replicate(nTests, sample(src, Nj, replace=FALSE))
groups <- rbind(group1, group2)

estSigDiffs <- apply(groups, 2, function(x) {
    sqrt(1/Nj) * sqrt(var(x[1:Nj]) + var(x[(Nj+1):(2*Nj)])) } )

meanDiffs <- apply(groups, 2, function(x) {
    mean(x[1:Nj]) - mean(x[(Nj+1):(2*Nj)]) } )

tVals <- meanDiffs / estSigDiffs
pVals <- pt(tVals, (2*Nj)-2, lower.tail=FALSE)
sum(pVals < alpha)

####*--------------------------------------------------------------------------*
#### 17.3 Eigene Funktionen erstellen ----
##### 17.3.1 Funktionskopf ----
.First <- function() {
    library(car)
    print("Have a nice day!")
}

##### 17.3.2 Funktionsrumpf ----
##### 17.3.3 Funktionsargumente prüfen ----
modify <- function(x, how=c("standardize", "normalize")) {
    how <- match.arg(how)
    if(how == "standardize") {
        scale(x, center=TRUE, scale=TRUE)
    } else if(how == "normalize") {
        (x - min(x)) / diff(range(x))
    }
}

x <- rnorm(100, mean=5, sd=3)
mean(x)
sd(x)
range(x)

x_mod1 <- modify(x, how="standardize")
mean(x_mod1)
sd(x_mod1)

x_mod2 <- modify(x, how="norm")
range(x_mod2)

x_mod3 <- modify(x, how="boxcox")

myPlot <- function(x, y, xLims=xRange) {
    stopifnot(is.numeric(x), is.numeric(y))
    stopifnot(length(x) > 0, length(y) > 0)
    if(length(x) != length(y)) {
        warning("x und y haben ungleiche Länge -> kürze")
        lenMin <- min(c(length(x), length(y)))
        x <- x[1:lenMin]
        y <- y[1:lenMin]
    }
    
    xRange <- round(range(x), -1) + c(-10, 10)
    plot(x, y, xlim=xLims)
}

myPlot(1:5, 1:10)

##### 17.3.4 Fehler behandeln ----
##### 17.3.5 Rückgabewert und Funktionsende ----
##### 17.3.6 Eigene Funktionen verwenden ----
c(1, 2, 3)[1]
(function(arg1, arg2) { arg1^2 + arg2^2 })(-3, 4)

mat <- matrix(rnorm(16, 100, 15), nrow=4)
apply(mat, 2, \(x) { sqrt(sum(x^2)) } )
sqrt(colSums(mat^2))

##### 17.3.7 Generische Funktionen ----
info            <- function(x) { UseMethod("info") }
info.numeric    <- function(x) { range(x)  }
info.matrix     <- function(x) { dim(x)    }
info.data.frame <- function(x) { names(x)  }
info.default    <- function(x) { length(x) }

vec  <- round(runif(12, 20, 100), 2)
char <- LETTERS[sample(1:26, 5)]
mat  <- matrix(vec, nrow=3)
myDf <- setNames(data.frame(mat),
                 LETTERS[seq_len(ncol(mat))])

info(vec)
info(mat)
info(myDf)
info(char)

####*--------------------------------------------------------------------------*
#### 17.4 Funktionen analysieren ----
##### 17.4.1 Quelltext fremder Funktionen begutachten ----
sdSource <- capture.output(sd)
sdSource
base::mean
boot:::basic.ci

get("fligner.test")
methods("fligner.test")
## get("fligner.test.default")
getS3method("fligner.test", "default")
getAnywhere("fligner.test.default")

##### 17.4.2 Funktionen zur Laufzeit untersuchen ----
myPlot <- function(x, y, xLims=xRange) {
    stopifnot(is.numeric(x), is.numeric(y))
    stopifnot(length(x) > 0, length(y) > 0)
    if(length(x) != length(y)) {
        warning("x und y haben ungleiche Länge -> kürze")
        lenMin <- min(length(x), length(y))
        x <- x[1:lenMin]
        y <- y[1:lenMin]
    }

    xRange <- round(range(x), -1) + c(-10, 10)
    plot(x, y, xlim=xLims)
}

## myPlot(1:2, "ABC")
## traceback()

info         <- function(x) { UseMethod("info") }
info.numeric <- function(x) { range(x)  }
info.default <- function(x) { length(x) }

debug(info)
## info(sample(1:10, 10, replace=TRUE))
undebug(info)

as.list(body(median.default))
trace(median.default, tracer=browser, at=3)
# median.default(rnorm(100))
untrace(median.default)

####*--------------------------------------------------------------------------*
#### 17.5 Effizienz von Auswertungen steigern ----
##### 17.5.1 Grundlegende Empfehlungen ----
system.time(solve(matrix(sample(1:100, 400^2, replace=TRUE), nrow=400)))
xDbl <- rnorm(10000, 0, 10)
object.size(xDbl)
object.size(as.integer(xDbl))

##### 17.5.2 Auswertungen parallelisieren ----
library(parallel)
N      <- 1000
X1     <- rnorm(N, 175, 7)
X2     <- rnorm(N,  30, 8)
Ycont  <- 0.5*X1 - 0.3*X2 + 10 + rnorm(N, 0, 6)
Yord   <- cut(Ycont, breaks=quantile(Ycont), include.lowest=TRUE,
              labels=c("--", "-", "+", "++"), ordered=TRUE)

datOrd <- data.frame(X1, X2, Yord)
k      <- detectCores() - 1
cl     <- makeCluster(k)
stuff  <- clusterEvalQ(cl, library(VGAM))
clusterExport(cl, "datOrd")

getPerf <- function(idx) {
    datTst <- datOrd[ idx, , drop=FALSE]
    datTrn <- datOrd[-idx, , drop=FALSE]
    fit    <- vglm(Yord ~ X1 + X2, family=propodds, data=datTrn)
    Phat   <- predict(fit, newdata=datTst, type="response")
    catHat <- levels(datOrd$Yord)[max.col(Phat)]
    cTab   <- xtabs(~ Yord + catHat, data=datTst)

    cbind(OR_X1=exp(VGAM::coef(fit))["X1"],
          CCR=sum(diag(cTab)) / sum(cTab))
}

tstGrp <- sample(seq_len(N), N, replace=FALSE) %% k
foldsL <- split(seq_along(tstGrp), tstGrp)
outL   <- parLapply(cl, foldsL, getPerf)
stopCluster(cl)
do.call(rbind.data.frame, outL)

## unter Linux / MacOS
# library(VGAM)
# outL <- mclapply(foldsL, getPerf, mc.cores=k)

## entferne die (ggf. automatisch) geladenen Zusatzpakete, sofern möglich
try(detach(package:parallel))
