

## ----libs-modellieren----------------------------------------------------
library(tidyverse)
data(stats_test, package = "pradadata")

## ----libs-hidden-modellieren, echo = FALSE-------------------------------
library(gridExtra)
library(caret)
library(viridis)
library(huxtable)


## ----plot-stats-smooth, echo = FALSE-------------------------------------
stats_test %>%
ggplot(aes(y = score, x = self_eval)) +
geom_jitter() +
ggtitle("A") -> p1

p2 <- p1 + geom_smooth(method = "lm", se = FALSE) + ggtitle("B")

## ----plot-stats, echo = FALSE, out.width = "90%", fig.height = 1, fig.cap = "Ein Beispiel für Modellieren"----
grid.arrange(p1, p2, nrow = 1)


## ----overfitting-prep-4-plots, echo = FALSE, include = FALSE, fig.height = 3, fig.width = 3----
x <- seq(from = 1, to = 10, by = .3)
y <- sin(x) + rnorm(n = length(x), mean = 0, sd = .3)

daten <- data_frame(x, y)

ggplot(daten) +
  aes(x = x, y = y) +
  coord_fixed(ratio = 5/1) +
  labs(y = "") +
  geom_point() +
  ggtitle("A") -> p1

ggplot(daten) +
  aes(x = x, y = y) +
  geom_point() +
  coord_fixed(ratio = 5/1) +
  labs(y = "") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  ggtitle("B")-> p2


ggplot(daten) +
  aes(x = x, y = y) +
  geom_point() +
  coord_fixed(ratio = 5/1) +
  labs(y = "") +
  geom_line(color = "blue") +
  ggtitle("C") -> p3

ggplot(daten) +
  aes(x = x, y = y) +
  geom_point() +
  coord_fixed(ratio = 5/1) +
  labs(y = "") +
  stat_function(n = 99, fun = sin, color = "darkgreen") +
  ggtitle("D") -> p4

## ----overfitting-4-plots, echo = FALSE, fig.cap = "Welches Modell (Teil B-D; rot, grün, blau) passt am besten zu den Daten (Teil A) ?", out.width = "100%"----

grid.arrange(p1, p2, p3, p4, ncol = 4)



## ----plot-bias-variance, echo = FALSE, fig.asp = 0.25, fig.cap = "Der Spagat zwischen Verzerrung und Varianz", out.width = "100%"----
poly_degree = 15
df <- data_frame(x = seq(from = 1, to = 10, by = .3),
                 y = sin(x) + rnorm(n = length(x), mean = 0, sd = .3))

df %>%
  mutate(binned = cut(.$x, breaks = c(-Inf, 5.5, +Inf))) %>%
  group_by(binned) %>%
  mutate(y_group_md = median(y)) -> df


p1 <- ggplot(df) +
  aes(x = x, y = y) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ poly(x, poly_degree), se = FALSE)


p2 <-  ggplot(df) +
  aes(x = x) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = y_group_md, group = binned), color = "firebrick")


grid.arrange(p1, p2, ncol = 2)

## ------------------------------------------------------------------------
train <- slice(stats_test, 1:200)
test <- slice(stats_test, 201:306)

## ------------------------------------------------------------------------
train <- stats_test %>%
  sample_frac(.8, replace = FALSE)  # Stichprobe von 80%, ohne Zurücklegen

test <- stats_test %>%
  anti_join(train)

## ----resids-plot, echo = FALSE, results = "hold", fig.cap = "Geringer (links; A) vs. hoher (rechts, B) Vorhersagefehler"----

set.seed(42)
N      <- 100
beta   <- 0.4
intercept <- 1


sim <- data_frame(
  x = rnorm(N),
  error1 = rnorm(N, mean = 0, sd = .5),
  error2 = rnorm(N, mean = 0, sd = 2),
  y1 = intercept + x*beta + error1,
  y2 = intercept + x*beta + error2,
  pred = 1 + x*beta
)



p1 <- ggplot(sim, aes(x, y1)) +
  geom_abline(intercept = intercept, slope = beta, colour = "red") +
  geom_point(colour = "#00998a") +
  geom_linerange(aes(ymin = y1, ymax = pred), colour = "grey40") +
  ylim(-6,+6) + labs(title = "A - wenig Vorhersagefehler")


p2 <- ggplot(sim, aes(x, y2)) +
  geom_abline(intercept = intercept, slope = beta, colour = "red") +
  geom_point(colour = "#00998a") +
  geom_linerange(aes(ymin = y2, ymax = pred), colour = "grey40") +
  ylim(-6,+6) + labs(title = "B - viel Vorhersagefehler")


grid.arrange(p1, p2, ncol = 2)

## ----fig-accuracy, echo = FALSE, fig.cap = "Sinnbild für die Trefferquote eines Klassifikationsmodells", out.width = "30%"----
knitr::include_graphics("images/modellieren/accuracy-crop.pdf")

## ----p-class-stat-bin-2d, echo = FALSE, fig.cap = "Ein einfaches Klassifikationsmodell", fig.asp = .7----

set.seed(42)
twoClassSim(n = 500, intercept = -5, linearVars = 10, noiseVars = 0,
  corrVars = 0, corrType = "AR1", corrValue = 0, mislabel = 0,
  ordinal = FALSE) -> sim_df

sim_df$Class_num <- ifelse(sim_df$Class ==
                           "Class1", 1, 0)


sim_df %>%
  filter(Linear01 > -2, Linear01 < 2) %>%
    filter(Linear02 > -2, Linear02 < 2) %>%
  ggplot() +
  aes(x = Linear01, y = Linear02, group = 1) +
  stat_summary_2d(aes(z = Class),
                  fun = function(z) names(which.max(table(z))),
                  bins = 3,
                  alpha = .5,
                  drop = TRUE) +
  geom_point(aes(color = Class, shape = Class), size = 3) +
annotate(geom = "point", x = 2, y = -1, shape = 8, size = 5) +
  labs(x = "x1", y = "x2",
       class = "Regen") +
  theme(legend.position = "none") +
  scale_color_manual(values = c("firebrick", "dodgerblue4")) +
  scale_fill_manual(values = c("grey50", "grey90"))



## ----plot-bias-variance2, echo = FALSE, fig.cap = "Bias-Varianz-Abwägung. Links (A): Wenig Bias, viel Varianz. Rechts (B): Viel Bias, wenig Varianz.", eval = FALSE----

poly_degree = 5
df <- data_frame(x = seq(from = 1, to = 10, by = .3),
                 y = cos(x) + rnorm(n = length(x), mean = 0, sd = .5))

df %>%
  mutate(binned = cut(.$x, breaks = c(-Inf, 5.5, +Inf))) %>%
  group_by(binned) %>%
  mutate(y_group_md = mean(y)) -> df


p1 <- ggplot(df) +
  aes(x = x, y = y) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ poly(x, poly_degree), se = FALSE) +
  labs(title = "A")


p2 <-  ggplot(df) +
  aes(x = x) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = y_group_md, group = binned), color = "firebrick")  +
  labs(title = "B")


grid.arrange(p1, p2, ncol = 2)


