

## ----libs-textmining-----------------------------------------------------
library(tidyverse)
library(stringr)
library(tidytext)
library(lsa)
library(SnowballC)
library(wordcloud)
library(skimr)
library(pdftools)

data(afd, package = "pradadata")
data(stopwords_de, package = "lsa")
data(sentiws, package = "pradadata")

## ----libs-textmining-hidden, echo = FALSE--------------------------------
library(knitr)

## ------------------------------------------------------------------------
text <- c("Wir haben die Frauen zu Bett gebracht,",
          "als die Männer in Frankreich standen.",
          "Wir hatten uns das viel schöner gedacht.",
          "Wir waren nur Konfirmanden.")

## ------------------------------------------------------------------------
text_df <- data_frame(Zeile = 1:4,
                      text = text)

## ----brecht, echo = FALSE------------------------------------------------
knitr::kable(text_df, caption = "Gedicht von Brecht",
booktabs = TRUE)

## ----eval = FALSE--------------------------------------------------------
## text <- read_lines("Brecht.txt")


## ------------------------------------------------------------------------
text_df %>%
  unnest_tokens(output = wort, input = text) -> tidytext_df
tidytext_df %>% head()

## Nehme den Datensatz "text_df" UND DANN

## In einem *Tidytext-Dataframes*steht in jeder Zeile ein Wort (Token) und die Häufigkeit dieses Worts im Dokument.

## ------------------------------------------------------------------------
tidytext_df %>%
  filter(str_detect(wort, "[a-z]")) -> tidytext_df_lowercase


## ------------------------------------------------------------------------
string <- paste0("Correlation of unemployment and #AfD votes",
                 "at #btw17: ***r = 0.18***",
                 "https://t.co/YHyqTguVWx")

## ------------------------------------------------------------------------
str_detect(string, "[:digit:]")

## ------------------------------------------------------------------------
str_locate(string, "[:digit:]")
str_extract(string, "[:digit:]")

## ------------------------------------------------------------------------
str_extract_all(string, "[:digit:]")

## ------------------------------------------------------------------------
str_extract_all(string, "[:digit:]{2}")

## ------------------------------------------------------------------------
str_extract_all(string, "#[:alnum:]+")

## ------------------------------------------------------------------------
str_extract_all(string, "https?://[:graph:]+")

## ------------------------------------------------------------------------
str_count(string, boundary("word"))

## ------------------------------------------------------------------------
str_extract_all(string, "[:alpha:]+")

## ------------------------------------------------------------------------
str_replace_all(string, "[^[:alpha:]+]", "")

## ----eval = FALSE--------------------------------------------------------
afd_pfad <- "data/afd_programm.pdf"
content <- pdf_text(afd_pfad)
afd <- data_frame(Seite = 1:96,
                  content)

## ------------------------------------------------------------------------
afd %>%
  unnest_tokens(output = token, input = content) %>%
  dplyr::filter(str_detect(token, "[a-z]")) -> afd_long
head(afd_long)

## ------------------------------------------------------------------------
afd_long %>%
  na.omit() %>%  # fehlende Werte löschen
  count(token, sort = TRUE)

## ------------------------------------------------------------------------
stopwords_de <- data_frame(word = stopwords_de)

afd_long %>%
  anti_join(stopwords_de, by = c("token" = "word") ) -> afd_no_stop

## ------------------------------------------------------------------------
afd_no_stop %>%
  count(token, sort = TRUE) -> afd_count

## ------------------------------------------------------------------------
afd_no_stop %>%
  mutate(token_stem = wordStem(.$token, language = "de")) %>%
  count(token_stem, sort = TRUE) -> afd_count_stemmed

## ----afd-count-no-stop, echo = FALSE-------------------------------------

afd_count %>%
  top_n(10) -> afd_count_top10

afd_count_stemmed %>%
  top_n(10) %>%
  bind_cols(afd_count_top10) %>%
  knitr::kable(caption = "Die häufigsten Wörter -- mit Stemming (links) und ohne Stemming (rechts)",
  booktabs = TRUE)

## ----show-wordcloud, fig.cap = "Eine Wordwolke zum AfD-Parteiprogramm"----
wordcloud(words = afd_count_stemmed$token_stem,
          freq = afd_count_stemmed$n,
          max.words = 100,
          scale = c(2,.5),
          colors=brewer.pal(6, "Dark2"))



## ------------------------------------------------------------------------
afd_count_stemmed %>%
  top_n(30) %>%
  ggplot() +
  aes(x = reorder(token_stem, n), y = n) +
  geom_col() +
  labs(title = "mit Trunkierung") +
  coord_flip() -> p1

afd_count %>%
  top_n(30) %>%
  ggplot() +
  aes(x = reorder(token, n), y = n) +
  geom_col() +
  labs(title = "ohne Trunkierung") +
  coord_flip() -> p2

## ----p-word-freq, echo = FALSE, fig.cap = "Worthäufigkeiten im AfD-Parteiprogramm", out.width = "100%", fig.asp=0.7----
gridExtra::grid.arrange(p1, p2, ncol = 2)

##
## ----sentiws-head, echo = FALSE------------------------------------------
knitr::kable(head(sentiws), caption = "Auszug aus SentiwS",
booktabs = TRUE)

## ------------------------------------------------------------------------
afd_long %>%
  inner_join(sentiws, by = c("token" = "word")) %>%
  select(-inflections) -> afd_senti  # die Spalte brauchen wir nicht

afd_senti %>%
  group_by(neg_pos) %>%
  summarise(polarity_sum = sum(value),
            polarity_count = n()) %>%
  mutate(polarity_prop = (polarity_count / sum(polarity_count)) %>%
           round(2)) -> afd_senti_tab

## ----afd-senti-tab, echo = FALSE-----------------------------------------
knitr::kable(afd_senti_tab, caption = "Zusammenfassung von SentiWS",
             booktabs = TRUE)

## ------------------------------------------------------------------------
afd_senti %>%
  distinct(token, .keep_all = TRUE) %>%
  mutate(value_abs = abs(value)) %>%
  top_n(20, value_abs) %>%
  pull(token)

## ----skim-no-eval, eval = FALSE------------------------------------------
sentiws %>%
  select(value, neg_pos) %>%
  skim()

## ----skimr-no-histogram, echo = FALSE------------------------------------
skim_with(numeric = list(hist = NULL))

## ----skim-neg-pos--------------------------------------------------------
sentiws %>%
  select(value, neg_pos) %>%
  group_by(neg_pos) %>%
  skim_to_wide()

## ------------------------------------------------------------------------
afd_senti %>%
  summarise(senti_sum = mean(value) %>% round(2))


