Aufbereitung Datensätze

23.05.2019 - Valentin Gold - Reading time ~16 Minutes


Aufbereitung Datensätze

In diesem File wird noch einmal die Aufbereitung der Datensätze demonstriert. Ich habe folgende Daten extrahiert:

  1. Jede Legislaturperiode aus dem GermaParl-Projekt als Tibble.
  2. Den UN Generalversammlungsdatensatz aus dem UNGA-Projekt als Tibble.
  3. Die EuroParl-Daten. Ich habe einen etwas anderen Datensatz als die Ihnen bekannte EuroParl-Daten genutzt. Das hat den Vorteil, dass auch Metadaten zu den Parlamentarier*innen zur Verfügung stehen.

Jeden Datensatz gibt es in zwei Varianten:

  1. inkl. Unterbrechungen
  2. aggregiert auf Reden, d.h. exkl. Unterbrechungen (Datename mit Zusatz _agg)

Wenn Sie einen der Datensätze nutzen, bitte unbedingt die angegebene Literatur zitieren, d.h.

Bitte beachten Sie, dass diese Aufbereitungsschritte noch fehlerbehaftet sein können. Wenn Ihnen Fehler auffallen, können Sie mir gerne Bescheid geben und dann versuche ich entsprechend nachzubessern.

Sie können die Datensätze online herunterladen

GermParl

Die Daten enstammen dem GermaParl Datensatz, zusammengestellt von Andreas Blätte im Rahmen des polMineR-Projekts.

GermaParl inkl. Unterbrechungen

In GermaParl sind Unterbrechungen als separate Beobachtungseinheiten formatiert, d.h. wenn ein*e Parlamentarier*in eine Rede anfängt und sie bekommt Applaus, dann wird der Applaus als separate Beobachtung angegeben. Im Anschluss an die Unterbrechung bzw. den Applaus wird dann die Rede wieder fortgesetzt. Mit einer solchen Struktur kann man beispielsweise analysieren, welche Inhalte aus den Reden Applaus erhalten.

#Pakete laden
library(tidyverse)
library(lubridate)

library(polmineR)
library(GermaParl)

#GermaParl laden
use("GermaParl") # to activate the corpus in the data package
corpus() # to see whether the GERMAPARL corpus is listed

#Loop over all legislative periods
sessions <- s_attributes("GERMAPARL", s_attribute = "lp") %>% 
  as.numeric()

s <- sessions[1]

for (s in sessions){
  #extract session
  session <- partition("GERMAPARL", lp=s)
  
  #convert to tibble
  session <- decode(session)
  session <- as_tibble(session)
  class(session)

  # backup <- session_18
  # session_18 <- backup

  # check interjections
  # session %>%
  # select(word, party, speaker, interjection) %>% 
  # #filter(interjection==TRUE) %>% 
  # head(10)

  # mark first occurrence of interjections and speeches
  session <- session %>% 
    mutate(first_occ = case_when(
      interjection==TRUE & lag(interjection==FALSE) ~ 1,
      interjection==FALSE & lag(interjection==TRUE) ~ 1,
      TRUE ~ 0
    )) %>% 
    mutate(turn_id = cumsum(first_occ)) 

  # # check interjections
  # session %>% 
  #   select(word, party, speaker, interjection, first_occ, turn_id) %>% 
  #   #filter(interjection==TRUE) %>% 
  #   head(10)

  # build lemma_pos and word_pos
  session <- session %>% 
    mutate(word_pos = str_c(word, pos, collapse = "_"), 
           lemma_pos = str_c(lemma, pos, collapse = "_"))

  # aggregation to speeches and interjections
  session <- session %>% 
    # group over all possible id variables
    group_by(turn_id, lp, year, session, date, speaker, interjection, party, parliamentary_group, role, agenda_item, agenda_item_type, url, src) %>% 
    # aggregation for lemma, pos, word, lemma_pos, word_pos
    summarise(lemma = str_c(lemma, collapse = " "), 
              pos = str_c(pos, collapse = " "), 
              word = str_c(word, collapse = " "), 
              #word_pos = str_c(word_pos, collapse = " "), 
              #lemma_pos = str_c(lemma_pos, collapse = " ")
              ) %>% 
    ungroup()

  #head(session)

  # convert to numeric, whenever possible
  session <- session %>% 
    mutate(lp = as.numeric(lp), 
           session = as.numeric(session), 
           date = ymd(date), 
           year = year(date),
           agenda_item = as.numeric(agenda_item))

  # correct spaces between punctuation marks
  session <- session %>% 
    mutate(word = str_replace_all(word, '[:space:]{1,}([.,;:!?\\)\\]])', '\\1')) %>% 
    mutate(word = str_replace_all(word, '(\\()[:space:]', '\\1')) %>% 
    mutate(word = str_replace_all(word, '(\\[)[:space:]', '\\1')) 
  
  #delete speaker if interjection == TRUE
  session <- session %>% 
    mutate(speaker = case_when(
      interjection == TRUE ~ NA_character_, 
      TRUE ~ speaker
    ))
    
  #names(session)
  #dim(session)
  
  #rename object to lp_XX
  assign(str_c("lp_", s), session)
  
  #save objects
  filepath <- str_c("../../../daten/polmineR/lp_", s, ".rda")
  if(s==13){save(lp_13, file=filepath)}
  if(s==14){save(lp_14, file=filepath)}
  if(s==15){save(lp_15, file=filepath)}
  if(s==16){save(lp_16, file=filepath)}
  if(s==17){save(lp_17, file=filepath)}
  if(s==18){save(lp_18, file=filepath)}
  
}

GermaParl exkl. Unterbrechungen

Wenn man nur kohärente Reden ohne Unterbrechungen haben will, dann kann der Datensatz auf diese Analyseeinheit aufaggregiert werden. Oder alternativ könnte man das obige Verfahren anpassen; hier gibt es in den polmineR-Vignetten eine Anleitung dazu.

#List all files
lp_files <- list.files("../../../daten/polmineR/", pattern = "lp_[0-9]{,2}.rda")

#loop over all legislative periods
f <- lp_files[1]

for (f in lp_files){
  #load data
  file_path <- str_c("../../../daten/polmineR/", f)
  load(file_path)
  
  #as new object temp
  if(str_detect(f, "13")){temp <- lp_13}
  if(str_detect(f, "14")){temp <- lp_14}
  if(str_detect(f, "15")){temp <- lp_15}
  if(str_detect(f, "16")){temp <- lp_16}
  if(str_detect(f, "17")){temp <- lp_17}
  if(str_detect(f, "18")){temp <- lp_18}
  
  #delete interjections
  temp <- temp %>% 
    #filter interjections
    filter(interjection != TRUE) %>% 
    #speaker change
    mutate(speaker_change = case_when(
      speaker != lag(speaker) ~ 1, 
      TRUE ~ 0
    )) %>% 
    #cumsum
    mutate(speaker_cumsum = cumsum(speaker_change)) %>% 
    #group
    group_by(speaker_cumsum) %>% 
    #aggregate
    summarise(turn_id = first(turn_id), 
              lp = mean(lp), 
              year = mean(year), 
              speaker = first(speaker),
              session = mean(session), 
              date = first(date), 
              interjection = first(interjection), 
              party = first(party), 
              parliamentary_group = first(parliamentary_group), 
              role = first(role), 
              agenda_item = first(agenda_item), 
              agenda_item_type = first(agenda_item_type), 
              url = first(url), 
              src = first(src), 
              lemma = str_c(lemma, collapse= " "), 
              pos = str_c(pos, collapse = " "), 
              word = str_c(word, collapse = " ")) %>% 
    #sort
    arrange(turn_id)
  
  #Inspection
  # temp %>% 
  #   slice(1:20) %>% 
  #   select(turn_id, speaker, word) %>% 
  #   DT::datatable(options = list(pageLength = 2))
    
  #rename object to lp_XX
  assign(str_c("lp_", str_extract(f, "[0-9]{2}"), "_agg"), temp)
  
  #save objects
  filepath <- str_c("../../../daten/polmineR/lp_", str_extract(f, "[0-9]{2}"), "_agg", ".rda")
  if(str_detect(f, "13")){save(lp_13_agg, file=filepath)}
  if(str_detect(f, "14")){save(lp_14_agg, file=filepath)}
  if(str_detect(f, "15")){save(lp_15_agg, file=filepath)}
  if(str_detect(f, "16")){save(lp_16_agg, file=filepath)}
  if(str_detect(f, "17")){save(lp_17_agg, file=filepath)}
  if(str_detect(f, "18")){save(lp_18_agg, file=filepath)}
}

Datensatz UNGA

Die Daten enstammen dem UNGA Datensatz, zusammengestellt von Andreas Blätte im Rahmen des polMineR-Projekts. In diesem Datensatz gibt es meines Erachtens nach keine Unterbrechungen; von daher gibt es nur eine Version des Datensatzes.

#UNGA laden
use("UNGA") # to activate the corpus in the data package
corpus() # to see whether the UNGA corpus is listed

#convert to tibble
unga <- decode("UNGA")
unga <- as_tibble(unga)
class(unga)

# backup <- unga
# unga <- backup

# check data
names(unga)
unga %>% 
  select(cpos, word, id, who, state, role, lp, session, date) %>% 
  slice(1:50) %>% 
  DT::datatable(options = list(pageLength = 20))

# aggregation to speeches
unga <- unga %>% 
  # group over all possible id variables
  group_by(who, state, role, lp, session, date) %>% 
  # aggregations
  summarise(id = first(id),
            cpos = first(cpos), 
            word = str_c(word, collapse = " ")) %>% 
  ungroup() %>% 
  arrange(cpos)

# convert to numeric, whenever possible
unga <- unga %>% 
  mutate(cpos = as.numeric(cpos), 
         id = as.numeric(id), 
         lp = as.numeric(lp), 
         session = as.numeric(session), 
         date = ymd(date), 
         year = year(date))

# correct spaces between punctuation marks
unga <- unga %>%
  mutate(word = str_replace_all(word, '[:space:]{1,}([.,;:!?\\)\\]])', '\\1')) %>% 
  mutate(word = str_replace_all(word, '(\\()[:space:]', '\\1')) %>% 
  mutate(word = str_replace_all(word, '(\\[)[:space:]', '\\1')) 
  
#names(unga)
#dim(unga)
  
filepath <- "../../../daten/polmineR/unga.rda"
save(unga, file=filepath)

EuroParl-UdS

Der EuroParl-UdS Datensatz ist online verfügbar unter http://fedora.clarin-d.uni-saarland.de/europarl-uds/. Im Prinzip ist das eine Erweiterung des ursprünglichen EuroParl-Datensatzes – zum einen zeitlich und zum anderen mit mehr Metadaten zu den Parlamentarier*innen.

Skript für Deutsche Daten

#EuroParl UdS

#Pakete laden
library(tidyverse)
library(lubridate)
library(XML)

#Liste aller Files im Ordner
files <- list.files("../../../daten/EuroParl_UdS/de/", 
                    pattern=".xml")
length(files)

#Für Testzwecke
#files <- files[1:10]
f <- files[1]

#-----------------------1.Schritt: Ersetzen von des Tags <a text="

#Veränderte Daten in neuem Ordner speichern
dir.create(file.path("../../../daten/EuroParl_UdS", "de_adapted"))

for (f in files){
  #Generiere Pfad zur Datei
  file_path <- str_c("../../../daten/EuroParl_UdS/de/", f)
  
  #Einlesen mit readLines
  source_file <- readLines(file_path, encoding = "UTF-8")
  
  #in Tibble konvertieren
  source_file <- tibble::enframe(source_file, name=NULL, value="original")
  
  #Tag ändern
  source_file <- source_file %>% 
    mutate(original = 
             str_replace_all(original, '\\<a text=\\"(.*?)\\"/>', '<p sl="kommentar">\\1</p>'))
  
  #Speichern in neuen Ordner
  save_path <- str_c("../../../daten/EuroParl_UdS/de_adapted/", f)
  writeLines(source_file %>% pull(original), save_path)
  
}


#-----------------------2.Schritt: adaptierte xml-Dokument ein- und auslesen

#Leeres Objekt
europarl_de <- NULL

#Für Testzwecke
#files <- files[1:10]
f <- files[2]

#Schleife
for (f in files){
  #Generiere Pfad zur Datei
  file_path <- str_c("../../../daten/EuroParl_UdS/de_adapted/", f)
  
  #Einlesen mit xml-Paket
  doc <- xmlParse(file_path, encoding = "UTF-8") 
  
  #Extrahiere Nodes
  #nodes <- getNodeSet(doc, "//*/p")
  #nodes[[1]]
  
  #Extrahiere alle Reden, gekennzeichnet durch den Tag <p>...</p>
  reden <- enframe(xpathSApply(doc, "//p", xmlValue), name=NULL)
  dim(reden)
  names(reden) <- "text"
  head(reden)
  
  #Attribute der Reden
  sl <- enframe(xpathSApply(doc, "//p", function(x) xmlAttrs(x, 'sl')), name=NULL)
  dim(sl)
  names(sl) <- "sl"
  head(sl)
  
  #Attribute der Redner*innen
  id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "id")), name=NULL)
  id <- lapply(id$value, function(x) if(is.null(x)) data.frame(id = NA) else x)
  id <- suppressMessages( do.call(rbind.data.frame, id) )
  dim(id)
  names(id) <- "id"
  head(id)
  
  speaker_id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "speaker_id")), name=NULL)
  speaker_id <- lapply(speaker_id$value, function(x) if(is.null(x)) data.frame(speaker_id = NA) else x)
  speaker_id <- suppressMessages( do.call(rbind.data.frame, speaker_id) )
  dim(speaker_id)
  names(speaker_id) <- "speaker_id"
  head(speaker_id)

  is_mep <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "is_mep")), name=NULL)
  is_mep <- lapply(is_mep$value, function(x) if(is.null(x)) data.frame(is_mep = NA) else x)
  is_mep <- suppressMessages( do.call(rbind.data.frame, is_mep) )
  dim(is_mep)
  names(is_mep) <- "is_mep"
  head(is_mep)
  
  mode <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "mode")), name=NULL)
  mode <- lapply(mode$value, function(x) if(is.null(x)) data.frame(mode = NA) else x)
  mode <- suppressMessages( do.call(rbind.data.frame, mode) )
  dim(mode)
  names(mode) <- "mode"
  head(mode)
  
  role <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "role")), name=NULL)
  role <- lapply(role$value, function(x) if(is.null(x)) data.frame(role = NA) else x)
  role <- suppressMessages( do.call(rbind.data.frame, role) )
  dim(role)
  names(role) <- "role"
  head(role)

  name <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "name")), name=NULL)
  name <- lapply(name$value, function(x) if(is.null(x)) data.frame(name = NA) else x)
  name <- suppressMessages( do.call(rbind.data.frame, name) )
  dim(name)
  names(name) <- "name"
  head(name)
  
  nationality <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "nationality")), name=NULL)
  nationality <- lapply(nationality$value, function(x) if(is.null(x)) data.frame(nationality = NA) else x)
  nationality <- suppressMessages( do.call(rbind.data.frame, nationality) )
  dim(nationality)
  names(nationality) <- "nationality"
  head(nationality)

  birth_date <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "birth_date")), name=NULL)
  birth_date <- lapply(birth_date$value, function(x) if(is.null(x)) data.frame(birth_date = NA) else x)
  birth_date <- suppressMessages( do.call(rbind.data.frame, birth_date) )
  dim(birth_date)
  names(birth_date) <- "birth_date"
  head(birth_date)

  birth_place <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "birth_place")), name=NULL)
  birth_place <- lapply(birth_place$value, function(x) if(is.null(x)) data.frame(birth_place = NA) else x)
  birth_place <- suppressMessages( do.call(rbind.data.frame, birth_place) )
  dim(birth_place)
  names(birth_place) <- "birth_place"
  head(birth_place)

  n_party <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "n_party")), name=NULL)
  n_party <- lapply(n_party$value, function(x) if(is.null(x)) data.frame(n_party = NA) else x)
  n_party <- suppressMessages( do.call(rbind.data.frame, n_party) )
  dim(n_party)
  names(n_party) <- "n_party"
  head(n_party)

  p_group <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "p_group")), name=NULL)
  p_group <- lapply(p_group$value, function(x) if(is.null(x)) data.frame(p_group = NA) else x)
  p_group <- suppressMessages( do.call(rbind.data.frame, p_group) )
  dim(p_group)
  names(p_group) <- "p_group"
  head(p_group)

  m_state <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "m_state")), name=NULL)
  m_state <- lapply(m_state$value, function(x) if(is.null(x)) data.frame(m_state = NA) else x)
  m_state <- suppressMessages( do.call(rbind.data.frame, m_state) )
  dim(m_state)
  names(m_state) <- "m_state"
  head(m_state)
  
  #Attribute der Sektion (Tagesordnung)
  sec_id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(x)), "id")), name=NULL)
  sec_id <- lapply(sec_id$value, function(x) if(is.null(x)) data.frame(sec_id = NA) else x)
  sec_id <- suppressMessages( do.call(rbind.data.frame, sec_id) )
  dim(sec_id)
  names(sec_id) <- "sec_id"
  head(sec_id)
  
  sec_title <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(x)), "title")), name=NULL)
  sec_title <- lapply(sec_title$value, function(x) if(is.null(x)) data.frame(sec_title = NA) else x)
  sec_title <- suppressMessages( do.call(rbind.data.frame, sec_title) )
  dim(sec_title)
  names(sec_title) <- "sec_title"
  head(sec_title)
  
  #Attribute des Dokuments
  doc_id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "id")), name=NULL)
  doc_id <- lapply(doc_id$value, function(x) if(is.null(x)) data.frame(doc_id = NA) else x)
  doc_id <- suppressMessages( do.call(rbind.data.frame, doc_id) )
  dim(doc_id)
  names(doc_id) <- "doc_id"
  head(doc_id)
  
  doc_lang <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "lang")), name=NULL)
  doc_lang <- lapply(doc_lang$value, function(x) if(is.null(x)) data.frame(doc_lang = NA) else x)
  doc_lang <- suppressMessages( do.call(rbind.data.frame, doc_lang) )
  dim(doc_lang)
  names(doc_lang) <- "doc_lang"
  head(doc_lang)

  doc_date <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "date")), name=NULL)
  doc_date <- lapply(doc_date$value, function(x) if(is.null(x)) data.frame(doc_date = NA) else x)
  doc_date <- suppressMessages( do.call(rbind.data.frame, doc_date) )
  dim(doc_date)
  names(doc_date) <- "doc_date"
  head(doc_date)  

  doc_place <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "place")), name=NULL)
  doc_place <- lapply(doc_place$value, function(x) if(is.null(x)) data.frame(doc_place = NA) else x)
  doc_place <- suppressMessages( do.call(rbind.data.frame, doc_place) )
  dim(doc_place)
  names(doc_place) <- "doc_place"
  head(doc_place)
  
  doc_edition <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "edition")), name=NULL)
  doc_edition <- lapply(doc_edition$value, function(x) if(is.null(x)) data.frame(doc_edition = NA) else x)
  doc_edition <- suppressMessages( do.call(rbind.data.frame, doc_edition) )
  dim(doc_edition)
  names(doc_edition) <- "doc_edition"
  head(doc_edition)
  
  #bind data
  reden <- bind_cols(sl, id, speaker_id, name, 
                     reden,
                     is_mep, mode, nationality, birth_date, birth_place, n_party, p_group, m_state, 
                     sec_id, sec_title, 
                     doc_id, doc_lang, doc_date, doc_place, doc_edition) %>% 
    mutate(rowid = row_number())
  #dim(reden)
  #head(reden)
  
  #Zusammenfügen aller Dateien
  europarl_de <- suppressWarnings ( bind_rows(europarl_de, data.frame(f, reden)) )
  
  #print(f)
}

#Inspektion
dim(europarl_de)
save(europarl_de, file="../../../daten/EuroParl_UdS/europarl_de_temp.rda")


#-----------------------3.Schritt: Korrekturen und Aggregation

#Daten laden
load("../../../daten/EuroParl_UdS/europarl_de_temp.rda")

names(europarl_de)
# europarl_de %>% 
#   select(id, sl, speaker_id, name, text) %>% 
#   slice(1:50) %>% 
#   DT::datatable(., options = list(pageLength = 5))

#neue Variable speaker, wenn sl ungleich kommentar
europarl_de <- europarl_de %>% 
  mutate(speaker = case_when(
    sl != "kommentar" ~ speaker_id, 
    TRUE ~ "0"
  ))

europarl_de %>% 
  select(speaker_id, sl, speaker) %>% 
  head(10)

#Sprecher*innenwechsel
europarl_de <- europarl_de %>% 
  mutate(speaker_change = case_when(
    speaker != lag(speaker) ~ 1, 
    TRUE ~ 0
  )) %>% 
  mutate(cumsum_speaker = cumsum(speaker_change)) %>% 
  group_by(f, cumsum_speaker) %>% 
  summarise(sl = first(sl),
            id = first(id), 
            speaker_id = first(speaker_id), 
            name = first(name), 
            text = str_c(text, collapse=" "), 
            is_mep = first(is_mep), 
            mode = first(mode), 
            nationality = first(nationality), 
            birth_date = first(birth_date), 
            birth_place = first(birth_place), 
            n_party = first(n_party), 
            p_group = first(p_group), 
            m_state = first(m_state),
            sec_id = first(sec_id), 
            sec_title = first(sec_title), 
            doc_id = first(doc_id), 
            doc_lang = first(doc_lang), 
            doc_date = first(doc_date), 
            doc_place = first(doc_place), 
            doc_edition = first(doc_edition), 
            rowid = first(rowid), 
            speaker = first(speaker)) %>% 
  arrange(f, rowid) %>% 
  ungroup()

europarl_de %>% 
  select(rowid, speaker_id, sl, speaker, text) %>% 
  head(10)

save(europarl_de, file="../../../daten/EuroParl_UdS/europarl_de_xml.rda")


#-----------------------4.Schritt:Aggregation auf Reden, d.h. ohne Kommentare
dim(europarl_de)
 europarl_de_agg <- europarl_de %>% 
   filter(sl != "kommentar") %>% 
   mutate(speaker_change = case_when(
    speaker != lag(speaker) ~ 1, 
    TRUE ~ 0
  )) %>% 
  mutate(cumsum_speaker = cumsum(speaker_change)) %>% 
  group_by(f, cumsum_speaker) %>% 
  summarise(sl = first(sl),
            id = first(id), 
            speaker_id = first(speaker_id), 
            name = first(name), 
            text = str_c(text, collapse=" "), 
            is_mep = first(is_mep), 
            mode = first(mode), 
            nationality = first(nationality), 
            birth_date = first(birth_date), 
            birth_place = first(birth_place), 
            n_party = first(n_party), 
            p_group = first(p_group), 
            m_state = first(m_state),
            sec_id = first(sec_id), 
            sec_title = first(sec_title), 
            doc_id = first(doc_id), 
            doc_lang = first(doc_lang), 
            doc_date = first(doc_date), 
            doc_place = first(doc_place), 
            doc_edition = first(doc_edition), 
            rowid = first(rowid), 
            speaker = first(speaker)) %>% 
  arrange(f, rowid) %>% 
  ungroup()

europarl_de_agg %>% 
  select(rowid, speaker_id, sl, speaker, text) %>% 
  head(10)

save(europarl_de_agg, file="../../../daten/EuroParl_UdS/europarl_de_agg.rda")

Skript für Englische Daten

#EuroParl UdS

#Pakete laden
library(tidyverse)
library(lubridate)
library(XML)

#Liste aller Files im Ordner
files <- list.files("../../../daten/EuroParl_UdS/en/", 
                    pattern=".xml")
length(files)

#Für Testzwecke
#files <- files[1:10]
f <- files[1]

#-----------------------1.Schritt: Ersetzen von des Tags <a text="

#Veränderte Daten in neuem Ordner speichern
dir.create(file.path("../../../daten/EuroParl_UdS", "en_adapted"))

for (f in files){
  #Generiere Pfad zur Datei
  file_path <- str_c("../../../daten/EuroParl_UdS/en/", f)
  
  #Einlesen mit readLines
  source_file <- readLines(file_path, encoding = "UTF-8")
  
  #in Tibble konvertieren
  source_file <- tibble::enframe(source_file, name=NULL, value="original")
  
  #Tag ändern
  source_file <- source_file %>% 
    mutate(original = 
             str_replace_all(original, '\\<a text=\\"(.*?)\\"/>', '<p sl="kommentar">\\1</p>'))
  
  #Speichern in neuen Ordner
  save_path <- str_c("../../../daten/EuroParl_UdS/en_adapted/", f)
  writeLines(source_file %>% pull(original), save_path)
  
}


#-----------------------2.Schritt: adaptierte xml-Dokument ein- und auslesen

#Leeres Objekt
europarl_en <- NULL

#Für Testzwecke
#files <- files[1:10]
f <- files[2]

#Schleife
for (f in files){
  #Generiere Pfad zur Datei
  file_path <- str_c("../../../daten/EuroParl_UdS/en_adapted/", f)
  
  #Einlesen mit xml-Paket
  doc <- xmlParse(file_path, encoding = "UTF-8") 
  
  #Extrahiere Nodes
  #nodes <- getNodeSet(doc, "//*/p")
  #nodes[[1]]
  
  #Extrahiere alle Reden, gekennzeichnet durch den Tag <p>...</p>
  reden <- enframe(xpathSApply(doc, "//p", xmlValue), name=NULL)
  dim(reden)
  names(reden) <- "text"
  head(reden)
  
  #Attribute der Reden
  sl <- enframe(xpathSApply(doc, "//p", function(x) xmlAttrs(x, 'sl')), name=NULL)
  dim(sl)
  names(sl) <- "sl"
  head(sl)
  
  #Attribute der Redner*innen
  id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "id")), name=NULL)
  id <- lapply(id$value, function(x) if(is.null(x)) data.frame(id = NA) else x)
  id <- suppressMessages( do.call(rbind.data.frame, id) )
  dim(id)
  names(id) <- "id"
  head(id)
  
  speaker_id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "speaker_id")), name=NULL)
  speaker_id <- lapply(speaker_id$value, function(x) if(is.null(x)) data.frame(speaker_id = NA) else x)
  speaker_id <- suppressMessages( do.call(rbind.data.frame, speaker_id) )
  dim(speaker_id)
  names(speaker_id) <- "speaker_id"
  head(speaker_id)
  
  is_mep <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "is_mep")), name=NULL)
  is_mep <- lapply(is_mep$value, function(x) if(is.null(x)) data.frame(is_mep = NA) else x)
  is_mep <- suppressMessages( do.call(rbind.data.frame, is_mep) )
  dim(is_mep)
  names(is_mep) <- "is_mep"
  head(is_mep)
  
  mode <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "mode")), name=NULL)
  mode <- lapply(mode$value, function(x) if(is.null(x)) data.frame(mode = NA) else x)
  mode <- suppressMessages( do.call(rbind.data.frame, mode) )
  dim(mode)
  names(mode) <- "mode"
  head(mode)
  
  role <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "role")), name=NULL)
  role <- lapply(role$value, function(x) if(is.null(x)) data.frame(role = NA) else x)
  role <- suppressMessages( do.call(rbind.data.frame, role) )
  dim(role)
  names(role) <- "role"
  head(role)
  
  name <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "name")), name=NULL)
  name <- lapply(name$value, function(x) if(is.null(x)) data.frame(name = NA) else x)
  name <- suppressMessages( do.call(rbind.data.frame, name) )
  dim(name)
  names(name) <- "name"
  head(name)
  
  nationality <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "nationality")), name=NULL)
  nationality <- lapply(nationality$value, function(x) if(is.null(x)) data.frame(nationality = NA) else x)
  nationality <- suppressMessages( do.call(rbind.data.frame, nationality) )
  dim(nationality)
  names(nationality) <- "nationality"
  head(nationality)
  
  birth_date <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "birth_date")), name=NULL)
  birth_date <- lapply(birth_date$value, function(x) if(is.null(x)) data.frame(birth_date = NA) else x)
  birth_date <- suppressMessages( do.call(rbind.data.frame, birth_date) )
  dim(birth_date)
  names(birth_date) <- "birth_date"
  head(birth_date)
  
  birth_place <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "birth_place")), name=NULL)
  birth_place <- lapply(birth_place$value, function(x) if(is.null(x)) data.frame(birth_place = NA) else x)
  birth_place <- suppressMessages( do.call(rbind.data.frame, birth_place) )
  dim(birth_place)
  names(birth_place) <- "birth_place"
  head(birth_place)
  
  n_party <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "n_party")), name=NULL)
  n_party <- lapply(n_party$value, function(x) if(is.null(x)) data.frame(n_party = NA) else x)
  n_party <- suppressMessages( do.call(rbind.data.frame, n_party) )
  dim(n_party)
  names(n_party) <- "n_party"
  head(n_party)
  
  p_group <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "p_group")), name=NULL)
  p_group <- lapply(p_group$value, function(x) if(is.null(x)) data.frame(p_group = NA) else x)
  p_group <- suppressMessages( do.call(rbind.data.frame, p_group) )
  dim(p_group)
  names(p_group) <- "p_group"
  head(p_group)
  
  m_state <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(x), "m_state")), name=NULL)
  m_state <- lapply(m_state$value, function(x) if(is.null(x)) data.frame(m_state = NA) else x)
  m_state <- suppressMessages( do.call(rbind.data.frame, m_state) )
  dim(m_state)
  names(m_state) <- "m_state"
  head(m_state)
  
  #Attribute der Sektion (Tagesordnung)
  sec_id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(x)), "id")), name=NULL)
  sec_id <- lapply(sec_id$value, function(x) if(is.null(x)) data.frame(sec_id = NA) else x)
  sec_id <- suppressMessages( do.call(rbind.data.frame, sec_id) )
  dim(sec_id)
  names(sec_id) <- "sec_id"
  head(sec_id)
  
  sec_title <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(x)), "title")), name=NULL)
  sec_title <- lapply(sec_title$value, function(x) if(is.null(x)) data.frame(sec_title = NA) else x)
  sec_title <- suppressMessages( do.call(rbind.data.frame, sec_title) )
  dim(sec_title)
  names(sec_title) <- "sec_title"
  head(sec_title)
  
  #Attribute des Dokuments
  doc_id <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "id")), name=NULL)
  doc_id <- lapply(doc_id$value, function(x) if(is.null(x)) data.frame(doc_id = NA) else x)
  doc_id <- suppressMessages( do.call(rbind.data.frame, doc_id) )
  dim(doc_id)
  names(doc_id) <- "doc_id"
  head(doc_id)
  
  doc_lang <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "lang")), name=NULL)
  doc_lang <- lapply(doc_lang$value, function(x) if(is.null(x)) data.frame(doc_lang = NA) else x)
  doc_lang <- suppressMessages( do.call(rbind.data.frame, doc_lang) )
  dim(doc_lang)
  names(doc_lang) <- "doc_lang"
  head(doc_lang)
  
  doc_date <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "date")), name=NULL)
  doc_date <- lapply(doc_date$value, function(x) if(is.null(x)) data.frame(doc_date = NA) else x)
  doc_date <- suppressMessages( do.call(rbind.data.frame, doc_date) )
  dim(doc_date)
  names(doc_date) <- "doc_date"
  head(doc_date)  
  
  doc_place <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "place")), name=NULL)
  doc_place <- lapply(doc_place$value, function(x) if(is.null(x)) data.frame(doc_place = NA) else x)
  doc_place <- suppressMessages( do.call(rbind.data.frame, doc_place) )
  dim(doc_place)
  names(doc_place) <- "doc_place"
  head(doc_place)
  
  doc_edition <- enframe(xpathSApply(doc, "//p", function(x) xmlGetAttr(xmlParent(xmlParent(xmlParent(x))), "edition")), name=NULL)
  doc_edition <- lapply(doc_edition$value, function(x) if(is.null(x)) data.frame(doc_edition = NA) else x)
  doc_edition <- suppressMessages( do.call(rbind.data.frame, doc_edition) )
  dim(doc_edition)
  names(doc_edition) <- "doc_edition"
  head(doc_edition)
  
  #bind data
  reden <- bind_cols(sl, id, speaker_id, name, 
                     reden,
                     is_mep, mode, nationality, birth_date, birth_place, n_party, p_group, m_state, 
                     sec_id, sec_title, 
                     doc_id, doc_lang, doc_date, doc_place, doc_edition) %>% 
    mutate(rowid = row_number())
  #dim(reden)
  #head(reden)
  
  #Zusammenfügen aller Dateien
  europarl_en <- suppressWarnings ( bind_rows(europarl_en, data.frame(f, reden)) )
  
  #print(f)
}

#Inspektion
dim(europarl_en)
save(europarl_en, file="../../../daten/EuroParl_UdS/europarl_en_temp.rda")


#-----------------------3.Schritt: Korrekturen und Aggregation

#Daten laden
load("../../../daten/EuroParl_UdS/europarl_en_temp.rda")

names(europarl_en)
# europarl_en %>% 
#   select(id, sl, speaker_id, name, text) %>% 
#   slice(1:50) %>% 
#   DT::datatable(., options = list(pageLength = 5))

#neue Variable speaker, wenn sl ungleich kommentar
europarl_en <- europarl_en %>% 
  mutate(speaker = case_when(
    sl != "kommentar" ~ speaker_id, 
    TRUE ~ "0"
  ))

europarl_en %>% 
  select(speaker_id, sl, speaker) %>% 
  head(10)

#Sprecher*innenwechsel
europarl_en <- europarl_en %>% 
  mutate(speaker_change = case_when(
    speaker != lag(speaker) ~ 1, 
    TRUE ~ 0
  )) %>% 
  mutate(cumsum_speaker = cumsum(speaker_change)) %>% 
  group_by(f, cumsum_speaker) %>% 
  summarise(sl = first(sl),
            id = first(id), 
            speaker_id = first(speaker_id), 
            name = first(name), 
            text = str_c(text, collapse=" "), 
            is_mep = first(is_mep), 
            mode = first(mode), 
            nationality = first(nationality), 
            birth_date = first(birth_date), 
            birth_place = first(birth_place), 
            n_party = first(n_party), 
            p_group = first(p_group), 
            m_state = first(m_state),
            sec_id = first(sec_id), 
            sec_title = first(sec_title), 
            doc_id = first(doc_id), 
            doc_lang = first(doc_lang), 
            doc_date = first(doc_date), 
            doc_place = first(doc_place), 
            doc_edition = first(doc_edition), 
            rowid = first(rowid), 
            speaker = first(speaker)) %>% 
  arrange(f, rowid) %>% 
  ungroup()

europarl_en %>% 
  select(rowid, speaker_id, sl, speaker, text) %>% 
  head(10)

save(europarl_en, file="../../../daten/EuroParl_UdS/europarl_en_xml.rda")


#-----------------------4.Schritt:Aggregation auf Reden, d.h. ohne Kommentare
dim(europarl_en)
europarl_en_agg <- europarl_en %>% 
  filter(sl != "kommentar") %>% 
  mutate(speaker_change = case_when(
    speaker != lag(speaker) ~ 1, 
    TRUE ~ 0
  )) %>% 
  mutate(cumsum_speaker = cumsum(speaker_change)) %>% 
  group_by(f, cumsum_speaker) %>% 
  summarise(sl = first(sl),
            id = first(id), 
            speaker_id = first(speaker_id), 
            name = first(name), 
            text = str_c(text, collapse=" "), 
            is_mep = first(is_mep), 
            mode = first(mode), 
            nationality = first(nationality), 
            birth_date = first(birth_date), 
            birth_place = first(birth_place), 
            n_party = first(n_party), 
            p_group = first(p_group), 
            m_state = first(m_state),
            sec_id = first(sec_id), 
            sec_title = first(sec_title), 
            doc_id = first(doc_id), 
            doc_lang = first(doc_lang), 
            doc_date = first(doc_date), 
            doc_place = first(doc_place), 
            doc_edition = first(doc_edition), 
            rowid = first(rowid), 
            speaker = first(speaker)) %>% 
  arrange(f, rowid) %>% 
  ungroup()

europarl_en_agg %>% 
  select(rowid, speaker_id, sl, speaker, text) %>% 
  head(10)

save(europarl_en_agg, file="../../../daten/EuroParl_UdS/europarl_en_agg.rda")