PolMine / GermaParl

GermaParl R Data Package
12 stars 3 forks source link

text recognition error in speech #13

Open KevinGlock opened 5 years ago

KevinGlock commented 5 years ago

Interjections are not recognised in

p <- partition("GERMAPARL", speaker = "Stephan Mayer", date = "2016-09-23", encoding = "UTF-8")

read(p)
KevinGlock commented 5 years ago

I saw the error occurs in the other speeches too. Here is the workflow to find those ones.

## load libraries

library("polmineR")
library("magrittr")
library("data.table")

use("GermaParl")

## create partitions

coi_cdu16 <- partition("GERMAPARL",
                       parliamentary_group = "CDU/CSU",
                       year  = 2012:2016,
                       interjection= F,
                       role = c("mp", "government"))

## as partition bundles

pb2 <- partition_bundle(coi_cdu16, s_attribute = "date")

nested2 <- lapply(pb2@objects,
                  function(x) partition_bundle(x,
                                               s_attribute = "agenda_item",
                                               verbose = F
                  )
)

## flatten nested data frames

debates2 <- flatten(nested2)

names(debates2) <- paste(blapply(debates2,
                                 function(x) s_attributes(x, "date")),
                         blapply(debates2,
                                 function(x) name(x)),
                         sep = "_"
)
q1 <- c('"[Mm]ehrstaat.*"', '".*[Ss]taatsbürger.*"', '".*[Ss]taatsangeh.*rig.*"',
        '".*[Ss]taatszugeh.*rig.*"', '"[Ss]taatenlos.*"', '"[Aa]us.*bürger.*"',
        '"[Ee]in.*bürger.*"', '"Pass"', '"PaÃY"',
        '"Blutsrecht.*"', '"Geburtsrecht.*"', '"Geburtsprinzip.*"',
        '"[Ii]us soli"', '"[Ii]us sanguinis"', '"[Jj]us soli"', '"[Jj]us sanguinis"',
        '"[Dd]oppel.* [Ss]taat.*"', '"Abstammungsrecht.*"', '"Abstammungsprinzip.*"')

q2 <- c('"[Dd]oppelstaat.*"', '"[Mm]ehrstaat.*"',
        '"[Dd]oppel.* [Ss]taat.*"', '"Doppelpass.*"', '"DoppelpaÃY.*"',
        '"[Oo]ptionspflicht.*"', '"[Oo]ptionszwang.*"', '"Optionsmodell.*"')

q3 <- c('".*[Aa]syl.*"', '".*[Ff]lucht.*"', '".*[Ff]lücht.*"', '".*[Mm]igra.*"',
        '".*[Ee]in.*wander.*"', '".*[Gg]renz.*"', '"[Ff]amilienzusammen.*"', '".*[Aa]us.*bürger.*"',
        '".*[Aa]b.*schie.*"', '".*[Aa]b.*schob.*"', '".*[Ee]in.*bürger.*"', '".*[Aa]us.*sied.*"',
        '"Aufnahme.*"', '"[Vv]isa.*"', '"[Vv]isum.*"', '"Loyalitätskonflikt"', '"Identitätsfeststellung"',
        '"Rückführung.*"', '".*[Aa]usländ.*"', '".*[Rr]usslanddeutsch.*"',
        '"[Aa]ufenthalt.*"', '"Rückübernahme.*"', '"Ehegattennachzug"', '"Duldung.*"',
        '"Residenzpflicht"', '"Regelanfrage"', '".*Vertreib.*"', '".*Vertrieb.*"', '"AZR"', '"Aufnahme.*"')

q4 <- c(q1, q2, q3)

## erease quotation marks to highlight protocols

q1_regex <- gsub('^\\"(.*?)\\"$', '\\1', q1)

q2_regex <- gsub('^\\"(.*?)\\"$', '\\1', q2)

q3_regex <- gsub('^\\"(.*?)\\"$', '\\1', q3)

q4_regex <- gsub('^\\"(.*?)\\"$', '\\1', q4)

dt6 <- count(debates2,
             query = q2,
             regex = T,
             fill = T,
             cqp = T
) %>% setorderv(cols = "TOTAL",
                order = -1L
)

debates_dual2 <- debates2[[ subset(dt6, TOTAL >= 4)[["partition"]] ]] 
debates_dual2[[6]] %>%
  read() %>%
  highlight(orange = q4_regex,
            lightgreen = q1_regex,
            red = q2_regex,
            regex = T
  )