PolMine / GermaParl

GermaParl R Data Package
12 stars 3 forks source link

Fix for germaparl_encode_lda_topics() #8

Open Studentenfutter opened 5 years ago

Studentenfutter commented 5 years ago

The germaparl_encode_lda_topics() returns an error because the decode() function of polmineR is deprecated. I found a solution by using the s_attribute_decode() function from RcppCWB:

germaparl_encode_lda_topics() <- 
function (k = 450, n = 5) {
germaparl_data_dir <- registry_file_parse(corpus = "GERMAPARL", 
        registry_dir = germaparl_regdir())[["home"]]
    corpus_charset <- registry_file_parse(corpus = "GERMAPARL")[["properties"]][["charset"]]
    model <- germaparl_load_topicmodel(k = 250)
    message("... getting topic matrix")
    topic_matrix <- topicmodels::topics(model, k = 5)
    topic_dt <- data.table(speech = colnames(topic_matrix), topics = apply(topic_matrix, 
        2, function(x) sprintf("|%s|", paste(x, collapse = "|"))), 
        key = "speech")
    message("... decoding s-attribute speech")
    if (!"speech" %in% s_attributes("GERMAPARL")) {
        stop("The s-attributes 'speech' is not yet present.", 
            "Use the function germaparl_add_s_attribute_speech to generate it.")
    }
    cpos_dt <- as.data.table(RcppCWB::s_attribute_decode("GERMAPARL", data_dir = germaparl_data_dir, s_attribute = "speech")) # Returns a data frame but setkeyv requires a data.table - converting
    names(cpos_dt)[names(cpos_dt) == "value"] <- "speech" # With the new function, speech gets renamed to "value" - changing it back to speech
    setkeyv(cpos_dt, "speech")
    cpos_dt2 <- topic_dt[cpos_dt]
    setorderv(cpos_dt2, cols = "cpos_left", order = 1L)
    cpos_dt2[["speech"]] <- NULL
    cpos_dt2[["id"]] <- NULL
    cpos_dt2[, `:=`(topics, ifelse(is.na(topics), "||", topics))]
    setcolorder(cpos_dt2, c("cpos_left", "cpos_right", "topics"))
    message("... running some sanity checks")
    coverage <- sum(cpos_dt2[["cpos_right"]] - cpos_dt2[["cpos_left"]]) + 
        nrow(cpos_dt2)
    if (coverage != size("GERMAPARL")) 
        stop()
    P <- partition("GERMAPARL", speech = ".*", regex = TRUE)
    if (sum(cpos_dt2[["cpos_left"]] - P@cpos[, 1]) != 0) 
        stop()
    if (sum(cpos_dt2[["cpos_right"]] - P@cpos[, 2]) != 0) 
        stop()
    if (length(sAttributes("GERMAPARL", "speech", unique = FALSE)) != 
        nrow(cpos_dt2)) 
        stop()
    message("... encoding s-attribute 'topics'")
    s_attribute_encode(values = cpos_dt2[["topics"]], data_dir = germaparl_data_dir, 
        s_attribute = "topics", corpus = "GERMAPARL", region_matrix = as.matrix(cpos_dt2[, 
            c("cpos_left", "cpos_right")]), registry_dir = germaparl_regdir(), 
        encoding = corpus_charset, method = "R", verbose = TRUE)
}
KevinGlock commented 5 years ago

I've tried this workflow on Windows. I don't no the problem with the topics, however, this workflow do not work when I run it:

Error in germaparl_encode_lda_topics() <- function(k = 250, n = 5) { : invalid (NULL) left side of assignment

We had tried this, but it does not work neither on my machine:

library(polmineR) use("GermaParl")

GermaParl::germaparl_add_s_attribute_speech()

GermaParl::germaparl_download_lda(250L) model <- GermaParl::germaparl_load_topicmodel(250L)

germaparl_encode_lda_topics <- function(corpus = corpus, model, k = 250, n = 5, package){

data_dir <- cwbtools::registry_file_parse(corpus = corpus, registry_dir = set_regdir(package))[["home"]] corpus_charset <- cwbtools::registry_file_parse(corpus = corpus)[["properties"]][["charset"]]

message("... getting topic matrix") topic_matrix <- topicmodels::topics(model, k = n) topic_dt <- data.table::data.table( speech = colnames(topic_matrix), topics = apply(topic_matrix, 2, function(x) sprintf("|%s|", paste(x, collapse = "|"))), key = "speech" )

message("... decoding s-attribute speech") if (!"speech" %in% s_attributes(corpus)){ stop("The s-attributes 'speech' is not yet present.", "Use the function add_s_attribute_speech to generate it.") } cpos_dt <- data.table::as.data.table(RcppCWB::s_attribute_decode(corpus, s_attribute = "speech", method = "R", data_dir = data_dir)) names(cpos_dt) <- c("cpos_left", "cpos_right", "speech") data.table::setkeyv(cpos_dt, "speech")

Merge tables

cpos_dt2 <- topic_dt[cpos_dt] data.table::setorderv(cpos_dt2, cols = "cpos_left", order = 1L) cpos_dt2[["speech"]] <- NULL cpos_dt2[["id"]] <- NULL cpos_dt2[, topics := ifelse(is.na(topics), "||", topics)] data.table::setcolorder(cpos_dt2, c("cpos_left", "cpos_right", "topics"))

some sanity tests

message("... running some sanity checks") coverage <- sum(cpos_dt2[["cpos_right"]] - cpos_dt2[["cpos_left"]]) + nrow(cpos_dt2) if (coverage != size(corpus)) stop("sizes don't match") P <- partition(corpus, speech = ".*", regex = TRUE) if (sum(cpos_dt2[["cpos_left"]] - P@cpos[,1]) != 0) stop() if (sum(cpos_dt2[["cpos_right"]] - P@cpos[,2]) != 0) stop() if (length(sAttributes(corpus, "speech", unique = FALSE)) != nrow(cpos_dt2)) stop()

message("... encoding s-attribute 'topics'") cwbtools::s_attribute_encode( values = cpos_dt2[["topics"]], # is still UTF-8, recoding done by s_attribute_encode data_dir = data_dir, s_attribute = "topics", corpus = corpus, region_matrix = as.matrix(cpos_dt2[, c("cpos_left", "cpos_right")]), registry_dir = set_regdir(package), encoding = corpus_charset, method = "R", verbose = TRUE, delete = FALSE ) }

set_regdir <- function(package){ system.file(package = package, "extdata", "cwb", "registry") }

germaparl_encode_lda_topics(corpus = "GERMAPARL", model = model, k = 250L, n = 5, package = "GermaParl")

R returns the error message that it cannot open the connection or that no such file or directory exist.

I hope Andreas have a solution that works. The IT team had also checked administration rights. Also to put the file in the repo and to read the file with readRDS do not work.