bnosac / udpipe

R package for Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing Based on the UDPipe Natural Language Processing Toolkit
https://bnosac.github.io/udpipe/en
Mozilla Public License 2.0
209 stars 33 forks source link

bitten by this, make sure this also works with ngram = 1, although one should not call it with terms which only have 1 word #65

Closed jwijffels closed 4 years ago

jwijffels commented 4 years ago

It should be this

txt_recode_ngram <- function (x, compound, ngram, sep = " ") {
  ngram <- as.integer(ngram)
  if (length(ngram) != 1) {
    stopifnot(length(ngram) == length(compound))
    keywords <- data.frame(keyword = compound, ngram = ngram, 
                           stringsAsFactors = FALSE)
    ngrams <- unique(keywords$ngram)
    ngrams <- sort(ngrams, decreasing = TRUE)
    for (i in ngrams) {
      x <- txt_recode_ngram(x, compound = keywords$keyword[keywords$ngram == i], ngram = i, sep = sep)
    }
  }
  else {
    keywords <- as.character(compound)
    if (length(keywords) == 0) {
      return(x)
    }
    y <- udpipe::txt_nextgram(x, n = ngram, sep = sep)
    idx <- which(y %in% keywords)
    x[idx] <- y[idx]
    size <- length(x)
    if(ngram > 1){
      for (i in 1:(ngram - 1)) {
        loc <- idx + i
        loc <- loc[loc <= size]
        if(length(loc) > 0){
          x[loc] <- NA_character_
        }
      }
    }

  }
  x
}