bnosac / udpipe

R package for Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing Based on the UDPipe Natural Language Processing Toolkit
https://bnosac.github.io/udpipe/en
Mozilla Public License 2.0
209 stars 33 forks source link

start/end values wrong in case starts with SpacesBefore and also has SpacesAfter #54

Closed jwijffels closed 5 years ago

jwijffels commented 5 years ago

Example of where it fails

> txt <- "\n\n\nI    went to the bank to buy some bread. Oh no.\n\n"
> x   <- udpipe::udpipe(txt, "english")
> mapply(start = x$start, stop = x$end, FUN=function(start, stop) substr(txt, start, stop))
 [1] "b"    ". Oh" "no"   "\n\n" ""     ""     ""     ""     ""     ""     ""     ""     ""  

but these work fine

> txt <- "\n\n\nI    went to the bank to buy some bread. Oh no.\n\n"
> txt <- trimws(txt)
> x   <- udpipe::udpipe(txt, "english")
> mapply(start = x$start, stop = x$end, FUN=function(start, stop) substr(txt, start, stop))
 [1] "I"     "went"  "to"    "the"   "bank"  "to"    "buy"   "some"  "bread" "."     "Oh"    "no"    "."

> txt <- "\n\n\nI went to the bank to buy some bread. Oh  no. \n\n"
> txt <- trimws(txt)
> x   <- udpipe::udpipe(txt, "english")
> mapply(start = x$start, stop = x$end, FUN=function(start, stop) substr(txt, start, stop))

Basically causes by having SpacesBefore and SpacesAfter in misc field of the same token, which only happens at the start of the text. Probably should advise to use trimws before annotating

jwijffels commented 5 years ago

Should update udpipe_reconstruct to the following using strsplit instead of the regular expression with backreferences

udpipe_reconstruct <- function(sentence_id, token, token_id, misc, only_from_to = FALSE){

  ##
  ## FROM THE UDPIPE DOCS: 
  ##

  # The markup uses the following MISC fields on tokens (not words in multi-word tokens):
  # SpacesBefore=content (by default empty): spaces/other content preceding the token
  # SpacesAfter=content (by default a space if SpaceAfter=No feature is not present, empty otherwise): spaces/other content following the token
  # SpacesInToken=content (by default equal to the FORM of the token): FORM of the token including original spaces (this is needed only if tokens are allowed to contain spaces and a token contains a tab or newline characters)

  # The content of all the three fields must be escaped to allow storing tabs and newlines. The following C-like schema is used:
  # \s: space
  # \t: tab
  # \r: CR character
  # \n: LF character
  # \p: | (pipe character)
  # \\: \ (backslash character)

  rawtxt <- token

  has_spacesafter_no <- grepl(pattern = "SpaceAfter=No", misc)
  has_spacesafter    <- grepl(pattern = "SpacesAfter=", misc)
  has_spacesbefore   <- grepl(pattern = "SpacesBefore=", misc)
  has_spacesintoken  <- grepl(pattern = "SpacesInToken=", misc)

  ##
  ## Spaces after
  ##
  after <- rep("", length(token))
  ## if no spaceafter feature, there is a space
  after[!has_spacesafter] <- " "
  ## if missing, there is a space after
  after[is.na(misc)] <- " "
  ## if contains SpaceAfter=No, there is nothing to add
  after[has_spacesafter_no] <- ""
  ## if contains SpacesAfter=, add the spaces to the after part
  idx <- which(has_spacesafter)
  #addme <- gsub(pattern = "(SpacesAfter=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesAfter", x = x, value = TRUE))
  addme <- gsub(pattern = "SpacesAfter=", replacement = "", addme)
  addme <- gsub("\\\\s", " ", addme)
  addme <- gsub("\\\\n", "\n", addme)
  addme <- gsub("\\\\t", "\t", addme)
  addme <- gsub("\\\\r", "\r", addme)
  addme <- gsub("\\\\p", "|", addme)
  addme <- gsub("\\\\", "\\", addme)
  after[idx] <- addme
  ## Fix for using std::istringstream in udpipe_annotate as it always ends with a newline character
  after[length(after)] <- gsub("\n$", "", after[length(after)])

  ##
  ## Spaces before
  ##
  before <- rep("", length(token))
  ## if contains SpacesBefore=, add the spaces to the after part
  idx <- which(has_spacesbefore)
  #addme <- gsub(pattern = "(SpacesBefore=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesBefore", x = x, value = TRUE))
  addme <- gsub(pattern = "SpacesBefore=", replacement = "", addme)
  addme <- gsub("\\\\s", " ", addme)
  addme <- gsub("\\\\n", "\n", addme)
  addme <- gsub("\\\\t", "\t", addme)
  addme <- gsub("\\\\r", "\r", addme)
  addme <- gsub("\\\\p", "|", addme)
  addme <- gsub("\\\\", "\\", addme)
  before[idx] <- addme

  ##
  ## SpacesInToken - MISC field stores form of the token including original spaces if there is a space in the token which can not be handled by FORM
  ##
  idx <- which(has_spacesintoken)
  #token[idx] <- gsub(pattern = "(SpacesInToken=)(.+)($|Spaces)", "\\2", misc[idx])
  addme <- sapply(strsplit(misc[idx], split = "\\|"), FUN=function(x) grep(pattern = "SpacesInToken", x = x, value = TRUE))
  addme <- gsub(pattern = "SpacesInToken=", replacement = "", addme)
  token[idx] <- addme

  ##
  ## Construct original text
  ##
  original_txt <- sprintf("%s%s%s", before, token, after)

  ##
  ## Multi-word tokens are not considered
  ##
  is_multi_word <- grepl("-", token_id)
  ids <- sprintf("%s.%s", sentence_id, token_id)
  ids_remove <- mapply(sentence_id = sentence_id[is_multi_word],
                       token_id = token_id[is_multi_word], 
                       FUN=function(sentence_id, token_id){
                         sprintf("%s.%s", sentence_id, unlist(strsplit(token_id, split = "-")))
                         }, SIMPLIFY = TRUE, USE.NAMES = FALSE)
  idx <- which(ids %in% ids_remove)
  original_txt[idx] <- ""

  ##
  ## Construct from-to
  ##
  before[idx] <- ""
  after[idx] <- ""

  nchars <- nchar(original_txt)
  original_to <- cumsum(nchars)
  original_from <- original_to - nchars + 1L
  from <- original_from + nchar(before)
  to <- original_to - nchar(after)
  from[idx] <- NA_integer_
  to[idx] <- NA_integer_

  if(only_from_to){
    return(list(from = from, to = to))  
  }else{
    return(list(text = paste(original_txt, collapse = ""),
                from = from,
                to = to))  
  }
}
jwijffels commented 5 years ago

Issue probably only happened for people who have leading/trailing spaces in their text.

jwijffels commented 5 years ago

Fixed in commit https://github.com/bnosac/udpipe/commit/fb76527f3f179f275bc8354e27f4e499d2212695