Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields.

Hi,
This function is an adaptation for RAKE function. In difference to the original function the output is an annotation table with corrected IDs and merged fields (in case of other fields).
best, Ernest
## function that creates keywords and adapts annotation table and overwrites old information
fun.kw.short <- function(at,UPOS=c("NOUN", "NOUN"),LRF=50,LR=1.5) {
  # Calculate the Rake. Keep the full information of the annotation table. This is based on udpipe::keyword_rake()
  library(udpipe)
  library(stringr)
  ##  create one common ID on the level of sentences.
  at[,group:=unique_identifier(at, fields = c("doc_id","paragraph_id","sentence_id"))]
  ## copy lemma to 'word' to merge lemma to keywords. 
  at[,word:=lemma]

  # Rake 
  ## code relevant terms
  at[,.relevant:=upos %in% UPOS]
  ## Add Keyword_id
  at[,keyword_id:=data.table::rleid(group,.relevant)]
  ## Calculate degree of each keyword (number of words minues 1).
  at[.relevant != FALSE,degree:=.N-1L,keyword_id]
  ## create keyowrds
  at[.relevant != FALSE,keyword:=paste(word, collapse = " "),keyword_id]
  ## Calculate degree of each word (sum of degrees of a word).
  at[.relevant != FALSE,word_degree:=sum(degree),word]
  ## Calculate frequency of a word
  at[.relevant != FALSE,word_freq:=.N,word]
  ## Calculate rake of a word
  at[.relevant != FALSE,word_rake:=word_degree/word_freq]
  # Keywords
  ## Number of times the keyword is used. 
  at[.relevant != FALSE, keyword_freq:=length(unique(keyword_id)), .(keyword, word)]
  ## Ngram
  at[.relevant != FALSE, keyword_ngram:=length(unique(word)), .(keyword)]
  ## Rake score of keywords (sum of rake of the words).
  at[.relevant != FALSE, keyword_rake:=sum(word_rake),keyword_id]

  ##replace lemma and add ngram of lemma
  at[,lemma:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword,lemma)]
  at[,lemma_ng:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword_ngram,1)]

  # Remove added variables
  at[,(c("keyword","keyword_ngram","keyword_freq","keyword_rake",".relevant","word_freq","word_rake","word_degree","degree","keyword_id","group","word")):=NULL]

  # merge head_token_ids variables
  ## create new token_id
  setorder(at,doc_id,paragraph_id,sentence_id,token_id)
  at[,token_id2:=data.table::rleid(lemma),.(doc_id,paragraph_id,sentence_id)]
  ## recode id cols
  id_cols <- c("doc_id","paragraph_id","sentence_id","token_id","head_token_id","token_id2")
  fun.ext.int <- function (x) {x+10^max(nchar(x))}
  at[,(id_cols):=lapply(.SD,fun.ext.int),.SDcols=id_cols]
  at[,(id_cols):=lapply(.SD,as.numeric),.SDcols=id_cols]
  ## create new id cols
  at[,id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id))]
  at[,id2:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id2))]
  at[,head_id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,head_token_id))]

  # create new head ids
  at[,head_id2:=at[,.(id1,id2)][,unique(.SD)][match(at$head_id1,id1),id2]]
  ## add dependency relation to root
  at[dep_rel=="root",head_id2:=head_id1]
  ## create new head_token_id
  at[,head_token_id2:=head_id2 %% 10000]
  at[,head_token_id2:=ifelse(head_token_id2==token_id2,NA,head_token_id2)]
  at[,c("token_id","head_id1","head_id2","id1","id2","head_token_id"):=NULL]
  setnames(at,gsub("id2","id",names(at)))

  # merge entries in upos, xpos, feats, dep_rel, token and misc. 
  cols <- c("doc_id","paragraph_id","sentence_id","token_id")
  cols1 <- c("upos","xpos","feats","dep_rel","token","misc")
  at[,deps:=NULL] ##column not needed, when used, as.character might be needed
  #at[,(cols1):=lapply(.SD,as.character),.SDcols=cols1]
  my.fun <- function(x) paste0(x[!is.na(x)&!duplicated(x)],collapse="_")
  at[,(cols1):=lapply(.SD,my.fun),by=cols,.SDcols=cols1]

  # export
  at[,unique(.SD)]  

}
bnosac / udpipe

Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields. #102