bnosac / udpipe

R package for Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing Based on the UDPipe Natural Language Processing Toolkit
https://bnosac.github.io/udpipe/en
Mozilla Public License 2.0
209 stars 33 forks source link

Create keywords using RAKE, adapt token and headtoken ids, and merge remaining fields. #102

Open ernestaigner opened 2 years ago

ernestaigner commented 2 years ago

Hi,

This function is an adaptation for RAKE function. In difference to the original function the output is an annotation table with corrected IDs and merged fields (in case of other fields).

best, Ernest

## function that creates keywords and adapts annotation table and overwrites old information
fun.kw.short <- function(at,UPOS=c("NOUN", "NOUN"),LRF=50,LR=1.5) {
  # Calculate the Rake. Keep the full information of the annotation table. This is based on udpipe::keyword_rake()
  library(udpipe)
  library(stringr)
  ##  create one common ID on the level of sentences.
  at[,group:=unique_identifier(at, fields = c("doc_id","paragraph_id","sentence_id"))]
  ## copy lemma to 'word' to merge lemma to keywords. 
  at[,word:=lemma]

  # Rake 
  ## code relevant terms
  at[,.relevant:=upos %in% UPOS]
  ## Add Keyword_id
  at[,keyword_id:=data.table::rleid(group,.relevant)]
  ## Calculate degree of each keyword (number of words minues 1).
  at[.relevant != FALSE,degree:=.N-1L,keyword_id]
  ## create keyowrds
  at[.relevant != FALSE,keyword:=paste(word, collapse = " "),keyword_id]
  ## Calculate degree of each word (sum of degrees of a word).
  at[.relevant != FALSE,word_degree:=sum(degree),word]
  ## Calculate frequency of a word
  at[.relevant != FALSE,word_freq:=.N,word]
  ## Calculate rake of a word
  at[.relevant != FALSE,word_rake:=word_degree/word_freq]
  # Keywords
  ## Number of times the keyword is used. 
  at[.relevant != FALSE, keyword_freq:=length(unique(keyword_id)), .(keyword, word)]
  ## Ngram
  at[.relevant != FALSE, keyword_ngram:=length(unique(word)), .(keyword)]
  ## Rake score of keywords (sum of rake of the words).
  at[.relevant != FALSE, keyword_rake:=sum(word_rake),keyword_id]

  ##replace lemma and add ngram of lemma
  at[,lemma:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword,lemma)]
  at[,lemma_ng:=ifelse(keyword_rake>LR & keyword_freq >=LRF & !is.na(keyword),keyword_ngram,1)]

  # Remove added variables
  at[,(c("keyword","keyword_ngram","keyword_freq","keyword_rake",".relevant","word_freq","word_rake","word_degree","degree","keyword_id","group","word")):=NULL]

  # merge head_token_ids variables
  ## create new token_id
  setorder(at,doc_id,paragraph_id,sentence_id,token_id)
  at[,token_id2:=data.table::rleid(lemma),.(doc_id,paragraph_id,sentence_id)]
  ## recode id cols
  id_cols <- c("doc_id","paragraph_id","sentence_id","token_id","head_token_id","token_id2")
  fun.ext.int <- function (x) {x+10^max(nchar(x))}
  at[,(id_cols):=lapply(.SD,fun.ext.int),.SDcols=id_cols]
  at[,(id_cols):=lapply(.SD,as.numeric),.SDcols=id_cols]
  ## create new id cols
  at[,id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id))]
  at[,id2:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,token_id2))]
  at[,head_id1:=as.numeric(paste0(doc_id,paragraph_id,sentence_id,head_token_id))]

  # create new head ids
  at[,head_id2:=at[,.(id1,id2)][,unique(.SD)][match(at$head_id1,id1),id2]]
  ## add dependency relation to root
  at[dep_rel=="root",head_id2:=head_id1]
  ## create new head_token_id
  at[,head_token_id2:=head_id2 %% 10000]
  at[,head_token_id2:=ifelse(head_token_id2==token_id2,NA,head_token_id2)]
  at[,c("token_id","head_id1","head_id2","id1","id2","head_token_id"):=NULL]
  setnames(at,gsub("id2","id",names(at)))

  # merge entries in upos, xpos, feats, dep_rel, token and misc. 
  cols <- c("doc_id","paragraph_id","sentence_id","token_id")
  cols1 <- c("upos","xpos","feats","dep_rel","token","misc")
  at[,deps:=NULL] ##column not needed, when used, as.character might be needed
  #at[,(cols1):=lapply(.SD,as.character),.SDcols=cols1]
  my.fun <- function(x) paste0(x[!is.na(x)&!duplicated(x)],collapse="_")
  at[,(cols1):=lapply(.SD,my.fun),by=cols,.SDcols=cols1]

  # export
  at[,unique(.SD)]  

}
jwijffels commented 2 years ago

thank you, can you provide an example on how to use the function?