databricks / spark-corenlp

Stanford CoreNLP wrapper for Apache Spark
GNU General Public License v3.0
422 stars 120 forks source link

Is it possible to specify the "language" model? #17

Open ghammad opened 8 years ago

ghammad commented 8 years ago

Hi all,

first of all, thank you for having made this wrapper available. Really useful.

Could you let me know if it is possible to specify the underlying CoreNLP model (english, french, ...) ?

According to what I understand from your code, it won't be easy since you use the simple Core API but it should be possible. Any idea/plans to extend your code with this possibility?

Regards,

Grégory

ghost commented 7 years ago

Hi, in my case i created a new function called for example "ner2" :)

def ner2 = udf { sentence: String => val pipeline = getOrCreateSentimentPipeline()

val document = pipeline.process(sentence)

val sentences = document.get(classOf[SentencesAnnotation]).asScala.toList

val tokens  = sentences.flatMap{sentence =>
  sentence.get(classOf[TokensAnnotation]).asScala.toList}

tokens.map { token =>
  //val word = token.get(classOf[TextAnnotation])
  val ner = token.get(classOf[NamedEntityTagAnnotation])
  //val lemma = token.get(classOf[LemmaAnnotation])
  (ner)
}

}

private def getOrCreateSentimentPipeline(): StanfordCoreNLP = { if (sentimentPipeline == null) { val props = new Properties() //props.setProperty("annotators", "tokenize, ssplit, parse, sentiment") props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner") props.setProperty("tokenize.language", "es") props.setProperty("tokenize.verbose", "true") props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger") props.setProperty("ner.model", "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz") props.setProperty("ner.applyNumericClassifiers", "false") props.setProperty("ner.useSUTime", "false") props.setProperty("ner.language", "spanish") props.setProperty("parse.model", "edu/stanford/nlp/models/lexparser/spanishPCFG.ser.gz") props.setProperty("depparse.model", "edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz") props.setProperty("depparse.language", "spanish") props.setProperty("regexner.ignoreCase", "true") props.setProperty("regexner.verbose", "true") sentimentPipeline = new StanfordCoreNLP(props) } sentimentPipeline }