Closed frankandrobot closed 8 years ago
package com.basistech.dp.nlp4j.impl;
import com.basistech.rosette.dm.AnnotatedText;
import com.basistech.rosette.dm.Annotator;
import com.basistech.rosette.dm.ListAttribute;
import com.basistech.rosette.dm.MorphoAnalysis;
import com.basistech.rosette.dm.Token;
import com.google.common.io.Resources;
import edu.emory.mathcs.nlp.component.template.node.NLPNode;
import edu.emory.mathcs.nlp.decode.NLPDecoder;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
*
*/
public class Nlp4jTaggerAnnotator implements Annotator {
// the interface here wants one parse-result per sentence.
// so we respect sentence boundaries in the input, and nothing else.
private final NLPDecoder decoder;
public Nlp4jTaggerAnnotator() {
URL configUrl = Resources.getResource("config-decode-tagger-en.xml");
try (InputStream configInputStream = Resources.asByteSource(configUrl).openStream()) {
decoder = new NLPDecoder(configInputStream);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public AnnotatedText annotate(CharSequence input) {
return annotate(new AnnotatedText.Builder().data(input).build());
}
@Override
public AnnotatedText annotate(AnnotatedText input) {
List<NLPNode> nodes = new ArrayList<>();
NLPNode rootNode = new NLPNode();
rootNode.toRoot();
nodes.add(rootNode);
nodes.addAll(
IntStream.range(0, input.getTokens().size())
.mapToObj(i -> createNode(input.getTokens().get(i), i + 1)).collect(Collectors.toList()));
// Nlp4J likes arrays
NLPNode[] nodesArray = nodes.toArray(new NLPNode[nodes.size()]);
decoder.decode(nodesArray);
AnnotatedText.Builder builder = new AnnotatedText.Builder(input);
ListAttribute.Builder<Token> tlBuilder = new ListAttribute.Builder<>(Token.class);
nodes.stream().skip(1).forEach(n -> tlBuilder.add(makeToken(n)));
builder.tokens(tlBuilder.build());
return builder.build();
}
private Token makeToken(NLPNode node) {
Token.Builder builder = new Token.Builder(node.getStartOffset(), node.getEndOffset(), node.getWordForm());
MorphoAnalysis.Builder maBuilder = new MorphoAnalysis.Builder();
maBuilder.partOfSpeech(node.getPartOfSpeechTag());
if (node.getLemma() != null) {
maBuilder.lemma(node.getLemma());
}
builder.addAnalysis(maBuilder.build());
if (node.getWordClusters() != null) {
builder.extendedProperty("word_clusters", node.getWordClusters());
}
return builder.build();
}
private NLPNode createNode(com.basistech.rosette.dm.Token token, int index) {
NLPNode node = decoder.create();
node.setWordForm(token.getText());
node.setStartOffset(token.getStartOffset());
node.setEndOffset(token.getEndOffset());
node.setID(index);
return node;
}
}
Yea, thanks I ended up figuring it out by looking at a demo:
import edu.emory.mathcs.nlp.decode.DecodeConfig
import edu.emory.mathcs.nlp.common.util.IOUtils
import edu.emory.mathcs.nlp.component.template.node.NLPNode
import edu.emory.mathcs.nlp.decode.NLPDecoder
fun posTagger(string : String) : Array<NLPNode> {
val configUri = "src/main/resources/nlp4j/config.xml"
val config = DecodeConfig(IOUtils.createFileInputStream(configUri))
val decoder = NLPDecoder(config)
return decoder.decode(string)
No example was given and I don't see the source code here.