bnqtoan / clearnlp

Automatically exported from code.google.com/p/clearnlp
Other
0 stars 0 forks source link

ClearNLP Error: java.lang.NullPointerException at com.googlecode.clearnlp.tokenization.EnglishTokenizer.normalizeNonUTF8 #10

Open GoogleCodeExporter opened 9 years ago

GoogleCodeExporter commented 9 years ago
I'm trying to find a good Semantic Role Labeling tool that I can use in my java 
code using Netbeans.
I tried ClearNLP and it work with testing the version with the right output fom 
this link: https://code.google.com/p/clearnlp/wiki/Installation

But when I used the following code:

    /*
     * To change this license header, choose License Headers in Project Properties.
     * To change this template file, choose Tools | Templates
     * and open the template in the editor.
     */
    package stanfordposcode;

    import java.io.BufferedReader;
    import java.io.FileInputStream;
    import java.io.PrintStream;
    import java.util.List;

    import com.googlecode.clearnlp.component.AbstractComponent;
    import com.googlecode.clearnlp.dependency.DEPTree;
    import com.googlecode.clearnlp.engine.EngineGetter;
    import com.googlecode.clearnlp.nlp.NLPDecode;
    import com.googlecode.clearnlp.nlp.NLPLib;
    import com.googlecode.clearnlp.reader.AbstractReader;
    import com.googlecode.clearnlp.segmentation.AbstractSegmenter;
    import com.googlecode.clearnlp.tokenization.AbstractTokenizer;
    import com.googlecode.clearnlp.util.UTInput;
    import com.googlecode.clearnlp.util.UTOutput;

    // Import log4j classes.
     import org.apache.log4j.Logger;
     import org.apache.log4j.BasicConfigurator;

    public class SRL
    {
            final String language = AbstractReader.LANG_EN;
            static Logger logger = Logger.getLogger(SRL.class);

            public SRL(String dictFile, String posModelFile, String depModelFile, String predModelFile, String roleModelFile, String srlModelFile, String inputFile, String outputFile) throws Exception
            {
                    AbstractTokenizer tokenizer  = EngineGetter.getTokenizer(language, new FileInputStream(dictFile));
                    AbstractComponent tagger     = EngineGetter.getComponent(new FileInputStream(posModelFile) , language, NLPLib.MODE_POS);
                    AbstractComponent analyzer   = EngineGetter.getComponent(new FileInputStream(dictFile)     , language, NLPLib.MODE_MORPH);
                    AbstractComponent parser     = EngineGetter.getComponent(new FileInputStream(depModelFile) , language, NLPLib.MODE_DEP);
                    AbstractComponent identifier = EngineGetter.getComponent(new FileInputStream(predModelFile), language, NLPLib.MODE_PRED);
                    AbstractComponent classifier = EngineGetter.getComponent(new FileInputStream(roleModelFile), language, NLPLib.MODE_ROLE);
                    AbstractComponent labeler    = EngineGetter.getComponent(new FileInputStream(srlModelFile) , language, NLPLib.MODE_SRL);

                    AbstractComponent[] components = {tagger, analyzer, parser, identifier, classifier, labeler};

                    String sentence = "I'd like to meet Dr. Choi.";
                    process(tokenizer, components, sentence);
                    process(tokenizer, components, UTInput.createBufferedFileReader(inputFile), UTOutput.createPrintBufferedFileStream(outputFile));
            }

            public void process(AbstractTokenizer tokenizer, AbstractComponent[] components, String sentence)
            {
                    DEPTree tree = NLPDecode.toDEPTree(tokenizer.getTokens(sentence));

                    for (AbstractComponent component : components)
                            component.process(tree);

                    System.out.println(tree.toStringSRL()+"\n");
            }

            public void process(AbstractTokenizer tokenizer, AbstractComponent[] components, BufferedReader reader, PrintStream fout)
            {
                    AbstractSegmenter segmenter = EngineGetter.getSegmenter(language, tokenizer);
                    DEPTree tree;

                    for (List<String> tokens : segmenter.getSentences(reader))
                    {
                            tree = NLPDecode.toDEPTree(tokens);

                            for (AbstractComponent component : components)
                                    component.process(tree);

                            fout.println(tree.toStringSRL()+"\n");
                    }

                    fout.close();
            }

            public static void main(String[] args)
            {
                BasicConfigurator.configure();

                    String dictFile      = "/Users/ha/clearnlp/dictionary-1.3.1.jar"; // e.g., dictionary.zip
                    String posModelFile  = "/Users/ha/clearnlp/ontonotes-en-pos-1.3.0.jar"; // e.g., ontonotes-en-pos.tgz
                    String depModelFile  = "/Users/ha/clearnlp/ontonotes-en-dep-1.3.0.jar"; // e.g., ontonotes-en-dep.tgz
                    String predModelFile = "/Users/ha/clearnlp/ontonotes-en-pred-1.3.0.jar"; // e.g., ontonotes-en-pred.tgz
                    String roleModelFile = "/Users/ha/clearnlp/ontonotes-en-role-1.3.0.jar"; // e.g., ontonotes-en-role.tgz
                    String srlModelFile  = "/Users/ha/clearnlp/ontonotes-en-srl-1.3.0.jar"; // e.g., ontonotes-en-srl.tgz
                    String inputFile     = "/Users/ha/NetBeansProjects/StanfordPOSCode/src/stanfordposcode/input.txt";
                    String outputFile    = "/Users/ha/NetBeansProjects/StanfordPOSCode/src/stanfordposcode/output.txt";

                    try
                    {
                            new SRL(dictFile, posModelFile, depModelFile, predModelFile, roleModelFile, srlModelFile, inputFile, outputFile);
                    }
                    catch (Exception e) {e.printStackTrace();}
            }

    }

I got the following error:

    ........
    13084 [main] INFO com.googlecode.clearnlp.classification.model.StringModel  - Loading model:

    java.lang.NullPointerException
        at com.googlecode.clearnlp.tokenization.EnglishTokenizer.normalizeNonUTF8(EnglishTokenizer.java:362)
        at com.googlecode.clearnlp.tokenization.EnglishTokenizer.getTokenList(EnglishTokenizer.java:111)
        at com.googlecode.clearnlp.tokenization.AbstractTokenizer.getTokens(AbstractTokenizer.java:61)
        at stanfordposcode.SRL.process(SRL.java:54)
        at stanfordposcode.SRL.<init>(SRL.java:48)
        at stanfordposcode.SRL.main(SRL.java:95)
    BUILD SUCCESSFUL (total time: 18 seconds)

I already added all the jar files:

http://i.stack.imgur.com/cIECT.png

how can I solve this error?
and is there a better SRL that I can use?

Thanks in advance

Original issue reported on code.google.com by sos...@gmail.com on 15 Jan 2015 at 3:58

GoogleCodeExporter commented 9 years ago
Please HELP!!

Original comment by sos...@gmail.com on 18 Jan 2015 at 12:06