JohnSnowLabs / spark-nlp

State of the Art Natural Language Processing
https://sparknlp.org/
Apache License 2.0
3.87k stars 711 forks source link

Replicate NerDLPipeline in Java #380

Closed jeffisenhart closed 5 years ago

jeffisenhart commented 5 years ago

I am trying to replicate NerDLPipeline in Java (standalone) and getting an exception

Here is the code. I expect to see the output of the call

pm.select("ner","ner_converter").show(false);

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import com.johnsnowlabs.nlp.DocumentAssembler;
import com.johnsnowlabs.nlp.Finisher;
import com.johnsnowlabs.nlp.annotators.Normalizer;
import com.johnsnowlabs.nlp.annotators.Tokenizer;
import com.johnsnowlabs.nlp.annotators.ner.NerConverter;
import com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel;

import scala.collection.JavaConverters;
import scala.collection.immutable.Seq;

public class NerExample  {

    private Seq<String> segment(List<String> data){
        return (Seq<String>) JavaConverters.asScalaIteratorConverter(data.iterator()).asScala().toSeq();
    }

    public void getEntities(SparkSession spark,List<String> data) {

        Dataset<Row> ds = spark.createDataset(segment(data), Encoders.STRING()).toDF();

        DocumentAssembler da = new DocumentAssembler();
        da.setInputCol("text").setOutputCol("document");

        Tokenizer tok = new Tokenizer();
        tok.setInputCols(new String[] {"document"});
        tok.setOutputCol("token");

        Normalizer normalizer = new Normalizer();
        normalizer.setInputCols(new String[] {"token"});
        normalizer.setOutputCol("normal");

        NerDLModel ner = NerDLModel.pretrained(NerDLModel.pretrained$default$1(), NerDLModel.pretrained$default$2(), NerDLModel.pretrained$default$3());
        ner.setInputCols(new String[] {"normal", "document"});
        ner.setOutputCol("ner");

        NerConverter nerConverter = new NerConverter();
        nerConverter.setInputCols(new String[] {"document", "normal", "ner"});
        nerConverter.setOutputCol("ner_converter");

        Finisher finisher = new Finisher();
        finisher.setInputCols(new String[] {"ner", "ner_converter"});
        finisher.setIncludeMetadata(true);
        finisher.setOutputAsArray(false);;
        finisher.setCleanAnnotations(false);
        finisher.setAnnotationSplitSymbol("@");
        finisher.setValueSplitSymbol("#");

        Pipeline pipeline = new Pipeline();
        pipeline.setStages(new PipelineStage[] {da, tok, normalizer, ner, nerConverter, finisher});

        Dataset<Row> empty = spark.createDataset(new ArrayList<String>(), Encoders.STRING()).toDF("text");
        Dataset<Row> pm = pipeline.fit(empty).transform(ds);
        pm.select("ner","ner_converter").show(false);   

    }

     public static void main(String[] args) {
        SparkSession spark = null;
        try {
            spark = SparkSession.builder().appName("Simple Application").master("local[*]")
                    .config("spark.driver.memory", "12G")
                    .config("spark.kryoserializer.buffer.max","200M")
                    .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
                    .getOrCreate();
            List<String> data = new ArrayList<>();
            data.add("Peter Parker is from New Zealand, and he is a wonderful man born in Germany, right in Berlin");
            data.add("Google is a famous company");
            data.add("Peter Parker is a super hero");
            new NerExample().getEntities(spark,data);
        } catch (Throwable t) {
            t.printStackTrace();
        }finally {
            if( spark != null ) {
                    spark.stop();
            }
        }
    }
}

When this line executes

NerDLModel ner = NerDLModel.pretrained(NerDLModel.pretrained$default$1(), NerDLModel.pretrained$default$2(), NerDLModel.pretrained$default$3());

I get the following exception:

java.lang.IllegalAccessError: tried to access method com.google.common.base.Stopwatch.<init>()V from class org.apache.hadoop.mapred.FileInputFormat
    at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:312)
    at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:204)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1343)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.take(RDD.scala:1337)
    at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1378)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.first(RDD.scala:1377)
    at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:615)
    at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:493)
    at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:12)
    at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:8)
    at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:115)
    at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:109)
    at com.johnsnowlabs.nlp.annotators.ner.dl.PretrainedNerDL$class.pretrained(NerDLModel.scala:117)
    at com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel$.pretrained(NerDLModel.scala:121)
    at com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel.pretrained(NerDLModel.scala)
    at hack.spark.NerExample.getEntities(NerExample.java:46)
    at hack.spark.NerExample.main(NerExample.java:174)
maziyarpanahi commented 5 years ago

This is actually related to the version guava used in Hadoop. Are you using any other packages which might be incompatible with the version shipped with Spark 2.4.0? Few quick fixes are adding hadoop-client with the version that supports a higher guava or Identified your dependency hierarchy and exclude the one that uses guava.

jeffisenhart commented 5 years ago

OK, I've trimmed down the dependencies to see what the issue might be and now getting a different error. Here is the exhaustive list of my dependencies.

    compile group: 'org.apache.spark', name: 'spark-core_2.11', version: '2.4.0'
    compile group: 'org.apache.spark', name: 'spark-sql_2.11', version: '2.4.0'
    compile group: 'com.johnsnowlabs.nlp', name: 'spark-nlp_2.11', version: '1.8.2'
    compile group: 'com.johnsnowlabs.nlp', name: 'spark-nlp-ocr_2.11', version: '1.8.2'
    compile group: 'org.apache.spark', name: 'spark-mllib_2.11', version: '2.4.0'
    compile group: 'com.amazonaws', name: 'aws-java-sdk-core', version: '1.11.372'
    compile group: 'com.amazonaws', name: 'aws-java-sdk-s3', version: '1.11.372'

And the current issue:

java.lang.NoSuchMethodError: com.google.common.base.Stopwatch.elapsedMillis()J
    at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:245)
    at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313)
    at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:204)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
    at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1343)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.take(RDD.scala:1337)
    at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1378)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.first(RDD.scala:1377)
    at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:615)
    at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:493)
    at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:12)
    at com.johnsnowlabs.nlp.FeaturesReader.load(ParamsAndFeaturesReadable.scala:8)
    at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:115)
    at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadModel(ResourceDownloader.scala:109)
    at com.johnsnowlabs.nlp.annotators.ner.dl.PretrainedNerDL$class.pretrained(NerDLModel.scala:117)
    at com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel$.pretrained(NerDLModel.scala:121)
    at com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel.pretrained(NerDLModel.scala)
    at hack.NerExample.getEntities(NerExample.java:46)
    at hack.NerExample.main(NerExample.java:175)
jeffisenhart commented 5 years ago

I finally got the code to run by restricting the quava to version 15

    compile ("com.google.guava:guava:15.0:cdi1.0") {
      force = true
    }
maziyarpanahi commented 5 years ago

Good to hear that :)