weso / sparkwdsub

Spark processing of wikidata subsets
MIT License
0 stars 3 forks source link

2014 dumps contain an empty field #3

Closed thewillyhuman closed 2 years ago

thewillyhuman commented 2 years ago

Apparently the 2014 dumps contain some empty fields that produce the following exception when processing it:

com.fasterxml.jackson.databind.exc.ValueInstantiationException: Cannot construct instance of `org.wikidata.wdtk.datamodel.implementation.TermImpl`, problem: A text has to be provided to create a MonolingualTextValue
 at [Source: UNKNOWN; line: 1, column: 269] (through reference chain: org.wikidata.wdtk.datamodel.implementation.ItemDocumentImpl["labels"]->java.util.LinkedHashMap["zh-hant"])
 at com.fasterxml.jackson.databind.exc.ValueInstantiationException.from(ValueInstantiationException.java:47)
 at com.fasterxml.jackson.databind.DeserializationContext.instantiationException(DeserializationContext.java:1732)
 at com.fasterxml.jackson.databind.deser.std.StdValueInstantiator.wrapAsJsonMappingException(StdValueInstantiator.java:491)
 at com.fasterxml.jackson.databind.deser.std.StdValueInstantiator.rewrapCtorProblem(StdValueInstantiator.java:514)
 at com.fasterxml.jackson.databind.deser.std.StdValueInstantiator.createFromObjectWith(StdValueInstantiator.java:285)
 at com.fasterxml.jackson.databind.deser.ValueInstantiator.createFromObjectWith(ValueInstantiator.java:229)
 at com.fasterxml.jackson.databind.deser.impl.PropertyBasedCreator.build(PropertyBasedCreator.java:198)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer._deserializeUsingPropertyBased(BeanDeserializer.java:488)
 at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.deserializeFromObjectUsingNonDefault(BeanDeserializerBase.java:1287)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer.deserializeFromObject(BeanDeserializer.java:326)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer.deserialize(BeanDeserializer.java:159)
 at com.fasterxml.jackson.databind.deser.std.MapDeserializer._readAndBindStringKeyMap(MapDeserializer.java:527)
 at com.fasterxml.jackson.databind.deser.std.MapDeserializer.deserialize(MapDeserializer.java:364)
 at com.fasterxml.jackson.databind.deser.std.MapDeserializer.deserialize(MapDeserializer.java:29)
 at com.fasterxml.jackson.databind.deser.SettableBeanProperty.deserialize(SettableBeanProperty.java:530)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer._deserializeWithErrorWrapping(BeanDeserializer.java:528)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer._deserializeUsingPropertyBased(BeanDeserializer.java:417)
 at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.deserializeFromObjectUsingNonDefault(BeanDeserializerBase.java:1287)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer.deserializeFromObject(BeanDeserializer.java:326)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer._deserializeOther(BeanDeserializer.java:194)
 at com.fasterxml.jackson.databind.deser.BeanDeserializer.deserialize(BeanDeserializer.java:161)
 at com.fasterxml.jackson.databind.jsontype.impl.AsPropertyTypeDeserializer._deserializeTypedForId(AsPropertyTypeDeserializer.java:130)
 at com.fasterxml.jackson.databind.jsontype.impl.AsPropertyTypeDeserializer.deserializeTypedFromObject(AsPropertyTypeDeserializer.java:97)
 at com.fasterxml.jackson.databind.deser.AbstractDeserializer.deserializeWithType(AbstractDeserializer.java:254)
 at com.fasterxml.jackson.databind.deser.impl.TypeWrappedDeserializer.deserialize(TypeWrappedDeserializer.java:68)
 at com.fasterxml.jackson.databind.ObjectReader._bindAndClose(ObjectReader.java:1719)
 at com.fasterxml.jackson.databind.ObjectReader.readValue(ObjectReader.java:1261)
 at org.wikidata.wdtk.datamodel.helpers.JsonDeserializer.deserializeEntityDocument(JsonDeserializer.java:124)
 at es.weso.wdsub.spark.wbmodel.LineParser.line2EntityStatements(LineParser.scala:33)
 at es.weso.wdsub.spark.wbmodel.LineParser.$anonfun$dumpRDD2Graph$2(LineParser.scala:66)
 at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
 at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
 at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
 at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
 at scala.collection.Iterator.foreach(Iterator.scala:941)
 at scala.collection.Iterator.foreach$(Iterator.scala:941)
 at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
 at org.apache.spark.graphx.EdgeRDD$.$anonfun$fromEdges$1(EdgeRDD.scala:107)
 at org.apache.spark.graphx.EdgeRDD$.$anonfun$fromEdges$1$adapted(EdgeRDD.scala:105)
 at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2(RDD.scala:915)
 at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$2$adapted(RDD.scala:915)
 at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
 at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
 at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
 at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
 at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
 at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
 at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
 at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
 at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
 at org.apache.spark.scheduler.Task.run(Task.scala:131)
 at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
 at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
 at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
 at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
 at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
 at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.NullPointerException: A text has to be provided to create a MonolingualTextValue
 at java.base/java.util.Objects.requireNonNull(Objects.java:347)
 at org.apache.commons.lang3.Validate.notNull(Validate.java:225)
 at org.wikidata.wdtk.datamodel.implementation.TermImpl.<init>(TermImpl.java:70)
 at jdk.internal.reflect.GeneratedConstructorAccessor38.newInstance(Unknown Source)
 at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
 at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
 at com.fasterxml.jackson.databind.introspect.AnnotatedConstructor.call(AnnotatedConstructor.java:124)
 at com.fasterxml.jackson.databind.deser.std.StdValueInstantiator.createFromObjectWith(StdValueInstantiator.java:283)

The line 33 from LineParser.scala is: val entityDocument = jsonDeserializer.deserializeEntityDocument(line).

And the line from the dump that produces the exception is:

{"id":"Q1246959","type":"item","aliases":{"zh":[{"language":"zh","value":"\u03f3"}],"it":[{"language":"it","value":"\u05d9"}]},"labels":{"la":{"language":"la","value":"Yot"},"zh-hans":{"language":"zh-hans","value":"\u037f"},"zh-hant":{"language":"zh-hant","removed":""},"zh-hk":{"language":"zh-hk","removed":""},"it":{"language":"it","value":"Jod"},"zh":{"language":"zh","value":"\u037f"},"fr":{"language":"fr","value":"Yot"}},"sitelinks":{"lawiki":{"site":"lawiki","title":"Yot","badges":[]},"zhwiki":{"site":"zhwiki","title":"\u037f","badges":[]},"itwiki":{"site":"itwiki","title":"Jod","badges":[]},"frwiki":{"site":"frwiki","title":"Yot","badges":[]}}},
thewillyhuman commented 2 years ago

This issue has been solved in the latest wdsub versions.