weso / sparkwdsub

Spark processing of wikidata subsets
MIT License
0 stars 3 forks source link

java.io.UTFDataFormatException when processing 2014 dump #5

Closed thewillyhuman closed 2 years ago

thewillyhuman commented 2 years ago

When the program processes the 2014 dump the program fails with the following exception:

java.io.UTFDataFormatException
    at java.base/java.io.ObjectInputStream$BlockDataInputStream.readUTFSpan(ObjectInputStream.java:3708)
    at java.base/java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3633)
    at java.base/java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:3437)
    at java.base/java.io.ObjectInputStream.readString(ObjectInputStream.java:2032)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1661)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2464)
    at java.base/java.io.ObjectInputStream.defaultReadObject(ObjectInputStream.java:629)
    at java.base/java.net.URI.readObject(URI.java:1778)
    at java.base/jdk.internal.reflect.GeneratedMethodAccessor34.invoke(Unknown Source)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:566)
    at java.base/java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1175)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2325)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2196)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1679)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2464)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2358)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2196)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1679)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2464)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2358)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2196)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1679)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2464)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2358)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2196)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1679)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2464)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2358)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2196)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1679)
    at java.base/java.io.ObjectInputStream.readArray(ObjectInputStream.java:2102)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1667)
    at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2464)
    at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2358)
    at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2196)
    at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1679)
    at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:493)
    at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:451)
    at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)
    at org.apache.spark.serializer.DeserializationStream$$anon$1.getNext(Serializer.scala:168)
    at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
    at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
    at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:221)
    at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:299)
    at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1423)
    at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1350)
    at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1414)
    at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1237)
    at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:335)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:131)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:829)
thewillyhuman commented 2 years ago

This issue has been fixed in the latests versions of wdsub.