AtlasOfLivingAustralia / biocache-store

Occurrence processing, indexing and batch processing
Other
7 stars 24 forks source link

Regression in latest biocache DwCALoader with None.get #298

Closed ansell closed 5 years ago

ansell commented 5 years ago

The following occurred when loading a Darwin Core Archive using the latest biocache.

aws-bstore-2b 2018-11-23 10:13:41,690 INFO : [DataLoader] - 139000, >> last key : dr4766|ALS-YARRAWONGA-2015-08-17-Other Chlorophyceae-, UUID: 42bf91d0-68d8-4b8b-80d5-2c0324345a97, records per sec: 1011.1223
aws-bstore-2b 2018-11-23 10:13:41,870 ERROR: [DataLoader] - None.get
java.util.NoSuchElementException: None.get
    at scala.None$.get(Option.scala:313)
    at scala.None$.get(Option.scala:311)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1$$anonfun$11.apply(DwCALoader.scala:234)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1$$anonfun$11.apply(DwCALoader.scala:234)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.AbstractTraversable.map(Traversable.scala:105)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1.apply(DwCALoader.scala:234)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1.apply(DwCALoader.scala:228)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at au.org.ala.biocache.load.DwCALoader.loadArchive(DwCALoader.scala:228)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$load$1.apply(DwCALoader.scala:126)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$load$1.apply(DwCALoader.scala:117)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at au.org.ala.biocache.load.DwCALoader.load(DwCALoader.scala:117)
    at au.org.ala.biocache.load.Loader.load(Loader.scala:209)
    at au.org.ala.biocache.load.Loader$$anonfun$main$4.apply(Loader.scala:96)
    at au.org.ala.biocache.load.Loader$$anonfun$main$4.apply(Loader.scala:96)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
    at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
    at au.org.ala.biocache.load.Loader$.main(Loader.scala:95)
    at au.org.ala.biocache.cmd.CMD2$.main(CMD2.scala:133)
    at au.org.ala.biocache.cmd.CMD2.main(CMD2.scala)
Exception in thread "main" java.util.NoSuchElementException: None.get
    at scala.None$.get(Option.scala:313)
    at scala.None$.get(Option.scala:311)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1$$anonfun$11.apply(DwCALoader.scala:234)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1$$anonfun$11.apply(DwCALoader.scala:234)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.AbstractTraversable.map(Traversable.scala:105)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1.apply(DwCALoader.scala:234)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$loadArchive$1.apply(DwCALoader.scala:228)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at au.org.ala.biocache.load.DwCALoader.loadArchive(DwCALoader.scala:228)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$load$1.apply(DwCALoader.scala:126)
    at au.org.ala.biocache.load.DwCALoader$$anonfun$load$1.apply(DwCALoader.scala:117)
    at scala.collection.immutable.List.foreach(List.scala:318)
    at au.org.ala.biocache.load.DwCALoader.load(DwCALoader.scala:117)
    at au.org.ala.biocache.load.Loader.load(Loader.scala:209)
    at au.org.ala.biocache.load.Loader$$anonfun$main$4.apply(Loader.scala:96)
    at au.org.ala.biocache.load.Loader$$anonfun$main$4.apply(Loader.scala:96)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
    at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
    at au.org.ala.biocache.load.Loader$.main(Loader.scala:95)
    at au.org.ala.biocache.cmd.CMD2$.main(CMD2.scala:133)
    at au.org.ala.biocache.cmd.CMD2.main(CMD2.scala)
ansell commented 5 years ago

The regression in this case is a safety for when the CSV parser that biocache-store uses is too lenient and allows lines through that it hasn't parsed fully. In this case, the id field was after a text field that contained a properly quoted line break character, which the dwca-io parser we are using silently corrupts and emits as a partial line. Added the record number that the issue occurs to into the error message so that we can debug the data file.