sramirez / spark-MDLP-discretization

Spark implementation of Fayyad's discretizer based on Minimum Description Length Principle (MDLP)
Apache License 2.0
44 stars 27 forks source link

DiscretizerModelReader::load fails with Spark 2.1.1 #36

Open MarcKaminski opened 7 years ago

MarcKaminski commented 7 years ago

Steps to reproduce with this dataset

spark-shell --jars "/path/to/spark-MDLP-discretization-1.3.jar"

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._

val carsPath = "/path/to/cars.data"
val mdlPath = "/path/to/save/mdl"

val df = spark.read.option("header", true)
  .option("inferSchema", true)
  .csv(carsPath)

val strId = new StringIndexer().setInputCol("v7").setOutputCol("v7_IDX")

val assmbleFeatures: VectorAssembler = new VectorAssembler()
  .setInputCols(Array("v0", "v1", "v2", "v3", "v4", "v5", "v6"))
  .setOutputCol("featuresRaw")

val dis: MDLPDiscretizer = new MDLPDiscretizer()
  .setInputCol("featuresRaw")
  .setOutputCol("featuresBucket")
  .setLabelCol("v7_IDX")
  .setMaxBins(10)
  .setMaxByPart(10000)
  .setMinBinPercentage(0.01)

val plm = new Pipeline()
  .setStages(Array(strId, assmbleFeatures, dis))
  .fit(df)

plm.write.overwrite.save(mdlPath)

PipelineModel.load(mdlPath)

Gives:

scala.MatchError: [WrappedArray(WrappedArray(-Infinity, 21.05, Infinity), WrappedArray(-Infinity, 5.5, Infinity), WrappedArray(-Infinity, 120.5, 134.5, Infinity), WrappedArray(-Infinity, 78.5, Infinity), WrappedArray(-Infinity, 2550.5, Infinity), WrappedArray(-Infinity, Infinity), WrappedArray(-Infinity, Infinity))] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
  at org.apache.spark.ml.feature.DiscretizerModel$DiscretizerModelReader.load(MDLPDiscretizer.scala:249)
  at org.apache.spark.ml.feature.DiscretizerModel$DiscretizerModelReader.load(MDLPDiscretizer.scala:232)
  at org.apache.spark.ml.util.DefaultParamsReader$.loadParamsInstance(ReadWrite.scala:435)
  at org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:273)
  at org.apache.spark.ml.Pipeline$SharedReadWrite$$anonfun$4.apply(Pipeline.scala:271)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
  at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
  at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
  at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
  at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
  at org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:271)
  at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:347)
  at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:341)
  at org.apache.spark.ml.util.MLReadable$class.load(ReadWrite.scala:215)
  at org.apache.spark.ml.PipelineModel$.load(Pipeline.scala:331)
  ... 50 elided

Fix in file ml.feature.MDLPDiscretizer.scala: (line 258ff)

val splits = sqlContext.read.parquet(dataPath)
  .select("splits")
  .head().getAs[Seq[Seq[Float]]](0).map(arr => arr.toArray).toArray
sramirez commented 7 years ago

Please, if possible, upload a PR with the change. It will better reflect your contribution to this repo.