Open szilard opened 5 years ago
https://github.com/szilard/GBM-perf/blob/master/wip-testing/sparkling_water/sw-h2o.scala
scala> val dx_train = asH2OFrame(d_train.select("Month","DayofMonth","DayOfWeek","DepTime","UniqueCarrier",
| "Origin","Dest","Distance","dep_delayed_15min"))
dx_train: org.apache.spark.h2o.H2OFrame =
Frame key: frame_rdd_34_b0642c9b519a5b66158f83e818084ae1
cols: 9
rows: 1000000
chunks: 6
size: 57955931
scala> H2OFrameSupport.allStringVecToCategorical(dx_train)
res1: org.apache.spark.h2o.H2OFrame =
Frame key: frame_rdd_34_b0642c9b519a5b66158f83e818084ae1
cols: 9
rows: 1000000
chunks: 6
size: 12188619
scala> val elapsed = ( System.nanoTime - now )/1e9
elapsed: Double = 3.172993114
scala> val gbm_md = gbm.trainModel.get
gbm_md: hex.tree.gbm.GBMModel =
...
scala> elapsed
res3: Double = 28.644599689
scala> evaluator.evaluate(predictions)
res4: Double = 0.7623568809741097
https://github.com/szilard/GBM-perf/blob/master/wip-testing/sparkling_water/sw-mllib-ohe.scala
doing 10 trees as this is slow:
scala> val gbm = new H2OGBM().setLabelCol("label").setFeaturesCol("features").
| setNtrees(10).setMaxDepth(10).setLearnRate(0.1) //.setMaxBins(100) not implemented??
scala> val model = pipeline.fit(d_train)
scala> val elapsed = ( System.nanoTime - now )/1e9
elapsed: Double = 132.769667071
slow with OHE 10 trees 136 sec vs 100 trees 28 sec (m5.2xlarge 8 cores) -- 50x
TODO: fix this (needs cast type):
val predictions = model.transform(d_test)
val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction_output").setMetricName("areaUnderROC")
evaluator.evaluate(predictions)
// TODO:
//evaluator.evaluate(predictions)
//java.lang.IllegalArgumentException: requirement failed: Column prediction_output must be of type equal to one of the following types: [double, struct<type:tinyint,size:int,indices:array<int>,values:array<double>>] but was actually of type struct<value:double>.
scala> val gbm = new H2OGBM().setLabelCol("dep_delayed_15min").
| setNtrees(100).setMaxDepth(10).setLearnRate(0.1) // .setMaxBins(100) not implemented??
scala> val model = pipeline.fit(d_train)
model: org.apache.spark.ml.PipelineModel = pipeline_679f2c3cfbeb
scala> val elapsed = ( System.nanoTime - now )/1e9
elapsed: Double = 31.183876731
TODO: fix this (needs cast type):
val predictions = model.transform(d_test)
val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction_output").setMetricName("areaUnderROC")
evaluator.evaluate(predictions)
// TODO:
//evaluator.evaluate(predictions)
//java.lang.IllegalArgumentException: requirement failed: Column prediction_output must be of type equal to one of the following types: [double, struct<type:tinyint,size:int,indices:array<int>,values:array<double>>] but was actually of type struct<value:double>.
m5.2xlarge 8cores 30GB RAM 1M data
for comparison: