google / nitroml

NitroML is a modular, portable, and scalable model-quality benchmarking framework for Machine Learning and Automated Machine Learning (AutoML) pipelines.
Apache License 2.0
41 stars 6 forks source link

Run 3 datasets in a single NitroML DAG #19

Closed cweill closed 4 years ago

cweill commented 4 years ago
class TitanicBenchmark(nitroml.Benchmark):
  r"""Demos a NitroML benchmark on the 'Titanic' dataset from OpenML."""

  def benchmark(self):
    # NOTE: For convenience, we fetch the OpenML task from the AutoTFX
    # tasks repository.
    datasets = [
   tfds_dataset.TFDSDataset(tfds.builder('titanic')),
   tfds_dataset.TFDSDataset(tfds.builder('adult')),
     ]
    for dataset in datasets:
      with self.sub_benchmark(dataset.name):
    # Compute dataset statistics.
    statistics_gen = tfx.StatisticsGen(examples=dataset.examples)

    # Infer the dataset schema.
    schema_gen = tfx.SchemaGen(
        statistics=statistics_gen.outputs.statistics, infer_feature_shape=True)

    # Apply global transformations and compute vocabularies.
    transform = tfx.Transform(
        examples=dataset.examples,
        schema=schema_gen.outputs.schema,
        module_file=os.path.join(
            os.path.dirname(__file__), 'auto_transform.py'))

    # Define a tf.estimator.Estimator-based trainer.
    trainer = tfx.Trainer(
        module_file=os.path.join(
            os.path.dirname(__file__), 'auto_estimator_trainer.py'),
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            trainer_executor.GenericExecutor),
        transformed_examples=transform.outputs.transformed_examples,
        schema=schema_gen.outputs.schema,
        transform_graph=transform.outputs.transform_graph,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Collect the pipeline components to benchmark.
    pipeline = dataset.components + [
        statistics_gen, schema_gen, transform, trainer
    ]

    # Finally, call evaluate() on the workflow DAG outputs. This will
    # automatically append Evaluators to compute metrics from the given
    # SavedModel and 'eval' TF Examples.
    self.evaluate(
        pipeline, examples=dataset.examples, model=trainer.outputs.model)