usc-isi-i2 / dsbox-ta2

The DSBox TA2 component
MIT License
11 stars 6 forks source link

System fails a specific template even though a possible pipeline generated by said template might succeed #70

Closed serbanstan closed 6 years ago

serbanstan commented 6 years ago

Consider the following template

class DefaultClassificationTemplate(DSBoxTemplate):
    def __init__(self):
        DSBoxTemplate.__init__(self)
        self.template = {
            "name": "Default_classification_template",
            "taskSubtype": {TaskSubtype.BINARY.name, TaskSubtype.MULTICLASS.name},
            "taskType": TaskType.CLASSIFICATION.name,
        # See TaskType, range include 'CLASSIFICATION', 'CLUSTERING', 'COLLABORATIVE_FILTERING',
            # 'COMMUNITY_DETECTION', 'GRAPH_CLUSTERING', 'GRAPH_MATCHING', 'LINK_PREDICTION',
            # 'REGRESSION', 'TIME_SERIES_FORECASTING', 'VERTEX_NOMINATION'
            "inputType": "table",  # See SEMANTIC_TYPES.keys() for range of values
            "output": "model_step",  # Name of the final step generating the prediction
            "target": "extract_target_step",  # Name of the step generating the ground truth
            "steps": [
                {
                    "name": "denormalize_step",
                    "primitives": ["d3m.primitives.dsbox.Denormalize"],
                    "inputs": ["template_input"]
                },
                {
                    "name": "to_dataframe_step",
                    "primitives": ["d3m.primitives.datasets.DatasetToDataFrame"],
                    "inputs": ["denormalize_step"]
                },
                {
                    "name": "extract_attribute_step",
                    "primitives": [{
                        "primitive": "d3m.primitives.data.ExtractColumnsBySemanticTypes",
                        "hyperparameters":
                            {
                                'semantic_types': (
                                'https://metadata.datadrivendiscovery.org/types/Attribute',),
                                'use_columns': (),
                                'exclude_columns': ()
                            }
                    }],
                    "inputs": ["to_dataframe_step"]
                },
                {
                    "name": "column_parser_step",
                    "primitives": ["d3m.primitives.data.ColumnParser"],
                    "inputs": ["extract_attribute_step"]
                },
                {
                    "name": "cast_1_step",
                    "primitives": ["d3m.primitives.data.CastToType"],
                    "inputs": ["column_parser_step"]
                },
                {
                    "name": "corex_step",
                    "primitives": [
                        {
                            "primitive": "d3m.primitives.dsbox.CorexText",
                            "hyperparameters":
                                {
                                    'min_df' : [(0)]
                                    # 'n_hidden':[(10)],
                                    # 'threshold':[(0)],
                                    # # 'threshold':[(0), (500)],
                                    # 'n_grams':[(1), (5)],
                                    # 'max_df':[(.9)],
                                    # 'min_df':[(.02)],
                                }
                        },
                        "d3m.primitives.dsbox.DoNothing"
                    ],
                    "inputs": ["cast_1_step"]
                },
                {
                    "name": "encoder_step",
                    "primitives": [
                        "d3m.primitives.dsbox.Encoder",
                        "d3m.primitives.dsbox.DoNothing"
                        ],
                    "inputs": ["corex_step"]
                },
                {
                    "name": "impute_step",
                    "primitives": ["d3m.primitives.sklearn_wrap.SKImputer"],
                    "inputs": ["encoder_step"]
                },
                {
                    "name": "extract_target_step",
                    "primitives": [{
                        "primitive": "d3m.primitives.data.ExtractColumnsBySemanticTypes",
                        "hyperparameters":
                            {
                                'semantic_types': (
                                'https://metadata.datadrivendiscovery.org/types/Target',
                                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',),
                                'use_columns': (),
                                'exclude_columns': ()
                            }
                    }],
                    "inputs": ["to_dataframe_step"]
                },
                {
                    "name": "model_step",
                    "runtime": {
                        "cross_validation": 10,
                        "stratified": True
                    },
                    "primitives": [
                        {
                        "primitive":
                            "d3m.primitives.sklearn_wrap.SKRandomForestClassifier",
                        "hyperparameters":
                            {
                            # 'max_depth': [(2),(4),(8)], #(10), #
                            # 'n_estimators':[(10),(20),(30)]
                            }
                        },
                        # {
                        # "primitive":
                        #     "d3m.primitives.sklearn_wrap.SKLinearSVC",
                        # "hyperparameters":
                        #     {
                        #     'C': [(1), (10), (100)],  # (10), #
                        #     }
                        # },
                        # {
                        #     "primitive":
                        #         "d3m.primitives.sklearn_wrap.SKMultinomialNB",
                        #     "hyperparameters":
                        #         {
                        #             'alpha': [(1)],
                        #         }
                        # },
                    ],
                    "inputs": ["impute_step", "extract_target_step"]
                }
            ]
        }

    # @override
    def importance(datset, problem_description):
        return 7

If we run the system on 27_wordLevels with python ta2-search /nas/home/stan/dsbox/runs3/config-seed/27_wordLevels_config.json, then we get the following stack trace

(dsbox-devel-710) [stan@dsbox01 python]$ python ta2-search /nas/home/stan/dsbox/runs3/config-seed/27_wordLevels_config.json 
Namespace(configuration_file='/nas/home/stan/dsbox/runs3/config-seed/27_wordLevels_config.json', cpus=-1, debug=False, output_prefix=None, timeout=-1)
Using configuation:
{'cpus': '10',
 'dataset_schema': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_dataset/datasetDoc.json',
 'executables_root': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels/executables',
 'pipeline_logs_root': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels/logs',
 'problem_root': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_problem',
 'problem_schema': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_problem/problemDoc.json',
 'ram': '10Gi',
 'saved_pipeline_ID': '',
 'saving_folder_loc': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels',
 'temp_storage_root': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels/temp',
 'timeout': 29,
 'training_data_root': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_dataset'}
[INFO] No test data config found! Will split the data.
[INFO] - dsbox.controller.controller - Top level output directory: /nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels
[INFO] Succesfully parsed test data
{'structural_type': <class 'd3m.container.pandas.DataFrame'>, 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table', 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'), 'dimension': {'name': 'rows', 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularRow',), 'length': 5600}}
{'dimension': <FrozenOrderedDict OrderedDict([('name', 'rows'), ('semantic_types', ('https://metadata.datadrivendiscovery.org/types/TabularRow',)), ('length', 5600)])>,
 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table',
                    'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'),
 'structural_type': <class 'd3m.container.pandas.DataFrame'>}
{'structural_type': <class 'd3m.container.pandas.DataFrame'>, 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table', 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'), 'dimension': {'name': 'rows', 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularRow',), 'length': 1400}}
{'dimension': <FrozenOrderedDict OrderedDict([('name', 'rows'), ('semantic_types', ('https://metadata.datadrivendiscovery.org/types/TabularRow',)), ('length', 1400)])>,
 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table',
                    'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'),
 'structural_type': <class 'd3m.container.pandas.DataFrame'>}
[INFO] Template choices:
Template ' Default_classification_template ' has been added to template base.
[INFO] Worker started, id: <_MainProcess(MainProcess, started)>
/nfs1/dsbox-repo/stan/miniconda/envs/dsbox-devel-710/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[INFO] Push@cache: ('d3m.primitives.dsbox.Denormalize', -7767296339518243098)
/nfs1/dsbox-repo/stan/miniconda/envs/dsbox-devel-710/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[INFO] Push@cache: ('d3m.primitives.datasets.DatasetToDataFrame', -7767296339518243098)
[INFO] Push@cache: ('d3m.primitives.data.ExtractColumnsBySemanticTypes', 6740131222379490814)
[INFO] Push@cache: ('d3m.primitives.data.ExtractColumnsBySemanticTypes', -7723164063588704075)
[INFO] Push@cache: ('d3m.primitives.data.ColumnParser', 6988704037418576994)
[INFO] Push@cache: ('d3m.primitives.data.CastToType', 1488402163441597907)
[INFO] Push@cache: ('d3m.primitives.dsbox.CorexText', 4051310912840668046)
[INFO] Push@cache: ('d3m.primitives.dsbox.Encoder', -2043123197196909841)
Traceback (most recent call last):
  File "pandas/_libs/src/inference.pyx", line 1125, in pandas._libs.lib.maybe_convert_numeric
ValueError: Unable to parse string "nan"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/search.py", line 384, in evaluate_pipeline
    evaluation_result = self._evaluate(configuration, cache)
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/search.py", line 401, in _evaluate
    fitted_pipeline.fit(cache=cache, inputs=[self.train_dataset])
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/pipeline/fitted_pipeline.py", line 92, in fit
    self.runtime.fit(**arguments)
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/runtime.py", line 200, in fit
    primitive_arguments
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/runtime.py", line 285, in _primitive_step_fit
    model.fit()
  File "/nfs1/dsbox-repo/stan/dsbox-cleaning/dsbox/datapreprocessing/cleaner/encoder.py", line 112, in fit
    if pd.isnull(pd.to_numeric(data.iloc[:,element])).sum() == data.shape[0]:
  File "/nfs1/dsbox-repo/stan/miniconda/envs/dsbox-devel-710/lib/python3.6/site-packages/pandas/core/tools/numeric.py", line 133, in to_numeric
    coerce_numeric=coerce_numeric)
  File "pandas/_libs/src/inference.pyx", line 1158, in pandas._libs.lib.maybe_convert_numeric
ValueError: Unable to parse string "nan" at position 2
Traceback (most recent call last):
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/search.py", line 274, in setup_initial_candidate
    candidate.data.update(result)
TypeError: 'NoneType' object is not iterable
--------------------
[ERROR] Initial Pipeline failed, Trying a random pipeline ...

If however we modify the template so that the encoder step is set to

                {
                    "name": "encoder_step",
                    "primitives": [
                        # "d3m.primitives.dsbox.Encoder",
                        "d3m.primitives.dsbox.DoNothing"
                        ],
                    "inputs": ["corex_step"]
                },

the run succeeds. I assume that as long as we have a possibly successful pipeline generated from a given template, we shouldn't crash.

proska commented 6 years ago

@kyao How should we address the initial candidate search failures? I can think of two options:

  1. increase the number of random trials initially in the "setup_initial_candidate".
  2. return failure and let the UCT to give the template more chances later
kyao commented 6 years ago

Let's go for option 2. We are already trying three times to find the initial candidate.

For this particular template, it will always fails regardless number of trails.

proska commented 6 years ago

fixed in commit b76d4adb3540f76f542f1397f9ec1226dbd657d8