System fails a specific template even though a possible pipeline generated by said template might succeed

Consider the following template

class DefaultClassificationTemplate(DSBoxTemplate):
    def __init__(self):
        DSBoxTemplate.__init__(self)
        self.template = {
            "name": "Default_classification_template",
            "taskSubtype": {TaskSubtype.BINARY.name, TaskSubtype.MULTICLASS.name},
            "taskType": TaskType.CLASSIFICATION.name,
        # See TaskType, range include 'CLASSIFICATION', 'CLUSTERING', 'COLLABORATIVE_FILTERING',
            # 'COMMUNITY_DETECTION', 'GRAPH_CLUSTERING', 'GRAPH_MATCHING', 'LINK_PREDICTION',
            # 'REGRESSION', 'TIME_SERIES_FORECASTING', 'VERTEX_NOMINATION'
            "inputType": "table",  # See SEMANTIC_TYPES.keys() for range of values
            "output": "model_step",  # Name of the final step generating the prediction
            "target": "extract_target_step",  # Name of the step generating the ground truth
            "steps": [
                {
                    "name": "denormalize_step",
                    "primitives": ["d3m.primitives.dsbox.Denormalize"],
                    "inputs": ["template_input"]
                },
                {
                    "name": "to_dataframe_step",
                    "primitives": ["d3m.primitives.datasets.DatasetToDataFrame"],
                    "inputs": ["denormalize_step"]
                },
                {
                    "name": "extract_attribute_step",
                    "primitives": [{
                        "primitive": "d3m.primitives.data.ExtractColumnsBySemanticTypes",
                        "hyperparameters":
                            {
                                'semantic_types': (
                                'https://metadata.datadrivendiscovery.org/types/Attribute',),
                                'use_columns': (),
                                'exclude_columns': ()
                            }
                    }],
                    "inputs": ["to_dataframe_step"]
                },
                {
                    "name": "column_parser_step",
                    "primitives": ["d3m.primitives.data.ColumnParser"],
                    "inputs": ["extract_attribute_step"]
                },
                {
                    "name": "cast_1_step",
                    "primitives": ["d3m.primitives.data.CastToType"],
                    "inputs": ["column_parser_step"]
                },
                {
                    "name": "corex_step",
                    "primitives": [
                        {
                            "primitive": "d3m.primitives.dsbox.CorexText",
                            "hyperparameters":
                                {
                                    'min_df' : [(0)]
                                    # 'n_hidden':[(10)],
                                    # 'threshold':[(0)],
                                    # # 'threshold':[(0), (500)],
                                    # 'n_grams':[(1), (5)],
                                    # 'max_df':[(.9)],
                                    # 'min_df':[(.02)],
                                }
                        },
                        "d3m.primitives.dsbox.DoNothing"
                    ],
                    "inputs": ["cast_1_step"]
                },
                {
                    "name": "encoder_step",
                    "primitives": [
                        "d3m.primitives.dsbox.Encoder",
                        "d3m.primitives.dsbox.DoNothing"
                        ],
                    "inputs": ["corex_step"]
                },
                {
                    "name": "impute_step",
                    "primitives": ["d3m.primitives.sklearn_wrap.SKImputer"],
                    "inputs": ["encoder_step"]
                },
                {
                    "name": "extract_target_step",
                    "primitives": [{
                        "primitive": "d3m.primitives.data.ExtractColumnsBySemanticTypes",
                        "hyperparameters":
                            {
                                'semantic_types': (
                                'https://metadata.datadrivendiscovery.org/types/Target',
                                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',),
                                'use_columns': (),
                                'exclude_columns': ()
                            }
                    }],
                    "inputs": ["to_dataframe_step"]
                },
                {
                    "name": "model_step",
                    "runtime": {
                        "cross_validation": 10,
                        "stratified": True
                    },
                    "primitives": [
                        {
                        "primitive":
                            "d3m.primitives.sklearn_wrap.SKRandomForestClassifier",
                        "hyperparameters":
                            {
                            # 'max_depth': [(2),(4),(8)], #(10), #
                            # 'n_estimators':[(10),(20),(30)]
                            }
                        },
                        # {
                        # "primitive":
                        #     "d3m.primitives.sklearn_wrap.SKLinearSVC",
                        # "hyperparameters":
                        #     {
                        #     'C': [(1), (10), (100)],  # (10), #
                        #     }
                        # },
                        # {
                        #     "primitive":
                        #         "d3m.primitives.sklearn_wrap.SKMultinomialNB",
                        #     "hyperparameters":
                        #         {
                        #             'alpha': [(1)],
                        #         }
                        # },
                    ],
                    "inputs": ["impute_step", "extract_target_step"]
                }
            ]
        }

    # @override
    def importance(datset, problem_description):
        return 7

If we run the system on 27_wordLevels with python ta2-search /nas/home/stan/dsbox/runs3/config-seed/27_wordLevels_config.json, then we get the following stack trace

(dsbox-devel-710) [stan@dsbox01 python]$ python ta2-search /nas/home/stan/dsbox/runs3/config-seed/27_wordLevels_config.json 
Namespace(configuration_file='/nas/home/stan/dsbox/runs3/config-seed/27_wordLevels_config.json', cpus=-1, debug=False, output_prefix=None, timeout=-1)
Using configuation:
{'cpus': '10',
 'dataset_schema': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_dataset/datasetDoc.json',
 'executables_root': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels/executables',
 'pipeline_logs_root': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels/logs',
 'problem_root': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_problem',
 'problem_schema': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_problem/problemDoc.json',
 'ram': '10Gi',
 'saved_pipeline_ID': '',
 'saving_folder_loc': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels',
 'temp_storage_root': '/nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels/temp',
 'timeout': 29,
 'training_data_root': '/nfs1/dsbox-repo/data/datasets-v31/seed_datasets_current/27_wordLevels/27_wordLevels_dataset'}
[INFO] No test data config found! Will split the data.
[INFO] - dsbox.controller.controller - Top level output directory: /nfs1/dsbox-repo/stan/dsbox-ta2/python/output/27_wordLevels
[INFO] Succesfully parsed test data
{'structural_type': <class 'd3m.container.pandas.DataFrame'>, 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table', 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'), 'dimension': {'name': 'rows', 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularRow',), 'length': 5600}}
{'dimension': <FrozenOrderedDict OrderedDict([('name', 'rows'), ('semantic_types', ('https://metadata.datadrivendiscovery.org/types/TabularRow',)), ('length', 5600)])>,
 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table',
                    'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'),
 'structural_type': <class 'd3m.container.pandas.DataFrame'>}
{'structural_type': <class 'd3m.container.pandas.DataFrame'>, 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table', 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'), 'dimension': {'name': 'rows', 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularRow',), 'length': 1400}}
{'dimension': <FrozenOrderedDict OrderedDict([('name', 'rows'), ('semantic_types', ('https://metadata.datadrivendiscovery.org/types/TabularRow',)), ('length', 1400)])>,
 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table',
                    'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'),
 'structural_type': <class 'd3m.container.pandas.DataFrame'>}
[INFO] Template choices:
Template ' Default_classification_template ' has been added to template base.
[INFO] Worker started, id: <_MainProcess(MainProcess, started)>
/nfs1/dsbox-repo/stan/miniconda/envs/dsbox-devel-710/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[INFO] Push@cache: ('d3m.primitives.dsbox.Denormalize', -7767296339518243098)
/nfs1/dsbox-repo/stan/miniconda/envs/dsbox-devel-710/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
[INFO] Push@cache: ('d3m.primitives.datasets.DatasetToDataFrame', -7767296339518243098)
[INFO] Push@cache: ('d3m.primitives.data.ExtractColumnsBySemanticTypes', 6740131222379490814)
[INFO] Push@cache: ('d3m.primitives.data.ExtractColumnsBySemanticTypes', -7723164063588704075)
[INFO] Push@cache: ('d3m.primitives.data.ColumnParser', 6988704037418576994)
[INFO] Push@cache: ('d3m.primitives.data.CastToType', 1488402163441597907)
[INFO] Push@cache: ('d3m.primitives.dsbox.CorexText', 4051310912840668046)
[INFO] Push@cache: ('d3m.primitives.dsbox.Encoder', -2043123197196909841)
Traceback (most recent call last):
  File "pandas/_libs/src/inference.pyx", line 1125, in pandas._libs.lib.maybe_convert_numeric
ValueError: Unable to parse string "nan"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/search.py", line 384, in evaluate_pipeline
    evaluation_result = self._evaluate(configuration, cache)
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/search.py", line 401, in _evaluate
    fitted_pipeline.fit(cache=cache, inputs=[self.train_dataset])
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/pipeline/fitted_pipeline.py", line 92, in fit
    self.runtime.fit(**arguments)
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/runtime.py", line 200, in fit
    primitive_arguments
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/runtime.py", line 285, in _primitive_step_fit
    model.fit()
  File "/nfs1/dsbox-repo/stan/dsbox-cleaning/dsbox/datapreprocessing/cleaner/encoder.py", line 112, in fit
    if pd.isnull(pd.to_numeric(data.iloc[:,element])).sum() == data.shape[0]:
  File "/nfs1/dsbox-repo/stan/miniconda/envs/dsbox-devel-710/lib/python3.6/site-packages/pandas/core/tools/numeric.py", line 133, in to_numeric
    coerce_numeric=coerce_numeric)
  File "pandas/_libs/src/inference.pyx", line 1158, in pandas._libs.lib.maybe_convert_numeric
ValueError: Unable to parse string "nan" at position 2
Traceback (most recent call last):
  File "/nfs1/dsbox-repo/stan/dsbox-ta2/python/dsbox/template/search.py", line 274, in setup_initial_candidate
    candidate.data.update(result)
TypeError: 'NoneType' object is not iterable
--------------------
[ERROR] Initial Pipeline failed, Trying a random pipeline ...

If however we modify the template so that the encoder step is set to

                {
                    "name": "encoder_step",
                    "primitives": [
                        # "d3m.primitives.dsbox.Encoder",
                        "d3m.primitives.dsbox.DoNothing"
                        ],
                    "inputs": ["corex_step"]
                },

the run succeeds. I assume that as long as we have a possibly successful pipeline generated from a given template, we shouldn't crash.

usc-isi-i2 / dsbox-ta2

System fails a specific template even though a possible pipeline generated by said template might succeed #70