probcomp / bayeslite

BayesDB on SQLite. A Bayesian database table for querying the probable implications of data as easily as SQL databases query the data itself.
http://probcomp.csail.mit.edu/software/bayesdb
Apache License 2.0
922 stars 64 forks source link

Invalid category error in `_from_numeric` in `cgpm_metamodel` #506

Open curlette opened 7 years ago

curlette commented 7 years ago
---------------------------------------------------------------------------
BQLError                                  Traceback (most recent call last)
<ipython-input-4-4dfbe68718e5> in <module>()
----> 1 get_ipython().run_cell_magic(u'bql', u'', u'DROP TABLE IF EXISTS predicted_earnings_class_training;\nCREATE TABLE predicted_earnings_class_training AS \n    INFER EXPLICIT\n        PREDICT Median_earnings_class\n        CONFIDENCE confidence USING 10 SAMPLES\n        FROM train_rf_p WHERE train_rf._rowid_ > 83 AND train_rf._rowid_ < 244;')

/scratch/curlette/.pyenv/local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2101             magic_arg_s = self.var_expand(line, stack_depth)
   2102             with self.builtin_trap:
-> 2103                 result = fn(magic_arg_s, cell)
   2104             return result
   2105 

/scratch/curlette/iventure/build/lib.linux-x86_64-2.7/iventure/magics.pyc in logged_cell_wrapper(self, line, cell)
    145             raw = self._retrieve_raw(line, cell)
    146             try:
--> 147                 output = func(self, line, cell)
    148             except:
    149                 exception = traceback.format_exc()

<decorator-gen-128> in bql(self, line, cell)

/scratch/curlette/.pyenv/local/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/scratch/curlette/iventure/build/lib.linux-x86_64-2.7/iventure/magics.pyc in bql(self, line, cell)
    306                 result = self._cmd(cmd)
    307             else:
--> 308                 result = self._bql([cmd])
    309         return result
    310 

/scratch/curlette/iventure/build/lib.linux-x86_64-2.7/iventure/magics.pyc in _bql(self, lines)
    320             if out.getvalue() and bql_string_complete_p(out.getvalue()):
    321                 ok = True
--> 322         cursor = self._bdb.execute(out.getvalue())
    323         return bqu.cursor_to_df(cursor)
    324 

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bayesdb.pyc in execute(self, string, bindings)
    213             bindings = ()
    214         return self._maybe_trace(
--> 215             self.tracer, self._do_execute, string, bindings)
    216 
    217     def _maybe_trace(self, tracer, meth, string, bindings):

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bayesdb.pyc in _maybe_trace(self, tracer, meth, string, bindings)
    221         if tracer:
    222             tracer(string, bindings)
--> 223         return meth(string, bindings)
    224 
    225     def _qid(self):

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bayesdb.pyc in _do_execute(self, string, bindings)
    262         else:
    263             raise ValueError('>1 phrase in string')
--> 264         cursor = bql.execute_phrase(self, phrase, bindings)
    265         return self._empty_cursor if cursor is None else cursor
    266 

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bql.pyc in execute_phrase(bdb, phrase, bindings)
     97             winders, unwinders = out.getwindings()
     98             with compiler.bayesdb_wind(bdb, winders, unwinders):
---> 99                 bdb.sql_execute(out.getvalue(), out.getbindings())
    100         return empty_cursor(bdb)
    101 

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bayesdb.pyc in sql_execute(self, string, bindings)
    279             bindings = ()
    280         return self._maybe_trace(
--> 281             self.sql_tracer, self._do_sql_execute, string, bindings)
    282 
    283     def _do_sql_execute(self, string, bindings):

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bayesdb.pyc in _maybe_trace(self, tracer, meth, string, bindings)
    221         if tracer:
    222             tracer(string, bindings)
--> 223         return meth(string, bindings)
    224 
    225     def _qid(self):

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bayesdb.pyc in _do_sql_execute(self, string, bindings)
    283     def _do_sql_execute(self, string, bindings):
    284         cursor = self._sqlite3.cursor()
--> 285         cursor.execute(string, bindings)
    286         return bql.BayesDBCursor(self, cursor)
    287 

src/connection.c in user-defined-scalar-bql_predict_confidence()

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bqlfn.pyc in <lambda>(*args)
     33 def bayesdb_install_bql(db, cookie):
     34     def function(name, nargs, fn):
---> 35         db.createscalarfunction(name, (lambda *args: fn(cookie, *args)), nargs)
     36     function("bql_column_correlation", 4, bql_column_correlation)
     37     function("bql_column_correlation_pvalue", 4, bql_column_correlation_pvalue)

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/bqlfn.pyc in bql_predict_confidence(bdb, population_id, generator_id, colno, rowid, numsamples)
    468     metamodel = core.bayesdb_generator_metamodel(bdb, generator_id)
    469     value, confidence = metamodel.predict_confidence(bdb, generator_id,
--> 470         None, colno, rowid, numsamples=numsamples)
    471     # XXX Whattakludge!
    472     return json.dumps({'value': value, 'confidence': confidence})

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/metamodels/cgpm_metamodel.py in predict_confidence(self, bdb, generator_id, modelno, colno, rowid, numsamples)
    407         sample = self.simulate_joint(
    408             bdb, generator_id, [(rowid, colno)], constraints,
--> 409             modelno, numsamples)
    410 
    411         # Determine the imputation strategy (mode or mean).

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/metamodels/cgpm_metamodel.py in simulate_joint(self, bdb, generator_id, targets, constraints, modelno, num_samples, accuracy)
    443         return [
    444             [map_value(colno, row[colno]) for colno in cgpm_query]
--> 445             for row in weighted_samples
    446         ]
    447 

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/metamodels/cgpm_metamodel.py in map_value(colno, value)
    440             samples, cgpm_rowid, cgpm_evidence, multiprocess=self._multiprocess)
    441         def map_value(colno, value):
--> 442             return self._from_numeric(bdb, generator_id, colno, value)
    443         return [
    444             [map_value(colno, row[colno]) for colno in cgpm_query]

/scratch/curlette/bayeslite/build/lib.linux-x86_64-2.7/bayeslite/metamodels/cgpm_metamodel.py in _from_numeric(self, bdb, generator_id, colno, value)
    719             if text is None:
--> 720                 raise BQLError('Invalid category: %r' % (value,))
    721             return text
    722        else:

BQLError: 

Perhaps this should return NaN instead, as in the other cases?

A notebook with a minimal working example of this bug reproduced can be found here: http://probcomp-3.csail.mit.edu:9999/notebooks/cgpm_invalid_category_bug_reproduced.ipynb

fsaad commented 7 years ago

@curlette Even though you specified k=7 in the MML code defining the number of categories for the random forest, the "training data" in scorecard_training_data only contains values [0,1,2,3,4], which means that BayesDB does not have a mapping for the categories [5, 6, 7] and thus raises a lookup error when the random forest predicts one of those classes.

It follows that this bug is an instance of #437, where really what the system needs is i.e. a user provided codebook for all the possible nominal values that the model may ever encounter (i.e. for newly incorporated rows).