[BUG] Can't print TokenCooccurrenceVectorizer object

cakiki commented 2 years ago

Hello!

Not sure this is actually a bug, but it seemed a bit odd so I figured I'd report it. I just trained a vectorizer using the following:

%%time
word_vectorizer = vectorizers.TokenCooccurrenceVectorizer(
    min_document_occurrences=2,
    window_radii=20,          
    window_functions='variable',
    kernel_functions='geometric',            
    n_iter = 3,
    normalize_windows=True,
).fit(subset['tokenized'])

When I try to display or print it in a Jupyter Notebook cell I get the following error (actually repeated a few times):

AttributeError: 'TokenCooccurrenceVectorizer' object has no attribute 'coo_initial_memory'

cakiki commented 2 years ago

Full trace in case it's useful:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
File /opt/conda/lib/python3.9/site-packages/IPython/core/formatters.py:973, in MimeBundleFormatter.__call__(self, obj, include, exclude)
    970     method = get_real_method(obj, self.print_method)
    972     if method is not None:
--> 973         return method(include=include, exclude=exclude)
    974     return None
    975 else:

File /opt/conda/lib/python3.9/site-packages/sklearn/base.py:614, in BaseEstimator._repr_mimebundle_(self, **kwargs)
    612 def _repr_mimebundle_(self, **kwargs):
    613     """Mime bundle used by jupyter kernels to display estimator"""
--> 614     output = {"text/plain": repr(self)}
    615     if get_config()["display"] == "diagram":
    616         output["text/html"] = estimator_html_repr(self)

File /opt/conda/lib/python3.9/site-packages/sklearn/base.py:279, in BaseEstimator.__repr__(self, N_CHAR_MAX)
    271 # use ellipsis for sequences with a lot of elements
    272 pp = _EstimatorPrettyPrinter(
    273     compact=True,
    274     indent=1,
    275     indent_at_name=True,
    276     n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW,
    277 )
--> 279 repr_ = pp.pformat(self)
    281 # Use bruteforce ellipsis when there are a lot of non-blank characters
    282 n_nonblank = len("".join(repr_.split()))

File /opt/conda/lib/python3.9/pprint.py:153, in PrettyPrinter.pformat(self, object)
    151 def pformat(self, object):
    152     sio = _StringIO()
--> 153     self._format(object, sio, 0, 0, {}, 0)
    154     return sio.getvalue()

File /opt/conda/lib/python3.9/pprint.py:170, in PrettyPrinter._format(self, object, stream, indent, allowance, context, level)
    168     self._readable = False
    169     return
--> 170 rep = self._repr(object, context, level)
    171 max_width = self._width - indent - allowance
    172 if len(rep) > max_width:

File /opt/conda/lib/python3.9/pprint.py:431, in PrettyPrinter._repr(self, object, context, level)
    430 def _repr(self, object, context, level):
--> 431     repr, readable, recursive = self.format(object, context.copy(),
    432                                             self._depth, level)
    433     if not readable:
    434         self._readable = False

File /opt/conda/lib/python3.9/site-packages/sklearn/utils/_pprint.py:189, in _EstimatorPrettyPrinter.format(self, object, context, maxlevels, level)
    188 def format(self, object, context, maxlevels, level):
--> 189     return _safe_repr(
    190         object, context, maxlevels, level, changed_only=self._changed_only
    191     )

File /opt/conda/lib/python3.9/site-packages/sklearn/utils/_pprint.py:440, in _safe_repr(object, context, maxlevels, level, changed_only)
    438 recursive = False
    439 if changed_only:
--> 440     params = _changed_params(object)
    441 else:
    442     params = object.get_params(deep=False)

File /opt/conda/lib/python3.9/site-packages/sklearn/utils/_pprint.py:93, in _changed_params(estimator)
     89 def _changed_params(estimator):
     90     """Return dict (param_name: value) of parameters that were given to
     91     estimator with non-default values."""
---> 93     params = estimator.get_params(deep=False)
     94     init_func = getattr(estimator.__init__, "deprecated_original", estimator.__init__)
     95     init_params = inspect.signature(init_func).parameters

File /opt/conda/lib/python3.9/site-packages/sklearn/base.py:210, in BaseEstimator.get_params(self, deep)
    208 out = dict()
    209 for key in self._get_param_names():
--> 210     value = getattr(self, key)
    211     if deep and hasattr(value, "get_params"):
    212         deep_items = value.get_params().items()

AttributeError: 'TokenCooccurrenceVectorizer' object has no attribute 'coo_initial_memory'

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
File /opt/conda/lib/python3.9/site-packages/IPython/core/formatters.py:707, in PlainTextFormatter.__call__(self, obj)
    700 stream = StringIO()
    701 printer = pretty.RepresentationPrinter(stream, self.verbose,
    702     self.max_width, self.newline,
    703     max_seq_length=self.max_seq_length,
    704     singleton_pprinters=self.singleton_printers,
    705     type_pprinters=self.type_printers,
    706     deferred_pprinters=self.deferred_printers)
--> 707 printer.pretty(obj)
    708 printer.flush()
    709 return stream.getvalue()

File /opt/conda/lib/python3.9/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
    407                         return meth(obj, self, cycle)
    408                 if cls is not object \
    409                         and callable(cls.__dict__.get('__repr__')):
--> 410                     return _repr_pprint(obj, self, cycle)
    412     return _default_pprint(obj, self, cycle)
    413 finally:

File /opt/conda/lib/python3.9/site-packages/IPython/lib/pretty.py:778, in _repr_pprint(obj, p, cycle)
    776 """A pprint that just redirects to the normal repr function."""
    777 # Find newlines and replace them with p.break_()
--> 778 output = repr(obj)
    779 lines = output.splitlines()
    780 with p.group():

File /opt/conda/lib/python3.9/site-packages/sklearn/base.py:279, in BaseEstimator.__repr__(self, N_CHAR_MAX)
    271 # use ellipsis for sequences with a lot of elements
    272 pp = _EstimatorPrettyPrinter(
    273     compact=True,
    274     indent=1,
    275     indent_at_name=True,
    276     n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW,
    277 )
--> 279 repr_ = pp.pformat(self)
    281 # Use bruteforce ellipsis when there are a lot of non-blank characters
    282 n_nonblank = len("".join(repr_.split()))

File /opt/conda/lib/python3.9/pprint.py:153, in PrettyPrinter.pformat(self, object)
    151 def pformat(self, object):
    152     sio = _StringIO()
--> 153     self._format(object, sio, 0, 0, {}, 0)
    154     return sio.getvalue()

File /opt/conda/lib/python3.9/pprint.py:170, in PrettyPrinter._format(self, object, stream, indent, allowance, context, level)
    168     self._readable = False
    169     return
--> 170 rep = self._repr(object, context, level)
    171 max_width = self._width - indent - allowance
    172 if len(rep) > max_width:

File /opt/conda/lib/python3.9/pprint.py:431, in PrettyPrinter._repr(self, object, context, level)
    430 def _repr(self, object, context, level):
--> 431     repr, readable, recursive = self.format(object, context.copy(),
    432                                             self._depth, level)
    433     if not readable:
    434         self._readable = False

File /opt/conda/lib/python3.9/site-packages/sklearn/utils/_pprint.py:189, in _EstimatorPrettyPrinter.format(self, object, context, maxlevels, level)
    188 def format(self, object, context, maxlevels, level):
--> 189     return _safe_repr(
    190         object, context, maxlevels, level, changed_only=self._changed_only
    191     )

File /opt/conda/lib/python3.9/site-packages/sklearn/utils/_pprint.py:440, in _safe_repr(object, context, maxlevels, level, changed_only)
    438 recursive = False
    439 if changed_only:
--> 440     params = _changed_params(object)
    441 else:
    442     params = object.get_params(deep=False)

File /opt/conda/lib/python3.9/site-packages/sklearn/utils/_pprint.py:93, in _changed_params(estimator)
     89 def _changed_params(estimator):
     90     """Return dict (param_name: value) of parameters that were given to
     91     estimator with non-default values."""
---> 93     params = estimator.get_params(deep=False)
     94     init_func = getattr(estimator.__init__, "deprecated_original", estimator.__init__)
     95     init_params = inspect.signature(init_func).parameters

File /opt/conda/lib/python3.9/site-packages/sklearn/base.py:210, in BaseEstimator.get_params(self, deep)
    208 out = dict()
    209 for key in self._get_param_names():
--> 210     value = getattr(self, key)
    211     if deep and hasattr(value, "get_params"):
    212         deep_items = value.get_params().items()

AttributeError: 'TokenCooccurrenceVectorizer' object has no attribute 'coo_initial_memory'

lmcinnes commented 2 years ago

That's definitely a bug. I don't think we've ever explored the sklearn pretty printing of models, so this certainly never came up for us, but I think it should be relatively easy to fix.

cakiki commented 2 years ago

I'm going to look into this. I'll submit a PR if I'm able to figure out how to fix it.

TutteInstitute / vectorizers

[BUG] Can't print TokenCooccurrenceVectorizer object #97