aboSamoor / polyglot

Multilingual text (NLP) processing toolkit
http://polyglot-nlp.com
Other
2.31k stars 337 forks source link

method.im_func.__name__ causes error in utils / def _pickle_method with Python3.6 #122

Open alex4321 opened 7 years ago

alex4321 commented 7 years ago

In mine case I have next module:

import numpy as np
from sklearn.preprocessing import FunctionTransformer
from polyglot.text import Text

INVERSE_TOKEN = "не"

def _polarity_vector(polarities):
    positive_count = (polarities > 0).sum()
    negative_count = (polarities < 0).sum()
    vector_non_norm_2d = np.array([positive_count, negative_count])
    length = np.linalg.norm(vector_non_norm_2d)
    if length == 0:
        return np.array([0.0, 0.0, 1.0])
    else:
        vector_norm_2d = (vector_non_norm_2d / length) ** 2
        neutral = 1.0 - vector_norm_2d.sum()
        return np.array([vector_norm_2d[0], vector_norm_2d[1], neutral])

def _text_polarity(text, language):
    tokens = list(Text(text.lower(), hint_language_code=language).words)
    tokens_shifted = tokens[1:] + [""]
    polarities = []
    skip = False
    for token, next_token in zip(tokens, tokens_shifted):
        if skip:
            skip = False
            continue
        polarity = token.polarity
        if str(token) == INVERSE_TOKEN:
            polarity = -next_token.polarity
            skip = True
        polarities.append(polarity)
    return _polarity_vector(np.array(polarities))

def _texts_polarities(texts, language):
    return [_text_polarity(text, language) for text in texts]

def polarity_vectorizer(language):
    return FunctionTransformer(_texts_polarities, validate=False, kw_args={
        "language": language
    })

So now I'm trying to run it inside sklearn cross-validation:

cv(
    Pipeline([
        ("features", FeatureUnion([
            ("polarity", polarity_vectorizer("ru")),
        ])),
        ("regression", TransformRidge()),
        ("output", output_range_transformation(1, 5)),
    ]),
    comment_train,
    rating_train
)

And I getted next exception:

C:\ProgramData\Anaconda3\Lib\multiprocessing\pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
    383                         break
    384                     try:
--> 385                         put(task)
    386                     except Exception as e:
    387                         job, ind = task[:2]

C:\Users\user\venv\machinelearning\lib\site-packages\sklearn\externals\joblib\pool.py in send(obj)
    369             def send(obj):
    370                 buffer = BytesIO()
--> 371                 CustomizablePickler(buffer, self._reducers).dump(obj)
    372                 self._writer.send_bytes(buffer.getvalue())
    373             self._send = send

C:\Users\user\venv\machinelearning\lib\site-packages\polyglot\utils.py in _pickle_method(method)
    38 def _pickle_method(method):
    39   """Pickle methods properly, including class methods."""
---> 40   func_name = method.im_func.__name__
    41   obj = method.im_self
    42   cls = method.im_class

As I can see - _pickle_method written for python2 (due to usage of method.im_func / method.im_self / method.im_class) and need few changes:

def _pickle_method(method):
    """Pickle methods properly, including class methods."""
    if six.PY3:
        func_name = method.__func__.__name__
        obj = method.__self__
        cls = obj.__class__
    else:
        func_name = method.im_func.__name__
        obj = method.im_self
        cls = method.im_class
    if isinstance(cls, type):
        # handle classmethods differently
        cls = obj
        obj = None
    if func_name.startswith('__') and not func_name.endswith('__'):
        #deal with mangled names
        cls_name = cls.__name__.lstrip('_')
        func_name = '_%s%s' % (cls_name, func_name)
    return _unpickle_method, (func_name, obj, cls)

def _unpickle_method(func_name, obj, cls):
    """Unpickle methods properly, including class methods."""
    if six.PY3:
        if obj is None:
        obj = cls
        cls = cls.__class__
        for cls in cls.__mro__:
        try:
            func = cls.__dict__[func_name]
        except KeyError:
            pass
        else:
            break
        return func.__get__(obj, cls)
    else:
        if obj is None:
        return cls.__dict__[func_name].__get__(obj, cls)
        for cls in cls.__mro__:
        try:
            func = cls.__dict__[func_name]
        except KeyError:
            pass
        else:
            break
        return func.__get__(obj, cls)
hzlmn commented 4 years ago

+1 Same problem. Polyglot patch pickle behaviour on initialization that results in continues problems from another places with pickle. Like in our case with joblib

   |     raise self._value
worker_1    |   File "/usr/local/lib/python3.7/multiprocessing/pool.py", line 431, in _handle_tasks
worker_1    |     put(task)
worker_1    |   File "/usr/local/lib/python3.7/site-packages/joblib/pool.py", line 158, in send
worker_1    |     CustomizablePickler(buffer, self._reducers).dump(obj)
worker_1    |   File "/usr/local/lib/python3.7/site-packages/polyglot/utils.py", line 43, in _pickle_method
worker_1    |     func_name = method.im_func.__name__
worker_1    |   File "/usr/local/lib/python3.7/site-packages/polyglot/utils.py", line 43, in _pickle_method
AttributeError: ‘function’ object has no attribute ‘im_func’
seizaemon commented 4 years ago

+1 I have same problem. @alex4321 's suggestion is going well on our environment.