ray-project / ray

Ray is a unified framework for scaling AI and Python applications. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
https://ray.io
Apache License 2.0
33.25k stars 5.63k forks source link

Actors do not work properly with subclasses that call super. #449

Open robertnishihara opened 7 years ago

robertnishihara commented 7 years ago

Continuing a discussion from #439, and as pointed out by @raopku, the following does not work.

import ray

ray.init()

import cloudpickle

class A(object):
  pass

@ray.remote
class B(A):
  def __init__(self):
    super(B, self).__init__()

B.remote()

At the root of the problem is the fact that cloudpickle doesn't seem to be able to serialize subclasses that call super. For example, consider the following.

import cloudpickle

class A(object):
  pass

class B(A):
  def __init__(self):
    super(B, self).__init__()

cloudpickle.dumps(B)

The last line fails with the following exception.

---------------------------------------------------------------------------
RecursionError                            Traceback (most recent call last)
/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dump(self, obj)
    145         try:
--> 146             return Pickler.dump(self, obj)
    147         except RuntimeError as e:

/Users/rkn/anaconda3/lib/python3.6/pickle.py in dump(self, obj)
    408             self.framer.start_framing()
--> 409         self.save(obj)
    410         self.write(STOP)

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return

/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_global(self, obj, name, pack)
    424 
--> 425             self.save_reduce(typ, (obj.__name__, obj.__bases__, d), obj=obj)
    426         else:

/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
    585             save(func)
--> 586             save(args)
    587             write(pickle.REDUCE)

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_tuple(self, obj)
    735             for element in obj:
--> 736                 save(element)
    737             # Subtle.  Same as in the big comment below.

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_dict(self, obj)
    820         self.memoize(obj)
--> 821         self._batch_setitems(obj.items())
    822 

/Users/rkn/anaconda3/lib/python3.6/pickle.py in _batch_setitems(self, items)
    846                     save(k)
--> 847                     save(v)
    848                 write(SETITEMS)

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return

/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_function(self, obj, name)
    263                 or themodule is None):
--> 264             self.save_function_tuple(obj)
    265             return

/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in save_function_tuple(self, func)
    311         save(_make_skel_func)
--> 312         save((code, closure, base_globals))
    313         write(pickle.REDUCE)

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_tuple(self, obj)
    735             for element in obj:
--> 736                 save(element)
    737             # Subtle.  Same as in the big comment below.

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save_list(self, obj)
    780         self.memoize(obj)
--> 781         self._batch_appends(obj)
    782 

/Users/rkn/anaconda3/lib/python3.6/pickle.py in _batch_appends(self, items)
    807             elif n:
--> 808                 save(tmp[0])
    809                 write(APPEND)

... last 16 frames repeated, from the frame below ...

/Users/rkn/anaconda3/lib/python3.6/pickle.py in save(self, obj, save_persistent_id)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return

RecursionError: maximum recursion depth exceeded in comparison

During handling of the above exception, another exception occurred:

PicklingError                             Traceback (most recent call last)
<ipython-input-2-e105f7705909> in <module>()
----> 1 cloudpickle.dumps(B)

/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dumps(obj, protocol)
    704 
    705     cp = CloudPickler(file,protocol)
--> 706     cp.dump(obj)
    707 
    708     return file.getvalue()

/Users/rkn/anaconda3/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in dump(self, obj)
    148             if 'recursion' in e.args[0]:
    149                 msg = """Could not pickle object as excessively deep recursion required."""
--> 150                 raise pickle.PicklingError(msg)
    151 
    152     def save_memoryview(self, obj):

PicklingError: Could not pickle object as excessively deep recursion required.
raopku commented 7 years ago

I think it is urgent to solve the problem of handling the kwarg of super class. This problem blocks the develop of algorithm library based on ray. I think a algorithm library is unavoidable of class inheritance.

robertnishihara commented 7 years ago

As a workaround, you can do the following. Avoid referring to the class name within the class definition. For example, suppose the base class is

class A(object):
  pass

Instead of defining the actor as

@ray.actor
class B(A):
  def __init__(self):
    super(B, self).__init__()

# This fails.
B()

You can define it as follows.

@ray.actor
class B(A):
  def __init__(self):
    A.__init__(self)

# This works.
B()
robertnishihara commented 7 years ago

@mehrdadn, you've looked a lot at serialization in Python. Is it correct that the problem has to do with referring to the name of B within the definition of B? Are there obvious solutions or workarounds that we're missing?

mehrdadn commented 7 years ago

It indeed seems to be that they don't support a class referring to itself by name at all:

>>> class A:
    def __init__(self): A
>>> import cloudpickle
>>> cloudpickle.dumps(A)
Traceback (most recent call last):
  File "<pyshell#5>", line 1, in <module>
    cloudpickle.dumps(A)
  File "site-packages\cloudpickle\cloudpickle.py", line 706, in dumps
    cp.dump(obj)
  File "site-packages\cloudpickle\cloudpickle.py", line 146, in dump
    return Pickler.dump(self, obj)
  File "pickle.py", line 224, in dump
    self.save(obj)
  File "pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "site-packages\cloudpickle\cloudpickle.py", line 425, in save_global
    self.save_reduce(typ, (obj.__name__, obj.__bases__, d), obj=obj)
  File "site-packages\cloudpickle\cloudpickle.py", line 590, in save_reduce
    self.memoize(obj)
  File "pickle.py", line 244, in memoize
    assert id(obj) not in self.memo
AssertionError

It also seems to be strictly a cloudpickle bug, since pickle doesn't have any problem pickling this either.

The workaround for __init__ is what you mention. The workaround for other cases (which would be generally in the context of @staticmethod, since that's where you explicitly need the class name) is to just use @classmethod instead @staticmethod so that you can say cls.foo() instead of MyClass.foo(). Not sure what other cases there are, but it's likely this will also happen in at least some situations where two classes cross-reference each other; not sure of an easy workaround in that case though.

I would file this as a bug with cloudpickle.

robertnishihara commented 7 years ago

Thanks @mehrdadn. In this case, pickle doesn't quite do the right thing either. It serializes

class A:
  def __init__(self):
    A

as b'\x80\x03c__main__\nA\nq\x00.', which doesn't seem to contain enough information to reconstruct the class. Calling pickle.loads on that string raises

AttributeError: Can't get attribute 'A' on <module '__main__'>
mehrdadn commented 7 years ago

Oof, I forgot to delete A from the global namespace before loading it again, that makes sense. That said, I still think this is a bug... I don't see why it shouldn't be able to work correctly.

robertnishihara commented 7 years ago

Closing this for now since it seems to be a bug in cloudpickle, and we have a viable workaround which is to call BaseClass.__init__(self) instead of super().__init__().

pcmoritz commented 7 years ago

The cloudpickle issue is now fixed with https://github.com/cloudpipe/cloudpickle/pull/102, this should be part of the next cloudpickle release.

It still doesn't work with Ray and the cloudpickle master, but there is new hope that we can fix it!

robertnishihara commented 7 years ago

We haven't resolved this yet, but the following workaround seems to work.

import ray

ray.init()

class Foo(object):
    def __init__(self):
        self.x = 1

class Bar(Foo):
    def __init__(self):
        super(Bar, self).__init__()

    def get_x(self):
        return self.x

Bar2 = ray.remote(Bar)

b = Bar2.remote()

ray.get(b.get_x.remote())

cc @mehrdadn @royf

katadh commented 5 years ago

Any idea how I can do this if I am using actors that require a GPU? If I add a gpu argument to the example above, i.e.

Bar2 = ray.remote(Bar, num_gpus=1)

I get the following error:

File "/home/katadh/anaconda3/envs/softlearning/lib/python3.6/site-packages/ray/worker.py",
line 2440, in remote
    assert len(args) == 0 and len(kwargs) > 0, error_string
AssertionError: The @ray.remote decorator must be applied either with no arguments 
and no parentheses, for example '@ray.remote', or it must be applied using some of 
the arguments 'num_return_vals', 'num_cpus', 'num_gpus', 'resources', 'max_calls', or
'max_reconstructions', like '@ray.remote(num_return_vals=2, resources={"CustomResource": 1})'
amiranas commented 5 years ago

@katadh

If you are using Python 3 there seems to be a simple workaround to the entire problem. Just leave out the super arguments:

import ray

ray.init()

class Foo(object):
    def __init__(self):
        self.x = 1

@ray.remote
class Bar(Foo):
    def __init__(self):
        super().__init__()

    def get_x(self):
        return self.x

b = Bar.remote()

print(ray.get(b.get_x.remote()))

This did not cause an error for me and also worked with @ray.remote(num_gpus=1).

katadh commented 5 years ago

@amiranas Thanks! I'll try that

robertnishihara commented 3 years ago

The original issue no longer seems completely accurate, but there is still a problem here.

If I run

import ray

ray.init()

import cloudpickle

class A(object):
  pass

@ray.remote
class B(A):
  def __init__(self):
    super(B, self).__init__()

B.remote()

There is no issue with cloudpickle now, but the actor constructor fails with

2020-11-14 13:12:18,793 ERROR worker.py:1018 -- Possible unhandled error from worker: ray::B.__init__() (pid=73022, ip=192.168.1.104)
  File "python/ray/_raylet.pyx", line 484, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 438, in ray._raylet.execute_task.function_executor
  File "<ipython-input-1-aa83cb2b851c>", line 13, in __init__
TypeError: super() argument 1 must be type, not ActorClass(B)
bluemonster0808 commented 2 years ago

Have this problem been fixed yet? I get the same bug in Ray 1.12.1

class A(object):
  def __init__(self):
    print("I'm A")

@ray.remote
class B(A):
  def __init__(self):
    super(B, self).__init__()
    #super().__init__() #this can work
    print("I'm B")
  def test(self):
      pass

b=B.remote()
objref = b.test.remote()
ray.get(objref)
image