joblib / joblib

Computing with Python functions.
http://joblib.readthedocs.org
BSD 3-Clause "New" or "Revised" License
3.89k stars 418 forks source link

local function failed to pickle after cython build #1297

Open bbbaka opened 2 years ago

bbbaka commented 2 years ago

Hello,I meet a question about local function. Here is an example program to reproduce it.

from joblib import Parallel
from sklearn.utils.fixes import delayed, _joblib_parallel_args
from loky import wrap_non_picklable_objects, set_loky_pickler
import sys
import time
import traceback
from joblib.externals.loky import set_loky_pickler
from joblib import parallel_backend
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects

class Test:

    def func(self):
        print("func ")

        def pow(x):
            print("pow x", x)
            return x

        set_loky_pickler('cloudpickle')
        res = Parallel(n_jobs=5,
                           **_joblib_parallel_args(prefer="processes"))(
                delayed(pow)(_bus) for _bus in range(10)
            )
        return res

if __name__ == '__main__':
    a = Test()
    print(a.func())

before I compile it through cython, I got my except result. But after compile it through cython, like python setup.py build_ext --inplace, I got:

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/joblib/externals/loky/backend/queues.py", line 153, in _feed
    obj_ = dumps(obj, reducers=reducers)
  File "/usr/local/lib/python3.8/site-packages/joblib/externals/loky/backend/reduction.py", line 271, in dumps
    dump(obj, buf, reducers=reducers, protocol=protocol)
  File "/usr/local/lib/python3.8/site-packages/joblib/externals/loky/backend/reduction.py", line 264, in dump
    _LokyPickler(file, reducers=reducers, protocol=protocol).dump(obj)
  File "/usr/local/lib/python3.8/site-packages/joblib/externals/cloudpickle/cloudpickle_fast.py", line 563, in dump
    return Pickler.dump(self, obj)
AttributeError: Can't pickle local object 'xxx.xxx.<locals>.xxx'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "xxx.py", line 68, in <module>
    main(sys.argv[1:])
  File "xxx.py", line 63, in main
    explain(curr_config, task)
  File "xxxx/xxxx/xxx/xxxx.py", line 94, in xxxx.xxxx.xxx.xxx.xxx
    xxx.xxx(explain_dir, **pipeline_transform_kwargs)
  File "xxx/xxx/xxx.py", line 313, in xxx.xxx.xxx.xxx.xxx
    **_joblib_parallel_args(prefer='processes'))(
  File "/usr/local/lib/python3.8/site-packages/joblib/parallel.py", line 1054, in __call__
    self.retrieve()
  File "/usr/local/lib/python3.8/site-packages/joblib/parallel.py", line 933, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/usr/local/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 542, in wrap_future_result
    return future.result(timeout=timeout)
  File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/usr/local/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
_pickle.PicklingError: Could not pickle the task to send it to the workers.

How can I avoid this problem without change function to nonlocal one.

# setup.py

# encoding=utf-8
from setuptools import setup, find_packages
from setuptools.extension import Extension
from Cython.Build import cythonize
from Cython.Distutils import build_ext
from pathlib import Path
import shutil
import sys, os

# some global variable, don't change them if you don't know what they mean
my_build_dir = "build"

class MyBuildExt(build_ext):
    def run(self):
        build_ext.run(self)

        build_dir = Path(self.build_lib)
        root_dir = Path(__file__).parent

        target_dir = build_dir if not self.inplace else root_dir

        for module in my_packages:
            module = module.replace(".", "/")
            module_path = Path(module)
            print(">>find modeule path:" + str(module_path))
            self.copy_file(module_path / '__init__.py', root_dir, target_dir)
            self.copy_file(module_path / '__main__.py', root_dir, target_dir)
            # for p in module_path.iterdir():
            #    if p.is_file():
            #        if p.is_file() and p.rglob("*.py") and p.rglob("*.pyc"):
            #            print(p)
            #        self.copy_file(p, root_dir, target_dir)

        for p in Path('.').iterdir():
            if p.is_file() and p.suffix not in [".py", ".pyc"] and p.name not in exclude_list:
                self.copy_file(p, root_dir, target_dir)
        print("copy end")

    def copy_file(self, path, source_dir, destination_dir):
        if not (source_dir / path).exists():
            return

        shutil.copyfile(str(source_dir / path), str(destination_dir / path))

def find_scrips():
    scrips_list = []

    for p in Path('.').iterdir():
        if p.is_file() and (p.name not in exclude_list):
            if p.suffix in [".py"] and (not p.name.startswith(".")):
                scrips_list.append(p.name)
    print("find scrips:")
    print(scrips_list)
    print(exclude_list)
    return scrips_list

if __name__ == "__main__":
    my_project_name = "test"
    my_clang = "gcc "

    if os.path.isdir(my_build_dir):
        print("Exist build dir, auto remove")
        shutil.rmtree(my_build_dir)

    print("=" * 10 + "start" + "=" * 10)
    # file that no need for package
    exclude_list = ["setup.py", ".DS_Store"]
    os.environ["CC"] = my_clang
    # find all package which contains __init__.py
    my_packages = find_packages()
    print("packages:")
    print(my_packages)

    ext = []
    for module in my_packages:
        module = module.replace(".", "/")
        ext.append(Extension(module + ".*", [module + "/*.py"]))
    ext.extend(find_scrips())

    setup(
        name=my_project_name,
        ext_modules=cythonize(
            ext,
            build_dir=my_build_dir,
            compiler_directives=dict(
                always_allow_keywords=True
            )),
        cmdclass=dict(
            build_ext=MyBuildExt
        ),
        packages=find_packages(where=".")
    )
jjerphan commented 1 year ago

Hi, thank you for reporting this. This might relate to https://github.com/cloudpipe/cloudpickle/issues/502.