oslocyclotronlab / ompy

A python implementation of the Oslo method
https://ompy.readthedocs.io
GNU General Public License v3.0
7 stars 7 forks source link

Bug in `self_if_none()` with python 3.10.1 #197

Closed vetlewi closed 1 year ago

vetlewi commented 2 years ago

There seems to be a bug in the self_if_none() function when ever it is used in subprocesses in python 3.10.1 The issue doesn't occur in 3.9.6

This is the error I'm getting:

RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/site-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/site-packages/pathos/helpers/mp_helper.py", line -1, in <lambda>
  File "/Users/vetlewi/Git_Repositories/vetlewi/ompy/ompy/ensemble_normalizer.py", line 150, in step
    return res
  File "/Users/vetlewi/Git_Repositories/vetlewi/ompy/ompy/ensemble_normalizer.py", line 166, in normalizeSimultan
    self.normalizer_simultan.normalize(gsf=gsf, nld=nld, num=num)
  File "/Users/vetlewi/Git_Repositories/vetlewi/ompy/ompy/normalizer_simultan.py", line 126, in normalize
    self.normalizer_nld = copy.deepcopy(self.self_if_none(normalizer_nld))
  File "/Users/vetlewi/Git_Repositories/vetlewi/ompy/ompy/normalizer_simultan.py", line 427, in self_if_none
    return self_if_none(self, *args, **kwargs)
  File "/Users/vetlewi/Git_Repositories/vetlewi/ompy/ompy/library.py", line 405, in self_if_none
    name = _retrieve_name(variable)
  File "/Users/vetlewi/Git_Repositories/vetlewi/ompy/ompy/library.py", line 427, in _retrieve_name
    line = inspect.stack()[3].code_context[0].strip()
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1678, in stack
    return getouterframes(sys._getframe(1), context)
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1655, in getouterframes
    frameinfo = (frame,) + getframeinfo(frame, context)
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1627, in getframeinfo
    start = lineno - 1 - context//2
TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'
"""

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_25288/1567711863.py in <module>
----> 1 ensembleNorm.normalize()

~/Git_Repositories/vetlewi/ompy/ompy/ensemble_normalizer.py in normalize(self)
    118         N = len(nlds)
    119         iterator = pool.imap(self.step, range(N), nlds, gsfs)
--> 120         self.res = list(tqdm(iterator, total=N))
    121         pool.close()
    122         pool.join()

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/tqdm/notebook.py in __iter__(self)
    252     def __iter__(self):
    253         try:
--> 254             for obj in super(tqdm_notebook, self).__iter__():
    255                 # return super(tqdm...) will not catch exception
    256                 yield obj

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/tqdm/std.py in __iter__(self)
   1183 
   1184         try:
-> 1185             for obj in iterable:
   1186                 yield obj
   1187                 # Update and possibly print the progressbar.

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/multiprocess/pool.py in next(self, timeout)
    868         if success:
    869             return value
--> 870         raise value
    871 
    872     __next__ = next                    # XXX

TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'
vetlewi commented 2 years ago

The following block of code will recreate the fault independent of OMpy

from pathos.multiprocessing import ProcessPool
from typing import Optional, Tuple, Iterator, Any, Union
import inspect
import re
import tqdm

def self_if_none(instance: Any, variable: Any, nonable: bool = False) -> Any:
    """ Sets `variable` from instance if variable is None.

    Note: Has to be imported as in the normalizer class due to class stack
          name retrieval

    Args:
        instance: instance
        variable: The variable to check
        nonable: Does not raise ValueError if
            variable and self.<variable_name> is None
            (where <variable_name> is replace by the variable's name).
    Returns:
        The value of variable or instance.variable
    Raises:
        ValueError: Nonable is True and if both variable and
            self.variable are None
    """

    def _retrieve_name(var: Any) -> str:
        """ Finds the source-code name of `var`

            NOTE: Only call from self.reset.

         Args:
            var: The variable to retrieve the name of.
        Returns:
            The variable's name.
        """
        # Retrieve the line of the source code of the third frame.
        # The 0th frame is the current function, the 1st frame is the
        # calling function and the second is the calling function's caller.
        line = inspect.stack()[3].code_context[0].strip()
        match = re.search(r".*\((\w+).*\).*", line)
        assert match is not None, "Retrieving of name failed"
        name = match.group(1)
        return name

    name = _retrieve_name(variable)
    if variable is None:
        self_variable = getattr(instance, name)
        if not nonable and self_variable is None:
            raise ValueError(f"`{name}` must be set")
        return self_variable
    return variable

class test_class:
    def __init__(self, im_a_object = None):
        self.my_object = im_a_object

    def step(self, i, my_object=None):
        # I will copy the n object over self.my_object it it isnt None.

        self.my_object = self.self_if_none(my_object)
        return i + n

    def self_if_none(self, *args, **kwargs):
        """ wrapper for lib.self_if_none """
        return self_if_none(self, *args, **kwargs)

ns = [1, 1, 1, 1, 1]

cls_suff = test_class(3)
res = []
for i, n in enumerate(ns):
    res.append(cls_suff.step(i, ns))
print(res)
pool = ProcessPool(nodes=2)
iterator = pool.imap(cls_suff.step, range(len(ns)), ns)
res = list(tqdm.tqdm_notebook(iterator, total=len(ns)))
pool.close()
pool.join()
pool.clear()

gives output:

[1, 2, 3, 4, 5]
/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_25714/2921792000.py:78: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  res = list(tqdm.tqdm_notebook(iterator, total=len(ns)))
0%
0/5 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/site-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/site-packages/pathos/helpers/mp_helper.py", line -1, in <lambda>
  File "/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_25714/2921792000.py", line 62, in step
    return i + n
  File "/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_25714/2921792000.py", line 146, in self_if_none
  File "/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_25714/2921792000.py", line 45, in self_if_none
    name = _retrieve_name(variable)
  File "/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_25714/2921792000.py", line 39, in _retrieve_name
    line = inspect.stack()[3].code_context[0].strip()
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1678, in stack
    return getouterframes(sys._getframe(1), context)
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1655, in getouterframes
    frameinfo = (frame,) + getframeinfo(frame, context)
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1627, in getframeinfo
    start = lineno - 1 - context//2
TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'
"""

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_25714/2921792000.py in <module>
     76 pool = ProcessPool(nodes=2)
     77 iterator = pool.imap(cls_suff.step, range(len(ns)), ns)
---> 78 res = list(tqdm.tqdm_notebook(iterator, total=len(ns)))
     79 pool.close()
     80 pool.join()

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/tqdm/notebook.py in __iter__(self)
    255     def __iter__(self):
    256         try:
--> 257             for obj in super(tqdm_notebook, self).__iter__():
    258                 # return super(tqdm...) will not catch exception
    259                 yield obj

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/tqdm/std.py in __iter__(self)
   1178 
   1179         try:
-> 1180             for obj in iterable:
   1181                 yield obj
   1182                 # Update and possibly print the progressbar.

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/multiprocess/pool.py in next(self, timeout)
    868         if success:
    869             return value
--> 870         raise value
    871 
    872     __next__ = next                    # XXX

TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'
vetlewi commented 2 years ago

This may be a bug in pathos. Will try to recreate with the default python processing pool class.

ErlendLima commented 2 years ago

I have encountered a similar bug before. OMpy does not play well with multiprocessing, but I have not found the root cause, but Pathos seems to crash too often to be worth the effort. What is worrying is that I am unable to replicate the bug using the code you provided. The function self_if_none is very brittle as it walks up Python's stack frame to find the variable and its value, hence it will fail if the stack does not look as when I wrote it. For future use we should remove self_if_none and find a more robust solution.

vetlewi commented 2 years ago

This is the simplest I've been able to get it:

from pathos.multiprocessing import ProcessPool
import inspect

def do_something_with_inspect(count):
    count *= 2
    return inspect.stack()

pool = ProcessPool(nodes=2)
iterator = pool.imap(do_something_with_inspect, range(4))
for r in iterator:
    print(r)
pool.close()
pool.join()
pool.clear()

results:

multiprocess.pool.RemoteTraceback:
"""
Traceback (most recent call last):
  File "/Users/vetlewi/.pyenv/versions/3.10.1/envs/pathosbug/lib/python3.10/site-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/Users/vetlewi/Desktop/python_playground/pathos/pathos/helpers/mp_helper.py", line -1, in <lambda>
  File "/Users/vetlewi/Desktop/python_playground/pathos_error/simple.py", line -1, in do_something_with_inspect
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1678, in stack
    return getouterframes(sys._getframe(1), context)
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1655, in getouterframes
    frameinfo = (frame,) + getframeinfo(frame, context)
  File "/Users/vetlewi/.pyenv/versions/3.10.1/lib/python3.10/inspect.py", line 1627, in getframeinfo
    start = lineno - 1 - context//2
TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/vetlewi/Desktop/python_playground/pathos_error/simple.py", line 11, in <module>
    for r in iterator:
  File "/Users/vetlewi/.pyenv/versions/3.10.1/envs/pathosbug/lib/python3.10/site-packages/multiprocess/pool.py", line 870, in next
    raise value
TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'

This is with python 3.10.1 and pathos HEAD

ErlendLima commented 2 years ago

Interesting! Does the same bug occur with vanilla multiprocessing without Pathos? At least we now know that inspection does not play nice with Pathos.

vetlewi commented 2 years ago

Interesting! Does the same bug occur with vanilla multiprocessing without Pathos? At least we now know that inspection does not play nice with Pathos.

Vanilla works great:

from pathos.multiprocessing import ProcessPool
import inspect

def do_something_with_inspect(count):
    count *= 2
    r = inspect.stack()
    return count

pool = ProcessPool(nodes=2)
iterator = pool.imap(do_something_with_inspect, range(4))
for r in iterator:
    print(r)
pool.close()
pool.join()
pool.clear()
% python simple_python.py
0
2
4
6
ErlendLima commented 2 years ago

Great! For a temporary fix we can switch to Vanilla.

vetlewi commented 2 years ago

Great! For a temporary fix we can switch to Vanilla.

I'm not sure if that is a simple fix unfortunately. Using vanilla I get the error:

2022-01-05 16:03:07,606 - ompy.ensemble_normalizer - INFO - Start normalization with 2 cpus
0%
0/2 [00:00<?, ?it/s]
---------------------------------------------------------------------------
PicklingError                             Traceback (most recent call last)
/var/folders/cb/ly7f8x1n48q9lkmqt6c6lyv00000gn/T/ipykernel_90525/1567711863.py in <module>
----> 1 ensembleNorm.normalize()

~/Git_Repositories/vetlewi/ompy/ompy/ensemble_normalizer.py in normalize(self)
    120         N = len(nlds)
    121         iterator = pool.imap(self.step_wrapper, zip(range(N), zip(nlds, gsfs)))
--> 122         self.res = list(tqdm(iterator, total=N))
    123         pool.close()
    124         pool.join()

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/tqdm/notebook.py in __iter__(self)
    255     def __iter__(self):
    256         try:
--> 257             for obj in super(tqdm_notebook, self).__iter__():
    258                 # return super(tqdm...) will not catch exception
    259                 yield obj

~/.pyenv/versions/3.10.1/lib/python3.10/site-packages/tqdm/std.py in __iter__(self)
   1178 
   1179         try:
-> 1180             for obj in iterable:
   1181                 yield obj
   1182                 # Update and possibly print the progressbar.

~/.pyenv/versions/3.10.1/lib/python3.10/multiprocessing/pool.py in next(self, timeout)
    868         if success:
    869             return value
--> 870         raise value
    871 
    872     __next__ = next                    # XXX

~/.pyenv/versions/3.10.1/lib/python3.10/multiprocessing/pool.py in _handle_tasks(taskqueue, put, outqueue, pool, cache)
    535                         break
    536                     try:
--> 537                         put(task)
    538                     except Exception as e:
    539                         job, idx = task[:2]

~/.pyenv/versions/3.10.1/lib/python3.10/multiprocessing/connection.py in send(self, obj)
    209         self._check_closed()
    210         self._check_writable()
--> 211         self._send_bytes(_ForkingPickler.dumps(obj))
    212 
    213     def recv_bytes(self, maxlength=None):

~/.pyenv/versions/3.10.1/lib/python3.10/multiprocessing/reduction.py in dumps(cls, obj, protocol)
     49     def dumps(cls, obj, protocol=None):
     50         buf = io.BytesIO()
---> 51         cls(buf, protocol).dump(obj)
     52         return buf.getbuffer()
     53 

PicklingError: Can't pickle <function const_temperature at 0x12c89f370>: attribute lookup const_temperature on ompy.normalizer_nld failed

This seems to be just another rabbit hole:(

vetlewi commented 2 years ago

For now I will wrap everything in an exception block and fall back to single threaded if multiprocessing fails.