aphp / edsnlp

Modular, fast NLP framework, compatible with Pytorch and spaCy, offering tailored support for French clinical notes.
https://aphp.github.io/edsnlp/
BSD 3-Clause "New" or "Revised" License
111 stars 29 forks source link

Pendulum serialization #290

Closed percevalw closed 3 months ago

percevalw commented 3 months ago

In Python 3.7, pendulum (2.1.2) does not support dill (0.3.7) serialization, ex: The main culprit is dill but since dill has dropped support for Python 3.7, we cannot expect the fix to come from it. The plan is therefore to drop pendulum

import edsnlp, edsnlp.pipes as eds

# with cols "note_id", "note_text" and optionally "note_datetime"
my_omop_df = ...
nlp = edsnlp.blank("eds")
nlp.add_pipe(eds.dates(as_ents=True))

docs = edsnlp.data.from_pandas(my_omop_df)
docs = docs.map_pipeline(nlp)
docs = docs.set_processing(backend="multiprocessing")
docs = docs.to_pandas(
    converter="ents",
    span_attributes={"date.datetime": "datetime"},
)
See the stacktrace ``` --------------------------------------------------------------------------- NameError Traceback (most recent call last) /tmp/ipykernel_2272/1949767627.py in ----> 1 res = docs.to_pandas(converter=pick_results) .venv/lib/python3.7/site-packages/confit/registry.py in wrapper_function(*args, **kwargs) 330 e.__cause__ = None 331 e.__suppress_context__ = True --> 332 raise e.with_traceback(remove_lib_from_traceback(e.__traceback__)) 333 334 wrapper_function.vd = vd # type: ignore .venv/lib/python3.7/site-packages/pydantic/decorator.cpython-37m-x86_64-linux-gnu.so in pydantic.decorator.ValidatedFunction.execute() .venv/edsnlp/edsnlp/data/pandas.py in to_pandas(data, converter, dtypes, **kwargs) 155 data = data.map(converter, kwargs=kwargs) 156 --> 157 return data.write(PandasWriter(dtypes)) .venv/edsnlp/edsnlp/core/lazy_collection.py in write(self, writer, execute) 312 config=self.config, 313 ) --> 314 return lc.execute() if execute else lc 315 316 def execute(self): .venv/edsnlp/edsnlp/core/lazy_collection.py in execute(self) 343 backend = "simple" 344 execute = getattr(edsnlp.processing, f"execute_{backend}_backend") --> 345 return execute(self) 346 347 def __iter__(self): .venv/edsnlp/edsnlp/processing/multiprocessing.py in execute_multiprocessing_backend(lc) 979 980 gen = process() --> 981 return flatten(gen) if lc.writer is None else lc.writer.write_main(gen) .venv/edsnlp/edsnlp/data/pandas.py in write_main(self, fragments) 107 import pandas as pd 108 --> 109 columns = ld_to_dl(flatten(fragments)) 110 res = pd.DataFrame(columns) 111 return res.astype(self.dtypes) if self.dtypes else res .venv/edsnlp/edsnlp/utils/collections.py in ld_to_dl(ld) 33 The dictionary of lists 34 """ ---> 35 ld = list(ld) 36 return {k: [dic.get(k) for dic in ld] for k in (ld[0] if len(ld) else ())} 37 .venv/edsnlp/edsnlp/utils/collections.py in flatten(items) 357 358 def flatten(items): --> 359 for item in items: 360 if isinstance(item, list): 361 yield from flatten(item) .venv/edsnlp/edsnlp/processing/multiprocessing.py in process() 906 907 while all(sum(wl.values()) >= max_workload for wl in active_chunks): --> 908 yield from get_and_process_output() 909 910 # Shuffle to ensure the first process does not receive all the .venv/edsnlp/edsnlp/processing/multiprocessing.py in get_and_process_output() 879 880 def get_and_process_output(): --> 881 outputs, count, cpu_idx, output_task_id = next(outputs_iterator) 882 if output_task_id == "finalize": 883 non_finalized.discard(cpu_idx) .venv/edsnlp/edsnlp/processing/multiprocessing.py in iter_results(self) 311 312 def iter_results(self): --> 313 for out in iter(self.outputs_queue.get, None): 314 yield out 315 .venv/lib/python3.7/multiprocessing/queues.py in get(self, block, timeout) 111 self._rlock.release() 112 # unserialize the data after having released the lock --> 113 return _ForkingPickler.loads(res) 114 115 def qsize(self): .venv/lib/python3.7/site-packages/dill/_dill.py in loads(str, ignore, **kwds) 299 """ 300 file = StringIO(str) --> 301 return load(file, ignore, **kwds) 302 303 # def dumpzs(obj, protocol=None): .venv/lib/python3.7/site-packages/dill/_dill.py in load(file, ignore, **kwds) 285 See :func:`loads` for keyword arguments. 286 """ --> 287 return Unpickler(file, ignore=ignore, **kwds).load() 288 289 def loads(str, ignore=None, **kwds): .venv/lib/python3.7/site-packages/dill/_dill.py in load(self) 440 441 def load(self): #NOTE: if settings change, need to update attributes --> 442 obj = StockUnpickler.load(self) 443 if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'): 444 if not self._ignore: .venv/lib/python3.7/site-packages/pendulum/tz/timezone.py in __init__(self, offset, name) 310 self._name = name 311 self._offset = offset --> 312 self._utcoffset = timedelta(seconds=offset) 313 314 @property NameError: name 'timedelta' is not defined ```

Your Environment