A simple NLP library allows profiling datasets with one or more text columns. When given a dataset and a column name containing text data, NLP Profiler will return either high-level insights or low-level/granular statistical information about the text in that column.
Other
241
stars
37
forks
source link
Error related to parallelisation process when trying to using NLP Profiler #22
The below error was reported by @carlolepelaars when using the NLP Profiler on a text dataset on a local machine environment with Anaconda (I have encountered a similar error as well when running NLP Profiler on Kaggle also with the Python environment set up by Anaconda).
Usage
df = apply_text_profiling(df, 'Text')
Output
Command:
```df = apply_text_profiling(df, 'Text')```
Full output:
final params: {'high_level': True, 'granular': True, 'grammar_check': False, 'spelling_check': True, 'parallelisation_method': 'default'}
Granular features: 0%
0/3 [00:01<?, ?it/s]
Granular features: Text => sentences_count: 0%
0/13 [00:01<?, ?it/s]
sentences_count: 32%
32/100 [00:20<00:01, 38.40it/s]
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
'''
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "/opt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 113, in get
return _ForkingPickler.loads(res)
File "/opt/anaconda3/lib/python3.7/site-packages/nlp_profiler/generate_features.py", line 5, in <module>
import swifter # noqa
File "/opt/anaconda3/lib/python3.7/site-packages/swifter/__init__.py", line 5, in <module>
from .swifter import SeriesAccessor, DataFrameAccessor
File "/opt/anaconda3/lib/python3.7/site-packages/swifter/swifter.py", line 14, in <module>
from .base import (
File "/opt/anaconda3/lib/python3.7/site-packages/swifter/base.py", line 4, in <module>
from psutil import cpu_count, virtual_memory
File "/opt/anaconda3/lib/python3.7/site-packages/psutil/__init__.py", line 159, in <module>
from . import _psosx as _psplatform
File "/opt/anaconda3/lib/python3.7/site-packages/psutil/_psosx.py", line 15, in <module>
from . import _psutil_osx as cext
ImportError: dlopen(/opt/anaconda3/lib/python3.7/site-packages/psutil/_psutil_osx.cpython-37m-darwin.so, 2): Symbol not found: ___CFConstantStringClassReference
Referenced from: /opt/anaconda3/lib/python3.7/site-packages/psutil/_psutil_osx.cpython-37m-darwin.so
Expected in: flat namespace
in /opt/anaconda3/lib/python3.7/site-packages/psutil/_psutil_osx.cpython-37m-darwin.so
'''
The above exception was the direct cause of the following exception:
BrokenProcessPool Traceback (most recent call last)
<ipython-input-24-96bf1218f0a1> in <module>
----> 1 df = apply_text_profiling(df, 'Text')
/opt/anaconda3/lib/python3.7/site-packages/nlp_profiler/core.py in apply_text_profiling(dataframe, text_column, params)
64 action_function(
65 action_description, new_dataframe,
---> 66 text_column, default_params[PARALLELISATION_METHOD_OPTION]
67 )
68
/opt/anaconda3/lib/python3.7/site-packages/nlp_profiler/granular_features.py in apply_granular_features(heading, new_dataframe, text_column, parallelisation_method)
45 generate_features(
46 heading, granular_features_steps,
---> 47 new_dataframe, parallelisation_method
48 )
/opt/anaconda3/lib/python3.7/site-packages/nlp_profiler/generate_features.py in generate_features(main_header, high_level_features_steps, new_dataframe, parallelisation_method)
45 new_dataframe[new_column] = parallelisation_method_function(
46 source_field, transformation_function,
---> 47 source_column, new_column
48 )
49
/opt/anaconda3/lib/python3.7/site-packages/nlp_profiler/generate_features.py in using_joblib_parallel(source_field, apply_function, source_column, new_column)
65 delayed(run_task)(
66 apply_function, each_value
---> 67 ) for _, each_value in enumerate(source_values_to_transform)
68 )
69 source_values_to_transform.update()
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
/opt/anaconda3/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
/opt/anaconda3/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
Suggested workaround
Use NLP Profiler with the following parameters instead:
The below error was reported by @carlolepelaars when using the NLP Profiler on a text dataset on a local machine environment with Anaconda (I have encountered a similar error as well when running NLP Profiler on Kaggle also with the Python environment set up by Anaconda).
Usage
Output
Suggested workaround
Use NLP Profiler with the following parameters instead:
Suggested solution to issue
apply_text_profiling
method) around atry...except
blockThanks for sharing the issue with us Carlo.