Using distributed or parallel set-up in script?: NA
Who can help
@mfuntowicz
Information
When using the question-answering pipeline, my script is run multiple times due to the thread pool within squad_convert_examples_to_features. In combination with other large models that take minutes to load, this causes them to reload every time the pipeline is called - or error outright if the code is not encapsulated. I've temporarily patched the transformers\data\processors\squad.py file on my end, forcing it to run in the current thread rather than multi-threading:
# with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
# annotate_ = partial(
# squad_convert_example_to_features,
# max_seq_length=max_seq_length,
# doc_stride=doc_stride,
# max_query_length=max_query_length,
# is_training=is_training,
# )
# features = list(
# tqdm(
# p.imap(annotate_, examples, chunksize=32),
# total=len(examples),
# desc="convert squad examples to features",
# disable=not tqdm_enabled,
# )
# )
squad_convert_example_to_features_init(tokenizer)
for example in examples:
features.append(squad_convert_example_to_features(
example,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=is_training
))
I'm wondering if there's a better solution here, though. Is this a bug? Or maybe an option could added in to allow single-threaded processing of the examples? I may be doing something wrong on my end too; not sure.
The problem arises when using:
[ ] the official example scripts: (give details below)
[x] my own modified scripts: (give details below)
The tasks I am working on is:
[ ] an official GLUE/SQUaD task: (give the name)
[x] my own task or dataset: (give details below)
To reproduce
Steps to reproduce the behavior:
For a simplified example, run the following python script from terminal:
#!/usr/bin/env python3
from transformers import pipeline
print('The script has been imported.')
nlp = pipeline('question-answering', framework='pt')
print(nlp(question='Who walked on the moon?', context='Niel Armstrong walked on the moon.'))
(environment) C:\Users\Dennis\Desktop\New folder>python runner.py
The script has been imported.
The script has been imported.
Traceback (most recent call last):
File "", line 1, in
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 105, in spawn_main
Traceback (most recent call last):
exitcode = _main(fd)
File "runner.py", line 6, in
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 114, in _main
print(nlp(question='Who walked on the moon?', context='Niel Armstrong walked on the moon.'))
prepare(preparation_data)
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in call
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 225, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 277, in _fixup_main_from_path
for example in examples
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in
run_name="mp_main__")
File "c:\users\dennis\appdata\local\programs\python\python36\lib\runpy.py", line 263, in run_path
for example in examples
pkg_name=pkg_name, script_name=fname)
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\data\processors\squad.py", line 325, in squad_convert_examples_to_features
File "c:\users\dennis\appdata\local\programs\python\python36\lib\runpy.py", line 96, in _run_module_code
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
mod_name, mod_spec, pkg_name, script_name)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 119, in Pool
File "c:\users\dennis\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code
context=self.get_context())
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 174, in init__
exec(code, run_globals)
File "C:\Users\Dennis\Desktop\New folder\runner.py", line 6, in
self._repopulate_pool()
print(nlp(question='Who walked on the moon?', context='Niel Armstrong walked on the moon.'))
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 239, in _repopulate_pool
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in call
w.start()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 322, in _Popen
for example in examples
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in
return Popen(process_obj)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\popen_spawn_win32.py", line 65, in init
reduction.dump(process_obj, to_child)
for example in examples
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\reduction.py", line 60, in dump
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\data\processors\squad.py", line 325, in squad_convert_examples_to_features
ForkingPickler(file, protocol).dump(obj)
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
BrokenPipeError: [Errno 32] Broken pipe
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 119, in Pool
context=self.get_context())
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 174, in init
self._repopulate_pool()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 239, in _repopulate_pool
w.start()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\popen_spawn_win32.py", line 33, in init
prep_data = spawn.get_preparation_data(process_obj._name)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 143, in get_preparation_data
_check_not_importing_main()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 136, in _check_not_importing_main
is not going to be frozen to produce an executable.''')
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
## Expected behavior
The script doesn't crash / reload the script every pipeline call.
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.
Environment info
transformers
version: 3.0.2Who can help
@mfuntowicz
Information
When using the
question-answering
pipeline, my script is run multiple times due to the thread pool withinsquad_convert_examples_to_features
. In combination with other large models that take minutes to load, this causes them to reload every time the pipeline is called - or error outright if the code is not encapsulated. I've temporarily patched thetransformers\data\processors\squad.py
file on my end, forcing it to run in the current thread rather than multi-threading:I'm wondering if there's a better solution here, though. Is this a bug? Or maybe an option could added in to allow single-threaded processing of the
examples
? I may be doing something wrong on my end too; not sure.The problem arises when using:
The tasks I am working on is:
To reproduce
Steps to reproduce the behavior:
print('The script has been imported.') nlp = pipeline('question-answering', framework='pt') print(nlp(question='Who walked on the moon?', context='Niel Armstrong walked on the moon.'))
(environment) C:\Users\Dennis\Desktop\New folder>python runner.py The script has been imported. The script has been imported. Traceback (most recent call last): File "", line 1, in
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 105, in spawn_main
Traceback (most recent call last):
exitcode = _main(fd)
File "runner.py", line 6, in
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 114, in _main
print(nlp(question='Who walked on the moon?', context='Niel Armstrong walked on the moon.'))
prepare(preparation_data)
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in call
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 225, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 277, in _fixup_main_from_path
for example in examples
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in
run_name="mp_main__")
File "c:\users\dennis\appdata\local\programs\python\python36\lib\runpy.py", line 263, in run_path
for example in examples
pkg_name=pkg_name, script_name=fname)
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\data\processors\squad.py", line 325, in squad_convert_examples_to_features
File "c:\users\dennis\appdata\local\programs\python\python36\lib\runpy.py", line 96, in _run_module_code
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
mod_name, mod_spec, pkg_name, script_name)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 119, in Pool
File "c:\users\dennis\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code
context=self.get_context())
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 174, in init__
exec(code, run_globals)
File "C:\Users\Dennis\Desktop\New folder\runner.py", line 6, in
self._repopulate_pool()
print(nlp(question='Who walked on the moon?', context='Niel Armstrong walked on the moon.'))
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 239, in _repopulate_pool
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in call
w.start()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 322, in _Popen
for example in examples
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\pipelines.py", line 1264, in
return Popen(process_obj)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\popen_spawn_win32.py", line 65, in init
reduction.dump(process_obj, to_child)
for example in examples
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\reduction.py", line 60, in dump
File "C:\Users\Dennis\Desktop\New folder\environment\lib\site-packages\transformers\data\processors\squad.py", line 325, in squad_convert_examples_to_features
ForkingPickler(file, protocol).dump(obj)
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
BrokenPipeError: [Errno 32] Broken pipe
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 119, in Pool
context=self.get_context())
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 174, in init
self._repopulate_pool()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 239, in _repopulate_pool
w.start()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\popen_spawn_win32.py", line 33, in init
prep_data = spawn.get_preparation_data(process_obj._name)
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 143, in get_preparation_data
_check_not_importing_main()
File "c:\users\dennis\appdata\local\programs\python\python36\lib\multiprocessing\spawn.py", line 136, in _check_not_importing_main
is not going to be frozen to produce an executable.''')
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.