I am sharing this error in the hope that you find it useful. Below is the traceback. Let me know if you there's anything I can do to make it more verbose or any particular info you want about my environment.
INFO:root:Feature extraction w/ Jukebox could take several minutes.
Traceback (most recent call last):
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/sheetsage/sheetsage/infer.py", line 851, in
tqdm=tqdm,
File "/sheetsage/sheetsage/infer.py", line 681, in sheetsage
audio_path_or_bytes, input_feats, tertiaries_times, chunks_tertiaries, tqdm
File "/sheetsage/sheetsage/infer.py", line 352, in _extract_features
extractor = _init_extractor(input_feats)
File "/sheetsage/sheetsage/infer.py", line 87, in _init_extractor
extractor = Jukebox()
File "/sheetsage/sheetsage/representations/init.py", line 10, in init
super().init(num_layers=53, fp16=False, log=False)
File "/sheetsage/sheetsage/representations/jukebox.py", line 89, in init
) = init_jukebox_singleton(model="5b", num_layers=num_layers, log=log)
File "/sheetsage/sheetsage/representations/jukebox.py", line 53, in init_jukebox_singleton
device,
File "/usr/local/lib/python3.6/dist-packages/jukebox/make_models.py", line 96, in make_vqvae
restore_model(hps, vqvae, hps.restore_vqvae)
File "/usr/local/lib/python3.6/dist-packages/jukebox/make_models.py", line 56, in restore_model
checkpoint = load_checkpoint(checkpoint_path)
File "/usr/local/lib/python3.6/dist-packages/jukebox/make_models.py", line 37, in load_checkpoint
dist.barrier()
File "/usr/local/lib/python3.6/dist-packages/jukebox/utils/dist_adapter.py", line 35, in barrier
return _barrier()
File "/usr/local/lib/python3.6/dist-packages/jukebox/utils/dist_adapter.py", line 68, in _barrier
return dist.barrier()
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 1489, in barrier
work = _default_pg.barrier()
RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:410, unhandled system error, NCCL version 2.4.8
I am sharing this error in the hope that you find it useful. Below is the traceback. Let me know if you there's anything I can do to make it more verbose or any particular info you want about my environment.
~ $ ./sheetsage.sh -j nekadma_fanav.mp3 > error.txt
INFO:root:Feature extraction w/ Jukebox could take several minutes. Traceback (most recent call last): File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/usr/lib/python3.6/runpy.py", line 85, in _run_code exec(code, run_globals) File "/sheetsage/sheetsage/infer.py", line 851, in
tqdm=tqdm,
File "/sheetsage/sheetsage/infer.py", line 681, in sheetsage
audio_path_or_bytes, input_feats, tertiaries_times, chunks_tertiaries, tqdm
File "/sheetsage/sheetsage/infer.py", line 352, in _extract_features
extractor = _init_extractor(input_feats)
File "/sheetsage/sheetsage/infer.py", line 87, in _init_extractor
extractor = Jukebox()
File "/sheetsage/sheetsage/representations/init.py", line 10, in init
super().init(num_layers=53, fp16=False, log=False)
File "/sheetsage/sheetsage/representations/jukebox.py", line 89, in init
) = init_jukebox_singleton(model="5b", num_layers=num_layers, log=log)
File "/sheetsage/sheetsage/representations/jukebox.py", line 53, in init_jukebox_singleton
device,
File "/usr/local/lib/python3.6/dist-packages/jukebox/make_models.py", line 96, in make_vqvae
restore_model(hps, vqvae, hps.restore_vqvae)
File "/usr/local/lib/python3.6/dist-packages/jukebox/make_models.py", line 56, in restore_model
checkpoint = load_checkpoint(checkpoint_path)
File "/usr/local/lib/python3.6/dist-packages/jukebox/make_models.py", line 37, in load_checkpoint
dist.barrier()
File "/usr/local/lib/python3.6/dist-packages/jukebox/utils/dist_adapter.py", line 35, in barrier
return _barrier()
File "/usr/local/lib/python3.6/dist-packages/jukebox/utils/dist_adapter.py", line 68, in _barrier
return dist.barrier()
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 1489, in barrier
work = _default_pg.barrier()
RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:410, unhandled system error, NCCL version 2.4.8