Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 423, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 650, in _save
zip_file.write_record(name, storage.data_ptr(), num_bytes)
RuntimeError: [enforce fail at inline_container.cc:445] . PytorchStreamWriter failed writing file data/5: file write failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "finetune_moss.py", line 309, in
train(args)
File "finetune_moss.py", line 274, in train
model.save_checkpoint(args.output_dir, global_step)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3137, in save_checkpoint
self._save_zero_checkpoint(save_dir, tag)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3425, in _save_zero_checkpoint
self.checkpoint_engine.save(zero_sd, zero_checkpoint_name)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
torch.save(state_dict, path)
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 424, in save
return
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 290, in exit
self.file_like.write_end_of_file()
RuntimeError: [enforce fail at inline_container.cc:325] . unexpected pos 20399218560 vs 20399218456
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 423, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 650, in _save
zip_file.write_record(name, storage.data_ptr(), num_bytes)
RuntimeError: [enforce fail at inline_container.cc:445] . PytorchStreamWriter failed writing file data/5: file write failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "finetune_moss.py", line 309, in
train(args)
File "finetune_moss.py", line 274, in train
model.save_checkpoint(args.output_dir, global_step)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3137, in save_checkpoint
self._save_zero_checkpoint(save_dir, tag)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3425, in _save_zero_checkpoint
self.checkpoint_engine.save(zero_sd, zero_checkpoint_name)
File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save
torch.save(state_dict, path)
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 424, in save
return
File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 290, in exit
self.file_like.write_end_of_file()
RuntimeError: [enforce fail at inline_container.cc:325] . unexpected pos 20399218560 vs 20399218456
terminate called after throwing an instance of 'c10::Error'
what(): [enforce fail at inline_container.cc:325] . unexpected pos 20399218560 vs 20399218456
frame #0: c10::ThrowEnforceNotMet(char const, int, char const, std::string const&, void const) + 0x55 (0x7fdf2abfa2f5 in /usr/local/lib/python3.8/dist-packages/torch/lib/libc10.so)
frame #1: + 0x3cbbe2c (0x7fdf599e1e2c in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #2: mz_zip_writer_add_mem_ex_v2 + 0x5c5 (0x7fdf599db775 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #3: caffe2::serialize::PyTorchStreamWriter::writeRecord(std::string const&, void const, unsigned long, bool) + 0xb9 (0x7fdf599e3419 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: caffe2::serialize::PyTorchStreamWriter::writeEndOfFile() + 0x2c3 (0x7fdf599e38e3 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: caffe2::serialize::PyTorchStreamWriter::~PyTorchStreamWriter() + 0x125 (0x7fdf599e3b55 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: + 0x805d95 (0x7fdf80e58d95 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so)
frame #7: + 0x3c69a3 (0x7fdf80a199a3 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so)
frame #8: + 0x3c787f (0x7fdf80a1a87f in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so)
Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 423, in save _save(obj, opened_zipfile, pickle_module, pickle_protocol) File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 650, in _save zip_file.write_record(name, storage.data_ptr(), num_bytes) RuntimeError: [enforce fail at inline_container.cc:445] . PytorchStreamWriter failed writing file data/5: file write failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "finetune_moss.py", line 309, in
train(args)
File "finetune_moss.py", line 274, in train model.save_checkpoint(args.output_dir, global_step) File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3137, in save_checkpoint self._save_zero_checkpoint(save_dir, tag) File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3425, in _save_zero_checkpoint self.checkpoint_engine.save(zero_sd, zero_checkpoint_name) File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save torch.save(state_dict, path) File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 424, in save return File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 290, in exit self.file_like.write_end_of_file() RuntimeError: [enforce fail at inline_container.cc:325] . unexpected pos 20399218560 vs 20399218456 Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 423, in save _save(obj, opened_zipfile, pickle_module, pickle_protocol) File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 650, in _save zip_file.write_record(name, storage.data_ptr(), num_bytes) RuntimeError: [enforce fail at inline_container.cc:445] . PytorchStreamWriter failed writing file data/5: file write failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "finetune_moss.py", line 309, in
train(args) + 0x3cbbe2c (0x7fdf599e1e2c in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #2: mz_zip_writer_add_mem_ex_v2 + 0x5c5 (0x7fdf599db775 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #3: caffe2::serialize::PyTorchStreamWriter::writeRecord(std::string const&, void const , unsigned long, bool) + 0xb9 (0x7fdf599e3419 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: caffe2::serialize::PyTorchStreamWriter::writeEndOfFile() + 0x2c3 (0x7fdf599e38e3 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: caffe2::serialize::PyTorchStreamWriter::~PyTorchStreamWriter() + 0x125 (0x7fdf599e3b55 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: + 0x805d95 (0x7fdf80e58d95 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so)
frame #7: + 0x3c69a3 (0x7fdf80a199a3 in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so)
frame #8: + 0x3c787f (0x7fdf80a1a87f in /usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so)
File "finetune_moss.py", line 274, in train model.save_checkpoint(args.output_dir, global_step) File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3137, in save_checkpoint self._save_zero_checkpoint(save_dir, tag) File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py", line 3425, in _save_zero_checkpoint self.checkpoint_engine.save(zero_sd, zero_checkpoint_name) File "/usr/local/lib/python3.8/dist-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py", line 18, in save torch.save(state_dict, path) File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 424, in save return File "/usr/local/lib/python3.8/dist-packages/torch/serialization.py", line 290, in exit self.file_like.write_end_of_file() RuntimeError: [enforce fail at inline_container.cc:325] . unexpected pos 20399218560 vs 20399218456 terminate called after throwing an instance of 'c10::Error' what(): [enforce fail at inline_container.cc:325] . unexpected pos 20399218560 vs 20399218456 frame #0: c10::ThrowEnforceNotMet(char const, int, char const, std::string const&, void const) + 0x55 (0x7fdf2abfa2f5 in /usr/local/lib/python3.8/dist-packages/torch/lib/libc10.so) frame #1: