When I try to reproduce the result, I encounter a bug. I post some information below. Please contact me if anyone knows how to deal with it. Any help from you will be gratitude.
+++
-environment:
Ubuntu 22.04.3
python 3.10.12
CUDA 12.1
PyTorch 2.1.1
GPU V100 1
-Command:
python fed_seed_run.py /root/workspace fedavg rte fine-tuning 1000 0,0,0
-Log:
Traceback (most recent call last):
File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 344, in main_loop
sender_rank, message_code, payload = self._network.recv(src=0)
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 102, in recv
sender_rank, message_code, content = PackageProcessor.recv_package(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 118, in recv_package
senderrank, , slices_size, message_code, data_type = recv_header(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 96, in recv_header
dist.recv(buffer, src=src)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(args, *kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1640, in recv
pg.recv([tensor], src, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:38114
Traceback (most recent call last):
File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 368, in main_loop
self.synchronize()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 376, in synchronize
self._network.send(
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 90, in send
PackageProcessor.send_package(pack, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 74, in send_package
send_content(content=package.content, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 61, in send_content
dist.send(content, dst=dst)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1597, in send
default_pg.send([tensor], dst, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:41358
Exception ignored in: <function Pool.del at 0x7f0097291990>
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/pool.py", line 271, in del
File "/usr/lib/python3.10/multiprocessing/queues.py", line 371, in put
AttributeError: 'NoneType' object has no attribute 'dumps'
+++
When I try to reproduce the result, I encounter a bug. I post some information below. Please contact me if anyone knows how to deal with it. Any help from you will be gratitude.
+++ -environment: Ubuntu 22.04.3 python 3.10.12 CUDA 12.1 PyTorch 2.1.1 GPU V100 1 -Command: python fed_seed_run.py /root/workspace fedavg rte fine-tuning 1000 0,0,0 -Log: Traceback (most recent call last): File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 344, in main_loop
sender_rank, message_code, payload = self._network.recv(src=0)
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 102, in recv
sender_rank, message_code, content = PackageProcessor.recv_package(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 118, in recv_package
senderrank, , slices_size, message_code, data_type = recv_header(
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 96, in recv_header
dist.recv(buffer, src=src)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func( args, *kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1640, in recv
pg.recv([tensor], src, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:38114
Traceback (most recent call last):
File "/root/workspace/code/FedETuning/main.py", line 20, in
main()
File "/root/workspace/code/FedETuning/main.py", line 16, in main
trainer.train()
File "/root/workspace/code/FedETuning/trainers/FedBaseTrainer.py", line 96, in train
self.client_manager.run()
File "/root/workspace/code/FedETuning/fedlab/core/network_manager.py", line 38, in run
self.main_loop()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 368, in main_loop
self.synchronize()
File "/root/workspace/code/FedETuning/trainers/BaseClient/base_client.py", line 376, in synchronize
self._network.send(
File "/root/workspace/code/FedETuning/fedlab/core/network.py", line 90, in send
PackageProcessor.send_package(pack, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 74, in send_package
send_content(content=package.content, dst=dst)
File "/root/workspace/code/FedETuning/fedlab/core/communicator/processor.py", line 61, in send_content
dist.send(content, dst=dst)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func( args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1597, in send
default_pg.send([tensor], dst, tag).wait()
RuntimeError: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.5]:41358
Exception ignored in: <function Pool.del at 0x7f0097291990>
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/pool.py", line 271, in del
File "/usr/lib/python3.10/multiprocessing/queues.py", line 371, in put
AttributeError: 'NoneType' object has no attribute 'dumps'
+++