Traceback (most recent call last):
File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
0%| | 0/73 [00:02<?, ?it/s]
Traceback (most recent call last):
File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
0%| | 0/73 [00:02<?, ?it/s]
Traceback (most recent call last):
File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
0%| | 0/73 [00:02<?, ?it/s]
Traceback (most recent call last):
File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 47381) of binary: /root/miniconda3/bin/python
Traceback (most recent call last):
File "/root/miniconda3/bin/torchrun", line 8, in
sys.exit(main())
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/run.py", line 724, in main
run(args)
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/run.py", line 715, in run
elastic_launch(
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Traceback (most recent call last): File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
0%| | 0/73 [00:02<?, ?it/s]
Traceback (most recent call last):
File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
0%| | 0/73 [00:02<?, ?it/s]
Traceback (most recent call last):
File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
0%| | 0/73 [00:02<?, ?it/s]
Traceback (most recent call last):
File "main.py", line 67, in
p.start(int(os.environ["LOCAL_RANK"]))
File "/root/autodl-tmp/ActCLR/processor/processor.py", line 238, in start
self.train(epoch + 1)
File "/root/autodl-tmp/ActCLR/processor/pretrain_actclr.py", line 83, in train
loss = self.model(data1, data2, data3, data4, epoch=epoch)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/ActCLR/net/actclr.py", line 342, in forward
k = k 0.5 + ks 0.5
RuntimeError: The size of tensor a (25) must match the size of tensor b (128) at non-singleton dimension 3
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 47381) of binary: /root/miniconda3/bin/python
Traceback (most recent call last):
File "/root/miniconda3/bin/torchrun", line 8, in
sys.exit(main())
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/run.py", line 724, in main
run(args)
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/run.py", line 715, in run
elastic_launch(
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
main.py FAILED
Failures: [1]: time : 2023-09-15_20:09:17 host : autodl-container-ecb611b052-9dd65de2 rank : 1 (local_rank: 1) exitcode : 1 (pid: 47382) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [2]: time : 2023-09-15_20:09:17 host : autodl-container-ecb611b052-9dd65de2 rank : 2 (local_rank: 2) exitcode : 1 (pid: 47383) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [3]: time : 2023-09-15_20:09:17 host : autodl-container-ecb611b052-9dd65de2 rank : 3 (local_rank: 3) exitcode : 1 (pid: 47384) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2023-09-15_20:09:17 host : autodl-container-ecb611b052-9dd65de2 rank : 0 (local_rank: 0) exitcode : 1 (pid: 47381) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html