Hello,
Thank you for your amazing work. I had an issue while trying to train the model on 4 GPUs. The trace is as shown below. I however was able to train the model with one GPU.
Could you please help me out with this
Traceback (most recent call last):
File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 55, in
main(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 35, in main
Trainer(args, save_path)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 56, in init
train_loss, train_mae = self.training(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 105, in training
outputs, edge_mask, ds_map = self.model(images)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, *kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 185, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 200, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 110, in parallel_apply
output.reraise()
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/_utils.py", line 694, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
output = module(input, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, *kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/TRACER.py", line 38, in forward
features, edge = self.model.get_blocks(x, H, W)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 245, in get_blocks
x = block(x, drop_connect_rate=drop_connect_rate)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 122, in forward
x_squeezed = self._se_reduce(x_squeezed)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/util/effi_utils.py", line 301, in forward
x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
RuntimeError: GET was unable to find an engine to execute this computation
Hello, Thank you for your amazing work. I had an issue while trying to train the model on 4 GPUs. The trace is as shown below. I however was able to train the model with one GPU. Could you please help me out with this
Traceback (most recent call last): File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 55, in
main(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/main.py", line 35, in main
Trainer(args, save_path)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 56, in init
train_loss, train_mae = self.training(args)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/trainer.py", line 105, in training
outputs, edge_mask, ds_map = self.model(images)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, *kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 185, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 200, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 110, in parallel_apply
output.reraise()
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/_utils.py", line 694, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
output = module(input, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, *kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/TRACER.py", line 38, in forward
features, edge = self.model.get_blocks(x, H, W)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 245, in get_blocks
x = block(x, drop_connect_rate=drop_connect_rate)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/model/EfficientNet.py", line 122, in forward
x_squeezed = self._se_reduce(x_squeezed)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/ec2-user/anaconda3/envs/new_fast_env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
File "/home/ec2-user/SageMaker/segment-anything/TRACER/util/effi_utils.py", line 301, in forward
x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
RuntimeError: GET was unable to find an engine to execute this computation