Hello. I'm trying to train on an instance with multiple V100 GPUs, but I'm getting the following error:
Traceback (most recent call last):
File "main.py", line 93, in
main()
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 722, in call
return self.main(args, kwargs)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 697, in main
rv = self.invoke(ctx)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 1066, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 895, in invoke
return ctx.invoke(self.callback, ctx.params)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 535, in invoke
return callback(args, kwargs)
File "main.py", line 31, in train
return callback(args, kwargs) [0/150]
File "main.py", line 31, in train
pipeline_manager.train(pipeline_name, dev_mode)
File "/ebs/osmc/src/pipeline_manager.py", line 32, in train
train(pipeline_name, dev_mode, self.logger, self.params, self.seed)
File "/ebs/osmc/src/pipeline_manager.py", line 116, in train
pipeline.fit_transform(data)
File "/ebs/osmc/src/steps/base.py", line 106, in fit_transform
step_inputs[input_step.name] = input_step.fit_transform(data)
File "/ebs/osmc/src/steps/base.py", line 106, in fit_transform
step_inputs[input_step.name] = input_step.fit_transform(data)
File "/ebs/osmc/src/steps/base.py", line 106, in fit_transform
step_inputs[input_step.name] = input_step.fit_transform(data)
[Previous line repeated 3 more times]
File "/ebs/osmc/src/steps/base.py", line 112, in fit_transform
return self._cached_fit_transform(step_inputs)
File "/ebs/osmc/src/steps/base.py", line 123, in _cached_fit_transform
step_output_data = self.transformer.fit_transform(step_inputs)
File "/ebs/osmc/src/steps/base.py", line 262, in fit_transform
self.fit(args, kwargs)
File "/ebs/osmc/src/models.py", line 84, in fit
metrics = self._fit_loop(data)
File "/ebs/osmc/src/steps/pytorch/models.py", line 92, in _fit_loop
outputs_batch = self.model(X)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in c
all
result = self.forward(*input, kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 123
, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 133
, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 77
, in parallel_apply
raise output
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 53
, in _worker
output = module(*input, *kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in c
all
result = self.forward(input, kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 122
, in forward
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 127
, in replicate
return replicate(module, device_ids)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 12, in
replicate
param_copies = Broadcast.apply(devices, *params)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 19, in
forward
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_co
alesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: all tensors must be on devices[0]
I am able to train on a single V100, or multiple Teska K80s by prepending the train command with CUDA_VISIBLE_DEVICES=0,1,2,3, but I cannot get multiple V100s to work because of this issue. I have googled the issue and found only a few examples of this error, but those were different enough from your code that I wasn't able to apply any of their advice. What can I do to get this to work?
Hello. I'm trying to train on an instance with multiple V100 GPUs, but I'm getting the following error:
Traceback (most recent call last): File "main.py", line 93, in
main()
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 722, in call
return self.main(args, kwargs)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 697, in main
rv = self.invoke(ctx)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 1066, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 895, in invoke
return ctx.invoke(self.callback, ctx.params)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/click/core.py", line 535, in invoke
return callback(args, kwargs)
File "main.py", line 31, in train
return callback(args, kwargs) [0/150]
File "main.py", line 31, in train
pipeline_manager.train(pipeline_name, dev_mode)
File "/ebs/osmc/src/pipeline_manager.py", line 32, in train
train(pipeline_name, dev_mode, self.logger, self.params, self.seed)
File "/ebs/osmc/src/pipeline_manager.py", line 116, in train
pipeline.fit_transform(data)
File "/ebs/osmc/src/steps/base.py", line 106, in fit_transform
step_inputs[input_step.name] = input_step.fit_transform(data)
File "/ebs/osmc/src/steps/base.py", line 106, in fit_transform
step_inputs[input_step.name] = input_step.fit_transform(data)
File "/ebs/osmc/src/steps/base.py", line 106, in fit_transform
step_inputs[input_step.name] = input_step.fit_transform(data)
[Previous line repeated 3 more times]
File "/ebs/osmc/src/steps/base.py", line 112, in fit_transform
return self._cached_fit_transform(step_inputs)
File "/ebs/osmc/src/steps/base.py", line 123, in _cached_fit_transform
step_output_data = self.transformer.fit_transform(step_inputs)
File "/ebs/osmc/src/steps/base.py", line 262, in fit_transform
self.fit(args, kwargs)
File "/ebs/osmc/src/models.py", line 84, in fit
metrics = self._fit_loop(data)
File "/ebs/osmc/src/steps/pytorch/models.py", line 92, in _fit_loop
outputs_batch = self.model(X)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in c
all
result = self.forward(*input, kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 123
, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 133
, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 77
, in parallel_apply
raise output
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 53
, in _worker
output = module(*input, *kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in c
all
result = self.forward(input, kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 122
, in forward
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 127
, in replicate
return replicate(module, device_ids)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 12, in
replicate
param_copies = Broadcast.apply(devices, *params)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 19, in
forward
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/home/ubuntu/.local/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_co
alesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: all tensors must be on devices[0]
I am able to train on a single V100, or multiple Teska K80s by prepending the train command with CUDA_VISIBLE_DEVICES=0,1,2,3, but I cannot get multiple V100s to work because of this issue. I have googled the issue and found only a few examples of this error, but those were different enough from your code that I wasn't able to apply any of their advice. What can I do to get this to work?