I try to extend the graphormer through adding a GIN model (ogb implementaion) for hiv task.
The model currently works fine on the cpu. However, when I use gpu for training, a cuda error is thrown after several epochs.
Traceback (most recent call last):
File "/home/yiming/anaconda3/envs/fairseq/bin/fairseq-train", line 8, in <module>
sys.exit(cli_main())
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq_cli/train.py", line 528, in cli_main
distributed_utils.call_main(cfg, main)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq/distributed/utils.py", line 369, in call_main
main(cfg, **kwargs)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq_cli/train.py", line 188, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq_cli/train.py", line 303, in train
log_output = trainer.train_step(samples)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq/trainer.py", line 793, in train_step
raise e
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq/trainer.py", line 760, in train_step
loss, sample_size_i, logging_output = self.task.train_step(
File "/home/yiming/Graphormer-v2.0/graphormer/tasks/graph_prediction.py", line 337, in train_step
loss, sample_size, logging_output = criterion(
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yiming/Graphormer-v2.0/graphormer/criterions/binary_logloss.py", line 95, in forward
logits = model(**sample["net_input"], perturb=perturb)[:, 0, :]
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yiming/Graphormer-v2.0/graphormer/models/gnn.py", line 64, in forward
return self.encoder(batched_data, **kwargs)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yiming/Graphormer-v2.0/graphormer/models/gnn.py", line 111, in forward
h_node = self.gnn_node(batched_data, perturb, **unused)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yiming/Graphormer-v2.0/graphormer/modules/gnn_module.py", line 181, in forward
h = self.convs[layer](h_list[layer], edge_index, edge_attr)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yiming/Graphormer-v2.0/graphormer/modules/conv_modules.py", line 32, in forward
out = self.mlp((1 + self.eps) * x + self.propagate(edge_index, x=x, edge_attr=edge_embedding))
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 96, in forward
return F.linear(input, self.weight, self.bias)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/nn/functional.py", line 1847, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
If I set
CUDA_LAUNCH_BLOCKING=1
CUDA_VISIBLE_DEVICES=0
The error becomes
Traceback (most recent call last):
File "/home/yiming/anaconda3/envs/fairseq/bin/fairseq-train", line 8, in <module>
sys.exit(cli_main())
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq_cli/train.py", line 528, in cli_main
distributed_utils.call_main(cfg, main)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq/distributed/utils.py", line 369, in call_main
main(cfg, **kwargs)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq_cli/train.py", line 188, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq_cli/train.py", line 303, in train
log_output = trainer.train_step(samples)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq/trainer.py", line 793, in train_step
raise e
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq/trainer.py", line 760, in train_step
loss, sample_size_i, logging_output = self.task.train_step(
File "/home/yiming/Graphormer-v2.0/graphormer/tasks/graph_prediction.py", line 368, in train_step
optimizer.backward(loss)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/fairseq/optim/fairseq_optimizer.py", line 95, in backward
loss.backward()
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/yiming/anaconda3/envs/fairseq/lib/python3.9/site-packages/torch/autograd/__init__.py", line 147, in backward
Variable._execution_engine.run_backward(
RuntimeError: CUDA error: an illegal memory access was encountered
May I ask if you have any idea about how to solve this?
Hi,
I try to extend the graphormer through adding a GIN model (ogb implementaion) for hiv task.
The model currently works fine on the cpu. However, when I use gpu for training, a cuda error is thrown after several epochs.
If I set
The error becomes
May I ask if you have any idea about how to solve this?
Thank you!