Open leoribeiro opened 3 years ago
Sadly, the error message is not really meaningful. Can you try to run your program with CUDA_LAUNCH_BLOCKING=1
to see if the error message changes? Alternatively, please feel free to send me a minimal example to reproduce so I can look into this.
Hi @rusty1s , thanks for your reply! I used UDA_LAUNCH_BLOCKING=1
and the error is the same.
Here there is a simple example (rgcn.py
) with SparseTensor
which raises the same error:
import argparse
import os.path as osp
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Entities
from torch_geometric.utils import k_hop_subgraph
from torch_geometric.nn import RGCNConv, FastRGCNConv
import torch_geometric.transforms as T
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str,
choices=['AIFB', 'MUTAG', 'BGS', 'AM'])
args = parser.parse_args()
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Entities')
dataset = Entities(path, args.dataset)
data = dataset[0]
# BGS and AM graphs are too big to process them in a full-batch fashion.
# Since our model does only make use of a rather small receptive field, we
# filter the graph to only contain the nodes that are at most 2-hop neighbors
# away from any training/test node.
node_idx = torch.cat([data.train_idx, data.test_idx], dim=0)
node_idx, edge_index, mapping, edge_mask = k_hop_subgraph(
node_idx, 2, data.edge_index, relabel_nodes=True)
data.num_nodes = node_idx.size(0)
data.edge_index = edge_index
data.edge_type = data.edge_type[edge_mask]
data.train_idx = mapping[:data.train_idx.size(0)]
data.test_idx = mapping[data.train_idx.size(0):]
data = T.ToSparseTensor()(data)
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = RGCNConv(data.num_nodes, 16, dataset.num_relations,
num_bases=30)
self.conv2 = RGCNConv(16, dataset.num_classes, dataset.num_relations,
num_bases=30)
def forward(self, adj_t):
x = F.relu(self.conv1(None, adj_t))
x = self.conv2(x, adj_t)
return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu') if args.dataset == 'AM' else device
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
def train():
model.train()
optimizer.zero_grad()
out = model(data.adj_t)
loss = F.nll_loss(out[data.train_idx], data.train_y)
loss.backward()
optimizer.step()
return loss.item()
@torch.no_grad()
def test():
model.eval()
pred = model(data.adj_t).argmax(dim=-1)
train_acc = pred[data.train_idx].eq(data.train_y).to(torch.float).mean()
test_acc = pred[data.test_idx].eq(data.test_y).to(torch.float).mean()
return train_acc.item(), test_acc.item()
for epoch in range(1, 51):
loss = train()
train_acc, test_acc = test()
print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {train_acc:.4f} '
f'Test: {test_acc:.4f}')
I executed the command: python rgcn.py --dataset AIFB
:
Traceback (most recent call last):
File "rgcn.py", line 82, in <module>
loss = train()
File "rgcn.py", line 65, in train
out = model(data.adj_t)
File "lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "rgcn.py", line 51, in forward
x = F.relu(self.conv1(None, adj_t))
File "lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "lib/python3.8/site-packages/torch_geometric/nn/conv/rgcn_conv.py", line 211, in forward
out += self.propagate(tmp, x=weight[i, x_l], size=size)
File "lib/python3.8/site-packages/torch_geometric/nn/conv/message_passing.py", line 226, in propagate
out = self.message_and_aggregate(edge_index, **msg_aggr_kwargs)
File "lib/python3.8/site-packages/torch_geometric/nn/conv/rgcn_conv.py", line 231, in message_and_aggregate
adj_t = adj_t.set_value(None, layout=None)
File "lib/python3.8/site-packages/torch_sparse/tensor.py", line 171, in set_value
return self.from_storage(self.storage.set_value(value, layout))
File "lib/python3.8/site-packages/torch_sparse/storage.py", line 229, in set_value
return SparseStorage(row=self._row, rowptr=self._rowptr, col=self._col,
File "lib/python3.8/site-packages/torch_sparse/storage.py", line 74, in __init__
assert row.max().item() < sparse_sizes[0]
RuntimeError: CUDA error: invalid device function
Packages' versions:
torch 1.7.0
torch-geometric 1.7.2
torch-scatter 2.0.8
torch-sparse 0.6.12
Weird, it works for me. Any chance you can debug why the adj_t = adj_t.set_value(None, layout=None)
call in rgcn_conv.py
fails for you? Can you ensure that row
has at least one entry?
I am having difficulties with the
SparseTensor
functionality inRGCN
:It returns an error at
adj_t = adj_t.set_value(None, layout=None)
:I convert the graphs to sparse using the following function:
I am sure there is an easy workaround. Any hints or comments would be appreciated.