Process on GPU stuck during training

DuHao10086 commented 4 years ago

❓ Questions & Help

Hi, first of all, thank you very much for building this comprehensive library. I'm currently working on a graph classification problem and I adapt your model in mnist_nn_conv.py for experiment. However, after a few epochs, the code is stuck and the gpu utilization keeps around 38%. The process can only be killed and not responding to keyboard interrupt ctrl+c. I have tested pytorch 1.5.0, 1.6.0 and on other gpus but same. Could you please provide any suggestions? Thank you very much!

`

class MnistNet(nn.Module):
def __init__(self, dataset):
    super(MnistNet, self).__init__()
    nn1 = nn.Sequential(nn.Linear(2, 25), nn.ReLU(), nn.Linear(25, 32))
    self.conv1 = NNConv(dataset.num_features, 32, nn1, aggr='mean')

    nn2 = nn.Sequential(nn.Linear(2, 25), nn.ReLU(), nn.Linear(25, 2048))
    self.conv2 = NNConv(32, 64, nn2, aggr='mean')

    self.fc1 = torch.nn.Linear(64, 128)
    self.fc2 = torch.nn.Linear(128, dataset.num_classes)

    self.transform = T.Cartesian(cat=False)

def forward(self, data):
    data.x = F.elu(self.conv1(data.x, data.edge_index, data.edge_attr))
    weight = normalized_cut_2d(data.edge_index, data.pos)
    cluster = graclus(data.edge_index, weight, data.x.size(0))
    data.edge_attr = None
    data = max_pool(cluster, data, transform=self.transform)

    data.x = F.elu(self.conv2(data.x, data.edge_index, data.edge_attr))
    weight = normalized_cut_2d(data.edge_index, data.pos)
    cluster = graclus(data.edge_index, weight, data.x.size(0))
    x, batch = max_pool_x(cluster, data.x, data.batch)

    x = global_mean_pool(x, batch)
    x = F.elu(self.fc1(x))
    x = F.dropout(x, training=self.training)
    return F.log_softmax(self.fc2(x), dim=1)`

And if I just use a simple GCN based model as follow, this problem won't occur during training. `

class GCN(nn.Module):
def __init__(self, hidden_channels, dataset):
    super(GCN, self).__init__()
    self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
    self.conv2 = GCNConv(hidden_channels, hidden_channels)
    self.conv3 = GCNConv(hidden_channels, hidden_channels)
    self.lin = Linear(hidden_channels, dataset.num_classes)

def forward(self, data):
    x, edge_index, batch = data.x, data.edge_index, data.batch
    x = self.conv1(x, edge_index)
    x = x.relu()
    x = self.conv2(x, edge_index)
    x = x.relu()
    x = self.conv3(x, edge_index)

    x = global_mean_pool(x, batch)  

    # x = F.dropout(x, p=0.5, training=self.training)
    x = self.lin(x)

    return x`

rusty1s commented 4 years ago

I think its due to the graclus call. I will investigate it, sorry for that :)

EvaHeffinck commented 3 years ago

I have the exact same problem (with pytorch 1.7.0 and cuda 11.0). If I change the clustering method to voxel_grid the problem indeed does not occur.

pyg-team / pytorch_geometric

Process on GPU stuck during training #1648

❓ Questions & Help