pyg-team / pytorch_geometric

Graph Neural Network Library for PyTorch
https://pyg.org
MIT License
21.11k stars 3.63k forks source link

Trouble with heterogeneous graph learning using GraphConv #7683

Open miso47 opened 1 year ago

miso47 commented 1 year ago

🐛 Describe the bug

I am having this issue with my model as

from torch_geometric.nn import SAGEConv, to_hetero,GraphConv
from torch.nn import Linear
from torch_geometric.utils import  to_undirected
data_indicator=dataset[0]
print(data_indicator.metadata())

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GraphConv((-1, -1), hidden_channels)
        self.conv2 = GraphConv((-1, -1), hidden_channels)
        self.fc1 = Linear(hidden_channels, out_channels) 

    def forward(self, x, edge_index,edge_weight):
        x = self.conv1(x, edge_index,edge_weight).relu()
        Vector = self.conv2(x, edge_index,edge_weight).relu()
        x = self.fc1(Vector)
        return x

model = GNN(hidden_channels=64, out_channels=2)
model = to_hetero(model, data_indicator.metadata(), aggr='sum')
print(model)

The print out of data.metadata() is (['circuit_element', 'junction'], [('junction', 'wire', 'junction'), ('junction', 'wire', 'circuit_element')])

However, I am getting this error when trying to train the model using a dataloader. The train code is:

optimizer = torch.optim.Adam(model.parameters(), lr=0.02)## For Classifier
criterion_classifier = torch.nn.CrossEntropyLoss()
index = torch.tensor([2,5])
def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         regress_out,XY = model(data.x_dict, data.edge_index_dict,data.edge_weight_dict)
         True_class,True_CCM_DCM,True_gain,Duty_cycle,D2,Current_rip,Inductance,Freq,Resistance=data.y.T.float() 

         True_output= data.y.index_select(1, index)#
         regress_loss=criterion_regression(regress_out, True_output) 
         writer.add_scalar("Loss/Train", regress_loss, epoch)   
         regress_loss.backward()#retain_graph=True)  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.
         return regress_out 

def test(loader):

     model.eval()
     correct = 0
     actual_output=[]
     model_output=[]
     for data in loader:  # Iterate in batches over the training/test dataset.     
         regress_out,XY = model(data.x_dict, data.edge_index_dict,data.edge_weight_dict)
         True_class,True_CCM_DCM,True_gain,Duty_cycle,D2,Current_rip,Inductance,Freq,Resistance=data.y.T.float() 
         True_output= data.y.index_select(1, index)
         MSE=criterion_regression(regress_out,True_output)
     return MSE
trn_acc=[]
tst_acc=[]
Rsqrd=[]
eph=[]

Train_save_error_margin=1 
Test_save_error_margin=1    
epochs=range(0, 1000)
for epoch in tqdm(epochs,total=len(epochs)):
    Regress_out=train()
    Train_Regress_MSE= test(train_loader)
    Test_Regress_MSE = test(test_loader)

The error message is cumbersome and leaves me clueless. The error printout is:


IndexError                                Traceback (most recent call last)
[~\AppData\Local\Temp/ipykernel_32372/394858564.py](https://file+.vscode-resource.vscode-cdn.net/c%3A/Users/~/AppData/Local/Temp/ipykernel_32372/394858564.py) in 
     59 epochs=range(0, 1000)
     60 for epoch in tqdm(epochs,total=len(epochs)):
---> 61     Regress_out=train()
     62     #writer.flush()
     63     #print(out)

[~\AppData\Local\Temp/ipykernel_32372/394858564.py](https://file+.vscode-resource.vscode-cdn.net/c%3A/Users/~/AppData/Local/Temp/ipykernel_32372/394858564.py) in train()
     14     for data in train_loader:  # Iterate in batches over the training dataset.
     15          #data=data.to(device)
---> 16          regress_out,XY = model(data.x_dict, data.edge_index_dict,data.edge_weight_dict)
     17          #print(regress_out.size())
     18          True_class,True_CCM_DCM,True_gain,Duty_cycle,D2,Current_rip,Inductance,Freq,Resistance=data.y.T.float() ## Transpose to match the dimension of the prediction output

[c:\ProgramData\Anaconda3\lib\site-packages\torch\fx\graph_module.py](file:///C:/ProgramData/Anaconda3/lib/site-packages/torch/fx/graph_module.py) in wrapped_call(self, *args, **kwargs)
    511                     print(generate_error_message(topmost_framesummary),
    512                           file=sys.stderr)
--> 513                 raise e.with_traceback(None)
    514 
    515         cls.__call__ = wrapped_call

IndexError: index out of range in self 

Environment

miso47 commented 1 year ago

When I changed the convolution type to SAGEConv, i.e the model is:

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels)
        self.fc1 = Linear(hidden_channels, out_channels) 

    def forward(self, x, edge_index,edge_weight):
        x = self.conv1(x, edge_index,edge_weight).relu()
        Vector = self.conv2(x, edge_index,edge_weight).relu()
        x = self.fc1(Vector)
        return x,Vector

model = GNN(hidden_channels=64, out_channels=2)
model = to_hetero(model, data_indicator.metadata(), aggr='sum')
print(model)

the error has changed to: ValueError: Encountered tensor with size 55 in dimension -2, but expected size 0.03999999910593033

SimonPop commented 1 year ago

Hi there!

I wanted to try and reproduce this error, but I don't think you mention the data you are using. I assume it is a custom dataset. It would really help if you could provide a sample of that data, even only the first 5 nodes for each type, and some of the edges connecting them.

I still tried with a random dataset sample that follows your metadata schema, but I cannot assure you that it has the same behavior. (feature & node numbers are chosen arbitrarily for example)

Indeed I do not have any trouble running my code, although with a more recent version of both PyTorch and PyG.

Your version of PyTorch (1.9.0) seems quite outdated. The installation guide for PyG mentions that version 1.12.0 at least should be installed to use the current version without additional care. Did you check if the problem persisted with a newer version of PyTorch? Maybe you should try checking with that first if this is possible for you to upgrade the version.

Here is the sample I used, if that can help you spot a potential mistake:

from torch_geometric.nn import SAGEConv, to_hetero
from torch.nn import Linear
import torch

from torch_geometric.data import (
    HeteroData,
)
from torch_geometric.loader import DataLoader

metadata = (['circuit_element', 'junction'], [('junction', 'wire', 'junction'), ('junction', 'wire', 'circuit_element')])

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels)
        self.fc1 = Linear(hidden_channels, out_channels) 

    def forward(self, x, edge_index,edge_weight):
        x = self.conv1(x, edge_index,edge_weight).relu()
        Vector = self.conv2(x, edge_index,edge_weight).relu()
        x = self.fc1(Vector)
        return x,Vector

model = GNN(hidden_channels=64, out_channels=2)
model = to_hetero(model, metadata, aggr='sum')

# Create test data respecting the given metadata schema.

data = HeteroData()
data['circuit_element'].x = torch.rand((5, 8)) # 5 nodes, 8 feature each
data['junction'].x = torch.rand((5, 4)) # 5 nodes, 4 features each

test_edges = torch.sparse.Tensor([[0, 1], [1, 3]]).int()

data[('junction', 'wire', 'junction')].edge_index = test_edges # Some edge between nodes
data[('junction', 'wire', 'circuit_element')].edge_index = test_edges # Some edge between nodes

# Try and  run the model with a dataloader.

train_loader = DataLoader([data])

for batch in train_loader:
    model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict)