learning the graph classification

Z-Rajaei commented 4 years ago

❓ Questions & Help

Hello, I'm trying to learn pytorch geometric, you have helped me a lot before and I'm thankful. I need your help to know if the way I am following is right or not. I have some graphs, including nodes and edges. There is 8 attributes for the nodes. Each node has some of these attributes with a value for it. I want to do classification of the graphs. I created my dataset like the tu-dataset: there is some files including: MM_A.txt 1, 2 2, 1 1, 3 3, 1 1, 4 4, 1 5, 6 6, 5 5, 7 7, 5 ...

MM_edge_labels.txt 0 0 0 0 1 1 0 ...

MM_graph_indicator.txt 1 1 1 1 2 2 ...

MM_graph_labels.txt 0 1 0 1 0 0 0 0 ...

MM_node_attributes.txt 1, 0, 0, 0, 0, 0, 0, 0
0, 0, 1, 5, 1, 0, 0, 0
0, 0, 1, 8, 3, 0, 0, 0
0, 0, 0, 0, 0, 1, 1, 1
1, 1, 0, 0, 0, 0, 0, 0
0, 0, 1, 10, 3, 0, 0, 0 0, 0, 1, 5, 3, 0, 0, 0
...

MM_node_labels.txt 0 1 1 2 0 1 ...

Here is the code to create the dataset and learning:

``

import os
import os.path as osp
import shutil
from torch_geometric.nn import DynamicEdgeConv, global_max_pool
from pointnet2_classification import MLP

import torch
from torch_geometric.data import InMemoryDataset, download_url, extract_zip
from torch_geometric.io import read_tu_data

from torch_geometric.data import DataLoader
from torch_geometric.nn import PointConv, fps, radius, global_max_pool

import sys
sys.path.append('..')

from torch.nn import Sequential as Seq, Linear as Lin, ReLU, BatchNorm1d as BN

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import global_max_pool, TopKPooling, GCNConv
from bijou.learner import Learner
from bijou.datasets import pyg_yoochoose_10k
from bijou.data import DataBunch, PyGDataLoader
from bijou.metrics import accuracy
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class ModelDB(InMemoryDataset):

        def __init__(self, root, transform=None, pre_transform=None,pre_filter=None,
                 use_node_attr=False, use_edge_attr=False):

        super(ModelDB, self).__init__(root, transform, pre_transform, pre_filter)

        self.data, self.slices = torch.load(self.processed_paths[0])

        print ('data', self.data, 'slice=',self.slices)

        if self.data.x is not None and not use_node_attr:
            num_node_attributes = self.num_node_attributes
            self.data.x = self.data.x[:, num_node_attributes:]

    @property
    def num_node_labels(self):
        print('numnodlabel')
        if self.data.x is None:
            return 0
        for i in range(self.data.x.size(1)):
            x = self.data.x[:, i:]
            if ((x == 0) | (x == 1)).all() and (x.sum(dim=1) == 1).all():
                return self.data.x.size(1) - i
        return 0

    @property
    def num_node_attributes(self):
        print('numnodattr')
        if self.data.x is None:
            return 0
        return self.data.x.size(1) - self.num_node_labels

    # @property
    def num_edge_labels(self):
        print('numeglabel')
        if self.data.edge_attr is None:
            return 0
        for i in range(self.data.edge_attr.size(1)):
            if self.data.edge_attr[:, i:].sum() == self.data.edge_attr.size(0):
                return self.data.edge_attr.size(1) - i
        return 0

    @property
    def num_edge_attributes(self):
        print('numegattr')
        if self.data.edge_attr is None:
            return 0
        return self.data.edge_attr.size(1) - self.num_edge_labels

    @property
    def raw_file_names(self):
        print('rawfilename')
        names = ['A', 'graph_indicator', 'edge_labels', 'graph_labels', 'node_attributes', 'node_labels','edge_attributes']
        return ['MM_{}.txt'.format(name) for name in names]

    @property
    def processed_file_names(self):

        return ['data.pt']

    @property
    def raw_dir(self):
        return osp.join(self.root, 'raw')

    @property
    def processed_dir(self):
        name = 'processed'
        return osp.join(self.root, name)

    def process(self):
        print('process start')
        name='MM'
        self.data, self.slices = read_tu_data(self.raw_dir, name)

        if self.pre_filter is not None:

            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

     torch.save((self.data, self.slices), self.processed_paths[0])

dataset = ModelDB(root='G:/Model', use_node_attr=True, use_edge_attr=False)

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 4)
        self.conv2 = GCNConv(4, dataset.num_classes)

    def forward(self, data):

        x, edge_index, batch = data.x, data.edge_index, data.batch
        print('shape1=',x.shape)
        x = self.conv1(x, edge_index)
        print('shape2=',x.shape)
        # x = x.view(-1, 60)

        x = F.relu(x)
        print('shape3=',x.shape)

        x = F.dropout(x, training=self.training)
        print('shape4=',x.shape)
        x = self.conv2(x, edge_index)
        print('shape5=',x.shape)

        return F.log_softmax(x, dim=1)

n = len(dataset) // 10
test_dataset = dataset[:n]
train_dataset = dataset[n:]
print('dataset0=',dataset[0])
test_loader = DataLoader(test_dataset, batch_size=80)
train_loader = DataLoader(train_dataset, batch_size=80)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train(epoch):
    model.train()

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

def test(loader):
    model.eval()

    correct = 0
    for data in loader:
        data = data.to(device)
        pred = model(data).max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)

for epoch in range(1, 201):
    loss = train(epoch)
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Acc: {:.5f}, Test Acc: {:.5f}'.
          format(epoch, loss, train_acc, test_acc))

``

In the first place, I want to know if the way I created my input files and the dataset and laerning process is true or not. And second, I encounter the following error: ValueError: Expected input batch_size (256) to match target batch_size (70).

please help me know that the way I'm following is right or not, and how to overcome the error.

rusty1s commented 4 years ago

You can, of course, make use of TUDataset loading mechanisms, but personally I would always process the data by myself (just for easier debugging and ensuring everything works as intended). Since your task is graph-classification, I do not fully-understand your network architecture. There seeems to be some kind of global aggregation missing that aggregates node features to global graph features.

Z-Rajaei commented 4 years ago

I used TUDataset because it is very similar to my input data.

I have some graphs, each node has some attributes (the attributes are not the same in all nodes, for example node 1 has 2 attributes of the total 8 attributes, node 2 has 4 attributes of the total 8 attributes, ...). nodes and edges have labels. The graph classification is done according to the attributes of nodes and the label of nodes and edges.

I didn't understand your last sentence! Could you tell me what the error is for?

rusty1s commented 4 years ago

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 4)
        self.conv2 = GCNConv(4, 4)
        self.lin = torch.nn.Linear(4, dataset.num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, batch)
        x = self.lin(x)

        return F.log_softmax(x, dim=1)

Z-Rajaei commented 4 years ago

Thank you very much!!!!! It works.

the result is as the following: Epoch: 001, Loss: 0.68808, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 002, Loss: 0.67881, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 003, Loss: 0.67722, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 004, Loss: 0.67096, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 005, Loss: 0.66708, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 006, Loss: 0.66639, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 007, Loss: 0.65875, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 008, Loss: 0.65206, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 009, Loss: 0.64866, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 010, Loss: 0.63805, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 011, Loss: 0.63464, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 012, Loss: 0.63463, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 013, Loss: 0.62172, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 014, Loss: 0.62425, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 015, Loss: 0.60330, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 016, Loss: 0.59983, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 017, Loss: 0.57610, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 018, Loss: 0.56826, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 019, Loss: 0.56400, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 020, Loss: 0.54949, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 021, Loss: 0.54285, Train Acc: 0.57143, Test Acc: 0.28571 Epoch: 022, Loss: 0.51800, Train Acc: 0.64286, Test Acc: 0.57143 Epoch: 023, Loss: 0.48245, Train Acc: 0.68571, Test Acc: 0.71429 Epoch: 024, Loss: 0.49753, Train Acc: 0.74286, Test Acc: 0.71429 Epoch: 025, Loss: 0.47759, Train Acc: 0.91429, Test Acc: 0.85714 Epoch: 026, Loss: 0.44741, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 027, Loss: 0.43629, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 028, Loss: 0.43012, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 029, Loss: 0.37574, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 030, Loss: 0.38201, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 031, Loss: 0.36425, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 032, Loss: 0.36529, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 033, Loss: 0.34476, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 034, Loss: 0.31921, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 035, Loss: 0.33876, Train Acc: 0.92857, Test Acc: 0.85714 Epoch: 036, Loss: 0.28582, Train Acc: 0.94286, Test Acc: 1.00000 Epoch: 037, Loss: 0.30076, Train Acc: 0.97143, Test Acc: 1.00000 Epoch: 038, Loss: 0.27581, Train Acc: 0.97143, Test Acc: 1.00000 Epoch: 039, Loss: 0.26290, Train Acc: 0.97143, Test Acc: 1.00000 Epoch: 040, Loss: 0.25185, Train Acc: 0.97143, Test Acc: 1.00000 Epoch: 041, Loss: 0.22169, Train Acc: 0.97143, Test Acc: 1.00000 Epoch: 042, Loss: 0.21283, Train Acc: 0.97143, Test Acc: 1.00000 Epoch: 043, Loss: 0.21101, Train Acc: 1.00000, Test Acc: 1.00000 Epoch: 044, Loss: 0.19844, Train Acc: 1.00000, Test Acc: 1.00000 Epoch: 045, Loss: 0.22333, Train Acc: 1.00000, Test Acc: 1.00000 Epoch: 046, Loss: 0.18416, Train Acc: 1.00000, Test Acc: 1.00000 Epoch: 047, Loss: 0.19532, Train Acc: 1.00000, Test Acc: 1.00000 Epoch: 048, Loss: 0.16506, Train Acc: 1.00000, Test Acc: 1.00000 Epoch: 049, Loss: 0.17910, Train Acc: 1.00000, Test Acc: 1.00000 Epoch: 050, Loss: 0.18781, Train Acc: 1.00000, Test Acc: 1.00000

Does it show that my model works as well? Is it reliable?

pyg-team / pytorch_geometric

learning the graph classification #1058

❓ Questions & Help