a-r-j / graphein

Protein Graph Library
https://graphein.ai/
MIT License
1.03k stars 131 forks source link

Loading graph into pytorch_geometric #130

Closed johnnytam100 closed 2 years ago

johnnytam100 commented 2 years ago

Hi Arian! I'm sorry for asking you again but I would like to load a residue graph into pytorch_geometric.

I tried:

import pickle
import networkx as nx

with open("1emb.p", 'rb') as f:  # notice the r instead of w
    G_loaded = pickle.load(f)

G_loaded

<networkx.classes.graph.Graph at 0x7fb972473150>

import torch
import torch_geometric

torch_geometric.utils.from_networkx(G_loaded)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
[<ipython-input-57-6acdff22372d>](https://localhost:8080/#) in <module>()
----> 1 torch_geometric.utils.from_networkx(G_loaded)

[/usr/local/lib/python3.7/dist-packages/torch_geometric/utils/convert.py](https://localhost:8080/#) in from_networkx(G, group_node_attrs, group_edge_attrs)
    176     for key, value in data.items():
    177         try:
--> 178             data[key] = torch.tensor(value)
    179         except ValueError:
    180             pass

RuntimeError: Could not infer dtype of set

Do you have any experience working on graphein with pytorch geometric?

Thanks in advance.

for info on torch_geometric.utils.from_networkx

https://pytorch-geometric.readthedocs.io/en/latest/modules/utils.html?highlight=torch_geometric.utils.from_networkx#torch_geometric.utils.from_networkx

a-r-j commented 2 years ago

Hi @johnnytam100, yep!

You have to use our conversion utility:

You can see an example here (although I note it's not rendering correctly in the docs - will try to fix - but the syntax should be enough to go on)

Hope it helps :)

If you're having any trouble let me know and I'll reopen the issue :)

EDIT: just realised the conversion docs aren't rendering properly either! I'll try to look into it this week, for the meantime you can check the source code here

EDIT: Also see #77

johnnytam100 commented 2 years ago

Hi Arian, it worked! Thank you so much for your notebook!

Btw, I am re-writing your multi-class classification model to a regression model, would you mind pointing to me how to slightly modify the model to make it works?

import pickle
import networkx as nx
import os
import glob
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from graphein.ml.conversion import GraphFormatConvertor
from tqdm.notebook import tqdm
import numpy as np
from torch_geometric.data import DataLoader
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, global_add_pool
from torch.nn.functional import mse_loss, nll_loss, relu, softmax, cross_entropy
from torch.nn import functional as F
from torchmetrics.functional import accuracy
from pytorch_lightning.callbacks import ModelCheckpoint
import os
import random

# collect graphs
path_list = []
graph_list = []

for path in glob.iglob('./*.p'):
  path_list.append(path)

path_list.sort()

for path in path_list:
    with open(path, 'rb') as f:  # notice the r instead of w
        graph = pickle.load(f)
        graph_list.append(graph)

# nx2pyg
format_convertor = GraphFormatConvertor('nx', 'pyg',
                                        verbose = 'gnn',
                                        columns = None)

pyg_list = [format_convertor(graph) for graph in tqdm(graph_list)]

# assign target
for idx, g in enumerate(pyg_list):
    # g.y = y_list[idx]             # original
    g.y = random.uniform(200, 600)  # regression
    g.coords = torch.FloatTensor(g.coords[0])

# other formatting (?)
for i in pyg_list:
    if i.coords.shape[0] == len(i.node_id):
        pass
    else:
        print(i)
        pyg_list.remove(i)

# train, val, test split
np.random.seed(42)
idx_all = np.arange(len(pyg_list))
np.random.shuffle(idx_all)

train_idx, valid_idx, test_idx = np.split(idx_all, [int(.8*len(pyg_list)), int(.9*len(pyg_list))])
train, valid, test = [pyg_list[i] for i in train_idx], [pyg_list[i] for i in valid_idx], [pyg_list[i] for i in test_idx]

# data loader
train_loader = DataLoader(train, batch_size=config.batch_size, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid, batch_size=32)
test_loader = DataLoader(test, batch_size=32)

# compile model
class GraphNets(pl.LightningModule):
    def __init__(self):
        super().__init__()

        if model_name == 'GCN':
            self.layer1 = GCNConv(in_channels=3, out_channels=config.n_hid)
            self.layer2 = GCNConv(in_channels=config.n_hid, out_channels=config.n_out)

        elif model_name == 'GAT':
            self.layer1 = GATConv(3, config.num_att_dim, heads=config.num_heads, dropout=config.dropout)
            self.layer2 = GATConv(config.num_att_dim * config.num_heads, out_channels = config.n_out, heads=1, concat=False,
                                 dropout=config.dropout)

        elif model_name == 'GraphSAGE':
            self.layer1 = SAGEConv(3, config.n_hid)
            self.layer2 = SAGEConv(config.n_hid, config.n_out)  

        self.decoder = nn.Linear(config.n_out, 7)

    def forward(self, g):
        x = g.coords
        x = F.dropout(x, p=config.dropout, training=self.training)
        x = F.elu(self.layer1(x, g.edge_index))
        x = F.dropout(x, p=config.dropout, training=self.training)
        x = self.layer2(x, g.edge_index)
        x = global_add_pool(x, batch=g.batch)
        x = self.decoder(x)
        # return softmax(x)     # original
        return x

    def training_step(self, batch, batch_idx):
        x = batch   
        y = x.y
        y_hat = self(x)
        # loss = cross_entropy(y_hat, y)    # original
        loss = mse_loss(y_hat, y)
        acc = accuracy(y_hat, y)

        self.log("train_loss", loss)
        self.log("train_acc", acc)
        return loss

    def validation_step(self, batch, batch_idx):
        x = batch   
        y = x.y
        y_hat = self(x)
        # loss = cross_entropy(y_hat, y)    # original
        loss = mse_loss(y_hat, y)
        acc = accuracy(y_hat, y)
        self.log("valid_loss", loss)
        self.log("valid_acc", acc)

    def test_step(self, batch, batch_idx):
        x = batch   
        y = x.y
        y_hat = self(x)
        # loss = cross_entropy(y_hat, y)    # original
        loss = mse_loss(y_hat, y)
        acc = accuracy(y_hat, y)

        y_pred_softmax = torch.log_softmax(y_hat, dim = 1)
        y_pred_tags = torch.argmax(y_pred_softmax, dim = 1) 
        f1 = f1_score(y.detach().cpu().numpy(), y_pred_tags.detach().cpu().numpy(), average = 'weighted')

        self.log("test_loss", loss)
        self.log("test_acc", acc)
        self.log("test_f1", f1)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=config.lr)
        return optimizer

GraphNets()

file_path = './graphein_model'
if not os.path.exists(file_path):
    os.mkdir(file_path)

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    dirpath=file_path,
    filename="model-{epoch:02d}-{val_loss:.2f}",
    save_top_k=1,
    mode="min",
)

# train model
model = GraphNets()
trainer = pl.Trainer(max_epochs=200, gpus=-1, callbacks=[checkpoint_callback])
trainer.fit(model, train_loader, valid_loader)

# evaluate on the model with the best validation set
best_model = GraphNets.load_from_checkpoint(checkpoint_callback.best_model_path)
out_best_test = trainer.test(best_model, test_loader)[0]

Unfortunately, I got this:

100%
29/29 [00:00<00:00, 247.57it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params
------------------------------------
0 | layer1  | GCNConv | 32    
1 | layer2  | GCNConv | 72    
2 | decoder | Linear  | 63    
------------------------------------
167       Trainable params
0         Non-trainable params
167       Total params
0.001     Total estimated model params size (MB)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
[<ipython-input-58-ba861e55045f>](https://localhost:8080/#) in <module>()
    155 model = GraphNets()
    156 trainer = pl.Trainer(max_epochs=200, gpus=-1, callbacks=[checkpoint_callback])
--> 157 trainer.fit(model, train_loader, valid_loader)
    158 
    159 # evaluate on the model with the best validation set

18 frames
[/usr/local/lib/python3.7/dist-packages/torch/functional.py](https://localhost:8080/#) in broadcast_tensors(*tensors)
     70     if has_torch_function(tensors):
     71         return handle_torch_function(broadcast_tensors, tensors, *tensors)
---> 72     return _VF.broadcast_tensors(tensors)  # type: ignore[attr-defined]
     73 
     74 

RuntimeError: The size of tensor a (7) must match the size of tensor b (3) at non-singleton dimension 1
a-r-j commented 2 years ago

The problematic line is:

self.decoder = nn.Linear(config.n_out, 7)

You should change the output dimension (7) to match the dimensions of your labels and the activation on the final layer. You’ll also want to report regression metrics instead of classification metrics

johnnytam100 commented 2 years ago

Thanks Arian! I made it worked with more modifications, and I make a record here for people who needs to do regression:

! pip install graphein
! pip install pytorch_lightning
! pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
! pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
! pip install -q git+https://github.com/rusty1s/pytorch_geometric.git
! pip install dgl
import pickle
import networkx as nx
import os
import glob
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from graphein.ml.conversion import GraphFormatConvertor
from tqdm.notebook import tqdm
import numpy as np
from torch_geometric.data import DataLoader
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, global_add_pool
from torch.nn.functional import mse_loss, nll_loss, relu, softmax, cross_entropy
from torch.nn import functional as F
from torchmetrics.functional import accuracy
from pytorch_lightning.callbacks import ModelCheckpoint
import os
import random
import pytorch_lightning as pl
import torch.nn as nn

# collect graphs
path_list = []
graph_list = []

for path in glob.iglob('./*.p'):
  path_list.append(path)

path_list.sort()

for path in path_list:
    with open(path, 'rb') as f:  # notice the r instead of w
        graph = pickle.load(f)
        graph_list.append(graph)

# nx2pyg
format_convertor = GraphFormatConvertor('nx', 'pyg',
                                        verbose = 'gnn',
                                        columns = None)

pyg_list = [format_convertor(graph) for graph in tqdm(graph_list)]

# assign target
for idx, g in enumerate(pyg_list):
    # g.y = y_list[idx]             # original
    g.y = int(random.uniform(200, 600))  # regression
    g.coords = torch.FloatTensor(g.coords[0])

# other formatting (?)
for i in pyg_list:
    if i.coords.shape[0] == len(i.node_id):
        pass
    else:
        print(i)
        pyg_list.remove(i)

# train, val, test split
np.random.seed(42)
idx_all = np.arange(len(pyg_list))
np.random.shuffle(idx_all)

train_idx, valid_idx, test_idx = np.split(idx_all, [int(.8*len(pyg_list)), int(.9*len(pyg_list))])
train, valid, test = [pyg_list[i] for i in train_idx], [pyg_list[i] for i in valid_idx], [pyg_list[i] for i in test_idx]

# data loader
config_default = dict(
    n_hid = 8,
    n_out = 8,
    batch_size = 4,
    dropout = 0.5,
    lr = 0.001,
    num_heads = 32,
    num_att_dim = 64,
    model_name = 'GCN'
)

class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

config = Struct(**config_default)

global model_name
model_name = config.model_name

train_loader = DataLoader(train, batch_size=config.batch_size, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid, batch_size=32)
test_loader = DataLoader(test, batch_size=32)

# compile model
class GraphNets(pl.LightningModule):
    def __init__(self):
        super().__init__()

        if model_name == 'GCN':
            self.layer1 = GCNConv(in_channels=3, out_channels=config.n_hid)
            self.layer2 = GCNConv(in_channels=config.n_hid, out_channels=config.n_out)

        elif model_name == 'GAT':
            self.layer1 = GATConv(3, config.num_att_dim, heads=config.num_heads, dropout=config.dropout)
            self.layer2 = GATConv(config.num_att_dim * config.num_heads, out_channels = config.n_out, heads=1, concat=False,
                                 dropout=config.dropout)

        elif model_name == 'GraphSAGE':
            self.layer1 = SAGEConv(3, config.n_hid)
            self.layer2 = SAGEConv(config.n_hid, config.n_out)  

        self.decoder = nn.Linear(config.n_out, 1)

    def forward(self, g):
        x = g.coords
        x = F.dropout(x, p=config.dropout, training=self.training)
        x = F.elu(self.layer1(x, g.edge_index))
        x = F.dropout(x, p=config.dropout, training=self.training)
        x = self.layer2(x, g.edge_index)
        x = global_add_pool(x, batch=g.batch)
        x = self.decoder(x)
        # return softmax(x)     # original
        return x

    def training_step(self, batch, batch_idx):
        x = batch   
        y = x.y
        y_hat = self(x)
        # loss = cross_entropy(y_hat, y)    # original
        loss = mse_loss(y_hat, y.float())
        # acc = accuracy(y_hat, y)            # original

        self.log("train_loss", loss)
        # self.log("train_acc", acc)          # original
        return loss

    def validation_step(self, batch, batch_idx):
        x = batch   
        y = x.y
        y_hat = self(x)
        # loss = cross_entropy(y_hat, y)    # original
        loss = mse_loss(y_hat, y.float())
        # acc = accuracy(y_hat, y)          # original
        self.log("valid_loss", loss)
        # self.log("valid_acc", acc)        # original

    def test_step(self, batch, batch_idx):
        x = batch   
        y = x.y
        y_hat = self(x)
        # loss = cross_entropy(y_hat, y)    # original
        loss = mse_loss(y_hat, y.float())
        # acc = accuracy(y_hat, y)            # original

        # y_pred_softmax = torch.log_softmax(y_hat, dim = 1)
        # y_pred_tags = torch.argmax(y_pred_softmax, dim = 1) 
        # f1 = f1_score(y.detach().cpu().numpy(), y_pred_tags.detach().cpu().numpy(), average = 'weighted')

        self.log("test_loss", loss)
        # self.log("test_acc", acc)         # original
        # self.log("test_f1", f1)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=config.lr)
        return optimizer

GraphNets()

file_path = './graphein_model'
if not os.path.exists(file_path):
    os.mkdir(file_path)

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    dirpath=file_path,
    filename="model-{epoch:02d}-{val_loss:.2f}",
    save_top_k=1,
    mode="min",
)

# train model
model = GraphNets()
trainer = pl.Trainer(max_epochs=200, gpus=-1, callbacks=[checkpoint_callback])
trainer.fit(model, train_loader, valid_loader)

# evaluate on the model with the best validation set
best_model = GraphNets.load_from_checkpoint(checkpoint_callback.best_model_path)
out_best_test = trainer.test(best_model, test_loader)[0]
a-r-j commented 2 years ago

Awesome! Glad it works!