HKUDS / OpenGraph

[EMNLP'2024] "OpenGraph: Towards Open Graph Foundation Models"
https://arxiv.org/abs/2403.01121
Apache License 2.0
265 stars 31 forks source link

How do I get /datasets/pubmed/ this kind of data from a raw pubmed dataset? #4

Closed Andy3117006664 closed 4 months ago

Andy3117006664 commented 4 months ago

A raw pubmed dataset has data like ind.pubmed.allx, ind.pubmed.ally.

How do I convert these data from raw pubmed datasets into data in /datasets/pubmed/ folder?

image

akaxlh commented 4 months ago

Thank you for your interests in our work. You may refer to the following scripts:

from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
import torch as t
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import argparse
import os
import pickle

parser = argparse.ArgumentParser(description='Model Parameters')
parser.add_argument('--data', default='', type=str, help='data name')
parser.add_argument('--shot', default=5, type=int, help='number of shots for each node')
args = parser.parse_args()

def make_fewshot_train_mask(train_mask, labels, shot):
    node_num = train_mask.shape[0]
    class_to_nodeid = dict()
    for i in range(node_num):
        if not train_mask[i]:
            continue
        if labels[i] not in class_to_nodeid:
            class_to_nodeid[labels[i]] = list()
        class_to_nodeid[labels[i]].append(i)
    new_train_mask = (np.zeros_like(labels) != 0)
    for label_class in class_to_nodeid:
        nodes = class_to_nodeid[label_class]
        if shot == -1:
            picked_nodes = nodes
        else:
            picked_nodes = np.random.permutation(len(nodes))[:shot]
        new_train_mask[picked_nodes] = True
    print('before', np.sum(train_mask), 'after', np.sum(new_train_mask))
    return new_train_mask

def make_adj_with_class_nodes(mat, labels, mask):
    rows, cols = list(mat.row), list(mat.col)
    assert np.min(labels) == 0
    class_num = np.max(labels) + 1
    node_num = mat.shape[0]
    for i in range(node_num):
        if mask[i]:
            rows.append(i)
            cols.append(labels[i] + node_num)
            rows.append(labels[i] + node_num)
            cols.append(i)
    vals = np.ones_like(rows)
    print('class num', class_num)
    return coo_matrix((vals, (rows, cols)), [node_num + class_num, node_num + class_num])

def write_file(data, file):
    with open(file, 'wb') as fs:
        pickle.dump(data, fs)

dataset = Planetoid('data', args.data, transform=T.NormalizeFeatures())
data = dataset[0]
print(data)

node_num = data.y.shape[0]
rows = data.edge_index[0].numpy()
cols = data.edge_index[1].numpy()
vals = np.ones_like(rows)
mat = coo_matrix((vals, (rows, cols)), shape=[node_num, node_num])
feats = data.x.numpy()
labels = data.y.numpy()
train_mask, val_mask, test_mask = data.train_mask, data.val_mask, data.test_mask
masks = dict()
train_mask = train_mask + val_mask

# test_idxs = np.random.permutation(node_num)[:int(node_num * 0.2)]
# train_mask = np.ones(node_num)
# train_mask[test_idxs] = 0
# test_mask = np.zeros(node_num)
# test_mask[test_idxs] = 1
# train_mask = t.from_numpy(train_mask == 1)
# test_mask = t.from_numpy(test_mask == 1)
# val_mask = test_mask

masks['valid'] = val_mask.numpy()
masks['test'] = test_mask.numpy()
print('train', np.sum(train_mask.numpy()))
print('val', np.sum(val_mask.numpy()))
print('test', np.sum(test_mask.numpy()))
masks['train'] = make_fewshot_train_mask(train_mask.numpy(), labels, args.shot)
class_adj = make_adj_with_class_nodes(mat, labels, masks['train'])

# exit()
if not os.path.exists(args.data):
    os.mkdir(args.data)
write_file(masks, f'{args.data}/mask_{args.shot}.pkl')
write_file(feats, f'{args.data}/feats.pkl')
write_file(mat, f'{args.data}/adj.pkl')
write_file(labels, f'{args.data}/label.pkl')
write_file(class_adj, f'{args.data}/adj_{args.shot}.pkl')
Andy3117006664 commented 4 months ago

Thank you for your interests in our work. You may refer to the following scripts:

from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
import torch as t
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import argparse
import os
import pickle

parser = argparse.ArgumentParser(description='Model Parameters')
parser.add_argument('--data', default='', type=str, help='data name')
parser.add_argument('--shot', default=5, type=int, help='number of shots for each node')
args = parser.parse_args()

def make_fewshot_train_mask(train_mask, labels, shot):
    node_num = train_mask.shape[0]
    class_to_nodeid = dict()
    for i in range(node_num):
        if not train_mask[i]:
            continue
        if labels[i] not in class_to_nodeid:
            class_to_nodeid[labels[i]] = list()
        class_to_nodeid[labels[i]].append(i)
    new_train_mask = (np.zeros_like(labels) != 0)
    for label_class in class_to_nodeid:
        nodes = class_to_nodeid[label_class]
        if shot == -1:
            picked_nodes = nodes
        else:
            picked_nodes = np.random.permutation(len(nodes))[:shot]
        new_train_mask[picked_nodes] = True
    print('before', np.sum(train_mask), 'after', np.sum(new_train_mask))
    return new_train_mask

def make_adj_with_class_nodes(mat, labels, mask):
    rows, cols = list(mat.row), list(mat.col)
    assert np.min(labels) == 0
    class_num = np.max(labels) + 1
    node_num = mat.shape[0]
    for i in range(node_num):
        if mask[i]:
            rows.append(i)
            cols.append(labels[i] + node_num)
            rows.append(labels[i] + node_num)
            cols.append(i)
    vals = np.ones_like(rows)
    print('class num', class_num)
    return coo_matrix((vals, (rows, cols)), [node_num + class_num, node_num + class_num])

def write_file(data, file):
    with open(file, 'wb') as fs:
        pickle.dump(data, fs)

dataset = Planetoid('data', args.data, transform=T.NormalizeFeatures())
data = dataset[0]
print(data)

node_num = data.y.shape[0]
rows = data.edge_index[0].numpy()
cols = data.edge_index[1].numpy()
vals = np.ones_like(rows)
mat = coo_matrix((vals, (rows, cols)), shape=[node_num, node_num])
feats = data.x.numpy()
labels = data.y.numpy()
train_mask, val_mask, test_mask = data.train_mask, data.val_mask, data.test_mask
masks = dict()
train_mask = train_mask + val_mask

# test_idxs = np.random.permutation(node_num)[:int(node_num * 0.2)]
# train_mask = np.ones(node_num)
# train_mask[test_idxs] = 0
# test_mask = np.zeros(node_num)
# test_mask[test_idxs] = 1
# train_mask = t.from_numpy(train_mask == 1)
# test_mask = t.from_numpy(test_mask == 1)
# val_mask = test_mask

masks['valid'] = val_mask.numpy()
masks['test'] = test_mask.numpy()
print('train', np.sum(train_mask.numpy()))
print('val', np.sum(val_mask.numpy()))
print('test', np.sum(test_mask.numpy()))
masks['train'] = make_fewshot_train_mask(train_mask.numpy(), labels, args.shot)
class_adj = make_adj_with_class_nodes(mat, labels, masks['train'])

# exit()
if not os.path.exists(args.data):
    os.mkdir(args.data)
write_file(masks, f'{args.data}/mask_{args.shot}.pkl')
write_file(feats, f'{args.data}/feats.pkl')
write_file(mat, f'{args.data}/adj.pkl')
write_file(labels, f'{args.data}/label.pkl')
write_file(class_adj, f'{args.data}/adj_{args.shot}.pkl')

Thank you for your prompt response and the helpful script for the PubMed dataset.

Could you please provide the scripts or guidance for handling Amazon-Book and ml1m these RS datasets in a similar manner?

Thank you for your assistance!