Closed Heinz217 closed 2 months ago
Hi, here is my processing code. It's an implementation I referenced from another repo.(i.e.https://github.com/XiaoxinHe/TAPE/blob/main/core/data_utils/load_cora.py)
def parse_cora():
path = 'dataset/cora/cora'
idx_features_labels = np.genfromtxt(
"{}.content".format(path), dtype=np.dtype(str))
data_X = idx_features_labels[:, 1:-1].astype(np.float32)
labels = idx_features_labels[:, -1]
class_map = {x: i for i, x in enumerate(['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory'])}
data_Y = np.array([class_map[l] for l in labels])
data_citeid = idx_features_labels[:, 0]
idx = np.array(data_citeid, dtype=np.dtype(str))
idx_map = {j: i for i, j in enumerate(idx)}
edges_unordered = np.genfromtxt(
"{}.cites".format(path), dtype=np.dtype(str))
edges = np.array(list(map(idx_map.get, edges_unordered.flatten()))).reshape(
edges_unordered.shape)
data_edges = np.array(edges[~(edges == None).max(1)], dtype='int')
data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
data_edges = np.unique(data_edges, axis=0).transpose()
num_classes = len(class_map)
# load data
data_name = 'cora'
dataset = Planetoid('dataset/cora', data_name,
transform=T.NormalizeFeatures())
data = dataset[0]
data.x = torch.tensor(data_X).float()
data.edge_index = torch.tensor(data_edges).long()
data.y = torch.tensor(data_Y).long()
data.num_nodes = len(data_Y)
data.num_classes = num_classes
# split data
node_id = np.arange(data.num_nodes)
np.random.shuffle(node_id)
data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
data.val_id = np.sort(
node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])
data.train_mask = torch.tensor(
[x in data.train_id for x in range(data.num_nodes)])
data.val_mask = torch.tensor(
[x in data.val_id for x in range(data.num_nodes)])
data.test_mask = torch.tensor(
[x in data.test_id for x in range(data.num_nodes)])
# text
with open('dataset/cora/mccallum/cora/papers')as f:
lines = f.readlines()
pid_filename = {}
for line in lines:
pid = line.split('\t')[0]
fn = line.split('\t')[1].replace(':','_')
file_bug_dict = {"http_##www.cs.ucc.ie#~dgb#papers#ICCBR2.ps.Z":"http_##www.cs.ucc.ie#~dgb#papers#iccbr2.ps.Z",
"http_##www.cs.ucl.ac.uk#staff#t.yu#pgp.new.ps":"http_##www.cs.ucl.ac.uk#staff#t.yu#pgp.ps",
"http_##www.cs.ucl.ac.uk#staff#t.yu#ep97.ps":"http_##www.cs.ucl.ac.uk#staff#T.Yu#ep97.ps"}
if fn in file_bug_dict:
fn=file_bug_dict[fn]
pid_filename[pid] = fn
path = 'dataset/cora/mccallum/cora/extractions/'
text = []
titles = []
abs = []
keys=[]
for pid in data_citeid:
fn = pid_filename[pid]
with open(path + fn) as f:
lines = f.read().splitlines()
ti=""
ab=""
key=""
for line in lines:
if 'Title:' in line:
ti = line
if 'Abstract:' in line:
ab = line
if 'Keyword:' in line:
key=line
text.append(f"{ti}\t{ab}")
titles.append(ti)
abs.append(ab)
keys.append(key)
data.raw_texts=text
data.titles=titles
data.abstracts=abs
data.keywords = keys
data.label_texts=['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory']
torch.save(data, "dataset/cora/processed_data.pt")
# text.append(ab)
return data, text
Thank you for your reply! It’s so kind of you. The code you provided has indeed resolved my confusion. It has been very helpful to me.
Hello, it’s been a great work for me! I have gained a lot of inspiration from your paper. Currently, I am trying to organize the input sequences of nodes in different ways, but I have encountered some issues: While working with the Cora dataset, I found that the node IDs in your dataset (Box: https://utexas.box.com/s/i7y03rzm40xt9bjbaj0dfdgxeyjx77gb) are different from the node IDs in the original Cora dataset that I downloaded using PyG. Could you please explain how I can construct a mapping between these two sets of node IDs? What is the logic behind the processing of these IDs? Thank you, and I look forward to your reply.