About the IDs of the dataset

Heinz217 commented 2 months ago

Hello, it’s been a great work for me! I have gained a lot of inspiration from your paper. Currently, I am trying to organize the input sequences of nodes in different ways, but I have encountered some issues: While working with the Cora dataset, I found that the node IDs in your dataset (Box: https://utexas.box.com/s/i7y03rzm40xt9bjbaj0dfdgxeyjx77gb) are different from the node IDs in the original Cora dataset that I downloaded using PyG. Could you please explain how I can construct a mapping between these two sets of node IDs? What is the logic behind the processing of these IDs? Thank you, and I look forward to your reply.

ChenRunjin commented 2 months ago

Hi, here is my processing code. It's an implementation I referenced from another repo.(i.e.https://github.com/XiaoxinHe/TAPE/blob/main/core/data_utils/load_cora.py)

def parse_cora():
    path = 'dataset/cora/cora'
    idx_features_labels = np.genfromtxt(
        "{}.content".format(path), dtype=np.dtype(str))
    data_X = idx_features_labels[:, 1:-1].astype(np.float32)
    labels = idx_features_labels[:, -1]
    class_map = {x: i for i, x in enumerate(['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
                                            'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory'])}
    data_Y = np.array([class_map[l] for l in labels])
    data_citeid = idx_features_labels[:, 0]
    idx = np.array(data_citeid, dtype=np.dtype(str))
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt(
        "{}.cites".format(path), dtype=np.dtype(str))
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten()))).reshape(
        edges_unordered.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype='int')
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    data_edges = np.unique(data_edges, axis=0).transpose()
    num_classes = len(class_map)
    # load data
    data_name = 'cora'
    dataset = Planetoid('dataset/cora', data_name,
                        transform=T.NormalizeFeatures())
    data = dataset[0]
    data.x = torch.tensor(data_X).float()
    data.edge_index = torch.tensor(data_edges).long()
    data.y = torch.tensor(data_Y).long()
    data.num_nodes = len(data_Y)
    data.num_classes = num_classes

    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])

    # text
    with open('dataset/cora/mccallum/cora/papers')as f:
        lines = f.readlines()
    pid_filename = {}
    for line in lines:
        pid = line.split('\t')[0]
        fn = line.split('\t')[1].replace(':','_')
        file_bug_dict = {"http_##www.cs.ucc.ie#~dgb#papers#ICCBR2.ps.Z":"http_##www.cs.ucc.ie#~dgb#papers#iccbr2.ps.Z",
                         "http_##www.cs.ucl.ac.uk#staff#t.yu#pgp.new.ps":"http_##www.cs.ucl.ac.uk#staff#t.yu#pgp.ps",
                         "http_##www.cs.ucl.ac.uk#staff#t.yu#ep97.ps":"http_##www.cs.ucl.ac.uk#staff#T.Yu#ep97.ps"}
        if fn in file_bug_dict:
            fn=file_bug_dict[fn]
        pid_filename[pid] = fn

    path = 'dataset/cora/mccallum/cora/extractions/'
    text = []
    titles = []
    abs = []
    keys=[]
    for pid in data_citeid:
        fn = pid_filename[pid]
        with open(path + fn) as f:
            lines = f.read().splitlines()

        ti=""
        ab=""
        key=""
        for line in lines:
            if 'Title:' in line:
                ti = line
            if 'Abstract:' in line:
                ab = line
            if 'Keyword:' in line:
                key=line
        text.append(f"{ti}\t{ab}")
        titles.append(ti)
        abs.append(ab)
        keys.append(key)
    data.raw_texts=text
    data.titles=titles
    data.abstracts=abs
    data.keywords = keys
    data.label_texts=['Case_Based', 'Genetic_Algorithms', 'Neural_Networks',
                                            'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory']
    torch.save(data, "dataset/cora/processed_data.pt")
            # text.append(ab)

    return data, text

Heinz217 commented 2 months ago

Thank you for your reply! It’s so kind of you. The code you provided has indeed resolved my confusion. It has been very helpful to me.

VITA-Group / LLaGA

About the IDs of the dataset #18