yao8839836 / text_gcn

Graph Convolutional Networks for Text Classification. AAAI 2019
1.35k stars 434 forks source link

测试集没有label #128

Open yysirs opened 3 years ago

yysirs commented 3 years ago

你好,请问测试集中没有标签,无法进行建图,只用train的数据进行建图。可以吗?对结果的影响大吗?

yysirs commented 3 years ago

给测试集伪标签吗?然后建图进行预测

yao8839836 commented 3 years ago

@yysirs

你好,是的,测试集用默认标签构图,然后预测得到P值 + 标签

yysirs commented 3 years ago

就比如:总的标签数量为 [1,2,3,4] 4类,test的数据集的伪标签都选择1,然后进行建图进行训练和预测。

yao8839836 commented 3 years ago

@yysirs 对,这时候只能看val的准确率,test的准确率需要有标签才能验证。

yysirs commented 3 years ago

好的,我试试,感谢回复。

yysirs commented 3 years ago

你好,我尝试在别的数据集上进行训练,但是发现train的acc和val的acc很快就达到100%, 对照了build_graph.py的文件发现并没有什么不一样,请问可能是哪里出了问题? image

yysirs commented 3 years ago

model.py

#!/usr/bin/env python
import torch
import torch.nn as nn

class GraphConvolution(nn.Module):
    def __init__(self, input_dim, output_dim,support, \
                 act_func=None,featureless=False,dropout_rate=0., \
                 bias=False):
        super(GraphConvolution, self).__init__()
        self.support = support
        self.featureless = featureless

        for i in range(len(self.support)):
            setattr(self, 'W{}'.format(i), nn.Parameter(torch.randn(input_dim, output_dim)))

        if bias:
            self.b = nn.Parameter(torch.zeros(1, output_dim))

        self.act_func = act_func
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.dropout(x)

        for i in range(len(self.support)):
            if self.featureless:
                pre_sup = getattr(self, 'W{}'.format(i))
            else:
                pre_sup = x.mm(getattr(self, 'W{}'.format(i)))

            if i == 0:
                out = self.support[i].mm(pre_sup)
            else:
                out += self.support[i].mm(pre_sup)

        if self.act_func is not None:
            out = self.act_func(out)

        self.embedding = out
        return out

class GCN(nn.Module):
    def __init__(self, input_dim, \
                 support, \
                 dropout_rate=0., \
                 num_classes=35):
        super(GCN, self).__init__()

        # GraphConvolution
        self.layer1 = GraphConvolution(input_dim, 200, support, act_func=nn.ReLU(), featureless=True,
                                       dropout_rate=dropout_rate)
        self.layer2 = GraphConvolution(200, num_classes, support, dropout_rate=dropout_rate)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        return out
yysirs commented 3 years ago

train.py

from __future__ import division
from __future__ import print_function
from sklearn import metrics
import time
import sys
import os
import torch
import torch.nn as nn

import numpy as np

from utils import *
from gcn import GCN

from config import CONFIG
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
cfg = CONFIG()

if len(sys.argv) != 2:
    sys.exit("Use: python train.py <dataset>")

datasets = ['20ng', 'R8', 'R52', 'ohsumed', 'mr', 'r8', 'risk']
dataset = sys.argv[1]
label_list = ['5-24', '6-34', '1-1', '6-8', '10-26', '2-3', '5-22', '6-28', '8-18', '1-4', '2-6', '6-21', '7-16', '6-29', '6-20', 
              '6-15', '6-13', '9-23', '5-35', '2-33', '5-30', '1-9', '8-27', '1-10', '6-19', '3-5', '2-2', '4-7', '2-17', '5-12', 
              '6-32', '6-31', '2-25', '2-11', '2-14']

class_list = [x.strip()
              for x in open('/data1/liushu/risk_data_grand/Text_GCN/data/labels.txt', encoding='utf8').readlines()]

if dataset not in datasets:
    sys.exit("wrong dataset name")
cfg.dataset = dataset

# Set random seed
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)

# Load data
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus(cfg.dataset)

features = sp.identity(features.shape[0])  # featureless

# Some preprocessing
features = preprocess_features(features)
if cfg.model == 'gcn':
    support = [preprocess_adj(adj)]
    num_supports = 1
    model_func = GCN
elif cfg.model == 'gcn_cheby':
    support = chebyshev_polynomials(adj, cfg.max_degree)
    num_supports = 1 + cfg.max_degree
    model_func = GCN
else:
    raise ValueError('Invalid argument for model: ' + str(cfg.model))

# Define placeholders
t_features = torch.from_numpy(features)
t_y_train = torch.from_numpy(y_train)
t_y_val = torch.from_numpy(y_val)
t_y_test = torch.from_numpy(y_test)
t_train_mask = torch.from_numpy(train_mask.astype(np.float32))
tm_train_mask = torch.transpose(torch.unsqueeze(
    t_train_mask, 0), 1, 0).repeat(1, y_train.shape[1])

t_support = []
for i in range(len(support)):
    t_support.append(torch.Tensor(support[i]))

model = model_func(input_dim=features.shape[0], support=t_support, num_classes=y_train.shape[1])

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': cfg.weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
]
# optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=cfg.learning_rate, eps=1e-8)
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

# Define model evaluation function
def evaluate(features, labels, mask):
    t_test = time.time()
    model.eval()
    with torch.no_grad():
        logits = model(features)
        t_mask = torch.from_numpy(np.array(mask * 1., dtype=np.float32))
        tm_mask = torch.transpose(torch.unsqueeze(
            t_mask, 0), 1, 0).repeat(1, labels.shape[1])
        loss = criterion(logits * tm_mask, torch.max(labels, 1)[1])
        pred = torch.max(logits, 1)[1]
        acc = ((pred == torch.max(labels, 1)[1]).float(
        ) * t_mask).sum().item() / t_mask.sum().item()

    return loss.numpy(), acc, pred.numpy(), labels.numpy(), (time.time() - t_test)

val_losses = []

# Train model
for epoch in range(cfg.epochs):

    t = time.time()

    # Forward pass
    logits = model(t_features)
    loss = criterion(logits * tm_train_mask, torch.max(t_y_train, 1)[1])
    acc = ((torch.max(logits, 1)[1] == torch.max(t_y_train, 1)[
        1]).float() * t_train_mask).sum().item() / t_train_mask.sum().item()

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Validation
    val_loss, val_acc, pred, labels, duration = evaluate(
        t_features, t_y_val, val_mask)
    val_losses.append(val_loss)

    print(f"Epoch: {epoch+1:.0f}, train_loss: {loss:.4f}, train_acc: {acc:.2%}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.2%}")

    if epoch > cfg.early_stopping and val_losses[-1] > np.mean(val_losses[-(cfg.early_stopping + 1):-1]):
        print("Early stopping...")
        break

test_loss, test_acc, pred, labels, test_duration = evaluate(t_features, t_y_test, test_mask)
# Testing
test_pred = []
test_labels = []
for i in range(len(test_mask)):
    if test_mask[i]:
        test_pred.append(pred[i])
        test_labels.append(np.argmax(labels[i]))

print(metrics.classification_report(
    test_labels, test_pred, digits=4, zero_division=0, target_names=class_list))
yysirs commented 3 years ago

build_graph.py

"""
@file: 建立图
"""
import random
import numpy as np
import pickle as pkl
import scipy.sparse as sp
from math import log
from nltk.corpus import wordnet as wn
import sys
random.seed(531)

word_embedding_dim = 300
word_vector_map = {}
doc_name_list = [] # 全部label
doc_train_list = [] # train label
doc_test_list = [] # test label
doc_content_list = [] # 全部item

with open('/data1/liushu/risk_data_grand/data/train.txt', 'r', encoding='utf-8') as f:
    for line_id, line in enumerate(f):
        id, sent_a, tgt = line.strip().split('\t')
        doc_train_list.append(tgt)
        doc_name_list.append(tgt)
        doc_content_list.append(sent_a)

with open('/data1/liushu/risk_data_grand/data/test.txt', 'r', encoding='utf-8') as f:
    for line_id, line in enumerate(f):
        id, sent_a = line.strip().split('\t')
        doc_test_list.append('0')
        doc_name_list.append('0')
        doc_content_list.append(sent_a)

train_ids = []
for train_name in doc_train_list:
    train_id = doc_name_list.index(train_name)
    train_ids.append(train_id)
print(len(train_ids))
random.shuffle(train_ids)

train_ids_str = '\n'.join(str(index) for index in train_ids)
f = open('/data1/liushu/risk_data_grand/Text_GCN/data/risk.train.index','w',encoding='utf-8')
f.write(train_ids_str)
f.close()

test_ids = []
for test_name in doc_test_list:
    test_id = doc_test_list.index(test_name)
    test_ids.append(test_id)
print(len(test_ids))
random.shuffle(test_ids)

test_ids_str = '\n'.join(str(index) for index in test_ids)
f = open('/data1/liushu/risk_data_grand/Text_GCN/data/test.index','w',encoding='utf-8')
f.write(test_ids_str)
f.close()

ids = train_ids + test_ids
shuffle_doc_name_list = [] # 包含所有的label,乱序
shuffle_doc_words_list = [] # 包含所有的item,乱序
for id in ids:
    shuffle_doc_name_list.append(doc_name_list[int(id)])
    shuffle_doc_words_list.append(doc_content_list[int(id)])
shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list)
shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list)

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/shuffle_label.txt','w',encoding='utf-8')
f.write(shuffle_doc_name_str)
f.close()

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/shuffle_item.txt','w',encoding='utf-8')
f.write(shuffle_doc_words_str)
f.close()

# build vocab
word_freq = {}
word_set = set()
for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    for word in words:
        word_set.add(word)
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

vocab = list(word_set)
vocab_size = len(vocab)

word_doc_list = {}

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    appeared = set()
    for word in words:
        if word in appeared:
            continue
        if word in word_doc_list:
            doc_list = word_doc_list[word]
            doc_list.append(i)
            word_doc_list[word] = doc_list
        else:
            word_doc_list[word] = [i]
        appeared.add(word)

word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

word_id_map = {}
for i in range(vocab_size):
    word_id_map[vocab[i]] = i

vocab_str = '\n'.join(vocab)

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/vocab.txt', 'w')
f.write(vocab_str)
f.close()

# label list
label_set = set()
for doc_meta in shuffle_doc_name_list:
    temp = doc_meta.split('\t')
    # label_set.add(temp[2])
    label_set.add(temp[0])
label_list = list(label_set)
label_list_str = '\n'.join(label_list)
f = open('/data1/liushu/risk_data_grand/Text_GCN/data/labels.txt', 'w')
f.write(label_list_str)
f.close()

# x: feature vectors of training docs, no initial features
# slect 90% training set
train_size = len(train_ids)
val_size = int(0.1 * train_size)
real_train_size = train_size - val_size  # - int(0.5 * train_size)
# different training rates

real_train_doc_names = shuffle_doc_name_list[:real_train_size]
real_train_doc_names_str = '\n'.join(real_train_doc_names)

f = open('/data1/liushu/risk_data_grand/Text_GCN/data/real_train.name', 'w')
f.write(real_train_doc_names_str)
f.close()

row_x = []
col_x = []
data_x = []

for i in range(real_train_size):
    doc_vec = np.array([0.0 for k in range(word_embedding_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embedding_dim):
        row_x.append(i)
        col_x.append(j)
        data_x.append(doc_vec[j]/doc_len)

x = sp.csr_matrix((data_x,(row_x,col_x)),shape=(real_train_size,word_embedding_dim))

y = [] # y.shape = [real_train_size,95]
for i in range(real_train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    one_hot = [0 for l in range(len(label_list))] # label_list 35
    # for label in temp[2:]:
    for label in temp:
        label_index = label_list.index(label)
        one_hot[label_index] =1
    y.append(one_hot)
y = np.array(y)

test_size = len(test_ids)
row_tx = []
col_tx = []
data_tx = []

for i in range(test_size):
    doc_vec = np.array([0.0 for k in range(word_embedding_dim)])
    doc_words = shuffle_doc_words_list[i+train_size]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embedding_dim):
        row_tx.append(i)
        col_tx.append(j)
        data_tx.append(doc_vec[j]/doc_len)

tx = sp.csr_matrix((data_tx,(row_tx,col_tx)),shape=(test_size,word_embedding_dim))

ty = [] # ty.shape = [test_size,95]
for i in range(test_size):
    doc_meta = shuffle_doc_name_list[i+train_size]
    temp = doc_meta.split('\t')
    one_hot = [0 for l in range(len(label_list))]
    # for label in temp[2:]:
    for label in temp:
        label_index = label_list.index(label)
        one_hot[label_index] =1
    ty.append(one_hot)
ty = np.array(ty)

word_vectors = np.random.uniform(-0.01, 0.01,(vocab_size, word_embedding_dim))

for i in range(len(vocab)):
    word = vocab[i]
    if word in word_vector_map:
        vector = word_vector_map[word]
        word_vectors[i] = vector

# real_train_size 和 test 合并
row_allx = []
col_allx = []
data_allx = []

for i in range(train_size):
    doc_vec = np.array([0.0 for k in range(word_embedding_dim)])
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_len = len(words)
    for word in words:
        if word in word_vector_map:
            word_vector = word_vector_map[word]
            # print(doc_vec)
            # print(np.array(word_vector))
            doc_vec = doc_vec + np.array(word_vector)

    for j in range(word_embedding_dim):
        row_allx.append(i)
        col_allx.append(j)
        data_allx.append(doc_vec[j]/doc_len)

for i in range(vocab_size):
    for j in range(word_embedding_dim):
        row_allx.append(int(i + train_size))  #这里添加的词是基于train上面再叠加的。
        col_allx.append(j)
        data_allx.append(word_vectors.item((i, j)))

row_allx = np.array(row_allx)
col_allx = np.array(col_allx)
data_allx = np.array(data_allx)

allx = sp.csr_matrix(
    (data_allx, (row_allx, col_allx)), shape=(train_size + vocab_size, word_embedding_dim))

ally = []
for i in range(train_size):
    doc_meta = shuffle_doc_name_list[i]
    temp = doc_meta.split('\t')
    one_hot = [0 for l in range(len(label_list))]
    # for label in temp[2:]:
    for label in temp:
        label_index = label_list.index(label)
        one_hot[label_index] = 1
    ally.append(one_hot)

for i in range(vocab_size):
    one_hot = [0 for l in range(len(label_list))]
    #如果想改word的label,就在这里改,但是通过实验发现,改动之后不影响实验结果
    ally.append(one_hot)

ally = np.array(ally)
# (18288, 256) (18288, 95) (2257, 256) (2257, 95) (69440, 256) (69440, 95)
print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)
#完成了所有feature的提取

'''
Doc word heterogeneous graph
'''
window_size = 20
windows = [] # windows = [[window_1],[window_2],...[window_n]]

for doc_words in shuffle_doc_words_list:
    words = doc_words.split()
    length = len(words)
    if length <= window_size:
        windows.append(words)
    else:
        for j in range(length - window_size + 1):
            window = words[j: j + window_size]
            windows.append(window)

word_window_freq = {} # word_window_freq = {的:20,是:18-->('是'在多少个window中出现)}
for window in windows:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in word_window_freq:
            word_window_freq[window[i]] += 1
        else:
            word_window_freq[window[i]] = 1
        appeared.add(window[i])

word_pair_count = {} # word_pair_count = {'0,1':10-->('的'和'是'共现的次数)}  word_pair_count>>word_window_freq
for window in windows:
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i]
            word_i_id = word_id_map[word_i]
            word_j = window[j]
            word_j_id = word_id_map[word_j]
            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1

row = []
col = []
weight = []

# pmi as weights
num_window = len(windows)

for key in word_pair_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    count = word_pair_count[key]
    word_freq_i = word_window_freq[vocab[i]]
    word_freq_j = word_window_freq[vocab[j]]
    pmi = log((1.0 * count / num_window) /
              (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
    if pmi <= 0:
        continue
    row.append(train_size + i)  #预留了trainsize的位置呢??????
    col.append(train_size + j)  #和前面的feature保持一致,先计算文档节点,再计算词和词之间的领结关系
    weight.append(pmi)

# word vector cosine similarity as weights

'''
for i in range(vocab_size):
    for j in range(vocab_size):
        if vocab[i] in word_vector_map and vocab[j] in word_vector_map:
            vector_i = np.array(word_vector_map[vocab[i]])
            vector_j = np.array(word_vector_map[vocab[j]])
            similarity = 1.0 - cosine(vector_i, vector_j)
            if similarity > 0.9:
                print(vocab[i], vocab[j], similarity)
                row.append(train_size + i)
                col.append(train_size + j)
                weight.append(similarity)
'''

# doc word frequency
doc_word_freq = {} # doc_word_freq = {'doc:word':文档和词共现的频次} 如果一篇文档中同一个词出现多次,频次也多次

for doc_id in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[doc_id]
    words = doc_words.split()
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

for i in range(len(shuffle_doc_words_list)):
    doc_words = shuffle_doc_words_list[i]
    words = doc_words.split()
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(shuffle_doc_words_list) /
                  word_doc_freq[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

node_size = train_size + vocab_size + test_size
adj = sp.csr_matrix(
    (weight, (row, col)), shape=(node_size, node_size))

dataset = 'risk'
# dump objects
f = open("data/ind.{}.x".format(dataset), 'wb')
pkl.dump(x, f)
f.close()

f = open("data/ind.{}.y".format(dataset), 'wb')
pkl.dump(y, f)
f.close()

f = open("data/ind.{}.tx".format(dataset), 'wb')
pkl.dump(tx, f)
f.close()

f = open("data/ind.{}.ty".format(dataset), 'wb')
pkl.dump(ty, f)
f.close()

f = open("data/ind.{}.allx".format(dataset), 'wb')
pkl.dump(allx, f)
f.close()

f = open("data/ind.{}.ally".format(dataset), 'wb')
pkl.dump(ally, f)
f.close()

f = open("data/ind.{}.adj".format(dataset), 'wb')
pkl.dump(adj, f)
f.close()
yysirs commented 3 years ago

希望大佬有空帮忙看下,十分感谢!

yysirs commented 3 years ago

train的数据格式为:

id,content,label

test的数据格式为:

id,content