649453932 / Chinese-Text-Classification-Pytorch

中文文本分类,TextCNN,TextRNN,FastText,TextRCNN,BiLSTM_Attention,DPCNN,Transformer,基于pytorch,开箱即用。
MIT License
5.29k stars 1.23k forks source link

自己写的预测类,输入文本即可得出结果,但是要用tensorflow才行,怎么把tensorflow pad那个删除? #72

Open futureflsl opened 3 years ago

futureflsl commented 3 years ago

`自己写的预测类,输入文本即可得出结果,但是要用tensorflow才行,怎么把tensorflow pad那个删除?

下面写的预测类代码在TextCNN测试通过,但是在 kr.preprocessing.sequence.pad_sequences这个用到了kears代码,怎么把这个换成pyotorch的或者numpy的?请大神指教,大家一起交流下 from tensorflow import keras as kr import torch.nn as nn from torch.autograd import Variable import os import torch import numpy as np from importlib import import_module

class InferManager(object): def init(self, weights): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dataset = 'dataset' + os.sep + 'mydata' # 数据集

搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random

    args_embedding = 'pre_trained'
    args_word = False
    embedding = 'embedding_SougouNews.npz'
    if args_embedding == 'random':
        embedding = 'random'
    model_name = 'TextCNN'  # 'TextRCNN'  # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
    if model_name == 'FastText':
        from utils_fasttext import build_dataset, build_iterator, get_time_dif
        embedding = 'random'
    else:
        from utils import build_dataset, build_iterator, get_time_dif

    x = import_module('models.' + model_name)
    config = x.Config(dataset, embedding)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样
    print("Loading data...")
    self.vocab, train_data, dev_data, test_data = build_dataset(config, args_word)
    # train
    config.n_vocab = len(self.vocab)
    self.model = x.Model(config).to(self.device)
    if model_name != 'Transformer':
        self.init_network(self.model)
    print(self.model.parameters)
    self.model.load_state_dict(torch.load(weights))
    self.model.eval()

# 权重初始化,默认xavier
def init_network(self, model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():
        if exclude not in name:
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0)
            else:
                pass

def load_labels(self, name_file):
    with open(name_file, 'r') as f:
        lines = f.read().rstrip('\n').split('\n')
    return lines

def inference(self, text, labels_list):
    tokenizer = lambda x: [y for y in x]
    content = tokenizer(text)
    data = [self.vocab[x] for x in content if x in self.vocab]
    data = kr.preprocessing.sequence.pad_sequences([data], 32)
    tests = [np.asarray(data)]
    test_batch = Variable(torch.LongTensor(tests)).to(self.device)
    with torch.no_grad():
        outputs = self.model(test_batch)
        predict = torch.max(outputs.data, 1)[1].cpu().numpy()
        # print(predict[0])
    return labels_list[predict[0]]

if name == 'main':

infer = InferManager('./dataset/mydata/saved_dict/TextCNN.ckpt')
labels_list=infer.load_labels('./dataset/mydata/data/class.txt')
text='Edmund上周通过Steam社区向玩家们分享了更多有关《以撒的结合:重生》“真·合作模式(True co-op)”的相关信息,一起来了解一下。根据官方介绍,“真·合作模式”目前仅支持本地联机,可供2-4人游玩,不过玩家们可以使用Steam“远程同乐”功能在线实现“本地联机”。E胖介绍的联机玩法:1、如果一名玩家控制的角色在过程中死亡,他们会以弱化的幽灵婴儿形式重生,这些幽灵宝宝只能造成极少的伤害。在当层的boss被击败时,幽灵宝宝才会真正复活(复活时只有半颗心,保有此前拥有的所有道具)。2、当boss被击败时会掉落一个boss道具,一名玩家拾取后还会再生成另一个道具,供下一位玩家拾取。但其他道具都只能单次生成,所以玩家们得分享道具。3、游戏中的资源(硬币、炸弹、钥匙)是共享的,但每个玩家都有自己的HP和道具。4、一旦所有非幽灵状态的玩家死亡,游戏就会结束。E胖表示目前游戏团队正在对合作模式进行调整和打磨,4月1日时该模式将正式发布。敬请期待。更多相关资讯请关注:以撒的结合:重生专区'
result = infer.inference(text,labels_list)
print(result)

`

empowerszc commented 3 years ago

我参考作者另一个基于bert分类项目的评论区写了个,分享一下

import torch
import numpy as np
from importlib import import_module
import argparse
import os
import pickle as pkl

parser = argparse.ArgumentParser(description="Classification based Transformer")
parser.add_argument("--model",type=str, default="TextCNN")
parser.add_argument("--dataset", type=str,default="THUCNews")
parser.add_argument("--text",type=str )
parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char')
parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained')
args = parser.parse_args()

UNK, PAD = '<UNK>', '<PAD>'  # 未知字,padding符号
dataset_name = args.dataset
#ThuNews
if dataset_name == "THUCNews":
    key = {
        0: 'finance',
        1: 'realty',
        2: 'stocks',
        3: 'education',
        4: 'science',
        5: 'society',
        6: 'politics',
        7: 'sports',
        8: 'game',
        9: 'entertainment'
    }

model_name = args.model # 'TextRCNN'  # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
x = import_module('models.' + model_name)

# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
embedding = args.embedding
if model_name == 'FastText':
    from utils_fasttext import build_dataset, build_iterator, get_time_dif
    embedding = 'random'
config = x.Config(dataset_name, embedding)
if os.path.exists(config.vocab_path):
    vocab = pkl.load(open(config.vocab_path, 'rb'))
    config.n_vocab = len(vocab)

model = x.Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))#
model.eval()

def build_predict_text(text, use_word):

    if use_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level

    token = tokenizer(text)
    seq_len = len(token)
    pad_size = config.pad_size
    if pad_size:
        if len(token)< pad_size:
            token.extend([PAD]*(pad_size-len(token)))
        else:
            token = token[:pad_size]
            seq_len = pad_size

    words_line = []
    for word in token:
        words_line.append(vocab.get(word, vocab.get(UNK)))

    # ids = torch.LongTensor([words_line]).cuda()
    ids = torch.LongTensor([words_line]).to(config.device)
    seq_len = torch.LongTensor(seq_len).to(config.device)

    return ids, seq_len

def predict(text):
    data = build_predict_text(text,args.use_word)
    with torch.no_grad():
        outputs = model(data)
        num = torch.argmax(outputs)

    return key[int(num)]

if __name__ == "__main__":
    if args.text is None:
        print(predict("备考2012高考作文必读美文50篇(一)"))
    else:
        print(predict(args.text))`
prozyworld commented 3 years ago

我参考作者另一个基于bert分类项目的评论区写了个,分享一下

import torch
import numpy as np
from importlib import import_module
import argparse
import os
import pickle as pkl

parser = argparse.ArgumentParser(description="Classification based Transformer")
parser.add_argument("--model",type=str, default="TextCNN")
parser.add_argument("--dataset", type=str,default="THUCNews")
parser.add_argument("--text",type=str )
parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char')
parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained')
args = parser.parse_args()

UNK, PAD = '<UNK>', '<PAD>'  # 未知字,padding符号
dataset_name = args.dataset
#ThuNews
if dataset_name == "THUCNews":
    key = {
        0: 'finance',
        1: 'realty',
        2: 'stocks',
        3: 'education',
        4: 'science',
        5: 'society',
        6: 'politics',
        7: 'sports',
        8: 'game',
        9: 'entertainment'
    }

model_name = args.model # 'TextRCNN'  # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
x = import_module('models.' + model_name)

# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
embedding = args.embedding
if model_name == 'FastText':
    from utils_fasttext import build_dataset, build_iterator, get_time_dif
    embedding = 'random'
config = x.Config(dataset_name, embedding)
if os.path.exists(config.vocab_path):
    vocab = pkl.load(open(config.vocab_path, 'rb'))
    config.n_vocab = len(vocab)

model = x.Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))#
model.eval()

def build_predict_text(text, use_word):

    if use_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level

    token = tokenizer(text)
    seq_len = len(token)
    pad_size = config.pad_size
    if pad_size:
        if len(token)< pad_size:
            token.extend([PAD]*(pad_size-len(token)))
        else:
            token = token[:pad_size]
            seq_len = pad_size

    words_line = []
    for word in token:
        words_line.append(vocab.get(word, vocab.get(UNK)))

    # ids = torch.LongTensor([words_line]).cuda()
    ids = torch.LongTensor([words_line]).to(config.device)
    seq_len = torch.LongTensor(seq_len).to(config.device)

    return ids, seq_len

def predict(text):
    data = build_predict_text(text,args.use_word)
    with torch.no_grad():
        outputs = model(data)
        num = torch.argmax(outputs)

    return key[int(num)]

if __name__ == "__main__":
    if args.text is None:
        print(predict("备考2012高考作文必读美文50篇(一)"))
    else:
        print(predict(args.text))`

怎样输出分数?

empowerszc commented 3 years ago

@prozyworld 不好意思,才注意到。输出的outputs体现了可能性,数值越大,是某个类别的可能性也越大。把它标准化到加和为1,所以用softmax函数处理下就可以。

import torch.nn.functional as F

def predict(text):
    data = build_predict_text(text,args.use_word)
    with torch.no_grad():
        outputs = model(data)
        num = torch.argmax(outputs)
        pred = F.softmax(outputs, dim=1)

    return key[int(num)],pred

这里pred输出的就是文本是各个标签的概率了,pred的输出格式是cuda上的列表,你可以转为正常的。

我参考作者另一个基于bert分类项目的评论区写了个,分享一下

import torch
import numpy as np
from importlib import import_module
import argparse
import os
import pickle as pkl

parser = argparse.ArgumentParser(description="Classification based Transformer")
parser.add_argument("--model",type=str, default="TextCNN")
parser.add_argument("--dataset", type=str,default="THUCNews")
parser.add_argument("--text",type=str )
parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char')
parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained')
args = parser.parse_args()

UNK, PAD = '<UNK>', '<PAD>'  # 未知字,padding符号
dataset_name = args.dataset
#ThuNews
if dataset_name == "THUCNews":
    key = {
        0: 'finance',
        1: 'realty',
        2: 'stocks',
        3: 'education',
        4: 'science',
        5: 'society',
        6: 'politics',
        7: 'sports',
        8: 'game',
        9: 'entertainment'
    }

model_name = args.model # 'TextRCNN'  # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
x = import_module('models.' + model_name)

# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
embedding = args.embedding
if model_name == 'FastText':
    from utils_fasttext import build_dataset, build_iterator, get_time_dif
    embedding = 'random'
config = x.Config(dataset_name, embedding)
if os.path.exists(config.vocab_path):
    vocab = pkl.load(open(config.vocab_path, 'rb'))
    config.n_vocab = len(vocab)

model = x.Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))#
model.eval()

def build_predict_text(text, use_word):

    if use_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level

    token = tokenizer(text)
    seq_len = len(token)
    pad_size = config.pad_size
    if pad_size:
        if len(token)< pad_size:
            token.extend([PAD]*(pad_size-len(token)))
        else:
            token = token[:pad_size]
            seq_len = pad_size

    words_line = []
    for word in token:
        words_line.append(vocab.get(word, vocab.get(UNK)))

    # ids = torch.LongTensor([words_line]).cuda()
    ids = torch.LongTensor([words_line]).to(config.device)
    seq_len = torch.LongTensor(seq_len).to(config.device)

    return ids, seq_len

def predict(text):
    data = build_predict_text(text,args.use_word)
    with torch.no_grad():
        outputs = model(data)
        num = torch.argmax(outputs)

    return key[int(num)]

if __name__ == "__main__":
    if args.text is None:
        print(predict("备考2012高考作文必读美文50篇(一)"))
    else:
        print(predict(args.text))`

怎样输出分数?

dongandi commented 3 years ago

我参考作者另一个基于bert分类项目的评论区写了个,分享一下

import torch
import numpy as np
from importlib import import_module
import argparse
import os
import pickle as pkl

parser = argparse.ArgumentParser(description="Classification based Transformer")
parser.add_argument("--model",type=str, default="TextCNN")
parser.add_argument("--dataset", type=str,default="THUCNews")
parser.add_argument("--text",type=str )
parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char')
parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained')
args = parser.parse_args()

UNK, PAD = '<UNK>', '<PAD>'  # 未知字,padding符号
dataset_name = args.dataset
#ThuNews
if dataset_name == "THUCNews":
    key = {
        0: 'finance',
        1: 'realty',
        2: 'stocks',
        3: 'education',
        4: 'science',
        5: 'society',
        6: 'politics',
        7: 'sports',
        8: 'game',
        9: 'entertainment'
    }

model_name = args.model # 'TextRCNN'  # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
x = import_module('models.' + model_name)

# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
embedding = args.embedding
if model_name == 'FastText':
    from utils_fasttext import build_dataset, build_iterator, get_time_dif
    embedding = 'random'
config = x.Config(dataset_name, embedding)
if os.path.exists(config.vocab_path):
    vocab = pkl.load(open(config.vocab_path, 'rb'))
    config.n_vocab = len(vocab)

model = x.Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))#
model.eval()

def build_predict_text(text, use_word):

    if use_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level

    token = tokenizer(text)
    seq_len = len(token)
    pad_size = config.pad_size
    if pad_size:
        if len(token)< pad_size:
            token.extend([PAD]*(pad_size-len(token)))
        else:
            token = token[:pad_size]
            seq_len = pad_size

    words_line = []
    for word in token:
        words_line.append(vocab.get(word, vocab.get(UNK)))

    # ids = torch.LongTensor([words_line]).cuda()
    ids = torch.LongTensor([words_line]).to(config.device)
    seq_len = torch.LongTensor(seq_len).to(config.device)

    return ids, seq_len

def predict(text):
    data = build_predict_text(text,args.use_word)
    with torch.no_grad():
        outputs = model(data)
        num = torch.argmax(outputs)

    return key[int(num)]

if __name__ == "__main__":
    if args.text is None:
        print(predict("备考2012高考作文必读美文50篇(一)"))
    else:
        print(predict(args.text))`

您好,很抱歉打扰您,我想问, image 我俺您的代码复制过去,出现这种问题是什么原因,还有一个问题,这个代码调用之前训练的模型了 吗?

m1109619669 commented 2 years ago

FastText报错 @empowerszc 大佬 用训练好的fasttext预测会报错 怎么解决 别的模型没问题

CRonaldo1997 commented 2 years ago

@m1109619669 同问题,请问解决了吗?感谢!

CRonaldo1997 commented 2 years ago

@m1109619669 已解决:

import torch import numpy as np from importlib import import_module import argparse import os import pickle as pkl

parser = argparse.ArgumentParser(description="Classification based Transformer") parser.add_argument("--model",type=str, default="TextCNN") parser.add_argument("--dataset", type=str,default="THUCNews") parser.add_argument("--text",type=str ) parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char') parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained') args = parser.parse_args()

UNK, PAD = '', '' # 未知字,padding符号 dataset_name = args.dataset

ThuNews

if dataset_name == "YWX": key = { 0: 'finance', 1: 'realty', 2: 'stocks', 3: 'education', 4: 'science', 5: 'society', 6: 'politics', 7: 'sports', 8: 'game', 9: 'entertainment' }

model_name = args.model # 'TextRCNN' # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer x = import_module('models.' + model_name)

embedding = args.embedding if model_name == 'FastText': from utils_fasttext import build_dataset, build_iterator, get_time_dif embedding = 'random' config = x.Config(dataset_name, embedding) if os.path.exists(config.vocab_path): vocab = pkl.load(open(config.vocab_path, 'rb')) config.n_vocab = len(vocab)

model = x.Model(config).to(config.device) model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cpu') ))# model.eval()

def biGramHash(sequence, t, buckets): t1 = sequence[t - 1] if t - 1 >= 0 else 0 return (t1 * 14918087) % buckets

def triGramHash(sequence, t, buckets): t1 = sequence[t - 1] if t - 1 >= 0 else 0 t2 = sequence[t - 2] if t - 2 >= 0 else 0 return (t2 14918087 18408749 + t1 * 14918087) % buckets

def build_predict_text(text, use_word): if use_word: tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level else: tokenizer = lambda x: [y for y in x] # char-level

token = tokenizer(text)
seq_len = len(token)
pad_size = config.pad_size
if pad_size:
    if len(token)< pad_size:
        token.extend([PAD]*(pad_size-len(token)))
    else:
        token = token[:pad_size]
        seq_len = pad_size

words_line = []
for word in token:
    words_line.append(vocab.get(word, vocab.get(UNK)))

buckets = config.n_gram_vocab
bigram = []
trigram = []
# ------ngram------
for i in range(pad_size):
    bigram.append(biGramHash(words_line, i, buckets))
    trigram.append(triGramHash(words_line, i, buckets))

ids = torch.LongTensor([words_line]).to(config.device)
seq_len = torch.LongTensor([seq_len]).to(config.device)
bigram_ts = torch.LongTensor([bigram]).to(config.device)
trigram_ts = torch.LongTensor([trigram]).to(config.device)

return ids, seq_len, bigram_ts, trigram_ts

def predict(text): data = build_predict_text(text,args.use_word) with torch.no_grad(): outputs = model(data) num = torch.argmax(outputs)

return key[int(num)]

if name == "main": if args.text is None: print(predict("some sentences")) else: print(predict(args.text))