Open futureflsl opened 3 years ago
我参考作者另一个基于bert分类项目的评论区写了个,分享一下
import torch
import numpy as np
from importlib import import_module
import argparse
import os
import pickle as pkl
parser = argparse.ArgumentParser(description="Classification based Transformer")
parser.add_argument("--model",type=str, default="TextCNN")
parser.add_argument("--dataset", type=str,default="THUCNews")
parser.add_argument("--text",type=str )
parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char')
parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained')
args = parser.parse_args()
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号
dataset_name = args.dataset
#ThuNews
if dataset_name == "THUCNews":
key = {
0: 'finance',
1: 'realty',
2: 'stocks',
3: 'education',
4: 'science',
5: 'society',
6: 'politics',
7: 'sports',
8: 'game',
9: 'entertainment'
}
model_name = args.model # 'TextRCNN' # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer
x = import_module('models.' + model_name)
# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
embedding = args.embedding
if model_name == 'FastText':
from utils_fasttext import build_dataset, build_iterator, get_time_dif
embedding = 'random'
config = x.Config(dataset_name, embedding)
if os.path.exists(config.vocab_path):
vocab = pkl.load(open(config.vocab_path, 'rb'))
config.n_vocab = len(vocab)
model = x.Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))#
model.eval()
def build_predict_text(text, use_word):
if use_word:
tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level
else:
tokenizer = lambda x: [y for y in x] # char-level
token = tokenizer(text)
seq_len = len(token)
pad_size = config.pad_size
if pad_size:
if len(token)< pad_size:
token.extend([PAD]*(pad_size-len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
words_line = []
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
# ids = torch.LongTensor([words_line]).cuda()
ids = torch.LongTensor([words_line]).to(config.device)
seq_len = torch.LongTensor(seq_len).to(config.device)
return ids, seq_len
def predict(text):
data = build_predict_text(text,args.use_word)
with torch.no_grad():
outputs = model(data)
num = torch.argmax(outputs)
return key[int(num)]
if __name__ == "__main__":
if args.text is None:
print(predict("备考2012高考作文必读美文50篇(一)"))
else:
print(predict(args.text))`
我参考作者另一个基于bert分类项目的评论区写了个,分享一下
import torch import numpy as np from importlib import import_module import argparse import os import pickle as pkl parser = argparse.ArgumentParser(description="Classification based Transformer") parser.add_argument("--model",type=str, default="TextCNN") parser.add_argument("--dataset", type=str,default="THUCNews") parser.add_argument("--text",type=str ) parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char') parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained') args = parser.parse_args() UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号 dataset_name = args.dataset #ThuNews if dataset_name == "THUCNews": key = { 0: 'finance', 1: 'realty', 2: 'stocks', 3: 'education', 4: 'science', 5: 'society', 6: 'politics', 7: 'sports', 8: 'game', 9: 'entertainment' } model_name = args.model # 'TextRCNN' # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer x = import_module('models.' + model_name) # 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random embedding = args.embedding if model_name == 'FastText': from utils_fasttext import build_dataset, build_iterator, get_time_dif embedding = 'random' config = x.Config(dataset_name, embedding) if os.path.exists(config.vocab_path): vocab = pkl.load(open(config.vocab_path, 'rb')) config.n_vocab = len(vocab) model = x.Model(config).to(config.device) model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))# model.eval() def build_predict_text(text, use_word): if use_word: tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level else: tokenizer = lambda x: [y for y in x] # char-level token = tokenizer(text) seq_len = len(token) pad_size = config.pad_size if pad_size: if len(token)< pad_size: token.extend([PAD]*(pad_size-len(token))) else: token = token[:pad_size] seq_len = pad_size words_line = [] for word in token: words_line.append(vocab.get(word, vocab.get(UNK))) # ids = torch.LongTensor([words_line]).cuda() ids = torch.LongTensor([words_line]).to(config.device) seq_len = torch.LongTensor(seq_len).to(config.device) return ids, seq_len def predict(text): data = build_predict_text(text,args.use_word) with torch.no_grad(): outputs = model(data) num = torch.argmax(outputs) return key[int(num)] if __name__ == "__main__": if args.text is None: print(predict("备考2012高考作文必读美文50篇(一)")) else: print(predict(args.text))`
怎样输出分数?
@prozyworld 不好意思,才注意到。输出的outputs体现了可能性,数值越大,是某个类别的可能性也越大。把它标准化到加和为1,所以用softmax函数处理下就可以。
import torch.nn.functional as F
def predict(text):
data = build_predict_text(text,args.use_word)
with torch.no_grad():
outputs = model(data)
num = torch.argmax(outputs)
pred = F.softmax(outputs, dim=1)
return key[int(num)],pred
这里pred输出的就是文本是各个标签的概率了,pred的输出格式是cuda上的列表,你可以转为正常的。
我参考作者另一个基于bert分类项目的评论区写了个,分享一下
import torch import numpy as np from importlib import import_module import argparse import os import pickle as pkl parser = argparse.ArgumentParser(description="Classification based Transformer") parser.add_argument("--model",type=str, default="TextCNN") parser.add_argument("--dataset", type=str,default="THUCNews") parser.add_argument("--text",type=str ) parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char') parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained') args = parser.parse_args() UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号 dataset_name = args.dataset #ThuNews if dataset_name == "THUCNews": key = { 0: 'finance', 1: 'realty', 2: 'stocks', 3: 'education', 4: 'science', 5: 'society', 6: 'politics', 7: 'sports', 8: 'game', 9: 'entertainment' } model_name = args.model # 'TextRCNN' # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer x = import_module('models.' + model_name) # 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random embedding = args.embedding if model_name == 'FastText': from utils_fasttext import build_dataset, build_iterator, get_time_dif embedding = 'random' config = x.Config(dataset_name, embedding) if os.path.exists(config.vocab_path): vocab = pkl.load(open(config.vocab_path, 'rb')) config.n_vocab = len(vocab) model = x.Model(config).to(config.device) model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))# model.eval() def build_predict_text(text, use_word): if use_word: tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level else: tokenizer = lambda x: [y for y in x] # char-level token = tokenizer(text) seq_len = len(token) pad_size = config.pad_size if pad_size: if len(token)< pad_size: token.extend([PAD]*(pad_size-len(token))) else: token = token[:pad_size] seq_len = pad_size words_line = [] for word in token: words_line.append(vocab.get(word, vocab.get(UNK))) # ids = torch.LongTensor([words_line]).cuda() ids = torch.LongTensor([words_line]).to(config.device) seq_len = torch.LongTensor(seq_len).to(config.device) return ids, seq_len def predict(text): data = build_predict_text(text,args.use_word) with torch.no_grad(): outputs = model(data) num = torch.argmax(outputs) return key[int(num)] if __name__ == "__main__": if args.text is None: print(predict("备考2012高考作文必读美文50篇(一)")) else: print(predict(args.text))`
怎样输出分数?
我参考作者另一个基于bert分类项目的评论区写了个,分享一下
import torch import numpy as np from importlib import import_module import argparse import os import pickle as pkl parser = argparse.ArgumentParser(description="Classification based Transformer") parser.add_argument("--model",type=str, default="TextCNN") parser.add_argument("--dataset", type=str,default="THUCNews") parser.add_argument("--text",type=str ) parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char') parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained') args = parser.parse_args() UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号 dataset_name = args.dataset #ThuNews if dataset_name == "THUCNews": key = { 0: 'finance', 1: 'realty', 2: 'stocks', 3: 'education', 4: 'science', 5: 'society', 6: 'politics', 7: 'sports', 8: 'game', 9: 'entertainment' } model_name = args.model # 'TextRCNN' # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer x = import_module('models.' + model_name) # 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random embedding = args.embedding if model_name == 'FastText': from utils_fasttext import build_dataset, build_iterator, get_time_dif embedding = 'random' config = x.Config(dataset_name, embedding) if os.path.exists(config.vocab_path): vocab = pkl.load(open(config.vocab_path, 'rb')) config.n_vocab = len(vocab) model = x.Model(config).to(config.device) model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cuda') ))# model.eval() def build_predict_text(text, use_word): if use_word: tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level else: tokenizer = lambda x: [y for y in x] # char-level token = tokenizer(text) seq_len = len(token) pad_size = config.pad_size if pad_size: if len(token)< pad_size: token.extend([PAD]*(pad_size-len(token))) else: token = token[:pad_size] seq_len = pad_size words_line = [] for word in token: words_line.append(vocab.get(word, vocab.get(UNK))) # ids = torch.LongTensor([words_line]).cuda() ids = torch.LongTensor([words_line]).to(config.device) seq_len = torch.LongTensor(seq_len).to(config.device) return ids, seq_len def predict(text): data = build_predict_text(text,args.use_word) with torch.no_grad(): outputs = model(data) num = torch.argmax(outputs) return key[int(num)] if __name__ == "__main__": if args.text is None: print(predict("备考2012高考作文必读美文50篇(一)")) else: print(predict(args.text))`
您好,很抱歉打扰您,我想问, 我俺您的代码复制过去,出现这种问题是什么原因,还有一个问题,这个代码调用之前训练的模型了 吗?
@empowerszc 大佬 用训练好的fasttext预测会报错 怎么解决 别的模型没问题
@m1109619669 同问题,请问解决了吗?感谢!
@m1109619669 已解决:
import torch import numpy as np from importlib import import_module import argparse import os import pickle as pkl
parser = argparse.ArgumentParser(description="Classification based Transformer") parser.add_argument("--model",type=str, default="TextCNN") parser.add_argument("--dataset", type=str,default="THUCNews") parser.add_argument("--text",type=str ) parser.add_argument('--use_word', default=False, type=bool, help='True for word, False for char') parser.add_argument('--embedding', default='random', type=str, help='random or pre_trained') args = parser.parse_args()
UNK, PAD = '
if dataset_name == "YWX": key = { 0: 'finance', 1: 'realty', 2: 'stocks', 3: 'education', 4: 'science', 5: 'society', 6: 'politics', 7: 'sports', 8: 'game', 9: 'entertainment' }
model_name = args.model # 'TextRCNN' # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer x = import_module('models.' + model_name)
embedding = args.embedding if model_name == 'FastText': from utils_fasttext import build_dataset, build_iterator, get_time_dif embedding = 'random' config = x.Config(dataset_name, embedding) if os.path.exists(config.vocab_path): vocab = pkl.load(open(config.vocab_path, 'rb')) config.n_vocab = len(vocab)
model = x.Model(config).to(config.device) model.load_state_dict(torch.load(config.save_path,map_location=torch.device('cpu') ))# model.eval()
def biGramHash(sequence, t, buckets): t1 = sequence[t - 1] if t - 1 >= 0 else 0 return (t1 * 14918087) % buckets
def triGramHash(sequence, t, buckets): t1 = sequence[t - 1] if t - 1 >= 0 else 0 t2 = sequence[t - 2] if t - 2 >= 0 else 0 return (t2 14918087 18408749 + t1 * 14918087) % buckets
def build_predict_text(text, use_word): if use_word: tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level else: tokenizer = lambda x: [y for y in x] # char-level
token = tokenizer(text)
seq_len = len(token)
pad_size = config.pad_size
if pad_size:
if len(token)< pad_size:
token.extend([PAD]*(pad_size-len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
words_line = []
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
buckets = config.n_gram_vocab
bigram = []
trigram = []
# ------ngram------
for i in range(pad_size):
bigram.append(biGramHash(words_line, i, buckets))
trigram.append(triGramHash(words_line, i, buckets))
ids = torch.LongTensor([words_line]).to(config.device)
seq_len = torch.LongTensor([seq_len]).to(config.device)
bigram_ts = torch.LongTensor([bigram]).to(config.device)
trigram_ts = torch.LongTensor([trigram]).to(config.device)
return ids, seq_len, bigram_ts, trigram_ts
def predict(text): data = build_predict_text(text,args.use_word) with torch.no_grad(): outputs = model(data) num = torch.argmax(outputs)
return key[int(num)]
if name == "main": if args.text is None: print(predict("some sentences")) else: print(predict(args.text))
`自己写的预测类,输入文本即可得出结果,但是要用tensorflow才行,怎么把tensorflow pad那个删除?
下面写的预测类代码在TextCNN测试通过,但是在 kr.preprocessing.sequence.pad_sequences这个用到了kears代码,怎么把这个换成pyotorch的或者numpy的?请大神指教,大家一起交流下 from tensorflow import keras as kr import torch.nn as nn from torch.autograd import Variable import os import torch import numpy as np from importlib import import_module
class InferManager(object): def init(self, weights): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dataset = 'dataset' + os.sep + 'mydata' # 数据集
搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
if name == 'main':
`