649453932 / Bert-Chinese-Text-Classification-Pytorch

使用Bert,ERNIE,进行中文文本分类
MIT License
4.02k stars 898 forks source link

大公无私给你们贴预测代码 #70

Closed jtyoui closed 2 years ago

jtyoui commented 3 years ago

需要安装 pip install torch pytorch_pretrained_bert

import os

import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertModel, BertTokenizer

# 识别的类型
key = {0: '别名',
       1: '防治农药',
       2: '病原学名',
       3: '病原中文名',
       4: '病原属性',
       5: '为害部位',
       6: '为害作物',
       7: '属目',
       8: '属科',
       9: '学名'
       }

class Config:
    """配置参数"""

    def __init__(self):
        cru = os.path.dirname(__file__)
        self.class_list = [str(i) for i in range(len(key))]  # 类别名单
        self.save_path = os.path.join(cru, 'ernie/ERNIE.ckpt')
        self.device = torch.device('cpu')
        self.require_improvement = 1000  # 若超过1000batch效果还没提升,则提前结束训练
        self.num_classes = len(self.class_list)  # 类别数
        self.num_epochs = 3  # epoch数
        self.batch_size = 128  # mini-batch大小
        self.pad_size = 32  # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-5  # 学习率
        self.bert_path = os.path.join(cru, 'bert')
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768

    def build_dataset(self, text):
        lin = text.strip()
        pad_size = len(lin)
        token = self.tokenizer.tokenize(lin)
        token = ['[CLS]'] + token
        token_ids = self.tokenizer.convert_tokens_to_ids(token)
        mask = [1] * pad_size
        token_ids = token_ids[:pad_size]
        return torch.tensor([token_ids], dtype=torch.long), torch.tensor([mask])

class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        context = x[0]
        mask = x[1]
        _, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
        out = self.fc(pooled)
        return out

config = Config()
model = Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path, map_location='cpu'))

def prediction_model(text):
    """输入一句问话预测"""
    data = config.build_dataset(text)
    with torch.no_grad():
        outputs = model(data)
        num = torch.argmax(outputs)
    return key[int(num)]

if __name__ == '__main__':
    print(prediction_model('水稻恶苗病主要危害哪些部位?'))
geralt-write-code commented 3 years ago

是不是只要加上prediction_model这个函数就可以了

jtyoui commented 3 years ago

是不是只要加上prediction_model这个函数就可以了

当然不是,我稍微修改了一点点代码.你就把我这个完全的粘贴复制就行了,把模型地址改一下就ok

geralt-write-code commented 3 years ago

是不是只要加上prediction_model这个函数就可以了

当然不是,我稍微修改了一点点代码.你就把我这个完全的粘贴复制就行了,把模型地址改一下就ok 感谢,我已经对着你的代码该出来了

shengtaovvv commented 3 years ago

请问一下,如果我想连续加载两个模型,加载完第一个之后应该怎么重置torch,才能让第二个模型加载的时候不受第一个模型的影响

sssssajfsd commented 3 years ago

当出现三个连续的数字的时候就报错,两个不会报错! ![Uploading image.png…]()

shengtaovvv commented 3 years ago

已经解决,谢谢

vencentDebug commented 3 years ago

RuntimeError: Error(s) in loading state_dict for Model: Unexpected key(s) in state_dict: "lstm.weight_ih_l0", "lstm.weight_hh_l0", "lstm.bias_ih_l0", "lstm.bias_hh_l0", "lstm.weight_ih_l0_reverse", "lstm.weight_hh_l0_reverse", "lstm.bias_ih_l0_reverse", "lstm.bias_hh_l0_reverse", "lstm.weight_ih_l1", "lstm.weight_hh_l1", "lstm.bias_ih_l1", "lstm.bias_hh_l1", "lstm.weight_ih_l1_reverse", "lstm.weight_hh_l1_reverse", "lstm.bias_ih_l1_reverse", "lstm.bias_hh_l1_reverse". size mismatch for fc.weight: copying a param with shape torch.Size([10, 1280]) from checkpoint, the shape in current model is torch.Size([10, 768]).

请问一下这个报错是因为什么原因,该如何调整

vencentDebug commented 3 years ago

import os import time

import torch import torch.nn as nn from pytorch_pretrained_bert import BertModel, BertTokenizer import numpy as np from importlib import import_module import argparse

class Config(object): """配置参数"""

def __init__(self, dataset):

    self.class_list = [x.strip() for x in open(
        dataset + '/data/class.txt').readlines()]
    self.save_path = 'THUCNews/saved_dict/bert.ckpt'
    self.device = torch.device('cpu')
    self.require_improvement = 1000  # 若超过1000batch效果还没提升,则提前结束训练
    self.num_classes = len(self.class_list)  # 类别数
    self.num_epochs = 3  # epoch数
    self.batch_size = 128  # mini-batch大小
    self.pad_size = 32  # 每句话处理成的长度(短填长切)
    self.learning_rate = 5e-5  # 学习率
    self.bert_path = './bert_pretrain'
    self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
    self.hidden_size = 768

def build_dataset(self, text):
    lin = text.strip()
    pad_size = len(lin)
    token = self.tokenizer.tokenize(lin)
    token = ['[CLS]'] + token
    token_ids = self.tokenizer.convert_tokens_to_ids(token)
    mask = [1] * pad_size
    token_ids = token_ids[:pad_size]
    return torch.tensor([token_ids], dtype=torch.long), torch.tensor([mask])

class Model(nn.Module):

def __init__(self, config):
    super(Model, self).__init__()
    self.bert = BertModel.from_pretrained(config.bert_path)
    for param in self.bert.parameters():
        param.requires_grad = True
    self.fc = nn.Linear(config.hidden_size, config.num_classes)

def forward(self, x):
    context = x[0]
    mask = x[1]
    _, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
    out = self.fc(pooled)
    return out

def prediction_model(text): """输入一句问话预测""" data = config.build_dataset(text) with torch.no_grad(): outputs = model(data) num = torch.argmax(outputs) return key[int(num)]

parser = argparse.ArgumentParser(description='Chinese Text Classification') parser.add_argument('--model', type=str, required=True, help='choose a model: Bert, ERNIE') args = parser.parse_args()

if name == 'main': dataset = 'THUCNews' # 数据集 key = {0: '别名', 1: '防治农药', 2: '病原学名', 3: '病原中文名', 4: '病原属性', 5: '为害部位', 6: '为害作物', 7: '属目', 8: '属科', 9: '学名' }

model_name = args.model  # bert
x = import_module('models.' + model_name)
config = x.Config(dataset)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

start_time = time.time()
config = Config(dataset)
model = Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path, map_location='cpu'))
print("Loading data...")
print(prediction_model('水稻恶苗病主要危害哪些部位?'))
vencentDebug commented 3 years ago

THUCNews/saved_dict/bert.ckpt 地址如上

jtyoui commented 3 years ago

RuntimeError: Error(s) in loading state_dict for Model: Unexpected key(s) in state_dict: "lstm.weight_ih_l0", "lstm.weight_hh_l0", "lstm.bias_ih_l0", "lstm.bias_hh_l0", "lstm.weight_ih_l0_reverse", "lstm.weight_hh_l0_reverse", "lstm.bias_ih_l0_reverse", "lstm.bias_hh_l0_reverse", "lstm.weight_ih_l1", "lstm.weight_hh_l1", "lstm.bias_ih_l1", "lstm.bias_hh_l1", "lstm.weight_ih_l1_reverse", "lstm.weight_hh_l1_reverse", "lstm.bias_ih_l1_reverse", "lstm.bias_hh_l1_reverse". size mismatch for fc.weight: copying a param with shape torch.Size([10, 1280]) from checkpoint, the shape in current model is torch.Size([10, 768]).

请问一下这个报错是因为什么原因,该如何调整

你把self.hidden_size = 768改成self.hidden_size = 1280试一下1.

michelleweii commented 3 years ago

好人一生平安(^▽^)

vencentDebug commented 3 years ago

谢谢,已经解决

shengyucaihua commented 3 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

JennieGerhardt commented 3 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

我也遇到了这个问题,请问你解决了吗?

jtyoui commented 3 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

self.class_list = [str(i) for i in range(11)] # 类别名单改一下就可以了。

runningman1215 commented 3 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

self.class_list = [str(i) for i in range(11)] # 类别名单改一下就可以了。

想问下类别名单self.class_list改一下是指什么意思

jtyoui commented 3 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

self.class_list = [str(i) for i in range(11)] # 类别名单改一下就可以了。

想问下类别名单self.class_list改一下是指什么意思

self.class_list = [str(i) for i in range(len(key))] # 类别名单 把 11改成类型的大小

jtyoui commented 3 years ago

我已经把代码改了

sense1011 commented 3 years ago

需要安装 pip install torch pytorch_pretrained_bert

import os

import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertModel, BertTokenizer

# 识别的类型
key = {0: '别名',
       1: '防治农药',
       2: '病原学名',
       3: '病原中文名',
       4: '病原属性',
       5: '为害部位',
       6: '为害作物',
       7: '属目',
       8: '属科',
       9: '学名'
       }

class Config:
    """配置参数"""

    def __init__(self):
        cru = os.path.dirname(__file__)
        self.class_list = [str(i) for i in range(len(key))]  # 类别名单
        self.save_path = os.path.join(cru, 'ernie/ERNIE.ckpt')
        self.device = torch.device('cpu')
        self.require_improvement = 1000  # 若超过1000batch效果还没提升,则提前结束训练
        self.num_classes = len(self.class_list)  # 类别数
        self.num_epochs = 3  # epoch数
        self.batch_size = 128  # mini-batch大小
        self.pad_size = 32  # 每句话处理成的长度(短填长切)
        self.learning_rate = 5e-5  # 学习率
        self.bert_path = os.path.join(cru, 'bert')
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768

    def build_dataset(self, text):
        lin = text.strip()
        pad_size = len(lin)
        token = self.tokenizer.tokenize(lin)
        token = ['[CLS]'] + token
        token_ids = self.tokenizer.convert_tokens_to_ids(token)
        mask = [1] * pad_size
        token_ids = token_ids[:pad_size]
        return torch.tensor([token_ids], dtype=torch.long), torch.tensor([mask])

class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        context = x[0]
        mask = x[1]
        _, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
        out = self.fc(pooled)
        return out

config = Config()
model = Model(config).to(config.device)
model.load_state_dict(torch.load(config.save_path, map_location='cpu'))

def prediction_model(text):
    """输入一句问话预测"""
    data = config.build_dataset(text)
    with torch.no_grad():
        outputs = model(data)
        num = torch.argmax(outputs)
    return key[int(num)]

if __name__ == '__main__':
    print(prediction_model('水稻恶苗病主要危害哪些部位?'))

作者的代码里 def forward(self, x): context = x[0] mask = x[2]

mask=x[2], 这里怎么变成了mask=x[1]呢

jtyoui commented 3 years ago

@sense1011 不用管,mask[1]即可,你可以单步调试一下就知道了。

LLLLucensus commented 3 years ago

self.save_path = "THUCNews/saved_dict/ERNIE.ckpt" 报错如下 RuntimeError: THUCNews/saved_dict/ERNIE.ckpt is a zip archive (did you mean to use torch.jit.load()?)

遇到这个报错改成jit.load报了另一个错,求教

TheOnlylight commented 3 years ago

长文本好像不太稳定,求教这个情况是什么情况啊 image

hughjackman111 commented 3 years ago

self.save_path = "THUCNews/saved_dict/ERNIE.ckpt" 报错如下 RuntimeError: THUCNews/saved_dict/ERNIE.ckpt is a zip archive (did you mean to use torch.jit.load()?)

遇到这个报错改成jit.load报了另一个错,求教

你解决了吗?我也遇到了这个问题。

dongandi commented 3 years ago

RuntimeError: Error(s) in loading state_dict for Model: Unexpected key(s) in state_dict: "lstm.weight_ih_l0", "lstm.weight_hh_l0", "lstm.bias_ih_l0", "lstm.bias_hh_l0", "lstm.0weight_ih_l0" "lstm.weight_hh_l0_reverse", "lstm.bias_ih_l0_reverse", "lstm.bias_hh_l0_reverse", "lstm.weight_ih_l1", "lstm.weight_hh_l1", "lstm.bias_ih_l1", "lstm.bias_ih_l0_reverse", "lstm.bias_hh_l0_reverse", "lstm.bias_ih_l1h_lsth_lsth_lsth_lstm" .weight_hh_l1_reverse”、“lstm.bias_ih_l1_reverse”、“lstm.bias_hh_l1_reverse”。 fc.weight 的尺寸不匹配:从检查点复制形状为 torch.Size([10, 1280]) 的参数,当前模型中的形状为 torch.Size([10, 768])。

以前这个报错是因为什么原因,怎么调整

您好我也出现这种情况,您是怎么解决的 image

chichixdf commented 2 years ago

这里没有考虑输入小于pad_size填充的情况吗

s348268281 commented 2 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

self.class_list = [str(i) for i in range(11)] # 类别名单改一下就可以了。

想问下类别名单self.class_list改一下是指什么意思

self.class_list = [str(i) for i in range(len(key))] # 类别名单 把 11改成类型的大小

大佬,这里按最新代码跑的,类别是key的长度,但是还是报这个错,只要预测输入的文本含有字母和英文就会报错 RuntimeError: The size of tensor a (14) must match the size of tensor b (21) at non-singleton dimension 3,请问是什么原因?

s348268281 commented 2 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

老哥解决了吗请问?

s348268281 commented 2 years ago

当有超过2位的连续数字字母时,就会报以下错 RuntimeError: The size of tensor a (25) must match the size of tensor b (26) at non-singleton dimension 3 请问该如何解决

self.class_list = [str(i) for i in range(11)] # 类别名单改一下就可以了。

想问下类别名单self.class_list改一下是指什么意思

self.class_list = [str(i) for i in range(len(key))] # 类别名单 把 11改成类型的大小

大佬,这里按最新代码跑的,类别是key的长度,但是还是报这个错,只要预测输入的文本含有字母和英文就会报错 RuntimeError: The size of tensor a (14) must match the size of tensor b (21) at non-singleton dimension 3,请问是什么原因?

抱歉打扰了,解决了,torch.tensor([token_ids], token = self.tokenizer.tokenize(lin)这里要是英文的话会根据单词分词,pad_size = len(lin)算的是纯字符串长度,所以两个维度不一样。

hao-go commented 2 years ago

好人一生平安!

yuanphoenix commented 2 years ago

RuntimeError: The size of tensor a (14) must match the size of tensor b (21) at non-singleton dimension 3 报这种错误的把代码改成下面样子的就行。

 token = self.tokenizer.tokenize(lin)
        token = ['[CLS]'] + token
        token_ids = self.tokenizer.convert_tokens_to_ids(token)
        mask=[]
        if len(token) < pad_size:
            mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
            token_ids += ([0] * (pad_size - len(token)))
        else:
            mask = [1] * pad_size
            token_ids = token_ids[:pad_size]
        return torch.tensor([token_ids], dtype=torch.long), torch.tensor([mask])
BGMYE commented 2 years ago

为什么每次预测的结果都不一样呢

tjctjc-hub commented 2 years ago

请问训练完的模型怎么使用呀?

fannanqi commented 2 years ago

berttokenizer这个模型有地址吗,推荐一下

youngwensi commented 2 years ago

为什么每次预测的结果都不一样呢

请问你这个问题解决了吗

BGMYE commented 2 years ago

没有...

shinian315 commented 1 year ago

为什么每次预测的结果都不一样呢

说明模型训练的有问题

Zzr-rr commented 10 months ago

对于出现连续几个数字会报错的原因,可以把pad_size计算放在tokenize之后.

def build_dataset(self, text): lin = text.strip()

pad_size = len(lin)

    token = self.tokenizer.tokenize(lin)
    pad_size = len(token)
    token = ['[CLS]'] + token
    token_ids = self.tokenizer.convert_tokens_to_ids(token)
    mask = [1] * pad_size
    token_ids = token_ids[:pad_size]
    return torch.tensor([token_ids], dtype=torch.long), torch.tensor([mask])
whyseu commented 2 months ago

如何指定某个GPU运行?

jtyoui commented 2 weeks ago

如何指定某个GPU运行?

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"