Tongjilibo / bert4torch

An elegent pytorch implement of transformers
https://bert4torch.readthedocs.io/
MIT License
1.22k stars 152 forks source link

模型推理标签与字符错位,不确定是哪里出了问题 #131

Closed ykallan closed 1 year ago

ykallan commented 1 year ago

提问时请尽可能提供如下信息:

基本信息

训练代码

#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]  token_level: 97.06; entity_level: 95.90

import os
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.callbacks import Callback
from bert4torch.snippets import sequence_padding, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm

maxlen = 64
batch_size = 8
categories = ['O', 'B-BRAND', 'I-BRAND', 'B-MODEL', 'I-MODEL', 'B-SPECS', 'I-SPECS', 'B-COLOR', 'I-COLOR', 'B-NAME',
              'I-NAME']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}

# BERT base
config_path = './pertrained_model/bert_base_chinese/config.json'
checkpoint_path = './pertrained_model/bert_base_chinese/pytorch_model.bin'
dict_path = './pertrained_model/bert_base_chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 固定seed
seed_everything(42)

# 加载数据集
class MyDataset(ListDataset):
    @staticmethod
    def load_data(filename):
        D = []
        with open(filename, encoding='utf-8') as f:
            f = f.read()
            for l in f.split('\n\n'):
                try:
                    if not l:
                        continue
                    d = ['']
                    for i, c in enumerate(l.split('\n')):
                        char, flag = c.split(' ')
                        d[0] += char
                        if flag[0] == 'B':
                            d.append([i, i, flag[2:]])
                        elif flag[0] == 'I':
                            d[-1][1] = i
                    D.append(d)
                except Exception as e:
                    continue
        return D

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

def collate_fn(batch):
    batch_token_ids, batch_labels = [], []
    for d in batch:
        tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
        mapping = tokenizer.rematch(d[0], tokens)
        start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
        end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
        token_ids = tokenizer.tokens_to_ids(tokens)
        labels = np.zeros(len(token_ids))
        for start, end, label in d[1:]:
            if start in start_mapping and end in end_mapping:
                start = start_mapping[start]
                end = end_mapping[end]
                labels[start] = categories_label2id['B-' + label]
                labels[start + 1:end + 1] = categories_label2id['I-' + label]
        batch_token_ids.append(token_ids)
        batch_labels.append(labels)
    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
    batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
    return batch_token_ids, batch_labels

# 转换数据集
train_dataloader = DataLoader(MyDataset('./data/train.txt'), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset('./data/test.txt'), batch_size=batch_size, collate_fn=collate_fn)

# 定义bert上的模型结构
class Model(BaseModel):
    def __init__(self):
        super().__init__()
        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path,
                                            segment_vocab_size=0)
        self.fc = nn.Linear(768, len(categories))  # 包含首尾
        self.crf = CRF(len(categories))

    def forward(self, token_ids):
        sequence_output = self.bert([token_ids])  # [btz, seq_len, hdsz]
        emission_score = self.fc(sequence_output)  # [btz, seq_len, tag_size]
        attention_mask = token_ids.gt(0).long()
        return emission_score, attention_mask

    def predict(self, token_ids):
        self.eval()
        with torch.no_grad():
            emission_score, attention_mask = self.forward(token_ids)
            best_path = self.crf.decode(emission_score, attention_mask)  # [btz, seq_len]
        return best_path

model = Model().to(device)

if os.path.exists("./best_model_crf.pt"):
    model.load_weights("best_model_crf.pt")
    print("加载权重,继续训练")

class Loss(nn.Module):
    def forward(self, outputs, labels):
        return model.crf(*outputs, labels)

def acc(y_pred, y_true):
    y_pred = y_pred[0]
    y_pred = torch.argmax(y_pred, dim=-1)
    acc = torch.sum(y_pred.eq(y_true)).item() / y_true.numel()
    return {'acc': acc}

# 支持多种自定义metrics = ['accuracy', acc, {acc: acc}]均可
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=acc)

def evaluate(data):
    X, Y, Z = 1e-10, 1e-10, 1e-10
    X2, Y2, Z2 = 1e-10, 1e-10, 1e-10
    for token_ids, label in tqdm(data):
        scores = model.predict(token_ids)  # [btz, seq_len]
        attention_mask = label.gt(0)

        # token粒度
        X += (scores.eq(label) * attention_mask).sum().item()
        Y += scores.gt(0).sum().item()
        Z += label.gt(0).sum().item()

        # entity粒度
        entity_pred = trans_entity2tuple(scores)
        entity_true = trans_entity2tuple(label)
        X2 += len(entity_pred.intersection(entity_true))
        Y2 += len(entity_pred)
        Z2 += len(entity_true)
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    f2, precision2, recall2 = 2 * X2 / (Y2 + Z2), X2 / Y2, X2 / Z2
    return f1, precision, recall, f2, precision2, recall2

def trans_entity2tuple(scores):
    '''把tensor转为(样本id, start, end, 实体类型)的tuple用于计算指标
    '''
    batch_entity_ids = set()
    for i, one_samp in enumerate(scores):
        entity_ids = []
        for j, item in enumerate(one_samp):
            flag_tag = categories_id2label[item.item()]
            if flag_tag.startswith('B-'):  # B
                entity_ids.append([i, j, j, flag_tag[2:]])
            elif len(entity_ids) == 0:
                continue
            elif (len(entity_ids[-1]) > 0) and flag_tag.startswith('I-') and (flag_tag[2:] == entity_ids[-1][-1]):  # I
                entity_ids[-1][-2] = j
            elif len(entity_ids[-1]) > 0:
                entity_ids.append([])

        for i in entity_ids:
            if i:
                batch_entity_ids.add(tuple(i))
    return batch_entity_ids

class Evaluator(Callback):
    """评估与保存
    """

    def __init__(self):
        self.best_val_f1 = 0.

    def on_epoch_end(self, steps, epoch, logs=None):
        f1, precision, recall, f2, precision2, recall2 = evaluate(valid_dataloader)
        if f2 > self.best_val_f1:
            self.best_val_f1 = f2
            model.save_weights('best_model_crf.pt')
        print(f'[val-token  level] f1: {f1:.5f}, p: {precision:.5f} r: {recall:.5f}')
        print(
            f'[val-entity level] f1: {f2:.5f}, p: {precision2:.5f} r: {recall2:.5f} best_f1: {self.best_val_f1:.5f}\n')

if __name__ == '__main__':
    evaluator = Evaluator()
    model.fit(train_dataloader, epochs=20, steps_per_epoch=None, callbacks=[evaluator])

推理代码

#! -*- coding:utf-8 -*-
# bert+crf用来做实体识别
# 数据集:http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
# [valid_f1]  token_level: 97.06; entity_level: 95.90

import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from bert4torch.callbacks import Callback
from bert4torch.snippets import sequence_padding, ListDataset, seed_everything
from bert4torch.layers import CRF
from bert4torch.tokenizers import Tokenizer
from bert4torch.models import build_transformer_model, BaseModel
from tqdm import tqdm

maxlen = 64
batch_size = 8
categories = ['O', 'B-BRAND', 'I-BRAND', 'B-MODEL', 'I-MODEL', 'B-SPECS', 'I-SPECS', 'B-COLOR', 'I-COLOR', 'B-NAME',
              'I-NAME']
categories_id2label = {i: k for i, k in enumerate(categories)}
categories_label2id = {k: i for i, k in enumerate(categories)}

# BERT base
config_path = './pertrained_model/bert_base_chinese/config.json'
checkpoint_path = './pertrained_model/bert_base_chinese/pytorch_model.bin'
dict_path = './pertrained_model/bert_base_chinese/vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 固定seed
seed_everything(42)

# 加载数据集
class MyDataset(ListDataset):
    @staticmethod
    def load_data(filename):
        D = []
        with open(filename, encoding='utf-8') as f:
            f = f.read()
            for l in f.split('\n\n'):
                try:
                    if not l:
                        continue
                    d = ['']
                    for i, c in enumerate(l.split('\n')):
                        char, flag = c.split(' ')
                        d[0] += char
                        if flag[0] == 'B':
                            d.append([i, i, flag[2:]])
                        elif flag[0] == 'I':
                            d[-1][1] = i
                    D.append(d)
                except Exception as e:
                    continue
        return D

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)

def collate_fn(batch):
    batch_token_ids, batch_labels = [], []
    for d in batch:
        tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
        mapping = tokenizer.rematch(d[0], tokens)
        start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
        end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
        token_ids = tokenizer.tokens_to_ids(tokens)
        labels = np.zeros(len(token_ids))
        for start, end, label in d[1:]:
            if start in start_mapping and end in end_mapping:
                start = start_mapping[start]
                end = end_mapping[end]
                labels[start] = categories_label2id['B-' + label]
                labels[start + 1:end + 1] = categories_label2id['I-' + label]
        batch_token_ids.append(token_ids)
        batch_labels.append(labels)
    batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
    batch_labels = torch.tensor(sequence_padding(batch_labels), dtype=torch.long, device=device)
    return batch_token_ids, batch_labels

# 定义bert上的模型结构
class Model(BaseModel):
    def __init__(self):
        super().__init__()
        self.bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path,
                                            segment_vocab_size=0)
        self.fc = nn.Linear(768, len(categories))  # 包含首尾
        self.crf = CRF(len(categories))

    def forward(self, token_ids):
        sequence_output = self.bert([token_ids])  # [btz, seq_len, hdsz]
        emission_score = self.fc(sequence_output)  # [btz, seq_len, tag_size]
        attention_mask = token_ids.gt(0).long()
        return emission_score, attention_mask

    def predict(self, token_ids):
        self.eval()
        with torch.no_grad():
            emission_score, attention_mask = self.forward(token_ids)
            best_path = self.crf.decode(emission_score, attention_mask)  # [btz, seq_len]
        return best_path

model = Model().to(device)

model.load_weights("best_model_crf.pt")

class Loss(nn.Module):
    def forward(self, outputs, labels):
        return model.crf(*outputs, labels)

def acc(y_pred, y_true):
    y_pred = y_pred[0]
    y_pred = torch.argmax(y_pred, dim=-1)
    acc = torch.sum(y_pred.eq(y_true)).item() / y_true.numel()
    return {'acc': acc}

# 支持多种自定义metrics = ['accuracy', acc, {acc: acc}]均可
model.compile(loss=Loss(), optimizer=optim.Adam(model.parameters(), lr=2e-5), metrics=acc)

def inference(text, maxlen=64, threshold=0):
    model.eval()
    text = text.replace(" ", "_").strip()
    token_ids, segment_ids = tokenizer.encode(text, maxlen=len(text))
    token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)[None, :]
    segment_ids = torch.tensor(segment_ids, dtype=torch.long, device=device)[None, :]

    scores = model.predict(token_ids)
    scores = scores.cpu().numpy()
    print(scores)
    print(list(scores[0]))
    predict = list(scores[0])
    labels = [categories_id2label.get(x)[2:] for x in predict if categories_id2label.get(x)[2:] != "O"]

    print(labels)

    text = text.replace("_", " ")
    print("len of text=", len(text))
    print("len of labels=", len(labels))
    for char, label in zip(text, labels):
        print(char, label)

if __name__ == '__main__':
    while True:
        text = input("input text:")
        inference(text=text)

# 海斯迪克 HKL-186 食品留样盒包装盒幼儿园学校保险取样盒 六格连体(送30标签)

输出信息

# 请在此处贴上你的调试输出
input text:宏视道 HSD-AV-HDVI16F 无缝混合矩阵
[[0 1 2 2 0 3 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0 0]]
[0, 1, 2, 2, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0]
['', 'BRAND', 'BRAND', 'BRAND', '', 'MODEL', 'MODEL', 'MODEL', 'MODEL', 'MODEL', 'MODEL', 'MODEL', 'MODEL', 'MODEL', '', '', '', '', '', '', '', '']
len of text= 25
len of labels= 22
宏 
视 BRAND
道 BRAND
  BRAND
H 
S MODEL
D MODEL
- MODEL
A MODEL
V MODEL
- MODEL
H MODEL
D MODEL
V MODEL
I 
1 
6 
F 

无 
缝 
混 

自我尝试

输入模型的字符串长度,与得到的推理标签长度不同,并且也无法对应相同数量,目前没找到具体问题出在哪里

ykallan commented 1 year ago

以下为训练、测试数据集样本

美 B-BRAND
汁 I-BRAND
源 I-BRAND
_ O
果 O
粒 O
橙 O
_ O
1 B-SPECS
. I-SPECS
2 I-SPECS
5 I-SPECS
l I-SPECS
/ I-SPECS
瓶 I-SPECS
_ O
1 O
2 O
瓶 O
/ O
箱 O
_ O
整 O
箱 O
销 O
售 O

万 B-BRAND
宝 I-BRAND
龙 I-BRAND
( O
M O
O O
N O
T O
_ O
B O
L O
A O
N O
C O
) O
大 O
班 O
系 O
列 O
1 B-MODEL
4 I-MODEL
5 I-MODEL
墨 O
水 O
笔 O
Tongjilibo commented 1 year ago

我看了下你的代码,你labels处理的时候把O去掉了啊,我的理解这样就labels的长度就和token_ids不一样了吧

labels = [categories_id2label.get(x)[2:] for x in predict if categories_id2label.get(x)[2:] != "O"]
ykallan commented 1 year ago

我看了下你的代码,你labels处理的时候把O去掉了啊,我的理解这样就labels的长度就和token_ids不一样了吧

labels = [categories_id2label.get(x)[2:] for x in predict if categories_id2label.get(x)[2:] != "O"]

修改推理的方法为:

def inference(text, maxlen=64, threshold=0):
    model.eval()
    text = text.replace(" ", "_").strip()
    token_ids, segment_ids = tokenizer.encode(text, maxlen=len(text))
    token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)[None, :]
    segment_ids = torch.tensor(segment_ids, dtype=torch.long, device=device)[None, :]

    scores = model.predict(token_ids)
    scores = scores.cpu().numpy()
    print(scores)
    print(list(scores[0]))
    predict = list(scores[0])
    # labels = [categories_id2label.get(x)[2:] for x in predict if categories_id2label.get(x)[2:] != "O"]
    labels = []
    for x in predict:
        lab = categories_id2label.get(x)
        if len(lab) > 2:
            lab = lab[2:]
        labels.append(lab)

    print(labels)

    text = text.replace("_", " ")
    print("len of text=", len(text))
    print("len of labels=", len(labels))
    for char, label in zip(text, labels):
        print(char, label)

得到的输出:

input text:海斯迪克 HKL-186 食品留样盒包装盒幼儿园学校保险取样盒 六格连体(送30标签)
[[0 1 2 2 2 0 3 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 6 6 6 6 6
  6 6 6 6 0]]
[0, 1, 2, 2, 2, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0]
['O', 'BRAND', 'BRAND', 'BRAND', 'BRAND', 'O', 'MODEL', 'MODEL', 'MODEL', 'MODEL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'SPECS', 'SPECS', 'SPECS', 'SPECS', 'SPECS', 'SPECS', 'SPECS', 'SPECS', 'SPECS', 'SPECS', 'O']
len of text= 43
len of labels= 41
海 O
斯 BRAND
迪 BRAND
克 BRAND
  BRAND
H O
K MODEL
L MODEL
- MODEL
1 MODEL
8 O
6 O
  O
食 O
品 O
留 O
样 O
盒 O
包 O
装 O
盒 O
幼 O
儿 O
园 O
学 O
校 O
保 O
险 O
取 O
样 O
盒 SPECS
  SPECS
六 SPECS
格 SPECS
连 SPECS
体 SPECS
( SPECS
送 SPECS
3 SPECS
0 SPECS
标 O

还是对应不上,不知道是否因为加上了掩码导致文本和预测的标签长度不一致

Tongjilibo commented 1 year ago

你先比较下token_ids和labels是否长度一致,我怀疑是tokenize时候有问题,不要传入maxlen,因为默认会添加cls和sep,你这样应该会截断掉两个

token_ids, segment_ids = tokenizer.encode(text)
Tongjilibo commented 1 year ago

你最好是debug看一下,默认tokenize后是[CLS]+sentence+[SEP]的,你输出的时候要对应一下

ykallan commented 1 year ago

你最好是debug看一下,默认tokenize后是[CLS]+sentence+[SEP]的,你输出的时候要对应一下

好的谢谢,我debug看一下

feng-1985 commented 1 year ago

利用mapping再把预测出来的起止位置映射到原始文本的位置上


mapping = tokenizer.rematch(d[0], tokens)
start_mapping = {i: j[0] for i, j in enumerate(mapping) if j}
end_mapping = {i: j[-1] for i, j in enumerate(mapping) if j}
for index, start, end, label in trans_entity2tuple(scores):
    if start in start_mapping and end in end_mapping:
        start = start_mapping[start]
        end = end_mapping[end]