Open 1456416403 opened 2 years ago
建议先了解一下transformers的tokenizer是如何使用的。
from chinesebert import ChineseBertTokenizerFast
tokenizer = ChineseBertTokenizerFast.from_pretrained("junnyu/ChineseBERT-base")
text_a = "我是谁?"
text_b = "我来自哪里?"
tokenized_data = tokenizer(text_a, text_b, return_tensors="pt")
print(tokenized_data["input_ids"])
print(tokenized_data["pinyin_ids"].reshape(1,-1, 8))
# tensor([[ 101, 2769, 3221, 6443, 8043, 102, 2769, 3341, 5632, 1525, 7027, 8043,
# 102]])
# tensor([[[ 0, 0, 0, 0, 0, 0, 0, 0],
# [28, 20, 3, 0, 0, 0, 0, 0],
# [24, 13, 14, 4, 0, 0, 0, 0],
# [24, 13, 10, 14, 2, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [28, 20, 3, 0, 0, 0, 0, 0],
# [17, 6, 14, 2, 0, 0, 0, 0],
# [31, 14, 4, 0, 0, 0, 0, 0],
# [19, 6, 3, 0, 0, 0, 0, 0],
# [17, 14, 3, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0]]])
这个是批量进行tokenizer的
from chinesebert import ChineseBertTokenizerFast
tokenizer = ChineseBertTokenizerFast.from_pretrained("junnyu/ChineseBERT-base")
text_a = ["我是谁?", "今天的天气怎么样?"]
text_b = ["我来自哪里?", "今天是个好天气,一起出去玩?"]
tokenized_data = tokenizer(text_a, text_b, return_tensors="pt", padding=True)
print(tokenized_data["input_ids"])
print(tokenized_data["pinyin_ids"].reshape(tokenized_data["input_ids"].shape[0],-1, 8))
# tensor([[ 101, 2769, 3221, 6443, 8043, 102, 2769, 3341, 5632, 1525, 7027, 8043,
# 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
# 0, 0],
# [ 101, 791, 1921, 4638, 1921, 3698, 2582, 720, 3416, 8043, 102, 791,
# 1921, 3221, 702, 1962, 1921, 3698, 8024, 671, 6629, 1139, 1343, 4381,
# 8043, 102]])
# tensor([[[ 0, 0, 0, 0, 0, 0, 0, 0],
# [28, 20, 3, 0, 0, 0, 0, 0],
# [24, 13, 14, 4, 0, 0, 0, 0],
# [24, 13, 10, 14, 2, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [28, 20, 3, 0, 0, 0, 0, 0],
# [17, 6, 14, 2, 0, 0, 0, 0],
# [31, 14, 4, 0, 0, 0, 0, 0],
# [19, 6, 3, 0, 0, 0, 0, 0],
# [17, 14, 3, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0]],
# [[ 0, 0, 0, 0, 0, 0, 0, 0],
# [15, 14, 19, 1, 0, 0, 0, 0],
# [25, 14, 6, 19, 1, 0, 0, 0],
# [ 9, 10, 5, 0, 0, 0, 0, 0],
# [25, 14, 6, 19, 1, 0, 0, 0],
# [22, 14, 4, 0, 0, 0, 0, 0],
# [31, 10, 19, 3, 0, 0, 0, 0],
# [18, 10, 5, 0, 0, 0, 0, 0],
# [30, 6, 19, 12, 4, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [15, 14, 19, 1, 0, 0, 0, 0],
# [25, 14, 6, 19, 1, 0, 0, 0],
# [24, 13, 14, 4, 0, 0, 0, 0],
# [12, 10, 4, 0, 0, 0, 0, 0],
# [13, 6, 20, 3, 0, 0, 0, 0],
# [25, 14, 6, 19, 1, 0, 0, 0],
# [22, 14, 4, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [30, 14, 4, 0, 0, 0, 0, 0],
# [22, 14, 3, 0, 0, 0, 0, 0],
# [ 8, 13, 26, 1, 0, 0, 0, 0],
# [22, 26, 4, 0, 0, 0, 0, 0],
# [28, 6, 19, 2, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0]]])
我想请问如果是一条句子内需要加入多个CLS和SEP token的情况下: 比如说 【cls】我是谁【sep】我是谁【sep】我是谁【sep】 这种情况下是否有办法呢?
分别tokenizer每个句子,然后自己手动组合
好的呢,十分感谢!
我发现这样也可以得到结果,不确定是否正确,你可以自己验证一下。这种方法无法得到正确的token_type_id,它只会得到全0的token_type_id
text = "我是谁?[SEP]我叫什么?[SEP]我来自哪里?"
tokenized_data = tokenizer(text, return_tensors="pt", padding=True)
print(tokenized_data["input_ids"])
print(tokenized_data["pinyin_ids"].reshape(tokenized_data["input_ids"].shape[0],-1, 8))
# tensor([[ 101, 2769, 3221, 6443, 8043, 102, 2769, 1373, 784, 720, 8043, 102,
# 2769, 3341, 5632, 1525, 7027, 8043, 102]])
# tensor([[[ 0, 0, 0, 0, 0, 0, 0, 0],
# [28, 20, 3, 0, 0, 0, 0, 0],
# [24, 13, 14, 4, 0, 0, 0, 0],
# [24, 13, 10, 14, 2, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [28, 20, 3, 0, 0, 0, 0, 0],
# [15, 14, 6, 20, 4, 0, 0, 0],
# [24, 13, 10, 19, 2, 0, 0, 0],
# [18, 10, 5, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [28, 20, 3, 0, 0, 0, 0, 0],
# [17, 6, 14, 2, 0, 0, 0, 0],
# [31, 14, 4, 0, 0, 0, 0, 0],
# [19, 6, 3, 0, 0, 0, 0, 0],
# [17, 14, 3, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0],
# [ 0, 0, 0, 0, 0, 0, 0, 0]]])
请问遇到输入是一个列表的情况下如何处理呢?比如在做MRC时需要插入特殊的token