Closed LiuShixing closed 11 months ago
@LiuShixing 方便share一下你的评测代码吗?我检查一下。LongBench我也跑过里面的几个任务,sumarization的几个数据集都还可以,VCSUM (zh)大概有16.6这样,NQ和TriviaQA这两个数据集的结果在 #61 都有提供。现在的代码还没加上我们内部测试的window attention,但效果应该也不至于“比较差“。
@LiuShixing 方便share一下你的评测代码吗?我检查一下。LongBench我也跑过里面的几个任务,sumarization的几个数据集都还可以,VCSUM (zh)大概有16.6这样,NQ和TriviaQA这两个数据集的结果在 #61 都有提供。现在的代码还没加上我们内部测试的window attention,但效果应该也不至于“比较差“。
我跑的LongBench https://github.com/THUDM/LongBench 结果,可能有错误,仅供参考,希望官方能提供正式的评测代码和结果
代码改动: modeling_qwen.py的76 ~79行被我注释了,因为我的flash_attn版本是1.0.5,这两个import错误 from flash_attn.layers.rotary import apply_rotary_emb_func from flash_attn.ops.rms_norm import rms_norm
改动位置 https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py#L76
其他相关代码和文件没改动
评测代码如下
import os
from datasets import load_dataset
import torch
import json
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
# qwen 目录下是https://huggingface.co/Qwen/Qwen-7B-Chat/tree/main 的.py和.json等文件
import sys
sys.path.append("/apdcephfs/private_shixingliu/project/Qwen")
from qwen.modeling_qwen import QWenLMHeadModel
from qwen.tokenization_qwen import QWenTokenizer
from qwen.configuration_qwen import QWenConfig
from transformers.generation import GenerationConfig
from transformers.trainer_utils import set_seed
from qwen.qwen_generation_utils import get_stop_words_ids, decode_tokens, make_context
# from dataset import LongBench
# This is the customized building prompt for chat models, here is an example for ChatGLM2
# def build_chat(tokenizer, prompt):
# return tokenizer.build_prompt(prompt)
def get_pred(model, tokenizer, data, max_length, input_max_length, max_gen, prompt_format, dataset, device):
model.generation_config.max_context_size = max_length
model.generation_config.max_generate_size = max_gen
model.generation_config.max_new_tokens = max_gen
model.generation_config.do_sample = False
model.generation_config.num_beams = 1
model.generation_config.temperature = 1.0
stop_words_ids = get_stop_words_ids(model.generation_config.chat_format, tokenizer)
preds = []
i = 0
for json_obj in tqdm(data):
prompt = prompt_format.format(**json_obj)
# truncate to fit max_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions)
tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
if len(tokenized_prompt) > input_max_length:
half = int(input_max_length/2)
prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
# if dataset not in ["lcc", "repobench-p", "trec", "nq", "triviaqa", "lsht"]: # chat models are better off without build prompt on these tasks
# prompt = build_chat(tokenizer, prompt)
# input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
raw_text, context_tokens = make_context(
tokenizer,
prompt,
history=None,
system="You are a helpful assistant.",
max_window_size=input_max_length,
chat_format=model.generation_config.chat_format,
)
input_ids = torch.tensor([context_tokens]).to(model.device)
output = model.generate(
input_ids,
generation_config=model.generation_config,
stop_words_ids = stop_words_ids,
)[0]
# pred = tokenizer.decode(output[context_length:], skip_special_tokens=True)
pred = decode_tokens(
output,
tokenizer,
raw_text_len=len(raw_text),
context_length=len(context_tokens),
chat_format=model.generation_config.chat_format,
verbose=False,
)
preds.append({"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"]})
if i == 0:
print(f"prompt:\n{raw_text}\npred:\n{pred}\nanswers:\n{json_obj['answers']}\n\n")
i += 1
return preds
def _load_model_tokenizer(model_path, seq_len):
tokenizer = QWenTokenizer.from_pretrained(model_path)
tokenizer.model_max_length = seq_len
device_map = "auto"
model = QWenLMHeadModel.from_pretrained(
model_path,
device_map=device_map,
fp16=True,
use_flash_attn=True,
use_logn_attn=True,
n_positions=seq_len,
max_position_embeddings=seq_len,
).eval()
model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
tokenizer.eos_token_id = model.generation_config.eos_token_id
return model, tokenizer
if __name__ == '__main__':
datasets = ["hotpotqa", "2wikimqa", "musique", "dureader", "narrativeqa", "qasper", "multifieldqa_en", "multifieldqa_zh", "gov_report", \
"qmsum", "vcsum", "trec", "nq", "triviaqa", "lsht", "passage_count", "passage_retrieval_en", "passage_retrieval_zh", "lcc", "repobench-p"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
max_length = 20480
input_max_length = max_length - 512
# model_path 里面有.bin,.py,.json和qwen.tiktoken文件
model_path = "/dockerdata/Qwen-7B-Chat"
model, tokenizer = _load_model_tokenizer(model_path, max_length)
model = model.eval()
# we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output
dataset2prompt = json.load(open("config/dataset2prompt.json", "r"))
dataset2maxlen = json.load(open("config/dataset2maxlen.json", "r"))
# predict on each dataset
if not os.path.exists("pred"):
os.makedirs("pred")
for dataset in datasets:
data = load_dataset('THUDM/LongBench', dataset, split='test')
# # 网络问题,从本地加载,有网络可以用上面那行
# # ----------------
# data = LongBench(name=dataset)
# splits = data._split_generators("")
# data_generator = data._generate_examples(**splits[0].gen_kwargs)
# data = [d for d in data_generator]
# # ---------------
prompt_format = dataset2prompt[dataset]
max_gen = dataset2maxlen[dataset]
preds = get_pred(model, tokenizer, data, max_length, input_max_length, max_gen, prompt_format, dataset, device)
with open(f"pred/{dataset}.jsonl", "w") as f:
for pred in preds:
json.dump(pred, f)
f.write('\n')
统计分数代码
import os
import json
from typing import List, Any, Dict, Callable
from tqdm import tqdm
from pyecharts.charts import Radar
from collections import OrderedDict, defaultdict
import numpy as np
import pandas as pd
from metrics import (
qa_f1_score,
rouge_zh_score,
qa_f1_zh_score,
rouge_score,
classification_score,
retrieval_score,
retrieval_zh_score,
count_score,
code_sim_score,
)
dataset2metric = {
"hotpotqa": qa_f1_score,
"2wikimqa": qa_f1_score,
"musique": qa_f1_score,
"dureader": rouge_zh_score,
"narrativeqa": qa_f1_score,
"qasper": qa_f1_score,
"multifieldqa_en": qa_f1_score,
"multifieldqa_zh": qa_f1_zh_score,
"gov_report": rouge_score,
"qmsum": rouge_score,
"vcsum": rouge_zh_score,
"trec": classification_score,
"nq": qa_f1_score,
"triviaqa": qa_f1_score,
"lsht": classification_score,
"passage_retrieval_en": retrieval_score,
"passage_retrieval_zh": retrieval_zh_score,
"passage_count": count_score,
"passkey_retrieval_zh": retrieval_zh_score,
"lcc": code_sim_score,
"repobench-p": code_sim_score,
}
categorys = ["单文档QA", "多文档QA", "摘要", "Few-shot学习", "代码补全", "合成任务"]
en_dataset2category = {
"narrativeqa": ["NarrativeQA", "单文档QA"],
"qasper": ["Qasper", "单文档QA"],
"multifieldqa_en": ["MultiFieldQA-en", "单文档QA"],
"hotpotqa": ["HotpotQA", "多文档QA"],
"2wikimqa": ["2WikiMQA", "多文档QA"],
"musique": ["Musique", "多文档QA"],
"gov_report": ["GovReport", "摘要"],
"qmsum": ["QMSum", "摘要"],
"trec": ["TREC", "Few-shot学习"],
"nq": ["NQ", "Few-shot学习"],
"triviaqa": ["TriviaQA", "Few-shot学习"],
"lcc": ["LCC", "代码补全"],
"repobench-p": ["RepoBench-P", "代码补全"],
"passage_retrieval_en": ["PassageRetrieval-en", "合成任务"],
"passage_count": ["Passage Count", "合成任务"],
}
zh_dataset2category = {
"multifieldqa_zh": ["MultiFieldQA-zh", "单文档QA"],
"dureader": ["DuReader (zh)", "多文档QA"],
"vcsum": ["VCSUM (zh)", "摘要"],
"lsht": ["LSHT (zh)", "Few-shot学习"],
"lcc": ["LCC", "代码补全"],
"repobench-p": ["RepoBench-P", "代码补全"],
"passage_retrieval_zh": ["PassageRetrieval-zh", "合成任务"],
}
all_dataset2category = {
"narrativeqa": ["NarrativeQA", "单文档QA"],
"qasper": ["Qasper", "单文档QA"],
"multifieldqa_en": ["MultiFieldQA-en", "单文档QA"],
"multifieldqa_zh": ["MultiFieldQA-zh", "单文档QA"],
"hotpotqa": ["HotpotQA", "多文档QA"],
"2wikimqa": ["2WikiMQA", "多文档QA"],
"musique": ["Musique", "多文档QA"],
"dureader": ["DuReader (zh)", "多文档QA"],
"gov_report": ["GovReport", "摘要"],
"qmsum": ["QMSum", "摘要"],
"vcsum": ["VCSUM (zh)", "摘要"],
"trec": ["TREC", "Few-shot学习"],
"nq": ["NQ", "Few-shot学习"],
"triviaqa": ["TriviaQA", "Few-shot学习"],
"lsht": ["LSHT (zh)", "Few-shot学习"],
"lcc": ["LCC", "代码补全"],
"repobench-p": ["RepoBench-P", "代码补全"],
"passage_retrieval_en": ["PassageRetrieval-en", "合成任务"],
"passage_count": ["Passage Count", "合成任务"],
"passage_retrieval_zh": ["PassageRetrieval-zh", "合成任务"],
}
zh_gpt35_scores = OrderedDict({
"榜单": "中文榜单",
"模型": "GPT-3.5-Turbo-16k",
"Avg": 44.5,
"单文档QA": 61.2,
"多文档QA": 28.7,
"摘要": 16.0,
"Few-shot学习": 29.2,
"代码补全": 54.5,
"合成任务": 77.5,
})
zh_chatglm2_6b_scores = OrderedDict({
"榜单": "中文榜单",
"模型": "ChatGLM2-6B-32k",
"Avg": 41.3,
"单文档QA": 52.0,
"多文档QA": 34.3,
"摘要": 16.3,
"Few-shot学习": 29.9,
"代码补全": 52.7,
"合成任务": 62.5,
})
zh_LongChat_7B_16k_scores = OrderedDict({
"榜单": "中文榜单",
"模型": "LongChat-7B-16k",
"Avg": 23.7,
"单文档QA": 26.6,
"多文档QA": 19.1,
"摘要": 14.0,
"Few-shot学习": 20.8,
"代码补全": 57.0,
"合成任务": 4.8,
})
en_gpt35_scores = OrderedDict({
"榜单": "英文榜单",
"模型": "GPT-3.5-Turbo-16k",
"Avg": 45.5,
"单文档QA": 39.8,
"多文档QA": 38.7,
"摘要": 26.5,
"Few-shot学习": 76.0,
"代码补全": 54.5,
"合成任务": 37.8,
})
en_chatglm2_6b_scores = OrderedDict({
"榜单": "英文榜单",
"模型": "ChatGLM2-6B-32k",
"Avg": 42.7,
"单文档QA": 32.8,
"多文档QA": 34.0,
"摘要": 28.6,
"Few-shot学习": 68.1,
"代码补全": 52.7,
"合成任务": 39.8,
})
en_LongChat_7B_16k_scores = OrderedDict({
"榜单": "英文榜单",
"模型": "LongChat-7B-16k",
"Avg": 33.7,
"单文档QA": 29.3,
"多文档QA": 16.1,
"摘要": 25.8,
"Few-shot学习": 59.9,
"代码补全": 57.0,
"合成任务": 14.2,
})
def scorer(dataset, predictions, answers, all_classes):
total_score = 0.
for (prediction, ground_truths) in zip(predictions, answers):
score = 0.
for ground_truth in ground_truths:
score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
total_score += score
return round(100 * total_score / len(predictions), 2)
def radar(data, labels, categories, save_file):
from pyecharts.charts import Radar
from pyecharts import options as opts
# 数据准备
# data = [
# [90, 80, 70, 60, 50],
# [80, 70, 60, 50, 40],
# [70, 60, 50, 40, 30],
# [60, 50, 40, 30, 20]
# ]
# labels = ["模型1", "模型2", "模型3", "模型4"]
# categories = ["指标1", "指标2", "指标3", "指标4", "指标5"]
colors = ["red", "blue", "green", "yellow"]
# 创建雷达图
radar = Radar()
radar.add_schema(
schema=[
opts.RadarIndicatorItem(name=category, max_=100) for category in categories
]
)
for i, label in enumerate(labels):
radar.add(
series_name=label,
data=[data[i]],
linestyle_opts=opts.LineStyleOpts(width=2),
areastyle_opts=opts.AreaStyleOpts(opacity=0.3, color=colors[i]),
)
# 设置图表选项
radar.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
radar.set_global_opts(
title_opts=opts.TitleOpts(title="模型评分雷达图"),
legend_opts=opts.LegendOpts(),
)
# 渲染图表
radar.render(save_file)
def plot_radar(save_dir, scores, model_name):
df_data = []
for name in ["en", "zh", "all"]:
dataset2category = eval(f"{name}_dataset2category")
cate2score = OrderedDict({k: [] for k in ["榜单", "模型", "Avg"] + categorys})
for dataset, score in scores.items():
if dataset not in dataset2category:
# print(f"skip {dataset} for {name}")
continue
cate2score[dataset2category[dataset][1]].append(score)
all_type_scores = []
for c in cate2score.keys():
if c in ["榜单", "模型", "Avg"]:
continue
cate2score[c] = float(np.mean(cate2score[c]))
all_type_scores.append(cate2score[c])
avg_score = float(np.mean(all_type_scores))
print(f"{name} avg_score {avg_score}")
cate2score["模型"] = model_name
cate2score["Avg"] = avg_score
if name == "en":
cate2score["榜单"] = "英文榜单"
elif name == "zh":
cate2score["榜单"] = "中文榜单"
else:
cate2score["榜单"] = "中英榜单"
radar_data = []
labels = []
if name != "all":
baseline1 = eval(f"{name}_gpt35_scores")
df_data.append(baseline1)
baseline2 = eval(f"{name}_chatglm2_6b_scores")
df_data.append(baseline2)
baseline3 = eval(f"{name}_LongChat_7B_16k_scores")
df_data.append(baseline3)
labels.append(baseline1["模型"])
labels.append(baseline2["模型"])
labels.append(baseline3["模型"])
radar_data.append([v for k, v in baseline1.items() if k not in ["榜单", "模型"]])
radar_data.append([v for k, v in baseline2.items() if k not in ["榜单", "模型"]])
radar_data.append([v for k, v in baseline3.items() if k not in ["榜单", "模型"]])
df_data.append(cate2score)
labels.append(cate2score["模型"])
radar_data.append([v for k, v in cate2score.items() if k not in ["榜单", "模型"]])
# "radar_chart.html"
radar(radar_data, labels, categorys, f"{save_dir}/{name}_radar_chart.html")
df = pd.DataFrame(df_data)
df.to_excel(f'{save_dir}/longbench.xlsx', index=False)
if __name__ == '__main__':
scores = dict()
pred_dir = "pred"
save_dir = "result"
os.makedirs(save_dir, exist_ok=True)
all_files = os.listdir(pred_dir)
for filename in all_files:
predictions, answers = [], []
dataset = filename.split('.')[0]
with open(f"{pred_dir}/{filename}", "r") as f:
for line in f:
data = json.loads(line)
predictions.append(data["pred"])
answers.append(data["answers"])
all_classes = data["all_classes"]
score = scorer(dataset, predictions, answers, all_classes)
scores[dataset] = score
with open(f"{save_dir}/result.json", "w") as f:
json.dump(scores, f, ensure_ascii=False, indent=4)
plot_radar(save_dir, scores, model_name="Qwen-7B-20k")
@LiuShixing 抱歉,回复晚了。你所测单文档QA,多文档QA,以及摘要的结果跟我们内部测的结果差距不大。内部因为使用了Window attention,所以得到的结果会更高一些,单文档QA结果为29.9,多文档QA结果为16.0,摘要结果为23.0。Few-shot结果较低的原因是因为Longbench会截断时会保留前后,截断中间部分的内容,这对于上下文长度低于8K的模型可能不够友好,会造成few-shot输入不完整,我们将输入改成向前截断就得到了比较合理的分数。至于代码补全以及合成任务,我们测出来的结果也很低,暂时还没找到原因。
感谢回复
Qwen-7B-20K推理时需要显存是多大?我使用40G的显卡直接报显存溢出
我用8卡40G推理的,单卡40G最多能6k吧,我估计
问一下预测之后生成多少个文件呢,我这里是12个,但是跑完统计脚本之后 缺了很多指标,现在不知道哪个地方出问题了
@LiuShixing 抱歉,回复晚了。你所测单文档QA,多文档QA,以及摘要的结果跟我们内部测的结果差距不大。内部因为使用了Window attention,所以得到的结果会更高一些,单文档QA结果为29.9,多文档QA结果为16.0,摘要结果为23.0。Few-shot结果较低的原因是因为Longbench会截断时会保留前后,截断中间部分的内容,这对于上下文长度低于8K的模型可能不够友好,会造成few-shot输入不完整,我们将输入改成向前截断就得到了比较合理的分数。至于代码补全以及合成任务,我们测出来的结果也很低,暂时还没找到原因。
generation_config.json的max_context_size config.json的n_positions,seq_length tokenizer_config.json的model_max_length
generation_config.json的max_context_size config.json的n_positions,seq_length tokenizer_config.json的model_max_length 请问这几个参数怎么设置呢?
generation_config.json的max_context_size 按实际需要修改。 config.json的n_positions,seq_length 无需修改。 tokenizer_config.json的model_max_length 按实际需要修改,这个只影响tokenizer在padding=True输出id序列的长度。
请问改到16k可行吗?
---原始邮件--- 发件人: "Ren @.> 发送时间: 2023年11月30日(周四) 下午4:23 收件人: @.>; 抄送: @.**@.>; 主题: Re: [QwenLM/Qwen] 💡 [REQUEST] - <title>长输入使用说明 (Issue #107)
generation_config.json的max_context_size 按实际需要修改。 config.json的n_positions,seq_length 无需修改。 tokenizer_config.json的model_max_length 按实际需要修改,这个只影响tokenizer在padding=True输出id序列的长度。
— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.Message ID: @.***>
超过8192似乎就不能在t4卡上跑了。。
---原始邮件--- 发件人: "Ren @.> 发送时间: 2023年11月30日(周四) 下午4:23 收件人: @.>; 抄送: @.**@.>; 主题: Re: [QwenLM/Qwen] 💡 [REQUEST] - <title>长输入使用说明 (Issue #107)
generation_config.json的max_context_size 按实际需要修改。 config.json的n_positions,seq_length 无需修改。 tokenizer_config.json的model_max_length 按实际需要修改,这个只影响tokenizer在padding=True输出id序列的长度。
— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.Message ID: @.***>
起始日期 | Start Date
No response
实现PR | Implementation PR
No response
相关Issues | Reference Issues
No response
摘要 | Summary
在https://huggingface.co/Qwen/Qwen-7B-Chat#%E9%95%BF%E5%BA%8F%E5%88%97%E8%AF%84%E6%B5%8B%EF%BC%88long-context-understanding%EF%BC%89
写了如何使用长输入,但是代码和配置文件里有几个和长度相关的参数如何设置?如: generation_config.json的max_context_size config.json的n_positions,seq_length tokenizer_config.json的model_max_length
我跑Qwen-7B-Chat的时候同时设置了use_dynamc_ntk=true,use_logn_attn=true, n_positions和model_max_length,等于16k。能推理,跑了清华的LongBench评测,但是看结果比较差,不知道是不是用法不对 希望能有个详细的长输入配置说明
基本示例 | Basic Example
无
缺陷 | Drawbacks
无
未解决问题 | Unresolved questions
No response