Closed seeyourcell closed 6 months ago
将
meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='
改成 meta_template = dict( round=[ dict(role="HUMAN", api_role="HUMAN"), dict(role="BOT", api_role="BOT", generate=True), ], )
from mmengine.config import read_base from opencompass.models.turbomind import TurboMindModel from opencompass.models.llama2 import Llama2,Llama2Chat
with read_base():
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_6dc406 import WSC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.race.race_gen_69ee4f import race_datasets
from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
meta_template = dict( round=[ dict(role="HUMAN", api_role="HUMAN"), dict(role="BOT", api_role="BOT", generate=True), ], )
models = [ dict( type=TurboMindModel, abbr='internlm-llama2-7b-w4a16', path="/workspaceLlama4w16a_new", max_out_len=100, max_seq_len=2048, batch_size=16, concurrency=16, meta_template=meta_template, run_cfg=dict(num_gpus=1, num_procs=1), ) ]
@lin65505578 hi, I followed your instructions and tested some datasets with llama2-7b-chat model in facebook format and lmdeploy's turbomind format, but couldn't reproduce results from opencompass website: https://opencompass.org.cn/model-detail/LLaMA-2-7B-Chat Is it normal? Is there any special setting when bechmarking llama2 using opencompass? BR
here's my reults on face book format (last column are results from opencompass website)
dataset version metric mode llama-2-7b-chat from_opencompass website
-------------------------------------- --------- ---------------- ------ -----------------
--------- 考试 Exam --------- - - - -
ceval - naive_average gen 27.38 [31.9]
agieval - naive_average gen 26.32 [28.5]
mmlu - naive_average gen 30.68 [46.2]
GaokaoBench - - - -
ARC-c - - - -
--------- 语言 Language --------- - - - -
WiC - - - -
summedits - - - -
chid-dev - - - -
afqmc-dev - - - -
bustm-dev - - - -
cluewsc-dev - - - -
WSC - - - -
winogrande - - - -
flores_100 - - - -
--------- 知识 Knowledge --------- - - - -
BoolQ - - - -
commonsense_qa - - - -
nq - - - -
triviaqa 2121ce score gen 42.62 [46.4]
--------- 推理 Reasoning --------- - - - -
cmnli - - - -
ocnli - - - -
ocnli_fc-dev - - - -
AX_b - - - -
AX_g - - - -
CB - - - -
RTE - - - -
story_cloze - - - -
COPA - - - -
ReCoRD - - - -
hellaswag - - - -
piqa - - - -
siqa - - - -
strategyqa - - - -
math - - - -
gsm8k 1d7fe4 accuracy gen 28.89 [26.3]
TheoremQA - - - -
openai_humaneval 8e312c humaneval_pass@1 gen 5.49 [12.2]
mbpp - - - -
bbh - - - -
--------- 理解 Understanding --------- - - - -
C3 - - - -
CMRC_dev - - - -
DRCD_dev - - - -
MultiRC - - - -
race-middle - - - -
race-high - - - -
openbookqa_fact - - - -
csl_dev - - - -
lcsts - - - -
Xsum - - - -
eprstmt-dev - - - -
lambada - - - -
tnews-dev - - - -
here's my reults on lmdeploy's turbomind format
dataset version metric mode llama2-chat-7b-turbomind
-------------------------------------- --------- ---------------- ------ --------------------------
--------- 考试 Exam --------- - - - -
ceval - naive_average gen 28.24
agieval - naive_average gen 26.72
mmlu - naive_average gen 35.41
GaokaoBench - - - -
ARC-c - - - -
--------- 语言 Language --------- - - - -
WiC - - - -
summedits - - - -
chid-dev - - - -
afqmc-dev - - - -
bustm-dev - - - -
cluewsc-dev - - - -
WSC - - - -
winogrande - - - -
flores_100 - - - -
--------- 知识 Knowledge --------- - - - -
BoolQ - - - -
commonsense_qa - - - -
nq - - - -
triviaqa 2121ce score gen 42.83
--------- 推理 Reasoning --------- - - - -
cmnli - - - -
ocnli - - - -
ocnli_fc-dev - - - -
AX_b - - - -
AX_g - - - -
CB - - - -
RTE - - - -
story_cloze - - - -
COPA - - - -
ReCoRD - - - -
hellaswag - - - -
piqa - - - -
siqa - - - -
strategyqa - - - -
math - - - -
gsm8k 1d7fe4 accuracy gen 26.46
TheoremQA - - - -
openai_humaneval 8e312c humaneval_pass@1 gen 6.71
mbpp - - - -
bbh - - - -
--------- 理解 Understanding --------- - - - -
C3 - - - -
CMRC_dev - - - -
DRCD_dev - - - -
MultiRC - - - -
race-middle - - - -
race-high - - - -
openbookqa_fact - - - -
csl_dev - - - -
lcsts - - - -
Xsum - - - -
eprstmt-dev - - - -
lambada - - - -
tnews-dev - - - -
hi we have supported vllm and lmdeploy by a simple way, you just need to set
--accelerator lmdeploy
Feel free to reopen it if needed
Prerequisite
Type
I'm evaluating with the officially supported tasks/models/datasets.
Environment
python3 -m lmdeploy.serve.turbomind.deploy llama2 /models/llama-2-7b-chat
生成了workspace 文件
应该用哪个配置文件,现在支持吗?
Reproduces the problem - code/configuration sample
python3 -m lmdeploy.serve.turbomind.deploy llama2 /models/llama-2-7b-chat
Reproduces the problem - command or script
python3 -m lmdeploy.serve.turbomind.deploy llama2 /models/llama-2-7b-chat
Reproduces the problem - error message
python3 -m lmdeploy.serve.turbomind.deploy llama2 /models/llama-2-7b-chat
Other information
No response