NTK长度外推， - Githubissues

提交前必须检查以下项目

[X] 请确保使用的是仓库最新代码（git pull），一些问题已被解决和修复。
[X] 我已阅读项目文档和FAQ章节并且已在Issue中对问题进行了搜索，没有找到相似问题和解决方案。
[X] 第三方插件问题：例如llama.cpp、LangChain、text-generation-webui等，同时建议到对应的项目中查找解决方案。

问题类型

模型推理

基础模型

Others

操作系统

macOS

详细描述问题

from datasets import load_dataset
import torch
import random
import numpy as np
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tqdm import tqdm
import os
import argparse
import sys

parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
# from attn_and_long_ctx_patches import apply_attention_patch, apply_ntk_scaling_patch

dir_path = os.path.dirname(os.path.realpath(__file__))

# 模型地址
model_path = r'/mnt/data/extend_length/Baichuan2_7B_Chat'

# 数据地址
file_path = r'/mnt/data/extend_length/longbench_data/data/'

# 百川2的prompt模板(百川会自动帮你添加user_token_id和assistant_token_id)
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""

"""
参数说明
--model_path ${model_path}：待评测模型所在目录（完整的Chinese-LLaMA-2或Chinese-Alpaca-2模型，非LoRA）
--predict_on {data_class}: 指定待预测的任务，可以为en，zh，code，或它们的组合，以逗号分隔，如en,zh,code
--output_dir ${output_dir}：评测结果的输出目录
--max_length ${max_length}：指令的最大长度。注意此长度不包括system prompt以及任务相关prompt在内。
--gpus ${gpus}：如需指定特定的GPU，请使用此参数，如0,1。
--alpha ${alpha}： NTK上下文扩展方法系数。一般设为待处理文本长度 / 模型上下文长度 * 2 - 1。或更方便地设为auto即可。
--e：在LongBench-E数据集上进行预测。参考LongBench官方文档以了解LongBench-E的详细说明。
"""
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str)
parser.add_argument('--predict_on',type=str, default='zh')
parser.add_argument('--output_dir',type=str, default='pred')
parser.add_argument('--gpus',type=str, default=None)
parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")

args = parser.parse_args()

model_path = args.model_path
predict_on = args.predict_on
output_dir = args.output_dir
gpus=args.gpus

print(f"Model Path: {model_path}")
print(f"Predict On: {predict_on}")
print(f"Output Directory: {output_dir}")
print(f"GPUs: {gpus}")
print(f"Evaluate Flag: {args.e}")

DO_SAMPLE =True
TEMPERATURE = 0.2
REPETITION_PENALTY = 1.1
TOP_P = 0.95
TOP_K = 40

if gpus is not None:
    os.environ["CUDA_VISIBLE_DEVICES"] = gpus
# apply_attention_patch(use_memory_efficient_attention=True)
# apply_ntk_scaling_patch(args.alpha)

def fill_baichuan2_prompt_template(instruction, with_system_prompt = False, system_prompt = DEFAULT_SYSTEM_PROMPT):
    messages = []
    if with_system_prompt is True:
        messages.append({'role': 'system', 'content': system_prompt})
        messages.append({"role": "user", "content": instruction})
        return  messages
    else:
        return messages.append({"role": "user", "content": instruction})

def get_pred(model, tokenizer, data, max_gen, prompt_format, dataset, device):
    preds = []
    for json_obj in tqdm(data):

        prompt = prompt_format.format(**json_obj)
        # truncate to fit max_length (we suggest truncate in the middle, since the left and right side may contain crucial instructions)
        # tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
        # if len(tokenized_prompt) > max_length:
        #     half = int(max_length/2)
        #     prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)

        if dataset not in ["trec", "triviaqa", "samsum", "lsht", "lcc", "repobench-p"]: # chat models are better off without build prompts on these tasks
            raw_data = fill_baichuan2_prompt_template(instruction=prompt,with_system_prompt=True)
        else:
            raw_data = fill_baichuan2_prompt_template(instruction=prompt)
        input_data = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
        context_length = input_data.input_ids.shape[-1]
        if dataset == "samsum": # prevent illegal output on samsum (model endlessly repeat "\nDialogue"), might be a prompting issue
            generation_config = {
                "pad_token_id": 0,
                "bos_token_id": 1,
                # eos_token_id=[2,5]
                "eos_token_id": [tokenizer.eos_token_id, tokenizer.encode("\n", add_special_tokens=False)[-1]],
                "user_token_id": 195,
                "assistant_token_id": 196,
                # 不同数据的最大输出tokens不同
                "max_new_tokens": max_gen,
                "min_length":context_length + 1,
                "temperature": 1.0,
                "top_k": 15,
                "top_p": 0.90,
                "repetition_penalty": 1.00,
                "do_sample": False,
                "transformers_version": "4.29.2"
            }
            model.generation_config = GenerationConfig(**generation_config)
        else:
            generation_config = {
                "pad_token_id": 0,
                "bos_token_id": 1,
                "eos_token_id": 2,
                "user_token_id": 195,
                "assistant_token_id": 196,
                "max_new_tokens": max_gen,
                "temperature": 1.0,
                "top_k": 15,
                "top_p": 0.90,
                "repetition_penalty": 1.00,
                "do_sample": False,
                "transformers_version": "4.29.2"
            }
            model.generation_config = GenerationConfig(**generation_config)
        pred = model.chat(tokenizer, input_data)
        preds.append({"pred": pred, "answers": json_obj["answers"], "all_classes": json_obj["all_classes"], "length": json_obj["length"]})
    return preds

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)

if __name__ == '__main__':
    seed_everything(42)
    load_type = torch.float16
    if torch.cuda.is_available():
        device = torch.device(0)
    else:
        device = torch.device('cpu')

    # 使用longbench-E 或者 longbench
    if args.e:
        en_datasets = [ "hotpotqa","2wikimqa",
                        "qasper", "multifieldqa_en",  "gov_report",
                        "trec", "samsum", "triviaqa",
                        "passage_count", "passage_retrieval_en", "multi_news"]
        zh_datasets = []
        code_datasets = [ "lcc", "repobench-p" ]
        if not os.path.exists(f"{output_dir}/pred_e"):
            os.makedirs(f"{output_dir}/pred_e")
    else:
        en_datasets = [ "hotpotqa","2wikimqa", "musique", "narrativeqa",
                        "qasper", "multifieldqa_en",  "gov_report",
                        "qmsum", "trec", "samsum", "triviaqa",
                        "passage_count", "passage_retrieval_en", "multi_news"]
        zh_datasets = [ "dureader", "multifieldqa_zh",
                        "vcsum","lsht", "passage_retrieval_zh"]
        code_datasets = [ "lcc", "repobench-p" ]

        if not os.path.exists(f"{output_dir}/pred"):
            os.makedirs(f"{output_dir}/pred")

    # 选择需要使用哪些子数据集
    datasets = []
    for data_type in predict_on.split(','):
        if data_type == 'zh':
            datasets += zh_datasets
        elif data_type == 'en':
            datasets += en_datasets
        elif data_type == 'code':
            datasets += code_datasets
    print(datasets)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 加载tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code = True)

    # 加载model
    model = model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype = torch.float16, trust_remote_code=True)
    model = model.eval()
    model_vocab_size = model.get_input_embeddings().weight.size(0)
    print(f"Vocab of the base model: {model_vocab_size}")
    tokenizer_vocab_size = len(tokenizer)
    print(f"Vocab of the tokenizer: {tokenizer_vocab_size}")

    # we design specific prompt format and max generation length for each task, feel free to modify them to optimize model output
    print(f"加载配置文件dataset2prompt,dataset2maxlen")
    dataset2prompt = json.load(open(dir_path + "/config/dataset2prompt.json", "r"))
    dataset2maxlen = json.load(open(dir_path + "/config/dataset2maxlen.json", "r"))
    print(f"加载成功")
    # predict on each dataset
    for dataset in datasets:
        print(f"Loading dataset {dataset}")
        if args.e:
            # data = load_dataset('THUDM/LongBench', dataset+'_e', split='test')

            data = load_dataset('json',  data_files=file_path+dataset+'_e'+'.jsonl', split='train')
            output_path = f"{output_dir}/pred_e/{dataset}.jsonl"
        else:
            data = load_dataset('json',  data_files=file_path+dataset+'.jsonl', split='train')
            output_path = f"{output_dir}/pred/{dataset}.jsonl"
        prompt_format = dataset2prompt[dataset]
        max_gen = dataset2maxlen[dataset]
        # preds = get_pred(model, tokenizer, data, max_length, max_gen, prompt_format, dataset, device)
        preds = get_pred(model, tokenizer, data, max_gen, prompt_format, dataset, device)
        with open(output_path, "w", encoding="utf-8") as f:
            for pred in preds:
                json.dump(pred, f, ensure_ascii=False)
                f.write('\n')

1.我看源码中，将长度超过的部分进行了截断，保留开始和结尾部分，那这样对于NTK来说，还会涉及到插值吗，他永远不会超过模型的max_input_length

我将截断部分的逻辑去除后跑baichuan2，用的是A800，但是报OOM

依赖情况（代码类问题务必提供）

bitsandbytes                  0.41.1
open-clip-torch               2.20.0
peft                          0.5.0
pytorch-lightning             1.7.7
pytorch-metric-learning       2.3.0
pytorch-wavelets              1.3.0
pytorch-wpe                   0.0.1
pytorch3d                     0.7.4
rotary-embedding-torch        0.3.0
sentencepiece                 0.1.99
taming-transformers-rom1504   0.0.6
torch                         2.0.1+cu118
torch-complex                 0.4.3
torch-scatter                 2.1.1
torchaudio                    2.0.2+cu118
torchmetrics                  0.11.4
torchsummary                  1.5.1
torchvision                   0.15.2+cu118
transformers                  4.34.1
transformers-stream-generator 0.0.4

运行日志或截图

Traceback (most recent call last):
  File "pred_baichuan2.py", line 241, in <module>
    preds = get_pred(model, tokenizer, data, max_gen, prompt_format, dataset, device)
  File "pred_baichuan2.py", line 154, in get_pred
    pred = model.chat(tokenizer, raw_data)
  File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 853, in chat
    outputs = self.generate(input_ids, generation_config=generation_config)
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py", line 1652, in generate
    return self.sample(
  File "/opt/conda/lib/python3.8/site-packages/transformers/generation/utils.py", line 2734, in sample
    outputs = self(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 756, in forward
    outputs = self.model(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 533, in forward
    layer_outputs = decoder_layer(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 345, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/Baichuan2_7B_Chat/modeling_baichuan.py", line 306, in forward
    attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 27.74 GiB (GPU 1; 79.35 GiB total capacity; 32.57 GiB already allocated; 17.55 GiB free; 60.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

ymcui / Chinese-LLaMA-Alpaca-2

NTK长度外推， #453

提交前必须检查以下项目

问题类型

基础模型

操作系统

详细描述问题

依赖情况（代码类问题务必提供）

运行日志或截图