提交前必须检查以下项目

[X] 请确保使用的是仓库最新代码（git pull），一些问题已被解决和修复。
[X] 由于相关依赖频繁更新，请确保按照Wiki中的相关步骤执行
[X] 我已阅读FAQ章节并且已在Issue中对问题进行了搜索，没有找到相似问题和解决方案
[X] 第三方插件问题：例如llama.cpp、text-generation-webui、LlamaChat等，同时建议到对应的项目中查找解决方案
[X] 模型正确性检查：务必检查模型的SHA256.md，模型不对的情况下无法保证效果和正常运行

问题类型

模型推理

基础模型

Alpaca-Plus-7B

操作系统

Linux

详细描述问题

import argparse
import json, os
parser = argparse.ArgumentParser()
parser.add_argument('--base_model', default=None, type=str, required=True)
parser.add_argument('--lora_model', default=None, type=str,help="If None, perform inference on the base model")
parser.add_argument('--tokenizer_path',default=None,type=str)
parser.add_argument('--data_file',default=None, type=str,help="A file that contains instructions (one instruction per line)")
parser.add_argument('--with_prompt',action='store_true',help="wrap the input with the prompt automatically")
parser.add_argument('--interactive',action='store_true',help="run in the instruction mode (single-turn)")
parser.add_argument('--predictions_file', default='./predictions.json', type=str)
parser.add_argument('--gpus', default="0", type=str)
parser.add_argument('--only_cpu',action='store_true',help='only use CPU for inference')
parser.add_argument('--alpha',type=str,default="1.0", help="The scaling factor of NTK method, can be a float or 'auto'. ")
parser.add_argument('--load_in_8bit',action='store_true', help="Load the LLM in the 8bit mode")

args = parser.parse_args()
if args.only_cpu is True:
    args.gpus = ""
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from peft import  PeftModel

# from patches import apply_attention_patch, apply_ntk_scaling_patch
# apply_attention_patch(use_memory_efficient_attention=True)
# apply_ntk_scaling_patch(args.alpha)

generation_config = dict(
    temperature=0.2,
    top_k=40,
    top_p=0.9,
    do_sample=True,
    num_beams=1,
    repetition_penalty=1.1,
    max_new_tokens=400
    )

 # The prompt template below is taken from llama.cpp
 # and is slightly different from the one used in training.
 # But we find it gives better results
prompt_input = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n\n{instruction}\n\n### Response:\n\n"
)

sample_data = ["为什么要减少污染，保护环境？"]

def generate_prompt(instruction, input=None):
    if input:
        instruction = instruction + '\n' + input
    return prompt_input.format_map({'instruction': instruction})

if __name__ == '__main__':
    load_type = torch.float16
    if torch.cuda.is_available():
        device = torch.device(0)
    else:
        device = torch.device('cpu')
    if args.tokenizer_path is None:
        args.tokenizer_path = args.lora_model
        if args.lora_model is None:
            args.tokenizer_path = args.base_model
    tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)

    base_model = LlamaForCausalLM.from_pretrained(
        args.base_model,
        load_in_8bit=args.load_in_8bit,
        torch_dtype=load_type,
        low_cpu_mem_usage=True,
        device_map='auto',
        )

    model_vocab_size = base_model.get_input_embeddings().weight.size(0)
    tokenzier_vocab_size = len(tokenizer)
    print(f"Vocab of the base model: {model_vocab_size}")
    print(f"Vocab of the tokenizer: {tokenzier_vocab_size}")
    if model_vocab_size!=tokenzier_vocab_size:
        assert tokenzier_vocab_size > model_vocab_size
        print("Resize model embeddings to fit tokenizer")
        base_model.resize_token_embeddings(tokenzier_vocab_size)
    if args.lora_model is not None:
        print("loading peft model")
        model = PeftModel.from_pretrained(base_model, args.lora_model,torch_dtype=load_type,device_map='auto',)
    else:
        model = base_model

    if device==torch.device('cpu'):
        model.float()
    # test data
    if args.data_file is None:
        examples = sample_data
    else:
        with open(args.data_file,'r') as f:
            examples = [l.strip() for l in f.readlines()]
        print("first 10 examples:")
        for example in examples[:10]:
            print(example)
    model.eval()

    with torch.no_grad():
        if args.interactive:
            print("Start inference with instruction mode.")

            print('='*85)
            print("+ 该模式下仅支持单轮问答，无多轮对话能力。\n"
                  "+ 如要进行多轮对话，请使用llama.cpp或llamachat工具。")
            print('-'*85)
            print("+ This mode only supports single-turn QA.\n"
                  "+ If you want to experience multi-turn dialogue, please use llama.cpp or llamachat.")
            print('='*85)

            while True:
                raw_input_text = input("Input:")
                if len(raw_input_text.strip())==0:
                    break
                if args.with_prompt:
                    input_text = generate_prompt(instruction=raw_input_text)
                else:
                    input_text = raw_input_text
                inputs = tokenizer(input_text,return_tensors="pt")  #add_special_tokens=False ?
                generation_output = model.generate(
                    input_ids = inputs["input_ids"].to(device),
                    attention_mask = inputs['attention_mask'].to(device),
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.pad_token_id,
                    **generation_config
                )
                s = generation_output[0]
                output = tokenizer.decode(s,skip_special_tokens=True)
                if args.with_prompt:
                    response = output.split("### Response:")[1].strip()
                else:
                    response = output
                print("Response: ",response)
                print("\n")
        else:
            print("Start inference.")
            results = []
            for index, example in enumerate(examples):
                if args.with_prompt is True:
                    input_text = generate_prompt(instruction=example)
                else:
                    input_text = example
                inputs = tokenizer(input_text,return_tensors="pt")  #add_special_tokens=False ?
                generation_output = model.generate(
                    input_ids = inputs["input_ids"].to(device),
                    attention_mask = inputs['attention_mask'].to(device),
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.pad_token_id,
                    **generation_config
                )
                s = generation_output[0]
                output = tokenizer.decode(s,skip_special_tokens=True)
                if args.with_prompt:
                    response = output.split("### Response:")[1].strip()
                else:
                    response = output
                print(f"======={index}=======")
                print(f"Input: {example}\n")
                print(f"Output: {response}\n")

                results.append({"Input":input_text,"Output":response})

            dirname = os.path.dirname(args.predictions_file)
            os.makedirs(dirname,exist_ok=True)
            with open(args.predictions_file,'w') as f:
                json.dump(results,f,ensure_ascii=False,indent=2)
            with open(dirname+'/generation_config.json','w') as f:
                json.dump(generation_config,f,ensure_ascii=False,indent=2)
）

脚本文件

base_model=/public/home/xx/xx/model/Llama-2-7b-hf
lora_model=/public/home/xx/xx/sft_lora_model/chinese-alpaca-pro-lora-7b
tokenizer_path=/public/home/xx/xx/sft_lora_model/chinese-alpaca-pro-lora-7b

CUDA_VISIBLE_DEVICES=2 python inference_hf.py \
    --base_model ${base_model} \
    --lora_model ${lora_model} \
    --interactive \
    --with_prompt

依赖情况（代码类问题务必提供）

accelerate==0.29.2
aiohttp==3.9.3
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.3.0
async-timeout==4.0.3
attrs==23.2.0
auto_gptq==0.7.1
bypy==1.8.5
certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
dataclasses-json==0.6.4
datasets==2.18.0
dill==0.3.8
einops==0.7.0
exceptiongroup==1.2.0
fastapi==0.110.1
filelock==3.9.0
frozenlist==1.4.1
fsspec==2024.2.0
gekko==1.1.0
greenlet==3.0.3
grpcio==1.62.1
grpcio-tools==1.62.1
h11==0.14.0
h2==4.1.0
hpack==4.0.0
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.22.2
humanfriendly==10.0
hyperframe==6.0.1
idna==3.6
Jinja2==3.1.2
joblib==1.4.0
jsonpatch==1.33
jsonpointer==2.4
langchain==0.1.11
langchain-community==0.0.32
langchain-core==0.1.41
langchain-text-splitters==0.0.1
langsmith==0.1.42
MarkupSafe==2.1.3
marshmallow==3.21.1
mpmath==1.3.0
multidict==6.0.5
multiprocess==0.70.16
mypy-extensions==1.0.0
networkx==3.2.1
nltk==3.8.1
numpy==1.24.1
nvidia-cublas-cu11==11.11.3.6
nvidia-cuda-cupti-cu11==11.8.87
nvidia-cuda-nvrtc-cu11==11.8.89
nvidia-cuda-runtime-cu11==11.8.89
nvidia-cudnn-cu11==8.7.0.84
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.3.0.86
nvidia-cusolver-cu11==11.4.1.48
nvidia-cusparse-cu11==11.7.5.86
nvidia-nccl-cu11==2.19.3
nvidia-nvtx-cu11==11.8.86
optimum==1.18.1
orjson==3.10.0
packaging==23.2
pandas==2.2.1
peft==0.3.0
pillow==10.2.0
portalocker==2.8.2
protobuf==4.25.3
psutil==5.9.8
pyarrow==15.0.2
pyarrow-hotfix==0.6
pydantic==2.6.4
pydantic_core==2.16.3
pypdf==4.2.0
python-dateutil==2.9.0.post0
python-multipart==0.0.9
pytz==2024.1
PyYAML==6.0.1
qdrant-client==1.8.2
regex==2023.12.25
requests==2.31.0
requests-toolbelt==1.0.0
rouge==1.0.1
safetensors==0.4.2
scikit-learn==1.4.2
scipy==1.13.0
sentence-transformers==2.2.1
sentencepiece==0.2.0
six==1.16.0
sniffio==1.3.1
SQLAlchemy==2.0.29
starlette==0.37.2
sympy==1.12
tenacity==8.2.3
threadpoolctl==3.4.0
tiktoken==0.6.0
tokenizers==0.13.3
torch==2.2.1+cu118
torchaudio==2.2.1+cu118
torchvision==0.17.1+cu118
tqdm==4.66.2
transformers==4.33.1
transformers-stream-generator==0.0.5
triton==2.2.0
typing-inspect==0.9.0
typing_extensions==4.8.0
tzdata==2024.1
urllib3==2.2.1
uvicorn==0.29.0
xxhash==3.4.1
yarl==1.9.4

ymcui / Chinese-LLaMA-Alpaca

结合官方给定的Lora，推理不准确 #894

提交前必须检查以下项目

问题类型

基础模型

操作系统

详细描述问题

脚本文件

依赖情况（代码类问题务必提供）

运行日志或截图