Open Vvegetables opened 11 months ago
可以提供更多信息用于复现问题吗? 例如code、模型以及使用的什么精度?
我尝试了一下,qwen模型也有类似问题,但发生在使用导出模型的情况下,也就是说 qwen模型导出精度int4为本地flm文件,然后再加载进行推理大概率出现不断重复情况 如果再同一个脚本中先进行transformers的加载,再通过接口去直接转换再进行推理没有遇见这种情况 代码的话其实就是用qwen2flm和cli_demo两个
如果再同一个脚本中先进行transformers的加载,再通过接口去直接转换再进行推理没有遇见这种情况 代码的话其实就是用qwen2flm和cli_demo两个
谢谢,可以给出一定的代码示例吗?没太理解您的意思
后面我用了之后,转出来的int4模型好像偶尔都会出现重复的情况
# coding=utf-8 # Implements API for Qwen-7B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat) # Usage: python openai_api.py # Visit http://localhost:8000/docs for documents.
from argparse import ArgumentParser import time import torch import uvicorn from pydantic import BaseModel, Field from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from contextlib import asynccontextmanager from typing import Any, Dict, List, Literal, Optional, Union from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM from transformers.generation import GenerationConfig from sse_starlette.sse import ServerSentEvent, EventSourceResponse
@asynccontextmanager async def lifespan(app: FastAPI): # collects GPU memory yield if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware( CORSMiddleware, allow_origins=[""], allow_credentials=True, allow_methods=[""], allow_headers=["*"], )
class ModelCard(BaseModel): id: str object: str = "model" created: int = Field(default_factory=lambda: int(time.time())) owned_by: str = "owner" root: Optional[str] = None parent: Optional[str] = None permission: Optional[list] = None
class ModelList(BaseModel): object: str = "list" data: List[ModelCard] = []
class ChatMessage(BaseModel): role: Literal["user", "assistant", "system"] content: str
class DeltaMessage(BaseModel): role: Optional[Literal["user", "assistant", "system"]] = None content: Optional[str] = None
class ChatCompletionRequest(BaseModel): model: str messages: List[ChatMessage] temperature: Optional[float] = None top_p: Optional[float] = None max_length: Optional[int] = None stream: Optional[bool] = False stop: Optional[List[str]] = []
class ChatCompletionResponseChoice(BaseModel): index: int message: ChatMessage finish_reason: Literal["stop", "length"]
class ChatCompletionResponseStreamChoice(BaseModel): index: int delta: DeltaMessage finish_reason: Optional[Literal["stop", "length"]]
class ChatCompletionResponse(BaseModel): model: str object: Literal["chat.completion", "chat.completion.chunk"] choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]] created: Optional[int] = Field(default_factory=lambda: int(time.time()))
@app.get("/v1/models", response_model=ModelList) async def list_models(): global model_args model_card = ModelCard(id="gpt-3.5-turbo") return ModelList(data=[model_card])
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse) async def create_chat_completion(request: ChatCompletionRequest): global model, tokenizer print('-'5 + "Related documents start" + '-'5) for each in request.messages: if each.role == "user": print(each.content) print() print('-'5 + "Related documents end" + '-'5) if request.messages[-1].role != "user": raise HTTPException(status_code=400, detail="The last message must be from user.") query = request.messages[-1].content stop_words = request.stop stop_words.extend(list(map(lambda x: x[1:], filter(lambda x: x.startswith("\n"), stop_words)))) prev_messages = request.messages[:-1]
# in your query.
# if len(prev_messages) > 0 and prev_messages[0].role == "system":
# query = prev_messages.pop(0).content + query
history = []
if len(prev_messages) % 2 == 0:
for i in range(0, len(prev_messages), 2):
if prev_messages[i].role == "user" and prev_messages[i + 1].role == "assistant":
history.append([prev_messages[i].content, prev_messages[i + 1].content])
else:
raise HTTPException(status_code=400, detail="The message order is invalid.")
else:
raise HTTPException(status_code=400, detail="The message number is invalid.")
if request.stream:
generate = predict(query, history, request.model, stop_words)
return EventSourceResponse(generate, media_type="text/event-stream")
if stop_words:
react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words]
response, _ = model.chat(tokenizer, query, history=history, stop_words_ids=react_stop_words_tokens)
for stop_ in stop_words:
if response.endswith(stop_):
response = response[:response.find(stop_)]
else:
response, _ = model.chat(tokenizer, query, history=history)
choice_data = ChatCompletionResponseChoice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop"
)
return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion")
async def predict(query: str, history: List[List[str]], model_id: str, stop_words: List[str]): global model, tokenizer assert stop_words == [], "in stream format, stop word is output" choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(role="assistant"), finish_reason=None ) chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk") yield "{}".format(chunk.model_dump_json(exclude_unset=True))
current_length = 0
if stop_words:
react_stop_words_tokens = [tokenizer.encode(stop_) for stop_ in stop_words]
response_generator = model.stream_chat(tokenizer, query, history=history,
stop_words_ids=react_stop_words_tokens)
else:
response_generator = model.stream_chat(tokenizer, query, history=history)
for new_response in response_generator:
new_response = new_response[0]
if len(new_response) == current_length:
continue
new_text = new_response[current_length:]
print(new_text, end='', flush=True)
current_length = len(new_response)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(content=new_text),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
print('', flush=True)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
def _get_args(): parser = ArgumentParser() parser.add_argument("-c", "--checkpoint-path", type=str, default='Qwen/Qwen-7B-Chat', help="Checkpoint name or path, default to %(default)r") parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only") parser.add_argument("--server-port", type=int, default=8090, help="Demo server port.") parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
args = parser.parse_args()
return args
if name == "main": args = _get_args() tokenizer = AutoTokenizer.from_pretrained( args.checkpoint_path, trust_remote_code=True, resume_download=True, )
if args.cpu_only:
model_origin = AutoModelForCausalLM.from_pretrained(
args.checkpoint_path,
device_map="cpu",
trust_remote_code=True,
resume_download=True,
).eval().float()
else:
model_origin = AutoModelForCausalLM.from_pretrained(
args.checkpoint_path,
device_map="auto",
trust_remote_code=True,
resume_download=True,
fp16=True
).eval()
model_origin.generation_config = GenerationConfig.from_pretrained(
args.checkpoint_path, trust_remote_code=True, resume_download=True, max_new_tokens=1024, min_new_tokens=128
)
from fastllm_pytools import llm
model = llm.from_hf(
model_origin, tokenizer, dtype="int4"
)
del model_origin
torch.cuda.empty_cache()
uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)
就算没有Lora,直接Qwen 7B导出成int4,问“你是谁”,也是这种效果。
./webui -p tools/qwen-7b-int4.flm --port 1234