Open zuzuou opened 11 months ago
请问解决了吗
找你加载模型的GenerationConfig.from_pretrained对应路径下generation_config.json文件里面可以设置temperature
请问解决了吗
我用api-for-open-llm成功拉起百川模型,并且封装的openapi接口也能正确调用。 我在这个项目里找到了关于温度参数的处理,但是看不懂 下面是有关温度的关键代码,如果你能看懂的话,望您不吝赐教!!!
def prepare_logits_processor(
temperature: float, repetition_penalty: float, top_p: float, top_k: int
) -> LogitsProcessorList:
processor_list = LogitsProcessorList()
# TemperatureLogitsWarper doesn't accept 0.0, 1.0 makes it a no-op, so we skip two cases.
if temperature >= 1e-5 and temperature != 1.0:
processor_list.append(TemperatureLogitsWarper(temperature))
if repetition_penalty > 1.0:
processor_list.append(RepetitionPenaltyLogitsProcessor(repetition_penalty))
if 1e-8 <= top_p < 1.0:
processor_list.append(TopPLogitsWarper(top_p))
if top_k > 0:
processor_list.append(TopKLogitsWarper(top_k))
return processor_list
@torch.inference_mode()
def generate_stream(
model,
tokenizer,
params,
device: str,
context_len: int,
stream_interval: int = 2,
):
prompt = params["prompt"]
temperature = float(params.get("temperature", 1.0))
repetition_penalty = float(params.get("repetition_penalty", 1.0))
top_p = float(params.get("top_p", 1.0))
top_k = int(params.get("top_k", -1))
logits_processor = prepare_logits_processor(
temperature, repetition_penalty, top_p, top_k
)
if logits_processor:
if repetition_penalty > 1.0:
tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
else:
tmp_output_ids = None
last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
else:
last_token_logits = logits[0, -1, :]
if device == "mps":
# Switch to CPU by avoiding some bugs in mps backend.
last_token_logits = last_token_logits.float().to("cpu")
if temperature < 1e-5 or top_p < 1e-8: # greedy
_, indices = torch.topk(last_token_logits, 2)
tokens = [int(index) for index in indices.tolist()]
else:
probs = torch.softmax(last_token_logits, dim=-1)
indices = torch.multinomial(probs, num_samples=2)
tokens = [int(token) for token in indices.tolist()]
token = tokens[0]
output_ids.append(token)
# Yield the output tokens
if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
if echo:
tmp_output_ids = output_ids
rfind_start = len(prompt) if isinstance(prompt, str) else 0
else:
tmp_output_ids = output_ids[input_echo_len:]
rfind_start = 0
output = tokenizer.decode(
tmp_output_ids,
skip_special_tokens=False if check_is_qwen(model) else True, # fix for qwen react
spaces_between_special_tokens=False,
clean_up_tokenization_spaces=True,
)
model.generation_config = GenerationConfig.from_pretrained(CHAT_MODEL_PATH)
model.generation_config.temperature = temperature
我是这么搞的,你可以试试
model.chat方法本质是模型中的chat(self, tokenizer, messages: List[dict], stream=False,generation_config: Optional[GenerationConfig]=None)
方法,所以你可以传入自定义的generation_config,这里面可以传入llm常见的参数
我按照官网的示例加载百川模型,尝试给chat方法传递temperature参数时报如下错误:
关键代码如下: ···python from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.utils import GenerationConfig class TransformersModel: def init(self, **kwargs): self.model_name = kwargs.get("model_name")
根据传入的模型简称,初始化模型路径
···
··· init_model = model_loaders.TransformersModel(model_name=args.model_name) tokenizer = init_model.tokenizer model = init_model.model response = model.chat(tokenizer, messages,temperature=0.9) ···