Closed genius0182 closed 9 months ago
我们使用的是是readme中Fill in the Moddle的例子。未使用TGI方式部署。
` @app.post("/generate", response_model=FillInAllResponse) async def fill_in_all(request: FillInRequest): global model, tokenizer
# max_new_tokens = model.generation_config.max_new_tokens max_new_tokens = request.parameters.max_new_tokens temperature = request.parameters.temperature repetition_penalty = request.parameters.repetition_penalty top_p = request.parameters.top_p do_sample = request.parameters.do_sample stop_words = request.parameters.stop query = request.inputs # print(f"query={query}") device = "cuda" if torch.cuda.is_available() else "cpu" inputs = tokenizer(query, return_tensors="pt").to(device) inputs = tokenizer.encode(query) max_input_tokens = model.config.n_positions - max_new_tokens inputs = inputs[-max_input_tokens:] inputs = torch.LongTensor([inputs]).to(device) stopping_criteria = StoppingCriteriaList( [ EndOfFunctionCriteria( [len(inputs[0])], ["|<end>|", "|end|", "<|endoftext|>"], tokenizer ) ] ) outputs = model.generate( inputs, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p, do_sample=False, stopping_criteria=stopping_criteria, ) model.generation_config.max_new_tokens = max_new_tokens response = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True) return FillInAllResponse(generated_text=response)
`
我们使用的是是readme中Fill in the Moddle的例子。未使用TGI方式部署。
` @app.post("/generate", response_model=FillInAllResponse) async def fill_in_all(request: FillInRequest): global model, tokenizer
`