Open LeeQuan1 opened 1 month ago
prompter = Prompter(prompt_template) tokenizer = AutoTokenizer.from_pretrained(base_model) if device == "cuda": model = AutoModelForCausalLM.from_pretrained( base_model, load_in_8bit=load_8bit, torch_dtype=torch.float16,
device_map={"": 0}, #offload_folder="offload_folder" ) model = PeftModel.from_pretrained( model, lora_weights, torch_dtype=torch.float16, ) elif device == "mps": model = AutoModelForCausalLM.from_pretrained( base_model, device_map={"": device}, torch_dtype=torch.float16, ) model = PeftModel.from_pretrained( model, lora_weights, device_map={"": device}, torch_dtype=torch.float16, ) else: model = AutoModelForCausalLM.from_pretrained( base_model, device_map={"": device}, low_cpu_mem_usage=True ) model = PeftModel.from_pretrained( model, lora_weights, device_map={"": device}, ) # unwind broken decapoda-research config model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk model.config.bos_token_id = 1 model.config.eos_token_id = 2 if not load_8bit: model.half() # seems to fix bugs for some users. model.eval() if torch.__version__ >= "2" and sys.platform != "win32": model = torch.compile(model) def evaluate( instruction, input=None, temperature=0.1, top_p=0.75, top_k=40, num_beams=4, max_new_tokens=128, **kwargs, ): prompt = prompter.generate_prompt(instruction, input) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, **kwargs, ) with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, ) s = generation_output.sequences[0] output = tokenizer.decode(s) return prompter.get_response(output) gr.Interface( fn=evaluate, inputs=[ gr.components.Textbox( lines=2, label="Instruction", placeholder="Tell me about alpacas.", ), gr.components.Textbox(lines=2, label="Input", placeholder="none"), gr.components.Slider( minimum=0, maximum=1, value=0.1, label="Temperature" ), gr.components.Slider( minimum=0, maximum=1, value=0.75, label="Top p" ), gr.components.Slider( minimum=0, maximum=100, step=1, value=40, label="Top k" ), gr.components.Slider( minimum=1, maximum=4, step=1, value=4, label="Beams" ), gr.components.Slider( minimum=1, maximum=2000, step=1, value=128, label="Max tokens" ), ], outputs=[ #gr.inputs.Textbox( gr.components.Textbox( lines=5, label="Output", ) ], title="BenTsao", description="", # noqa: E501 ).launch(server_name=server_name, share=share_gradio) 这是部分代码 求大佬解惑
可能是pydantic版本的问题,参考 https://github.com/gradio-app/gradio/issues/4974 将pydantic版本下调至1.10.7以解决
prompter = Prompter(prompt_template) tokenizer = AutoTokenizer.from_pretrained(base_model) if device == "cuda": model = AutoModelForCausalLM.from_pretrained( base_model, load_in_8bit=load_8bit, torch_dtype=torch.float16,
device_map="auto",