josStorer / RWKV-Runner

A RWKV management and startup tool, full automation, only 8MB. And provides an interface compatible with the OpenAI API. RWKV is a large language model that is fully open source and available for commercial use.
https://www.rwkv.com
MIT License
5.31k stars 502 forks source link

调用api接口感觉速度有点慢,是否和流式输出有关系,我看接口记录中流式输出参数是false,这个"stream": false, 参数在哪里可以设置呢? #329

Open zhuifengzl opened 7 months ago

zhuifengzl commented 7 months ago

2024-04-19 16:22:02,475 - INFO Client: Address(host='192.168.31.39', port=63902) Url: http://192.168.31.39:8000/chat/completions Body: {"max_tokens": 1000, "temperature": 1.2, "top_p": 0.5, "presence_penalty": 0.4, "frequency_penalty": 0.4, "penalty_decay": null, "top_k": null, "global_penalty": null, "messages": [{"role": "user", "content": "喂", "raw": false}], "model": "rwkv", "stream": false, "stop": ["\n\nUser", "\n\nQuestion", "\n\nQ", "\n\nHuman", "\n\nBob", "\n\nAssistant", "\n\nAnswer", "\n\nA", "\n\nBot", "\n\nAlice", "\n\nUser", "\n\nAss"], "user_name": null, "assistant_name": null, "system_name": null, "presystem": true} Data: Hello! How can I assist you today? Finished. RequestsNum: 0

zhuifengzl commented 7 months ago

我用的模型是RWKV-4-World-CHNtuned-3B-v1-20230625-ctx4096.pth,也转换过RWKV-4-World-CHNtuned-3B-v1-20230625-ctx4096-fp16.bin,都测试过,即使修改代码backend-python\routes\completion.py下的设置,重启程序也不生效,是这个模型不支持吗 class ChatCompletionBody(ModelConfigBody): messages: Union[List[Message], None] model: Union[str, None] = "rwkv" stream: bool = False stop: Union[str, List[str], None] = default_stop user_name: Union[str, None] = Field( None, description="Internal user name", min_length=1 ) assistant_name: Union[str, None] = Field( None, description="Internal assistant name", min_length=1 ) system_name: Union[str, None] = Field( None, description="Internal system name", min_length=1 ) presystem: bool = Field( True, description="Whether to insert default system prompt at the beginning" )

model_config = {
    "json_schema_extra": {
        "example": {
            "messages": [
                {"role": Role.User.value, "content": "hello", "raw": False}
            ],
            "model": "rwkv",
            "stream": False,
            "stop": None,
            "user_name": None,
            "assistant_name": None,
            "system_name": None,
            "presystem": True,
            "max_tokens": 1000,
            "temperature": 1,
            "top_p": 0.3,
            "presence_penalty": 0,
            "frequency_penalty": 1,
        }
    }
}

class CompletionBody(ModelConfigBody): prompt: Union[str, List[str], None] model: Union[str, None] = "rwkv" stream: bool = False stop: Union[str, List[str], None] = None

model_config = {
    "json_schema_extra": {
        "example": {
            "prompt": "The following is an epic science fiction masterpiece that is immortalized, "
            + "with delicate descriptions and grand depictions of interstellar civilization wars.\nChapter 1.\n",
            "model": "rwkv",
            "stream": False,
            "stop": None,
            "max_tokens": 100,
            "temperature": 1,
            "top_p": 0.3,
            "presence_penalty": 0,
            "frequency_penalty": 1,
        }
    }
}
zhuifengzl commented 7 months ago

好像每次程序都会重置修改的参数,那个流式参数是不能修改吗

BlinkDL commented 7 months ago

模型用 https://huggingface.co/BlinkDL/rwkv-6-world/blob/main/RWKV-x060-World-3B-v2.1-20240417-ctx4096.pth

另外显存多少?显存够就解码参数选 cuda fp16

zhuifengzl commented 7 months ago

12g的显存

josStorer commented 7 months ago

@zhuifengzl 参数是调用api的时候传递的, 可以改, 你不用去改源码, 载入模型的时候, 拉满"载入显存层数"

zhuifengzl commented 7 months ago

好的,我试试,感谢哈

zhuifengzl commented 7 months ago

public struct LocalSendData { public string model; public bool stream; public bool presystem; public int max_tokens; public double temperature; public double top_p; public double presence_penalty; public double frequency_penalty; public List messages; } public struct LocalSendDataMes { public string role; public string content; public bool raw; } public static IEnumerator RequestGPTSegmentation_local(string content, Action<string, bool> callback) { string postData = ""; //Debug.Log("postData");

        LocalSendData local = new LocalSendData();
        local.model = "rwkv";
        local.stream = false;
        local.presystem = true;
        local.max_tokens = 10000;
        local.temperature = 1.2;
        local.top_p = 0.5;
        local.presence_penalty = 0.4;
        local.frequency_penalty = 0.4;
        local.messages = new List<LocalSendDataMes>();
        local.messages.Add(new LocalSendDataMes() { role = "user", content = content, raw = false });
        postData = JsonMapper.ToJson(local);

        Debug.Log(postData);

        //Debug.Log(ConfigExcelMgr.instance.excelData.str_localPath);

        using (var request = new UnityWebRequest(ConfigExcelMgr.instance.excelData.str_localPath + "/chat/completions", "POST"))
        //using (var request = new UnityWebRequest("http://192.168.0.105:8000/chat/completions", "POST"))
        {
            request.SetRequestHeader("Accept", "application/json, text/plain, */*");
            request.SetRequestHeader("Content-Type", "application/json");                

            request.uploadHandler = new UploadHandlerRaw(Encoding.UTF8.GetBytes(postData));

            request.downloadHandler = new DownloadHandlerBuffer();

            UnityWebRequestAsyncOperation asyncOp = request.SendWebRequest();

            int dataIndex = 0;

            string text = "";

            while (!asyncOp.isDone)
            {
                //Disponse(false);
                yield return wait_internal;
            }

            if (request.result == UnityWebRequest.Result.ConnectionError || request.result == UnityWebRequest.Result.ProtocolError)
            {
                Debug.LogError("Error: " + request.error);
            }

            //Debug.Log(request.downloadHandler.text);
            if (request.downloadHandler.text != null)
            {

                string jsondata = request.downloadHandler.text;
                JsonData jd = JsonMapper.ToObject(jsondata);
                //string js_finish_reason = (string)jd["choices"]["finish_reason"];

                string js_text = (string)jd["choices"][0]["message"]["content"];

                callback?.Invoke(js_text, true);
            }

这是写的代码,我不太懂,帮忙看看应该怎么修改,才能提高接口的回答速度呢