poor quality output for qwen 72b

pseudotensor commented 5 months ago

server:

export CUDA_VISIBLE_DEVICES="3,4,5,6"
python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --tokenizer-path lmms-lab/llavanext-qwen-tokenizer --port=30010 --host="0.0.0.0" --tp-size=4 --random-seed=1234 --context-length=32768 &> 72b.log &

client:

"""
Usage:
# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
# Installing latest sglang.

# Endpoint Service CLI:
# python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --tokenizer-path lmms-lab/llavanext-qwen-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4

python3 http_qwen_llava_test.py

Output:
"Two children pose with a large teddy bear, one holding a smaller stuffed bear, in a room with an American flag and potted plants."
"""

import argparse
import asyncio
import json
import time
import copy

import aiohttp
import requests

from llava.conversation import (
    default_conversation,
    conv_templates,
    SeparatorStyle,
    conv_llava_llama_3,
    conv_qwen,
)

async def send_request(url, data, delay=0):
    await asyncio.sleep(delay)
    async with aiohttp.ClientSession() as session:
        async with session.post(url, json=data) as resp:
            output = await resp.json()
    return output

async def test_concurrent(args):
    url = f"{args.host}:{args.port}"

    prompt = "<image>\nPlease generate caption towards this image."
    conv_template = copy.deepcopy(conv_qwen)
    conv_template.append_message(role="user", message=prompt)
    prompt_with_template = conv_template.get_prompt()
    response = []
    for i in range(1):
        response.append(
            send_request(
                url + "/generate",
                {
                    "text": prompt_with_template,
                    "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
                    "sampling_params": {
                        "max_new_tokens": 1024,
                        "temperature": 0,
                        "top_p": 1.0,
                        "presence_penalty": 2,
                        "frequency_penalty": 2,
                        "stop": "<|im_end|>",
                    },
                },
            )
        )

    rets = await asyncio.gather(*response)
    for ret in rets:
        print(ret["text"])

def test_streaming(args):
    url = f"{args.host}:{args.port}"
    prompt = "<image>\nGive detailed information."
    conv_template = copy.deepcopy(conv_qwen)
    conv_template.append_message(role="user", message=prompt)
    prompt_with_template = conv_template.get_prompt()
    pload = {
        "text": prompt_with_template,
        "sampling_params": {
            "max_new_tokens": 1024,
            "temperature": 0,
            "top_p": 1.0,
            "presence_penalty": 2,
            "frequency_penalty": 2,
            "stop": "<|im_end|>",
        },
        #"image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
        "image_data": "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.png",
        "stream": True,
    }
    response = requests.post(
        url + "/generate",
        json=pload,
        stream=True,
    )

    prev = 0
    for chunk in response.iter_lines(decode_unicode=False):
        chunk = chunk.decode("utf-8")
        if chunk and chunk.startswith("data:"):
            if chunk == "data: [DONE]":
                break
            data = json.loads(chunk[5:].strip("\n"))
            output = data["text"].strip()
            print(output[prev:], end="", flush=True)
            prev = len(output)
    print("")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default="http://0.0.0.0")
    parser.add_argument("--port", type=int, default=80)
    args = parser.parse_args()
    # asyncio.run(test_concurrent(args))
    test_streaming(args)

just gives:

Big Ben

No matter how I prompt, the output is extremely terse even if accurate.

I changed the image, but otherwise this is the default script from sglang: https://github.com/sgl-project/sglang/blob/main/examples/usage/llava/http_qwen_llava_test.py

If I try increasing temperature to 0.5, I get no response at all and it just fails:

INFO:     172.16.0.42:27134 - "POST /generate HTTP/1.1" 200 OK
ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/responses.py", line 265, in __call__
    await wrap(partial(self.listen_for_disconnect, receive))
  File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/responses.py", line 261, in wrap
    await func()
  File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/responses.py", line 238, in listen_for_disconnect
    message = await receive()
  File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 568, in receive
    await self.message_event.wait()
  File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/asyncio/locks.py", line 214, in wait
    await fut
asyncio.exceptions.CancelledError: Cancelled by cancel scope 7dcc7e79ada0

During handling of the above exception, another exception occurred:

  + Exception Group Traceback (most recent call last):
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 411, in run_asgi
  |     result = await app(  # type: ignore[func-returns-value]
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in __call__
  |     return await self.app(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__
  |     await super().__call__(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/applications.py", line 123, in __call__
  |     await self.middleware_stack(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
  |     raise exc
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
  |     await self.app(scope, receive, _send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
  |     await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
  |     raise exc
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
  |     await app(scope, receive, sender)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/routing.py", line 756, in __call__
  |     await self.middleware_stack(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/routing.py", line 776, in app
  |     await route.handle(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/routing.py", line 297, in handle
  |     await self.app(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/routing.py", line 77, in app
  |     await wrap_app_handling_exceptions(app, request)(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
  |     raise exc
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
  |     await app(scope, receive, sender)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/routing.py", line 75, in app
  |     await response(scope, receive, send)
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/responses.py", line 258, in __call__
  |     async with anyio.create_task_group() as task_group:
  |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 678, in __aexit__
  |     raise BaseExceptionGroup(
  | exceptiongroup.ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
  +-+---------------- 1 ----------------
  +-+---------------- 1 ----------------
    | Traceback (most recent call last):
    |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/responses.py", line 261, in wrap
    |     await func()
    |   File "/home/ubuntu/miniconda3/envs/sglang/lib/python3.10/site-packages/starlette/responses.py", line 250, in stream_response
    |     async for chunk in self.body_iterator:
    |   File "/home/ubuntu/sglang/python/sglang/srt/server.py", line 89, in stream_results
    |     async for out in tokenizer_manager.generate_request(obj, request):
    |   File "/home/ubuntu/sglang/python/sglang/srt/managers/tokenizer_manager.py", line 143, in generate_request
    |     pixel_values, image_hash, image_size = await self.get_pixel_values(
    | TypeError: cannot unpack non-iterable NoneType object
    +------------------------------------

I don't understand what it means by the TypeError just because of temperature=0.5, very odd. Is that sglang's fault?

Luodian commented 5 months ago

@pseudotensor

I made a fix of the template issue at sglang. You could check the new examples folder with updated code.

https://github.com/sgl-project/sglang/blob/main/examples/usage/llava/

pseudotensor commented 5 months ago

Thanks, trying latest sglang, latest llava-next, and what I think your change to the script was -- i.e. assistant None as last appended thing.

pseudotensor commented 5 months ago

Yes, that maybe fixed the problem. Any idea about the TypeError or size error?

https://github.com/sgl-project/sglang/issues/474 https://github.com/sgl-project/sglang/issues/473

LLaVA-VL / LLaVA-NeXT

poor quality output for qwen 72b #37