Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

I've been trying to deploy the new LLaVA-NeXT with Sglang on Modal but not sure why I'm getting "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained." message for a very long time and nothing is happing.

How can I serve the model directly so it doesn't have to load every time? I just want to load the model one time in my start_engine and use the generate function to get the output. I think this will be fast.


from pathlib import Path
from modal import Mount, asgi_app
import modal

GPU_CONFIG = modal.gpu.A100(memory=80, count=1)

vllm_image = (
    modal.Image.from_registry(
        "nvidia/cuda:11.8.0-devel-ubuntu22.04", add_python="3.10")
    .apt_install("git", "wget", "cmake")
    .pip_install(
        "wheel==0.43.0",
        "torch==2.2.1",
        "torchvision==0.17.1",
        "transformers==4.40.0",
        "timm==0.9.12",
        "Pillow==10.3.0",
        "peft==0.8.2",
        "hf-transfer==0.1.6",
        "huggingface_hub==0.22.2",
        "nvidia-nccl-cu11==2.21.5"
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    .run_commands("pip install flash-attn==2.5.2 --no-build-isolation")
    .run_commands("pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git")
    .run_commands('pip install openai')
    .run_commands("git clone https://github.com/sgl-project/sglang.git && cd sglang && pip install -e 'python[all]'")
)

app = modal.App("tgi")

@app.cls(
    gpu=GPU_CONFIG,
    timeout=120,
    container_idle_timeout=120,
    allow_concurrent_inputs=10,
    image=vllm_image,
)
class Model:
    @modal.enter()
    async def start_engine(self):
        import sglang as sgl
        import requests
        import subprocess

        command = 'python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --tokenizer-path lmms-lab/llama3-llava-next-8b-tokenizer --port=30000 --host="127.0.0.1" --tp-size=4'
        subprocess.run(command, shell=True)

    @modal.method()
    async def generate(self):
        print("Generating")

modal-labs / modal-client

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. #1816