sgl-project / sglang

SGLang is a structured generation language designed for large language models (LLMs). It makes your interaction with models faster and more controllable.
Apache License 2.0
2.76k stars 179 forks source link

llava-next-video inference result is empty #498

Open AmazDeng opened 1 month ago

AmazDeng commented 1 month ago

I tested the code srt_example_llava_v.py and found that it was unable to produce inference results (the results are empty). The code is as follows: str_example_llava_v.py

"""
Usage: python3 srt_example_llava.py
"""

import sglang as sgl
import os
import csv
import time
import argparse

@sgl.function
def video_qa(s, num_frames, video_path, question):
    s += sgl.user(sgl.video(video_path,num_frames) + question)
    s += sgl.assistant(sgl.gen("answer"))

def single(path, num_frames=16):
    print(f"single path={path}")
    state = video_qa.run(
        num_frames=num_frames,
        video_path=path,
        question="Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes",
        # temperature=0.0,
        max_new_tokens=1024,
    )
    print(f"single state={state}\n")

def split_into_chunks(lst, num_chunks):
    """Split a list into a specified number of chunks."""
    # Calculate the chunk size using integer division. Note that this may drop some items if not evenly divisible.
    chunk_size = len(lst) // num_chunks

    if chunk_size == 0:
        chunk_size = len(lst)
    # Use list comprehension to generate chunks. The last chunk will take any remainder if the list size isn't evenly divisible.
    chunks = [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
    # Ensure we have exactly num_chunks chunks, even if some are empty
    chunks.extend([[] for _ in range(num_chunks - len(chunks))])
    return chunks

def save_batch_results(batch_video_files, states, cur_chunk, batch_idx, save_dir):
    csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['video_name', 'answer'])
        for video_path, state in zip(batch_video_files, states):
            video_name = os.path.basename(video_path)
            # writer.writerow([video_name, state["answer"]])
            writer.writerow([video_name, state])

def compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir):
    final_csv_filename = f"{save_dir}/final_results_chunk_{cur_chunk}.csv"
    with open(final_csv_filename, 'w', newline='') as final_csvfile:
        writer = csv.writer(final_csvfile)
        writer.writerow(['video_name', 'answer'])
        for batch_idx in range(num_batches):
            batch_csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
            with open(batch_csv_filename, 'r') as batch_csvfile:
                reader = csv.reader(batch_csvfile)
                next(reader)  # Skip header row
                for row in reader:
                    writer.writerow(row)
            os.remove(batch_csv_filename)

def find_video_files(video_dir):
    # Check if the video_dir is actually a file
    if os.path.isfile(video_dir):
        # If it's a file, return it as a single-element list
        return [video_dir]

    # Original logic to find video files in a directory
    video_files = []
    for root, dirs, files in os.walk(video_dir):
        for file in files:
            if file.endswith(('.mp4', '.avi', '.mov')):
                video_files.append(os.path.join(root, file))
    return video_files

def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=64):
    print(f"batch,video_dir={video_dir}")
    video_files = find_video_files(video_dir)
    print(f"batch,video_files={video_files}")
    chunked_video_files = split_into_chunks(video_files, num_chunks)[cur_chunk]
    num_batches = 0

    for i in range(0, len(chunked_video_files), batch_size):
        batch_video_files = chunked_video_files[i:i + batch_size]
        print(f"Processing batch of {len(batch_video_files)} video(s)...")

        if not batch_video_files:
            print("No video files found in the specified directory.")
            return

        batch_input = [
            {   
                "num_frames": num_frames,
                "video_path": video_path,
                "question": "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes.",
            } for video_path in batch_video_files
        ]

        start_time = time.time()
        states = video_qa.run_batch(batch_input, max_new_tokens=512, temperature=0.2)
        total_time = time.time() - start_time
        average_time = total_time / len(batch_video_files)
        print(f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds")
        print(f"batch,states={states}")
        save_batch_results(batch_video_files, states, cur_chunk, num_batches, save_dir)
        num_batches += 1

    compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir)

if __name__ == "__main__":

    # Create the parser
    parser = argparse.ArgumentParser(description='Run video processing with specified port.')

    # Add an argument for the port
    parser.add_argument('--port', type=int, default=30000, help='The master port for distributed serving.')
    parser.add_argument('--chunk-idx', type=int, default=0, help='The index of the chunk to process.')
    parser.add_argument('--num-chunks', type=int, default=8, help='The number of chunks to process.')
    parser.add_argument('--save-dir', type=str, default="/media/star/8T/tmp/llava_video", help='The directory to save the processed video files.')
    parser.add_argument('--video-dir', type=str, default="/media/star/8T/tmp/gpt4v/video/tmp", help='The directory or path for the processed video files.')
    parser.add_argument('--model-path', type=str, default="/media/star/8T/model/gpt/llava/llava-next/lmms-lab/LLaVA-NeXT-Video-7B-DPO", help='The model path for the video processing.')
    parser.add_argument('--num-frames', type=int, default=16, help='The number of frames to process in each video.' )
    parser.add_argument("--mm_spatial_pool_stride", type=int, default=2)

    # Parse the arguments
    args = parser.parse_args()

    cur_port = args.port

    cur_chunk = args.chunk_idx

    num_chunks = args.num_chunks

    num_frames = args.num_frames

    if "34b" in args.model_path.lower():
        tokenizer_path = "liuhaotian/llava-v1.6-34b-tokenizer"
    elif "7b" in args.model_path.lower():
        # tokenizer_path = "llava-hf/llava-1.5-7b-hf"
        tokenizer_path="/media/star/8T/model/gpt/llava/llava-hf/llava-1.5-7b-hf"
    else:
        print("Invalid model path. Please specify a valid model path.")
        exit()

    model_overide_args = {}

    model_overide_args["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride
    model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
    model_overide_args["num_frames"] = args.num_frames
    model_overide_args["model_type"] = "llava"
    model_overide_args["mm_vision_tower"] = "/media/star/8T/model/clip/openai_clip/clip-vit-large-patch14-336"

    if "34b" in args.model_path.lower():
        model_overide_args["image_token_index"] = 64002

    if args.num_frames == 32:
        model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
        model_overide_args["max_sequence_length"] = 4096 * 2
        model_overide_args["tokenizer_model_max_length"] = 4096 * 2
    elif args.num_frames < 32:
        pass
    else:
        print("The maximum number of frames to process is 32. Please specify a valid number of frames.")
        exit()

    runtime = sgl.Runtime(
        model_path=args.model_path, #"liuhaotian/llava-v1.6-vicuna-7b",
        tokenizer_path=tokenizer_path,
        port=cur_port,
        additional_ports=[cur_port+1,cur_port+2,cur_port+3,cur_port+4],
        model_overide_args=model_overide_args,
        tp_size=1
    )
    sgl.set_default_backend(runtime)
    print(f"chat template: {runtime.endpoint.chat_template.name}")

    # Run a single request
    # try:
    print("\n========== single ==========\n")
    root = args.video_dir
    if os.path.isfile(root):
        video_files = [root]
    else:
        video_files = [os.path.join(root, f) for f in os.listdir(root) if f.endswith(('.mp4', '.avi', '.mov'))]  # Add more extensions if needed
    start_time = time.time()  # Start time for processing a single video
    for cur_video in video_files[:1]:
        print(cur_video)
        single(cur_video, num_frames)
    end_time = time.time()  # End time for processing a single video
    total_time = end_time - start_time
    average_time = total_time / len(video_files)  # Calculate the average processing time
    print(f"Average processing time per video: {average_time:.2f} seconds")
    runtime.shutdown()
    # except Exception as e:
    #     print(e)
    runtime.shutdown()

    # # # Run a batch of requests
    # print("\n========== batch ==========\n")
    # if not os.path.exists(args.save_dir):
    #     os.makedirs(args.save_dir)
    # batch(args.video_dir,args.save_dir,cur_chunk, num_chunks, num_frames, num_chunks)
    # runtime.shutdown()

srt_example_llava_v.sh

#!/bin/bash

##### USAGE #####
#    - First node:
#      ```sh
#      bash examples/usage/llava_video/srt_example_llava_v.sh K 0 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```
#    - Second node:
#      ```sh
#      bash examples/usage/llava_video/srt_example_llava_v.sh K 1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```
#    - The K node:
#      ```sh
#      bash examples/usage/llava_video/srt_example_llava_v.sh K K-1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
#      ```

# Replace `K`, `YOUR_VIDEO_PATH`, `YOUR_MODEL_PATH`, and `FRAMES_PER_VIDEO` with your specific details.
# CURRENT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
CURRENT_ROOT=$(dirname "$0")

echo ${CURRENT_ROOT}

cd ${CURRENT_ROOT}

export PYTHONWARNINGS=ignore

START_TIME=$(date +%s)  # Capture start time

NUM_NODES=$1

CUR_NODES_IDX=$2

VIDEO_DIR=$3

MODEL_PATH=$4   

NUM_FRAMES=$5

# FRAME_FORMAT=$6

# FRAME_FORMAT=$(echo $FRAME_FORMAT | tr '[:lower:]' '[:upper:]')

# # Check if FRAME_FORMAT is either JPEG or PNG
# if [[ "$FRAME_FORMAT" != "JPEG" && "$FRAME_FORMAT" != "PNG" ]]; then
#     echo "Error: FRAME_FORMAT must be either JPEG or PNG."
#     exit 1
# fi

# export TARGET_FRAMES=$TARGET_FRAMES

echo "Each video you will sample $NUM_FRAMES frames"

# export FRAME_FORMAT=$FRAME_FORMAT

# echo "The frame format is $FRAME_FORMAT"

# Assuming GPULIST is a bash array containing your GPUs
GPULIST=(0)
LOCAL_CHUNKS=${#GPULIST[@]}

echo "Number of GPUs in GPULIST: $LOCAL_CHUNKS"

ALL_CHUNKS=$((NUM_NODES * LOCAL_CHUNKS))

# Calculate GPUs per chunk
GPUS_PER_CHUNK=1

echo $GPUS_PER_CHUNK

for IDX in $(seq 1 $LOCAL_CHUNKS); do
    (
        START=$(((IDX-1) * GPUS_PER_CHUNK))
        LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index

        CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})

        # Convert the chunk GPUs array to a comma-separated string
        CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")

        LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))

        echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"

        # Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
        PORT=$((10000 + RANDOM % 55536))

        MAX_RETRIES=10
        RETRY_COUNT=0
        COMMAND_STATUS=1  # Initialize as failed

        while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
            echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"

#!/bin/bash
            CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \
            --port $PORT \
            --num-chunks $ALL_CHUNKS \
            --chunk-idx $(($LOCAL_IDX - 1)) \
            --save-dir /media/star/8T/tmp/llava_video \
            --video-dir $VIDEO_DIR \
            --model-path $MODEL_PATH \
            --num-frames $NUM_FRAMES #&

            wait $!  # Wait for the process to finish and capture its exit status
            COMMAND_STATUS=$?

            if [ $COMMAND_STATUS -ne 0 ]; then
                echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
                RETRY_COUNT=$(($RETRY_COUNT + 1))
                sleep 180  # Wait a bit before retrying
            else
                echo "Execution succeeded for chunk $(($LOCAL_IDX - 1))."
            fi
        done

        if [ $COMMAND_STATUS -ne 0 ]; then
            echo "Execution failed for chunk $(($LOCAL_IDX - 1)) after $MAX_RETRIES attempts."
        fi
    ) #&
    sleep 2  # Slight delay to stagger the start times
done

wait

cat /media/star/8T/tmp/llava_video/final_results_chunk_*.csv > /media/star/8T/tmp/llava_video/final_results_node_${CUR_NODES_IDX}.csv

END_TIME=$(date +%s)  # Capture end time
ELAPSED_TIME=$(($END_TIME - $START_TIME))
echo "Total execution time: $ELAPSED_TIME seconds."

running bash script

bash examples/usage/llava_video/srt_example_llava_v.sh 1 0 "/media/star/8T/tmp/gpt4v/video/tmp/1.mp4" "/media/star/8T/model/gpt/llava/llava-next/lmms-lab/LLaVA-NeXT-Video-7B-DPO" 32

terminal output

(sglang) star@star-SYS-7049GP-TRT:/media/star/8T/PycharmProjects/github/train_inference_accelerate/sglang$ bash examples/usage/llava_video/srt_example_llava_v.sh 1 0 "/media/star/8T/tmp/gpt4v/video/tmp/1.mp4" "/media/star/8T/model/gpt/llava/llava-next/lmms-lab/LLaVA-NeXT-Video-7B-DPO" 16
examples/usage/llava_video
Each video you will sample 16 frames
Number of GPUs in GPULIST: 1
1
Chunk 0 will run on GPUs 0
Running chunk 0 on GPUs 0 with port 30256. Attempt 1
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
target_frames: 16
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
chat template: vicuna_v1.1

========== single ==========

/media/star/8T/tmp/gpt4v/video/tmp/1.mp4
single path=/media/star/8T/tmp/gpt4v/video/tmp/1.mp4
single state=ProgramState(A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:)

Average processing time per video: 0.01 seconds
Execution succeeded for chunk 0.
Total execution time: 32 seconds.

why?

ZhangYuanhan-AI commented 1 month ago

image Hi, thanks for your interest!

it works well at my side. What GPU do you use?

AmazDeng commented 1 month ago

image Hi, thanks for your interest!

it works well at my side. What GPU do you use?

A100 80G,only one GPU card

AmazDeng commented 1 month ago

image Hi, thanks for your interest!

it works well at my side. What GPU do you use?

I have also tested llava-next-image demo code, the output is right, not empty. which version of sglang do you use?

AmazDeng commented 4 weeks ago

Could you please take a look at this issue? @merrymercy