Batching - Githubissues

SinanAkkoyun commented 2 months ago

Is this code "optimal" for batched inference and preprocessing?

SinanAkkoyun commented 2 months ago

import time
import torch
from transformers import AutoModelForCausalLM
from PIL import Image
from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM
from deepseek_vl.utils.io import load_pil_images
import concurrent.futures

# Initialize the model and processor
model_path = "deepseek-ai/deepseek-vl-1.3b-chat"
vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Function to load and process images and text per thread
def process_conversation(conversation_piece):
    # Load images directly using the entire conversation piece
    pil_images = load_pil_images(conversation_piece)
    prepare_inputs = vl_chat_processor(
        conversations=conversation_piece,
        images=pil_images,
        force_batchify=True
    ).to(vl_gpt.device)
    return prepare_inputs

n_threads = 8

conversation = [
    [
        {"role": "User", "content": "Thoroughly describe <image_placeholder>.", "images": ["../../man_wave.png"]},
        {"role": "Assistant", "content": ""}
    ] for _ in range(n_threads)
]
start = time.time()

# Using ThreadPoolExecutor to parallelize image loading and input preparation
with concurrent.futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
    futures = [executor.submit(process_conversation, conv) for conv in conversation]
    results = [f.result() for f in concurrent.futures.as_completed(futures)]

print("Time for preprocessing: ", time.time() - start)

# Aggregate results from threads
input_ids = torch.cat([res.input_ids for res in results], dim=0)
pixel_values = torch.cat([res.pixel_values for res in results], dim=0)
attention_mask = torch.cat([res.attention_mask for res in results], dim=0)
images_seq_mask = torch.cat([res.images_seq_mask for res in results], dim=0)
images_emb_mask = torch.cat([res.images_emb_mask for res in results], dim=0)
sft_format = [res.sft_format for res in results]

# Run model to get the response
inputs_embeds = vl_gpt.prepare_inputs_embeds(
    input_ids=input_ids,
    pixel_values=pixel_values,
    images_seq_mask=images_seq_mask,
    images_emb_mask=images_emb_mask
)

outputs = vl_gpt.language_model.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=1,
    do_sample=False,
    use_cache=True
)

answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(sft_format[0], answer)  # Assuming sft_format is consistent across threads

end = time.time()
print("Time taken to process: ", end - start)

SinanAkkoyun commented 2 months ago

The vl_chat_processor seems to take up the most time, more than the image embedding model itself. If possible it would be awesome to get help on optimizing the preprocessing or somehow better parallelizing it, which would make it great for serving

soloice commented 2 months ago

Is this code "optimal" for batched inference and preprocessing?

Nope. It's just a toy demo, not for production purpose.

SinanAkkoyun commented 2 months ago

I know, that's why I at least tried to quickly "parallelize" the processor Any help in actually optimizing it for batching is greatly appreciated

gullalc commented 2 months ago

You could use custom dataset class and use dataloader to do batching. This is how I run it and it is quite fast.

class CustomDataset(Dataset):
    def __init__(self, image_folder, prompt, processor):
        self.image_ids = [os.path.join(image_folder, f) for f in os.listdir(image_folder)]
        self.prompt = prompt
        self.processor = processor

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_path = self.image_ids[idx]

        prompt = [
                    {
                        "role": "User",
                        "content": "<image_placeholder>%s"%(self.prompt),
                        "images": [image_path]
                    },
                    {
                        "role": "Assistant",
                        "content": ""
                    }
                ]

        pil_images = load_pil_images(prompt)
        res = self.processor(conversations=prompt, images=pil_images, force_batchify=True)

        return image_path, res.input_ids.squeeze(0), res.pixel_values.squeeze(0), \
                    res.attention_mask.squeeze(0), res.images_seq_mask.squeeze(0), res.images_emb_mask.squeeze(0)

def process_images(model, tokenizer, loader):
    image_descriptions = {}
    for i, batch in enumerate(loader):

        print(f'Processing batch {i}/{len(loader)}')

        image_ids, input_ids, pixel_values, attention_mask, images_seq_mask, images_emb_mask = batch

        input_ids, pixel_values, attention_mask, images_seq_mask, images_emb_mask = input_ids.to(model.device), \
                                            pixel_values.to(model.device), attention_mask.to(model.device), \
                                            images_seq_mask.to(model.device), images_emb_mask.to(model.device)

        print(input_ids.shape, pixel_values.shape, attention_mask.shape, images_seq_mask.shape, images_emb_mask.shape)

        with torch.no_grad():
            inputs_embeds = model.prepare_inputs_embeds(
                input_ids=input_ids,
                pixel_values=pixel_values.to(torch.bfloat16),
                images_seq_mask=images_seq_mask,
                images_emb_mask=images_emb_mask
            )

            outputs = model.language_model.generate(
                    inputs_embeds=inputs_embeds,
                    attention_mask=attention_mask,
                    pad_token_id=tokenizer.eos_token_id,
                    bos_token_id=tokenizer.bos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    max_new_tokens=256,
                    do_sample=False,
                    use_cache=True
                )

        for image_id, output in zip(image_ids, outputs):
            answer = tokenizer.decode(output.cpu().tolist(), skip_special_tokens=True)
            answer = answer.split("Assistant:")[-1].strip()
            image_descriptions[image_id] = answer

    return image_descriptions

## Model, Processor and Tokenizer
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained("deepseek-ai/deepseek-vl-7b-chat")
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl-7b-chat", trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

prompt = "Describe this image in detail."

## Dataloader
dataloader = DataLoader(CustomDataset(image_folder, prompt, vl_chat_processor), batch_size=32, shuffle=False, pin_memory=True, num_workers=8)

image_descriptions = process_images(vl_gpt, tokenizer, dataloader)

SinanAkkoyun commented 2 months ago

Oh wow that's a very cool approach, thank you so much for sharing it! I will try it out asap

Youho99 commented 1 month ago

I'm also interested in making inference faster. I only use inference.

deepseek-ai / DeepSeek-VL

Batching #48