vikhyat / moondream

tiny vision language model
https://moondream.ai
Apache License 2.0
5.35k stars 457 forks source link

IndexError: index 185 is out of bounds for dimension 0 with size 1 #116

Closed pagalscientist closed 3 months ago

pagalscientist commented 3 months ago

I took the exact same code from the fine tuning notebook but tweaked the dataset part a little. Model trains perfectly fine, exports perfectly fine even loads perfectly fine. But i get the IndexError: index 185 is out of bounds for dimension 0 with size 1 error. The index number changes though. What could be the possible issue? I passed the same image to the pretrained model and it works perfectly fine. But when i pass the same image to the finue-tuned one it throws index error. Here's the code

import json from torch.utils.data import Dataset from PIL import Image import os

class CustomDataset(Dataset): def init(self, json_file, images_dir): with open(json_file, 'r') as f: self.data = json.load(f) self.images_dir = images_dir

def __len__(self):
    return len(self.data)

def __getitem__(self, idx):
    item = self.data[idx]
    image_id = item["image_id"]
    questions = item["questions"]

    image_path = os.path.join(self.images_dir, image_id)
    image = Image.open(image_path).convert("RGB")

    # Format questions and answers
    qa_pairs = [{"question": q["question"], "answer": q["answer"]} for q in questions]

    return {
        "image": image,
        "qa": qa_pairs
    }

dataset = CustomDataset("data.json", "images/")

from torch.utils.data import DataLoader from bitsandbytes.optim import Adam8bit import math from einops import rearrange from tqdm import tqdm

ANSWER_EOS = "<|endoftext|>"

Number of tokens used to represent each image.

IMG_TOKENS = 729

def collate_fn(batch): images = [sample['image'] for sample in batch] images = torch.stack(moondream.vision_encoder.preprocess(images)) images = rearrange(images, "b c (h p1) (w p2) -> b (h w) (c p1 p2)", p1=14, p2=14)

labels_acc = []
tokens_acc = []

for sample in batch:
    toks = [tokenizer.bos_token_id]
    labs = [-100] * (IMG_TOKENS + 1)

    for qa in sample['qa']:
        q_t = tokenizer(
            f"\n\nQuestion: {qa['question']}\n\nAnswer:",
            add_special_tokens=False
        ).input_ids
        toks.extend(q_t)
        labs.extend([-100] * len(q_t))

        a_t = tokenizer(
            f" {qa['answer']}{ANSWER_EOS}",
            add_special_tokens=False
        ).input_ids
        toks.extend(a_t)
        labs.extend(a_t)

    tokens_acc.append(toks)
    labels_acc.append(labs)

max_len = -1
for labels in labels_acc:
    max_len = max(max_len, len(labels))

attn_mask_acc = []

for i in range(len(batch)):
    len_i = len(labels_acc[i])
    pad_i = max_len - len_i

    labels_acc[i].extend([-100] * pad_i)
    tokens_acc[i].extend([tokenizer.eos_token_id] * pad_i)
    attn_mask_acc.append([1] * len_i + [0] * pad_i)

return (
    images.to(dtype=DTYPE),
    torch.stack([torch.tensor(t, dtype=torch.long) for t in tokens_acc]),
    torch.stack([torch.tensor(l, dtype=torch.long) for l in labels_acc]),
    torch.stack([torch.tensor(a, dtype=torch.bool) for a in attn_mask_acc]),
)

def compute_loss(batch): images, tokens, labels, attn_mask = batch

images = images.to(DEVICE)
tokens = tokens.to(DEVICE)
labels = labels.to(DEVICE)
attn_mask = attn_mask.to(DEVICE)

with torch.no_grad():
    img_embs = moondream.vision_encoder.encoder(images)
    img_embs = moondream.vision_encoder.projection(img_embs)

tok_embs = moondream.text_model.get_input_embeddings()(tokens)
inputs_embeds = torch.cat((tok_embs[:, 0:1, :], img_embs, tok_embs[:, 1:, :]), dim=1)

outputs = moondream.text_model(
    inputs_embeds=inputs_embeds,
    labels=labels,
    attention_mask=attn_mask,
)

return outputs.loss

def lr_schedule(step, max_steps): x = step / max_steps if x < 0.1: return 0.1 LR + 0.9 LR x / 0.1 else: return 0.1 LR + 0.9 LR (1 + math.cos(math.pi * (x - 0.1))) / 2

dataloaders = { "train": DataLoader( dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, ), "val": DataLoader( dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, ), }

moondream.text_model.train() moondream.text_model.transformer.gradient_checkpointing_enable()

total_steps = EPOCHS len(dataloaders["train"]) // GRAD_ACCUM_STEPS optimizer = Adam8bit( [ {"params": moondream.text_model.parameters()}, ], lr=LR 0.1, betas=(0.9, 0.95), eps=1e-6 )

if USE_WANDB: import wandb wandb.init( project="moondream-ft", config={ "EPOCHS": EPOCHS, "BATCH_SIZE": BATCH_SIZE, "GRAD_ACCUM_STEPS": GRAD_ACCUM_STEPS, "LR": LR, } )

i = 0 for epoch in range(EPOCHS): for batch in tqdm(dataloaders["train"], desc=f"Epoch {epoch + 1}/{EPOCHS}"): i += 1

    loss = compute_loss(batch)
    loss.backward()

    if i % GRAD_ACCUM_STEPS == 0:
        optimizer.step()
        optimizer.zero_grad()

        lr = lr_schedule(i / GRAD_ACCUM_STEPS, total_steps)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

    if i % 100 == 0 and USE_WANDB:
        # Calculate validation loss
        val_loss = 0
        for val_batch in tqdm(dataloaders["val"], desc="Validation"):
            with torch.no_grad():
                val_loss += compute_loss(val_batch).item()
        val_loss /= len(dataloaders["val"])

    if USE_WANDB:
        wandb.log({
            "loss/train": loss.item(),
            "lr": optimizer.param_groups[0]['lr']
        } | ({"loss/val": val_loss} if i % 100 == 0 else {}))

if USE_WANDB: wandb.finish()

moondream.save_pretrained("checkpoints/moondream2-fine_tuned")

import torch from transformers import AutoTokenizer, AutoModelForCausalLM

moondream = AutoModelForCausalLM.from_pretrained("checkpoints/moondream2-fine_tuned", trust_remote_code=True, ignore_mismatched_sizes=True) # it throws error if i do not set the ignore_mismatched_sizes. This could be the issue. moondream.eval()

MD_REVISION = "2024-05-20" tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", revision=MD_REVISION) from PIL import Image image = Image.open("sample_test.jpeg") md_answer = moondream.answer_question( moondream.encode_image(image), "describe the image", tokenizer=tokenizer, ) md_answer

But it always throws the error.

pagalscientist commented 3 months ago

Issue resolved, i messed up with the code.