UKPLab / sentence-transformers

State-of-the-Art Text Embeddings
https://www.sbert.net
Apache License 2.0
14.78k stars 2.43k forks source link

CLIP training example is broken #2723

Open ffletcherr opened 3 months ago

ffletcherr commented 3 months ago

CLIP training example is broken.

raise this error:

TypeError: 'JpegImageFile' object is not subscriptable

in this section line 151:

https://github.com/UKPLab/sentence-transformers/blob/fc1b7d0f308585e8caafbc9ca96af099b0931f73/sentence_transformers/models/Transformer.py#L135-L153

tomaarsen commented 3 months ago

Hello!

The CLIP training script: https://github.com/UKPLab/sentence-transformers/blob/fc1b7d0f308585e8caafbc9ca96af099b0931f73/examples/training/clip/train_clip.ipynb still seems to work.

I'm curious in what extent your training setup differs from this one, as I'd like to get this fixed for better CLIP training support.

ffletcherr commented 3 months ago

Hello Tom, thank you for quick response.

try this to reproduce the same error:

import requests
from PIL import Image
from sentence_transformers import InputExample, SentenceTransformer, losses
from torch.utils.data import DataLoader

model = SentenceTransformer("sentence-transformers/clip-ViT-B-32-multilingual-v1")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_1 = Image.open(requests.get(url, stream=True).raw).convert("RGB")
image_2 = Image.open(requests.get(url, stream=True).raw).convert("RGB")
images = [image_1, image_2]
texts = ["a cat", "two dog"]

train_dataset = []
for pil_image, text in zip(images, texts):
    train_dataset.append(InputExample(texts=[pil_image, text]))

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
train_loss = losses.MultipleNegativesSymmetricRankingLoss(model=model)

model.fit([(train_dataloader, train_loss)], epochs=5, show_progress_bar=True)
sentence-transformers==2.7.0
transformers==4.41.2
Pillow==10.3.0
torch==2.2.1+cu118
ffletcherr commented 2 months ago

That was my mistake. sentence-transformers/clip-ViT-B-32-multilingual-v1 is a Bert-Style sentence-transformers model and not a CLIP model!

Also other CLIP models in HuggingFace can't be loaded using SentenceTransformer(model_path) for example openai/clip-vit-base-patch32 that raises AttributeError: 'CLIPConfig' object has no attribute 'hidden_size'.

only models that have 0_CLIPModel folder in their repo can be loaded like sentence-transformers/clip-ViT-B-32 or sentence-transformers/clip-ViT-L-14.

But I needed a CLIP with different tokenizer which can handle multilingual text better. I tried this code to customize CLIP processor (tokenizer) and text model (token embedding layer) :

from pathlib import Path

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, CLIPImageProcessor
from transformers import CLIPModel as HFCLIPModel
from transformers import CLIPProcessor
from transformers.models.clip.modeling_clip import CLIPTextEmbeddings

class CustomCLIPProcessor(CLIPProcessor):
    tokenizer_class = "AutoTokenizer"

class CustomCLIPModel(SBERTCLIPModel):
    def __init__(self, clip_model=None, clip_processor=None, model_name=None):
        super(CustomCLIPModel, self).__init__()
        if model_name is None:
            self.model = clip_model
            self.processor = clip_processor
        else:
            self.model = HFCLIPModel.from_pretrained(model_name)
            self.processor = CustomCLIPProcessor.from_pretrained(model_name)

    @staticmethod
    def load(input_path: str) -> "CustomCLIPModel":
        return CustomCLIPModel(model_name=input_path)

models_folder = Path("./")
custom_model_path = models_folder / "custom-clip-vit-b-32"
openai_clip_path = "openai/clip-vit-base-patch32"
multilingual_path = "sentence-transformers/clip-ViT-B-32-multilingual-v1"

tokenizer = AutoTokenizer.from_pretrained(multilingual_path)
image_processor = CLIPImageProcessor.from_pretrained(openai_clip_path)
clip_processor = CustomCLIPProcessor(image_processor=image_processor, tokenizer=tokenizer)
clip_model = HFCLIPModel.from_pretrained(openai_clip_path)
clip_model.text_model.config.update({"vocab_size": tokenizer.vocab_size})
clip_model.text_model.embeddings = CLIPTextEmbeddings(clip_model.text_model.config)
sbert_clip_model = CustomCLIPModel(clip_model, clip_processor)
model = SentenceTransformer(modules=[sbert_clip_model])
model.save_pretrained(custom_model_path)
modules_config = json.loads((custom_model_path / "modules.json").read_text())
modules_config[0].update({"type": "__main__.CustomCLIPModel"})
(custom_model_path / "moduls.json").write_text(json.dumps(modules_config, indent=2))

saved files and folders:

Screenshot from 2024-06-21 17-29-05

and for loading the saved model:

from pathlib import Path

from sentence_transformers import SentenceTransformer
from sentence_transformers.models import CLIPModel as SBERTCLIPModel
from transformers import AutoTokenizer
from transformers import CLIPModel as HFCLIPModel
from transformers import CLIPProcessor

class CustomCLIPProcessor(CLIPProcessor):
    tokenizer_class = "AutoTokenizer"

class CustomCLIPModel(SBERTCLIPModel):
    def __init__(self, clip_model=None, clip_processor=None, model_name=None):
        super(CustomCLIPModel, self).__init__()
        if model_name is None:
            self.model = clip_model
            self.processor = clip_processor
        else:
            self.model = HFCLIPModel.from_pretrained(model_name)
            self.processor = CustomCLIPProcessor.from_pretrained(model_name)

    @staticmethod
    def load(input_path: str) -> "CustomCLIPModel":
        return CustomCLIPModel(model_name=input_path)

models_folder = Path("./")
custom_model_path = models_folder / "custom-clip-vit-b-32"

model = SentenceTransformer(custom_model_path.as_posix())
Oscarjia commented 2 months ago

I have also experienced the error, i think this issues related to 1684

y", line 1885, in train
    return inner_training_loop(
  File "/lib/python3.10/site-packages/transformers/trainer.p
y", line 2291, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
  File "/lib/python3.10/site-packages/transformers/trainer.p
y", line 2721, in _maybe_log_save_evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/lib/python3.10/site-packages/sentence_transformers/
trainer.py", line 382, in evaluate
    return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
  File "/lib/python3.10/site-packages/transformers/trainer.py", line 3572, in evaluate
    output = eval_loop(
  File "/lib/python3.10/site-packages/sentence_transformers/trainer.py", line 392, in evaluation_loop
    output = super().evaluation_loop(
  File "/lib/python3.10/site-packages/transformers/trainer.py", line 3747, in evaluation_loop
    for step, inputs in enumerate(dataloader):
  File "/lib/python3.10/site-packages/accelerate/data_loader.py", line 464, in __iter__
    next_batch = next(dataloader_iter)
  File "/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 630, in __next__
    data = self._next_data()
  File "/lib/python3.10/site-packages/torch/utils/data/datal:
data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/lib/python3.10/site-packages/sentence_transformers/data_collator.py", line 37, in __call__
    tokenized = self.tokenize_fn([row[column] for row in features])
  File "/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py", line 922, in tokenize
    return self._first_module().tokenize(texts)
  File "/lib/python3.10/site-packages/sentence_transformers/models/Transformer.py", line 154, in tokenize
    batch1.append(text_tuple[0])
TypeError: 'NoneType' object is not subscriptable