[CLIP] 'clip-ViT-B-32' can we not change the max_seq_lenght?

gaceladri commented 2 years ago

Hi,

I am playing with the clip model. Can not we change the maximum sequence length of the clip model? I was trying to encode quora_duplicate_questions (that have some "long" sentences > 77). I borrowed the code from here

The code to reproduce the issues:

Click to expand!

``` from sentence_transformers import SentenceTransformer, util import os import csv import pickle import time import hnswlib model_name = 'clip-ViT-B-32' model = SentenceTransformer(model_name) model.max_seq_length = 512 url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" dataset_path = "quora_duplicate_questions.tsv" max_corpus_size = 100000 embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format( model_name.replace('/', '_'), max_corpus_size) embedding_size = 512 # Size of embeddings top_k_hits = 10 # Output k hits # Check if embedding cache path exists if not os.path.exists(embedding_cache_path): # Check if the dataset exists. If not, download and extract # Download dataset if needed if not os.path.exists(dataset_path): print("Download dataset") util.http_get(url, dataset_path) # Get all unique sentences from the file corpus_sentences = set() with open(dataset_path, encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL) for row in reader: if len(row['question1']) > 128: continue corpus_sentences.add(row['question1']) if len(corpus_sentences) >= max_corpus_size: break if len(row['question2']) > 128: continue corpus_sentences.add(row['question2']) if len(corpus_sentences) >= max_corpus_size: break corpus_sentences = list(corpus_sentences) print("Encode the corpus. This might take a while") corpus_embeddings = model.encode( corpus_sentences, show_progress_bar=True, convert_to_numpy=True) print("Store file on disc") with open(embedding_cache_path, "wb") as fOut: pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fOut) else: print("Load pre-computed embeddings from disc") with open(embedding_cache_path, "rb") as fIn: cache_data = pickle.load(fIn) corpus_sentences = cache_data['sentences'] corpus_embeddings = cache_data['embeddings'] # Defining our hnswlib index index_path = "./hnswlib.index" # We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity index = hnswlib.Index(space='cosine', dim=embedding_size) if os.path.exists(index_path): print("Loading index...") index.load_index(index_path) else: # Create the HNSWLIB index print("Start creating HNSWLIB index") index.init_index(max_elements=len(corpus_embeddings), ef_construction=400, M=64) # Then we train the index to find a suitable clustering index.add_items(corpus_embeddings, list(range(len(corpus_embeddings)))) print("Saving index to:", index_path) index.save_index(index_path) # Controlling the recall by setting ef: index.set_ef(50) # ef should always be > top_k_hits ######### Search in the index ########### print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences))) # while True: inp_question = input("Please enter a question: ") start_time = time.time() question_embedding = model.encode(inp_question) # We use hnswlib knn_query method to find the top_k_hits corpus_ids, distances = index.knn_query(question_embedding, k=top_k_hits) # We extract corpus ids and scores for the first query hits = [{'corpus_id': id, 'score': 1-score} for id, score in zip(corpus_ids[0], distances[0])] hits = sorted(hits, key=lambda x: x['score'], reverse=True) end_time = time.time() print("Input question:", inp_question) print("Results (after {:.3f} seconds):".format(end_time-start_time)) for hit in hits[0:top_k_hits]: print("\t{:.3f}\t{}".format( hit['score'], corpus_sentences[hit['corpus_id']])) # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity # Here, we compute the recall of ANN compared to the exact results correct_hits = util.semantic_search( question_embedding, corpus_embeddings, top_k=top_k_hits)[0] correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits]) ann_corpus_ids = set([hit['corpus_id'] for hit in hits]) if len(ann_corpus_ids) != len(correct_hits_ids): print("Approximate Nearest Neighbor returned a different number of results than expected") recall = len(ann_corpus_ids.intersection( correct_hits_ids)) / len(correct_hits_ids) print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100)) if recall < 1: print("Missing results:") for hit in correct_hits[0:top_k_hits]: if hit['corpus_id'] not in ann_corpus_ids: print("\t{:.3f}\t{}".format( hit['score'], corpus_sentences[hit['corpus_id']])) print("\n\n========\n") ```

The issue that I have is that despite I had set the model.max_seq_length = 512 it seems that the dimesion of the position_embeddings keeps being 77.

Click here to see the error

``` --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) /tmp/ipykernel_66931/2906114839.py in 49 corpus_sentences = list(corpus_sentences) 50 print("Encode the corpus. This might take a while") ---> 51 corpus_embeddings = model.encode( 52 corpus_sentences, show_progress_bar=True, convert_to_numpy=True) 53 ~/anaconda3/envs/nvt/lib/python3.8/site-packages/sentence_transformers/SentenceTransformer.py in encode(self, sentences, batch_size, show_progress_bar, output_value, convert_to_numpy, convert_to_tensor, device, normalize_embeddings) 155 156 with torch.no_grad(): --> 157 out_features = self.forward(features) 158 159 if output_value == 'token_embeddings': ~/anaconda3/envs/nvt/lib/python3.8/site-packages/torch/nn/modules/container.py in forward(self, input) 115 def forward(self, input): 116 for module in self: --> 117 input = module(input) 118 return input 119 ~/anaconda3/envs/nvt/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 725 result = self._slow_forward(*input, **kwargs) 726 else: --> 727 result = self.forward(*input, **kwargs) 728 for hook in itertools.chain( 729 _global_forward_hooks.values(), ~/anaconda3/envs/nvt/lib/python3.8/site-packages/sentence_transformers/models/CLIPModel.py in forward(self, features) 27 28 if 'input_ids' in features: ---> 29 text_outputs = self.model.text_model( 30 input_ids=features.get('input_ids'), 31 attention_mask=features.get('attention_mask', None), ~/anaconda3/envs/nvt/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 725 result = self._slow_forward(*input, **kwargs) 726 else: --> 727 result = self.forward(*input, **kwargs) 728 for hook in itertools.chain( 729 _global_forward_hooks.values(), ~/anaconda3/envs/nvt/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py in forward(self, input_ids, attention_mask, position_ids, output_attentions, output_hidden_states, return_dict) 631 input_ids = input_ids.view(-1, input_shape[-1]) 632 --> 633 hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) 634 635 bsz, seq_len = input_shape ~/anaconda3/envs/nvt/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs) 725 result = self._slow_forward(*input, **kwargs) 726 else: --> 727 result = self.forward(*input, **kwargs) 728 for hook in itertools.chain( 729 _global_forward_hooks.values(), ~/anaconda3/envs/nvt/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py in forward(self, input_ids, position_ids, inputs_embeds) 163 164 position_embeddings = self.position_embedding(position_ids) --> 165 embeddings = inputs_embeds + position_embeddings 166 167 return embeddings RuntimeError: The size of tensor a (85) must match the size of tensor b (77) at non-singleton dimension 1 ```

Thank you!

nreimers commented 2 years ago

Clip can just encode text up to 77 word pieces. A further increase is not possible, as it is not supported by the model.

Encoding only text with CLIP does not make sense. There are much better text encoders available like the all-* models

gaceladri commented 2 years ago

Yes, thanks. I wanted to clarify if it was a bug or the model itself. It makes sense that it only encodes up to 77 as it was trained on captions. Thanks for the advice! It was just some tests of speed with ANN similarity.

Tortoise17 commented 2 years ago

Clip can just encode text up to 77 word pieces. A further increase is not possible, as it is not supported by the model.

Encoding only text with CLIP does not make sense. There are much better text encoders available like the all-* models

Just I want to clarify, 77 word pieces mean 77 alphabets including spaces or 77 words no matter how many alphabets separated by space.?

nreimers commented 2 years ago

It uses a fixed sized vocabulary with common words and character ngrams. So short common words are 1 word piece, will longer words and less common words will be broken down into multiple character chunks.

xiaohk commented 2 years ago

To fix this error, I used the CLIP tokenizer to truncate my input sentences:

tokenizer = model._first_module().processor.tokenizer

def truncate_sentence(sentence, tokenizer):
    """
    Truncate a sentence to fit the CLIP max token limit (77 tokens including the
    starting and ending tokens).

    Args:
        sentence(string): The sentence to truncate.
        tokenizer(CLIPTokenizer): Rretrained CLIP tokenizer.
    """

    cur_sentence = sentence
    tokens = tokenizer.encode(cur_sentence)

    if len(tokens) > 77:
        # Skip the starting token, only include 75 tokens
        truncated_tokens = tokens[1:76]
        cur_sentence = tokenizer.decode(truncated_tokens)

        # Recursive call here, because the encode(decode()) can have different result
        return truncate_sentence(cur_sentence, tokenizer)

    else:
        return cur_sentence

MrLoh commented 1 month ago

@xiaohk's solution is not great as it requires tokenizing every input twice. All other transformer based models handle truncation automatically. I opened a PR that adds truncation to CLIP models https://github.com/UKPLab/sentence-transformers/pull/2969

UKPLab / sentence-transformers

[CLIP] 'clip-ViT-B-32' can we not change the max_seq_lenght? #1269