Open hamza13-12 opened 8 months ago
If I reduce context length to anything under 77, I get an error:
def generate_clip_embeddings(captions, clip_model, batch_size=32): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") clip_model.to(device) embeddings = [] for batch_start in range(0, len(captions), batch_size): batch_captions = captions[batch_start:batch_start + batch_size] batch_inputs = clip.tokenize(batch_captions, context_length=56, truncate=True).to(device) with torch.no_grad(): batch_outputs = clip_model.encode_text(batch_inputs) embeddings.append(batch_outputs.cpu().numpy()) return np.vstack(embeddings)
RuntimeError Traceback (most recent call last) Cell In[37], line 1 ----> 1 clip_embeddings_train = generate_clip_embeddings(train_captions, clip_model, batch_size=128) 2 clip_embeddings_val = generate_clip_embeddings(val_captions, clip_model, batch_size=128) Cell In[36], line 10, in generate_clip_embeddings(captions, clip_model, batch_size) 8 batch_inputs = clip.tokenize(batch_captions, context_length=56, truncate=True).to(device) 9 with torch.no_grad(): ---> 10 batch_outputs = clip_model.encode_text(batch_inputs) 11 embeddings.append(batch_outputs.cpu().numpy()) 13 return np.vstack(embeddings) File /opt/conda/lib/python3.10/site-packages/clip/model.py:346, in CLIP.encode_text(self, text) 343 def encode_text(self, text): 344 x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model] --> 346 x = x + self.positional_embedding.type(self.dtype) 347 x = x.permute(1, 0, 2) # NLD -> LND 348 x = self.transformer(x) RuntimeError: The size of tensor a (56) must match the size of tensor b (77) at non-singleton dimension 1
Any ideas on how to resolve this?
If I reduce context length to anything under 77, I get an error:
Any ideas on how to resolve this?