Open chenchun0629 opened 9 months ago
some code
# builder.py
import os
from .clip_encoder import CLIPVisionTower
from .chinese_clip_encoder import ChineseCLIPVisionTower
def build_vision_tower(vision_tower_cfg, **kwargs):
vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
is_absolute_path_exists = os.path.exists(vision_tower)
if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
elif 'chinese-clip' in vision_tower:
return ChineseCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
raise ValueError(f'Unknown vision tower: {vision_tower}')
# chinese_clip_encoder.py
import torch
import torch.nn as nn
from transformers import ChineseCLIPVisionModel, ChineseCLIPImageProcessor, ChineseCLIPVisionConfig
class ChineseCLIPVisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_layer = args.mm_vision_select_layer
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
if not delay_load:
self.load_model()
else:
self.cfg_only = ChineseCLIPVisionConfig.from_pretrained(self.vision_tower_name)
def load_model(self):
self.image_processor = ChineseCLIPImageProcessor.from_pretrained(self.vision_tower_name)
self.vision_tower = ChineseCLIPVisionModel.from_pretrained(self.vision_tower_name)
self.vision_tower.requires_grad_(False)
self.is_loaded = True
def feature_select(self, image_forward_outs):
image_features = image_forward_outs.hidden_states[self.select_layer]
if self.select_feature == 'patch':
image_features = image_features[:, 1:]
elif self.select_feature == 'cls_patch':
image_features = image_features
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size) ** 2
After debugging, it was found that the image_size will decrease from the original 336 to 224
LlavaLlamaForCausalLM.images.shape: torch.Size([1, 3, 336, 336])
LlavaMetaForCausalLM0.images.shape: torch.Size([1, 3, 336, 336])
LlavaMetaForCausalLM.images.shape: torch.Size([1, 3, 336, 336])
ChineseCLIPVisionTower.images.shape: torch.Size([1, 3, 336, 336])
LlavaLlamaForCausalLM.images.shape: torch.Size([1, 3, 336, 336])
LlavaMetaForCausalLM0.images.shape: torch.Size([1, 3, 336, 336])
LlavaMetaForCausalLM.images.shape: torch.Size([1, 3, 336, 336])
ChineseCLIPVisionTower.images.shape: torch.Size([1, 3, 336, 336])
{'loss': 2.6496, 'learning_rate': 3.1826973105233184e-06, 'epoch': 0.0}
0%| | 4/202394
LlavaLlamaForCausalLM.images.shape: torch.Size([1, 3, 224, 224])
LlavaMetaForCausalLM0.images.shape: torch.Size([1, 3, 224, 224])
LlavaMetaForCausalLM.images.shape: torch.Size([1, 3, 224, 224])
ChineseCLIPVisionTower.images.shape: torch.Size([1, 3, 224, 224])
LlavaLlamaForCausalLM.images.shape: torch.Size([1, 3, 224, 224])
LlavaMetaForCausalLM0.images.shape: torch.Size([1, 3, 224, 224])
LlavaMetaForCausalLM.images.shape: torch.Size([1, 3, 224, 224])
ChineseCLIPVisionTower.images.shape: torch.Size([1, 3, 224, 224])
Traceback (most recent call last):
File "/data/jupyter/user/cc/LLaVA-cc/llava/train/train_mem.py", line 17, in <module>
train()
File "/data/jupyter/user/cc/LLaVA/llava/train/train.py", line 965, in train
trainer.train()
File "/data/miniconda3/envs/llava/lib/python3.10/site-packages/transformers/trainer.py", line 1539, in train
...
...
# 336*336 in LazySupervisedDataset and DataCollatorForSupervisedDataset
LazySupervisedDataset.images: torch.Size([3, 336, 336])
DataCollatorForSupervisedDataset.images: torch.Size([3, 336, 336])
# 224*224 in transformers.trainer
trainer0.images: torch.Size([1, 3, 224, 224])
trainer1.images: torch.Size([1, 3, 224, 224])
# transformers.trainer
for step, inputs in enumerate(epoch_iterator):
print("trainer0.images:", inputs["images"].shape, flush=True)
total_batched_samples += 1
if rng_to_sync:
self._load_rng_state(resume_from_checkpoint)
rng_to_sync = False
# Skip past any already trained steps if resuming training
if steps_trained_in_current_epoch > 0:
steps_trained_in_current_epoch -= 1
if steps_trained_progress_bar is not None:
steps_trained_progress_bar.update(1)
if steps_trained_in_current_epoch == 0:
self._load_rng_state(resume_from_checkpoint)
continue
elif steps_trained_progress_bar is not None:
steps_trained_progress_bar.close()
steps_trained_progress_bar = None
if step % args.gradient_accumulation_steps == 0:
self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
with self.accelerator.accumulate(model):
print("trainer1.images:", inputs["images"].shape, flush=True)
tr_loss_step = self.training_step(model, inputs)
I try use blip_laion_cc_sbu_558k.json datasets to finetune model, is works.
When I use llava_v1_5_mix665k.json datasets, I encounter this issue.
你的维度问题看起来就是224和336clip出的特征维度对不上吧 257 和 577不是正对着224和336 clip patch 14时出的image token数么,加的1就是cls token
请问,你这个问题解决了吗?
Question
motivation:
I try to use chinese-clip replace clip.
environment
step 1: Pretrain
The current pretraining step has been successfully executed.
step 2: Visual Instruction Tuning
I have promble in this current step.
May I ask if there are any suggestions on how I can solve this problem.
Thanks!