modelscope / ms-swift

Use PEFT or Full-parameter to finetune 400+ LLMs or 100+ MLLMs. (LLM: Qwen2.5, Llama3.2, GLM4, Internlm2.5, Yi1.5, Mistral, Baichuan2, DeepSeek, Gemma2, ...; MLLM: Qwen2-VL, Qwen2-Audio, Llama3.2-Vision, Llava, InternVL2, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL, Phi3.5-Vision, ...)
https://swift.readthedocs.io/zh-cn/latest/Instruction/index.html
Apache License 2.0
4.43k stars 389 forks source link

Using multimodal datasets to train ovis1_6-gemma2-9b, an error occurred: RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char #2514

Open c-x-l-w opened 4 days ago

c-x-l-w commented 4 days ago

Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s] Loading checkpoint shards: 20%|██ | 1/5 [00:04<00:18, 4.72s/it] Loading checkpoint shards: 40%|████ | 2/5 [00:10<00:16, 5.53s/it] Loading checkpoint shards: 60%|██████ | 3/5 [00:16<00:11, 5.60s/it] Loading checkpoint shards: 80%|████████ | 4/5 [00:23<00:06, 6.30s/it] Loading checkpoint shards: 100%|██████████| 5/5 [00:24<00:00, 4.25s/it] Loading checkpoint shards: 100%|██████████| 5/5 [00:24<00:00, 4.90s/it] [INFO:swift] model.max_model_len: 8192 [INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)} [INFO:swift] model_config: OvisConfig { "_attn_implementation_autoset": true, "_name_or_path": "/home/tom/fssd/WWW2025/Ovis1.6-Gemma2-9B", "architectures": [ "Ovis" ], "auto_map": { "AutoConfig": "configuration_ovis.OvisConfig", "AutoModelForCausalLM": "modeling_ovis.Ovis" }, "conversation_formatter_class": "GemmaConversationFormatter", "disable_tie_weight": false, "hidden_size": 3584, "keys_to_ignore_at_inference": [ "past_key_values" ], "llm_attn_implementation": "eager", "llm_config": { "_attn_implementation_autoset": false, "_name_or_path": "google/gemma-2-9b-it", "add_cross_attention": false, "architectures": [ "Gemma2ForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "attn_logit_softcapping": 50.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": 2, "cache_implementation": "hybrid", "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": 1, "exponential_decay_length_penalty": null, "final_logit_softcapping": 30.0, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "head_dim": 256, "hidden_act": "gelu_pytorch_tanh", "hidden_activation": "gelu_pytorch_tanh", "hidden_size": 3584, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "intermediate_size": 14336, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "max_position_embeddings": 8192, "min_length": 0, "model_type": "gemma2", "no_repeat_ngram_size": 0, "num_attention_heads": 16, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 42, "num_key_value_heads": 8, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": 0, "prefix": null, "problem_type": null, "pruned_heads": {}, "query_pre_attn_scalar": 256, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "rms_norm_eps": 1e-06, "rope_theta": 10000.0, "sep_token_id": null, "sliding_window": 4096, "sliding_window_size": 4096, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": "bfloat16", "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "use_cache": true, "vocab_size": 256000 }, "model_type": "ovis", "multimodal_max_length": 8192, "quantization_config": { "_load_in_4bit": false, "_load_in_8bit": true, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_quant_storage": "uint8", "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": true, "llm_int8_enable_fp32_cpu_offload": false, "llm_int8_has_fp16_weight": false, "llm_int8_skip_modules": null, "llm_int8_threshold": 6.0, "load_in_4bit": false, "load_in_8bit": true, "quant_method": "bitsandbytes" }, "torch_dtype": "bfloat16", "transformers_version": "4.46.1", "use_cache": true, "visual_tokenizer_config": { "_attn_implementation_autoset": false, "_name_or_path": "", "add_cross_attention": false, "architectures": null, "backbone_config": { "_attn_implementation_autoset": false, "_name_or_path": "google/siglip-so400m-patch14-384", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_size": 384, "intermediate_size": 4304, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "siglip_vision_model", "no_repeat_ngram_size": 0, "num_attention_heads": 16, "num_beam_groups": 1, "num_beams": 1, "num_channels": 3, "num_hidden_layers": 27, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 14, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": null, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false }, "backbone_kwargs": {}, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "depths": null, "diversity_penalty": 0.0, "do_sample": false, "drop_cls_token": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_stride": 2, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "siglip_visual_tokenizer", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "tau": 1.0, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenize_function": "softmax", "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": null, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "vocab_size": 65536 } }

[INFO:swift] model.generation_config: GenerationConfig { "bos_token_id": 2, "eos_token_id": 1, "max_new_tokens": 2048, "pad_token_id": 0 }

[INFO:swift] Setting model.config.use_cache: False [INFO:swift] target_modules: ^(llm)(?!.(lm_head|output|emb|wte|shared)). [INFO:swift] modules_to_save: [] [INFO:swift] lora_config: get_wrapped_class..PeftWrapper(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/home/tom/fssd/WWW2025/Ovis1.6-Gemma2-9B', revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='^(llm)(?!.(lm_head|output|emb|wte|shared)).', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=[], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06) [INFO:swift] [base_model.model.llm.model.embed_tokens.weight]: requires_grad=False, dtype=torch.bfloat16, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0 [INFO:swift] [base_model.model.llm.model.layers.0.mlp.down_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0 [INFO:swift] ... [INFO:swift] PeftModelForCausalLM( (base_model): LoraModel( (model): Ovis( (llm): Gemma2ForCausalLM( (model): Gemma2Model( (embed_tokens): Embedding(256000, 3584, padding_idx=0) (layers): ModuleList( (0-41): 42 x Gemma2DecoderLayer( (self_attn): Gemma2Attention( (q_proj): lora.Linear8bitLt( (base_layer): Linear8bitLt(in_features=3584, out_features=4096, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=3584, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=4096, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (k_proj): lora.Linear8bitLt( (base_layer): Linear8bitLt(in_features=3584, out_features=2048, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=3584, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=2048, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (v_proj): lora.Linear8bitLt( (base_layer): Linear8bitLt(in_features=3584, out_features=2048, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=3584, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=2048, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (o_proj): lora.Linear8bitLt( (base_layer): Linear8bitLt(in_features=4096, out_features=3584, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=3584, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (rotary_emb): Gemma2RotaryEmbedding() ) (mlp): Gemma2MLP( (gate_proj): lora.Linear8bitLt( (base_layer): Linear8bitLt(in_features=3584, out_features=14336, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=3584, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=14336, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (up_proj): lora.Linear8bitLt( (base_layer): Linear8bitLt(in_features=3584, out_features=14336, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=3584, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=14336, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (down_proj): lora.Linear8bitLt( (base_layer): Linear8bitLt(in_features=14336, out_features=3584, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=14336, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=3584, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (act_fn): PytorchGELUTanh() ) (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06) (pre_feedforward_layernorm): Gemma2RMSNorm((3584,), eps=1e-06) (post_feedforward_layernorm): Gemma2RMSNorm((3584,), eps=1e-06) (post_attention_layernorm): Gemma2RMSNorm((3584,), eps=1e-06) ) ) (norm): Gemma2RMSNorm((3584,), eps=1e-06) ) (lm_head): Linear(in_features=3584, out_features=256000, bias=False) ) (visual_tokenizer): SiglipVisualTokenizer( (backbone): SiglipVisionModel( (vision_model): SiglipVisionTransformer( (embeddings): SiglipVisionEmbeddings( (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid) (position_embedding): Embedding(729, 1152) ) (encoder): SiglipEncoder( (layers): ModuleList( (0-26): 27 x SiglipEncoderLayer( (self_attn): SiglipSdpaAttention( (k_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True) (v_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True) (q_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True) (out_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True) ) (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) (mlp): SiglipMLP( (activation_fn): PytorchGELUTanh() (fc1): Linear8bitLt(in_features=1152, out_features=4304, bias=True) (fc2): Linear8bitLt(in_features=4304, out_features=1152, bias=True) ) (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) ) ) ) (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) (head): SiglipMultiheadAttentionPoolingHead( (attention): MultiheadAttention( (out_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True) ) (layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True) (mlp): SiglipMLP( (activation_fn): PytorchGELUTanh() (fc1): Linear8bitLt(in_features=1152, out_features=4304, bias=True) (fc2): Linear8bitLt(in_features=4304, out_features=1152, bias=True) ) ) ) ) (head): Sequential( (0): Linear8bitLt(in_features=4608, out_features=65531, bias=False) (1): LayerNorm((65531,), eps=1e-05, elementwise_affine=True) ) ) (vte): VisualEmbedding(65536, 3584) ) ) ) [INFO:swift] PeftModelForCausalLM: 10233.9195M Params (27.0090M Trainable [0.2639%]), 0.0061M Buffers. [INFO:swift] system: None [INFO:swift] args.lazy_tokenize: True

Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 1000 examples [00:00, 53817.98 examples/s] [INFO:swift] train_dataset: Dataset({ features: ['query', 'response', 'images'], num_rows: 990 }) [INFO:swift] val_dataset: Dataset({ features: ['query', 'response', 'images'], num_rows: 10 }) [INFO:swift] Setting max_partition: 9. You can adjust this hyperparameter through the environment variable: MAX_PARTITION. [INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 23515, 30582, 107] [INFO:swift] [LABELS] [-100 * 214]活动页面 [INFO:swift] training_args: Seq2SeqTrainingArguments( _n_gpu=1, acc_strategy=token, accelerator_config={'split_batches': False, 'dispatch_batches': False, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, adafactor=False, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, additional_saved_files=[], auto_find_batch_size=False, batch_eval_metrics=False, bf16=True, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=1, dataloader_persistent_workers=False, dataloader_pin_memory=True, dataloader_prefetch_factor=None, ddp_backend=None, ddp_broadcast_buffers=None, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, dispatch_batches=None, do_eval=True, do_predict=False, do_train=False, eval_accumulation_steps=None, eval_delay=0, eval_do_concat_batches=True, eval_on_start=False, eval_steps=1000, eval_strategy=IntervalStrategy.STEPS, eval_use_gather_object=False, evaluation_strategy=None, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, generation_config=GenerationConfig { "bos_token_id": 2, "eos_token_id": 1, "max_new_tokens": 2048, "pad_token_id": 0 } , generation_max_length=None, generation_num_beams=None, gradient_accumulation_steps=1, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, greater_is_better=False, group_by_length=False, half_precision_backend=auto, hub_always_push=False, hub_model_id=None, hub_private_repo=False, hub_strategy=HubStrategy.EVERY_SAVE, hub_token=, ignore_data_skip=False, include_for_metrics=[], include_inputs_for_metrics=False, include_num_input_tokens_seen=False, include_tokens_per_second=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.0001, length_column_name=length, load_best_model_at_end=False, local_rank=0, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/runs, logging_first_step=True, logging_nan_inf_filter=True, logging_steps=1000, logging_strategy=IntervalStrategy.STEPS, loss_name=None, lr_scheduler_kwargs={}, lr_scheduler_type=SchedulerType.COSINE, max_grad_norm=1, max_steps=-1, metric_for_best_model=loss, metric_warmup_step=0, mp_parameters=, neftune_noise_alpha=None, no_cuda=False, num_train_epochs=2, optim=OptimizerNames.ADAMW_TORCH, optim_args=None, optim_target_modules=None, output_dir=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=1, per_device_train_batch_size=1, predict_with_generate=False, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ray_scope=last, remove_unused_columns=False, report_to=['tensorboard'], restore_callback_states_from_checkpoint=False, resume_from_checkpoint=None, run_name=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001, save_on_each_node=False, save_only_model=False, save_safetensors=True, save_steps=1000, save_strategy=IntervalStrategy.STEPS, save_total_limit=2, seed=42, skip_memory_metrics=True, sortish_sampler=False, split_batches=None, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torch_empty_cache_steps=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, train_dataset_sample=-1, train_sampler_random=True, use_cpu=False, use_ipex=False, use_legacy_prediction_loop=False, use_liger_kernel=False, use_mps_device=False, warmup_ratio=0.05, warmup_steps=0, weight_decay=0.1, ) [ERROR:swift] There are error run git command. /home/tom/fssd/WWW2025/swift/swift/trainers/mixin.py:93: FutureWarning: tokenizer is deprecated and will be removed in version 5.0.0 for Seq2SeqTrainer.__init__. Use processing_class instead. super().init( Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. [2024-11-26 21:50:40,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [INFO:swift] The SftArguments will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/sft_args.json [INFO:swift] The Seq2SeqTrainingArguments will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/training_args.json [INFO:swift] The logging file will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/logging.jsonl

Train: 0%| | 0/1980 [00:00<?, ?it/s]/opt/conda/envs/www2025/lib/python3.12/site-packages/bitsandbytes/autograd/_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization") Traceback (most recent call last): File "/home/tom/fssd/WWW2025/swift/swift/cli/sft.py", line 5, in sft_main() File "/home/tom/fssd/WWW2025/swift/swift/utils/run_utils.py", line 32, in x_main result = llm_x(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 546, in llm_sft return trainer_train(args, model, template, train_dataset, val_dataset, callbacks=callbacks, msg=msg) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 496, in trainer_train trainer.train(training_args.resume_from_checkpoint) File "/home/tom/fssd/WWW2025/swift/swift/trainers/mixin.py", line 493, in train res = super().train(resume_from_checkpoint, args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2122, in train return inner_training_loop( ^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2474, in _inner_training_loop tr_loss_step = self.training_step(model, inputs, num_items_in_batch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 3572, in training_step loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/fssd/WWW2025/swift/swift/trainers/trainers.py", line 161, in compute_loss outputs = model(inputs) ^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl return inner() ^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1769, in inner args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc] ^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 350, in _pre_forward_hook res_extra.append(self._post_encode(module, d)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 1355, in _postencode , inputsembeds, labels, = self.model.merge_multimodal( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 376, in merge_multimodal visual_tokens = self.visual_tokenizer(torch.cat([x for x in pixel_values], dim=0)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward output = module._old_forward(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 223, in forward features = self.encode(pixel_values) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 198, in encode output = self.backbone(pixel_values, output_hidden_states=True, return_dict=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward output = module._old_forward(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1190, in forward return self.vision_model( ^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward output = module._old_forward(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1101, in forward pooler_output = self.head(last_hidden_state) if self.use_head else None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward output = module._old_forward(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1128, in forward hidden_state = self.attention(probe, hidden_state, hidden_state)[0] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward output = module._old_forward(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/activation.py", line 1368, in forward attn_output, attn_output_weights = F.multi_head_attention_forward( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/functional.py", line 6251, in multi_head_attention_forward attn_output = linear(attn_output, out_proj_weight, out_proj_bias) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char

Train: 0%| | 0/1980 [00:03<?, ?it/s]

himasai9712 commented 2 days ago

how you started training?