Closed xiaolvtongxue-zt closed 6 months ago
[ERROR:swift]这个不需要管的
你先试试这行代码能不能跑通, web-ui训练我没法帮你查看原因
# pip install ms-swift[llm] -U
# Experimental environment: A10, 3090, V100, ...
# 8GB GPU memory
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from swift.llm import (
DatasetName, InferArguments, ModelType, SftArguments,
infer_main, sft_main, app_ui_main, merge_lora
)
model_type = ModelType.qwen1half_0_5b
sft_args = SftArguments(
model_type=model_type,
train_dataset_sample=2000,
dataset=[DatasetName.jd_sentiment_zh],
output_dir='output')
result = sft_main(sft_args)
best_model_checkpoint = result['best_model_checkpoint']
print(f'best_model_checkpoint: {best_model_checkpoint}')
torch.cuda.empty_cache()
infer_args = InferArguments(
ckpt_dir=best_model_checkpoint,
load_dataset_config=True,
val_dataset_sample=10)
merge_lora(infer_args, device_map='cpu')
result = infer_main(infer_args)
torch.cuda.empty_cache()
app_ui_main(infer_args)
UnicodeDecodeError 这个原因通常是数据集下载了一半中断了,你可以先将数据集缓存删除,再运行就可以了
你先试试这行代码能不能跑通, web-ui训练我没法帮你查看原因
# pip install ms-swift[llm] -U # Experimental environment: A10, 3090, V100, ... # 8GB GPU memory import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' import torch from swift.llm import ( DatasetName, InferArguments, ModelType, SftArguments, infer_main, sft_main, app_ui_main, merge_lora ) model_type = ModelType.qwen1half_0_5b sft_args = SftArguments( model_type=model_type, train_dataset_sample=2000, dataset=[DatasetName.jd_sentiment_zh], output_dir='output') result = sft_main(sft_args) best_model_checkpoint = result['best_model_checkpoint'] print(f'best_model_checkpoint: {best_model_checkpoint}') torch.cuda.empty_cache() infer_args = InferArguments( ckpt_dir=best_model_checkpoint, load_dataset_config=True, val_dataset_sample=10) merge_lora(infer_args, device_map='cpu') result = infer_main(infer_args) torch.cuda.empty_cache() app_ui_main(infer_args)
你好,我重新使用python代码进行运行,结果如下:
./aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [96,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [97,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [98,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [99,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [100,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [101,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [102,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [103,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [104,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [105,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [106,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [107,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [108,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [109,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [110,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [111,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [112,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [113,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [114,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [115,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [116,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [117,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [118,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [119,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [120,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [121,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [122,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [123,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [124,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [125,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [126,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [108,0,0], thread: [127,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed.
Traceback (most recent call last):
File "/home/centos/xiaolv/太安模型微调/swift_qwen/swift_qwen_1.5.py", line 165, in <module>
output = sft_main(sft_args)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/swift/utils/run_utils.py", line 31, in x_main
result = llm_x(args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/swift/llm/sft.py", line 236, in llm_sft
trainer.train(training_args.resume_from_checkpoint)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/swift/trainers/trainers.py", line 50, in train
res = super().train(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/trainer.py", line 1624, in train
return inner_training_loop(
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/trainer.py", line 2902, in training_step
loss = self.compute_loss(model, inputs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/swift/trainers/trainers.py", line 216, in compute_loss
outputs = model(**inputs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/accelerate/utils/operations.py", line 822, in forward
return model_forward(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/accelerate/utils/operations.py", line 810, in __call__
return convert_to_fp32(self.model_forward(*args, **kwargs))
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
return func(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/swift/tuners/base.py", line 84, in forward
return self.base_model(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1173, in forward
outputs = self.model(
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1048, in forward
layer_outputs = self._gradient_checkpointing_func(
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/swift/llm/utils/model.py", line 2629, in <lambda>
_old_checkpoint(*args, use_reentrant=use_reentrant, **kwargs))
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/_compile.py", line 24, in inner
return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
return fn(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
return fn(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 458, in checkpoint
ret = function(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 773, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 676, in forward
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 167, in apply_rotary_pos_emb
q_embed = (q * cos) + (rotate_half(q) * sin)
File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 140, in rotate_half
return torch.cat((-x2, x1), dim=-1)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
我的torch版本时12.1。
我的运行参数是:
import os
## 设置可用的GPU数量
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
os.environ['WEBUI_SHARE'] = "1"
os.environ['WEBUI_SERVER'] = "192.168.0.74"
os.environ['WEBUI_PORT'] = "7860"
## 设置GPU的空闲备用的数据大小,----> 32MB
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
from swift.llm import DatasetName, ModelType, SftArguments, sft_main
## 1. 定义基座模型的路径
MODEL="../models/models/qwen/Qwen1___5-7B-Chat"
## 2. 定义模型的训练数据路径
DATA="../train_data/2024_03_24/train_data__2024_03_24_11_06_36_debug.json"
## 日志打印
output_dir_ = "output"
## 从训练集中切分多少数据当做验证集
dataset_test_ratio_ = 0.05
##
output_qwen="../train_data/2024_03_23"
model_max_length_=8192
lora_rank_ = 32
lora_alpha_ = lora_rank_*4
batch_size_ = 16
eval_batch_size_ = 32
num_train_epochs_=20
max_new_tokens_ = 8192
sft_args = SftArguments(
model_type='qwen1half-7b-chat',
model_id_or_path=MODEL,
model_revision='master',
sft_type='lora',
freeze_parameters=0.0,
additional_trainable_parameters=[],
tuner_backend='swift',
template_type='qwen',
output_dir=output_dir_,
add_output_dir_suffix=False,
ddp_backend='nccl',
ddp_find_unused_parameters=None,
ddp_broadcast_buffers=None,
seed=42,
resume_from_checkpoint=None,
dtype='bf16',
dataset=['_custom_dataset'],
dataset_seed=42,
dataset_test_ratio=dataset_test_ratio_,
train_dataset_sample=-1,
train_dataset_mix_ratio=None,
train_dataset_mix_ds=['ms-bench'],
val_dataset_sample=-1,
use_loss_scale=False,
system='You are a helpful assistant.',
max_length=model_max_length_,
truncation_strategy='delete',
check_dataset_strategy='none',
custom_train_dataset_path=[DATA],
custom_val_dataset_path=[],
self_cognition_sample=0,
quantization_bit=0,
bnb_4bit_comp_dtype='bf16',
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
lora_target_modules=['q_proj', 'k_proj', 'v_proj'],
lora_rank=lora_rank_,
lora_alpha=lora_alpha_,
lora_dropout_p=0.05,
lora_bias_trainable='none',
lora_modules_to_save=[],
lora_dtype='fp32',
use_rslora=False,
lora_layers_to_transform=None,
lora_layers_pattern=None,
lora_rank_pattern={},
lora_alpha_pattern={},
lora_loftq_config={},
use_dora=False,
use_galore=False,
galore_rank=128,
galore_target_modules=None,
galore_update_proj_gap=50,
galore_scale=1.0,
galore_proj_type='std',
galore_optim_per_parameter=False,
galore_with_embedding=False,
adalora_target_r=8,
adalora_init_r=12,
adalora_tinit=0,
adalora_tfinal=0,
adalora_deltaT=1,
adalora_beta1=0.85,
adalora_beta2=0.85,
adalora_orth_reg_weight=0.5,
ia3_target_modules=['DEFAULT'],
ia3_feedforward_modules=[],
ia3_modules_to_save=[],
llamapro_num_new_blocks=4,
llamapro_num_groups=None,
neftune_noise_alpha=None,
neftune_backend='transformers',
gradient_checkpointing=True,
deepspeed=None,
batch_size=batch_size_,
eval_batch_size=eval_batch_size_,
num_train_epochs=num_train_epochs_,
max_steps=-1,
optim='adamw_torch',
adam_beta1=0.9,
adam_beta2=0.999,
learning_rate=0.0001,
weight_decay=0.01,
gradient_accumulation_steps=2,
max_grad_norm=0.5,
predict_with_generate=False,
lr_scheduler_type='linear',
warmup_ratio=0.05,
eval_steps=50,
save_steps=1,
save_only_model=False,
save_total_limit=10,
logging_steps=1,
dataloader_num_workers=1,
dataloader_pin_memory=True,
push_to_hub=False,
hub_model_id=None,
hub_token=None,
hub_private_repo=False,
push_hub_strategy='push_best',
test_oom_error=False,
disable_tqdm=False,
lazy_tokenize=False,
preprocess_num_proc=1,
use_flash_attn=None,
ignore_args_error=False,
check_model_is_latest=True,
acc_strategy='token',
save_on_each_node=True,
evaluation_strategy='steps',
save_strategy='steps',
max_new_tokens=max_new_tokens_,
do_sample=True,
temperature=0.3,
top_k=20,
top_p=0.7,
repetition_penalty=1.0,
num_beams=1,
per_device_train_batch_size=None,
per_device_eval_batch_size=None,
only_save_model=None,
neftune_alpha=None,
deepspeed_config_path=None,
model_cache_dir=None)
output = sft_main(sft_args)
best_model_checkpoint = output['best_model_checkpoint']
print(f'best_model_checkpoint: {best_model_checkpoint}')
你好,经过我们内部的不断试错后,似乎已经解决了以上出现的问题。 最终的结论是:系统的内核版本过低导致。 主要的灵感来源与一句警告提示:
dataloader_config DataLoaderConfiguration(dispatch_batches=None,split_batches=False,even_batches=True,use_seedable_sampler=True)
warnings.warn(
Detected kernel version 3.10.0,which is below the recommended minimum of 5.5.0;this can cause the process to hang.It is recommended to upgrade the kernel to the minimum version or
higher.
我们最后将服务器的内核升级到6.8后,似乎不会再出现以上的问题。
在微调 qwen/Qwen15-7B-Chat 时,使用本地模型 models/models/qwen/Qwen15-7B-Chat,数据也处理正确,但在开始训练时报错了。怎么回事?
我使用的时swift_ui进行的微调。
或者时出现一下信息:
之后,再读这个文件的时候,就会报一下错误:
这是什么操作呢?