Closed vivixx1 closed 3 months ago
lora微调时的脚本如下,最大token数设置为8192 export NCCL_P2P_DISABLE=1 export NCCL_IB_DISABLE=1 export NCCL_DEBUG=info export NCCL_SOCKET_IFNAME=eth0 nproc_per_node=6
PYTHONPATH=../../.. \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 \ torchrun \ --nproc_per_node=$nproc_per_node \ --master_port 29500 \ /swift/examples/pytorch/llm/llm_sft.py \ --model_type qwen1half-72b-chat --model_id_or_path /ai/qwenhalf72bchat/Qwen1.5-72B-Chat/ \ --model_revision master \ --sft_type lora \ --tuner_backend swift \ --template_type AUTO \ --dtype 'bf16' \ --output_dir /ai/outfile/ \ --ddp_backend nccl \ --custom_train_dataset_path /ai/123/训练集4.1knn_41.jsonl \ --train_dataset_sample -1 \ --num_train_epochs 2 \ --max_length 8192 \ --truncation_strategy delete \ --check_dataset_strategy warning \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0.05 \ --lora_target_modules DEFAULT \ --gradient_checkpointing true \ --batch_size 1 \ --weight_decay 0.1 \ --learning_rate 1e-4 \ --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ --max_grad_norm 0.5 \ --warmup_ratio 0.03 \ --eval_steps 50 \ --save_total_limit 2 \ --logging_steps 1 \ --use_flash_attn true \ --deepspeed default-zero3 \ --save_only_model true \
这是完整报错信息
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [37,0,0], thread: [125,0,0] Assertion -sizes[i] <= index && index < sizes[i] && "index out of bounds"
failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [37,0,0], thread: [126,0,0] Assertion -sizes[i] <= index && index < sizes[i] && "index out of bounds"
failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [37,0,0], thread: [127,0,0] Assertion -sizes[i] <= index && index < sizes[i] && "index out of bounds"
failed.
Traceback (most recent call last):
File "myinfer_merged.py", line 37, in TORCH_USE_CUDA_DSA
to enable device-side assertions.
训练时出现过这个提示,不过看意思只是影响效果,不会出现别的影响
并且也尝试把测试集的token数量降低到1000以下,同样出现这个情况,有没有可能是硬件原因?
import os import json
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3' os.environ['NCCL_P2P_DISABLE'] = '1' os.environ['NCCL_IB_DISABLE'] = '1' os.environ['NCCL_DEBUG'] = 'info' os.environ['NCCL_SOCKET_IFNAME'] = 'eth0'
from swift.llm import ( ModelType, get_vllm_engine, get_default_template_type, get_template, inference_vllm ) from swift.tuners import Swift
ckpt_dir = '/ai/outfile/qwen1half-72b-chat/v3-20240331-163441/checkpoint-50-merged/' model_type = ModelType.qwen1half_72b_chat template_type = get_default_template_type(model_type)
llm_engine = get_vllm_engine(model_type, model_id_or_path=ckpt_dir, gpu_memory_utilization=0.95, tensor_parallel_size=4) tokenizer = llm_engine.hf_tokenizer template = get_template(template_type, tokenizer)
with open('/ai/123/测试集4.1knn.jsonl', 'r', encoding='utf-8') as f: data = [json.loads(item) for item in f]
output_data = [] flag = 0 for line in data: query = line['query'] response = inference_vllm(llm_engine, template, [{'query': query}])[0] print(f'response: {response}') output_data.append(response) flag += 1 if flag == 10: with open('/ai/outfile/qwen1half_72b_chat_2.jsonl', 'a', encoding='utf-8') as f: for line in output_data: f.write(json.dumps({'response': line}, ensure_ascii=False) + '\n') output_data = [] flag = 0
尝试用vllm加速推理的方式解决了。。。。虽然也不知道为啥传统的推理不行
尝试用增量权重和lora合并权重推理,均会出现RuntimeError: CUDA error: device-side assert triggered,但是在训练的时候不会出现问题 这是增量权重代码 import json import os from transformers.debug_utils import DebugUnderflowOverflow from modelscope import GenerationConfig
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
from swift.llm import ( get_model_tokenizer, get_template, inference, ModelType, get_default_template_type ) from swift.tuners import Swift
ckpt_dir = '/ai/outfile/qwen1half-72b-chat/v6-20240402-111812/checkpoint-106' model_type = ModelType.qwen1half_72b_chat template_type = get_default_template_type(model_type) model_id_or_path = '/ai/qwenhalf72bchat/Qwen1.5-72B-Chat/'
model, tokenizer = get_model_tokenizer(model_type, model_id_or_path=model_id_or_path, model_kwargs={'device_map': 'auto'})
model = Swift.from_pretrained(model, ckpt_dir, inference_mode=True)
debug_overflow = DebugUnderflowOverflow(model) template = get_template(template_type, tokenizer)
model.generation_config = GenerationConfig( max_new_tokens=8192, temperature=0.9, repetition_penalt=1.05, do_sample = True, max_length=512 )
with open('/ai/123/测试集4.2knn.jsonl', 'r', encoding='utf-8') as f: data = [json.loads(item) for item in f]
output_data = [] flag = 0 for line in data:
query = prompt + '题目:' + line['query']
这是合并权重推理代码 import json import os from transformers.debug_utils import DebugUnderflowOverflow from modelscope import GenerationConfig
import os os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
from swift.llm import ( get_model_tokenizer, get_template, inference, ModelType, get_default_template_type )
ckpt_dir = '/ai/outfile/qwen1half-72b-chat/v3-20240331-163441/checkpoint-50-merged/' model_type = ModelType.qwen1half_72b_chat template_type = get_default_template_type(model_type)
model, tokenizer = get_model_tokenizer(model_type, model_kwargs={'device_map': 'auto'}, model_id_or_path=ckpt_dir) debug_overflow = DebugUnderflowOverflow(model) template = get_template(template_type, tokenizer)
with open('/ai/123/测试集4.1knn.jsonl', 'r', encoding='utf-8') as f: data = [json.loads(item) for item in f]
output_data = [] flag = 0 for line in data:
query = prompt + '题目:' + line['query']