import os
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
TrainingArguments,
pipeline,
logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import torch
import torch_npu
from accelerate import Accelerator
accelerator = Accelerator()
device_map = accelerator.device
# source '/home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh'
x = torch.randn(2, 2).npu()
y = torch.randn(2, 2).npu()
z = x.mm(y)
print(z)
print(device_map)
# The model that you want to train from the Hugging Face hub
model_name = "/home/HwHiAiUser/Code/model/llama-3b"
# The instruction dataset to use
dataset_name = "/home/HwHiAiUser/Code"
# Fine-tuned model name
new_model = "llama-3b-NPU"
################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 64
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"
# Number of training epochs
num_train_epochs = 10
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 5
# Batch size per GPU for evaluation
per_device_eval_batch_size = 5
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 5
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 50
# Log every X updates steps
logging_steps = 50
################################################################################
# SFT parameters
################################################################################
# Maximum sequence length to use
max_seq_length = None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = False
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
# Load base model
model = AutoModelForCausalLM.from_pretrained(
model_name,
# torch_dtype=torch.float16,
# quantization_config=bnb_config,
trust_remote_code=True,
device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name, use_fast=False, trust_remote_code=True,
device_map=device_map)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
# Load LoRA configuration
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
)
# Set training parameters
training_arguments = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
weight_decay=weight_decay,
fp16=fp16,
bf16=bf16,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
report_to="tensorboard"
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=packing,
)
# Train model
trainer.train()
# Save trained model
trainer.model.save_pretrained(new_model)
报错信息如下:
发生异常: RuntimeError
Expected all tensors to be on the same device. Expected NPU tensor, please check whether the input tensor device is correct.
File "/home/HwHiAiUser/Code/main.py", line 212, in <module>
trainer.train()
RuntimeError: Expected all tensors to be on the same device. Expected NPU tensor, please check whether the input tensor device is correct.
(NPU) [HwHiAiUser@localhost Code]$ cd /home/HwHiAiUser/Code ; /usr/bin/env /home/HwHiAiUser/下载/yes/envs/NPU/bin/python /home/HwHiAiUser/.vscode/extensions/ms-python.python-2023.18.0/pythonFiles/lib/python/debugpy/adapter/../../debugpy/launcher 41483 -- /home/HwHiAiUser/Code/main.py
/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
warn("The installed version of bitsandbytes was compiled without GPU support. "
/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
Warning: Device do not support double dtype now, dtype cast repalce with float.
tensor([[-1.1986, 0.8204],
[-1.6992, 1.1416]], device='npu:0')
npu
You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:159: UserWarning: You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to 1024
warnings.warn(
Map: 100%|███████████| 122606/122606 [03:51<00:00, 529.37 examples/s]
0%| | 0/49040 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/HwHiAiUser/.vscode/extensions/ms-python.python-2023.18.0/pythonFiles/lib/python/debugpy/adapter/../../debugpy/launcher/../../debugpy/__main__.py", line 39, in <module>
cli.main()
File "/home/HwHiAiUser/.vscode/extensions/ms-python.python-2023.18.0/pythonFiles/lib/python/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 430, in main
run()
File "/home/HwHiAiUser/.vscode/extensions/ms-python.python-2023.18.0/pythonFiles/lib/python/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "/home/HwHiAiUser/.vscode/extensions/ms-python.python-2023.18.0/pythonFiles/lib/python/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
File "/home/HwHiAiUser/.vscode/extensions/ms-python.python-2023.18.0/pythonFiles/lib/python/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/home/HwHiAiUser/.vscode/extensions/ms-python.python-2023.18.0/pythonFiles/lib/python/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "/home/HwHiAiUser/Code/main.py", line 212, in <module>
trainer.train()
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/transformers/trainer.py", line 1539, in train
return inner_training_loop(
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/transformers/trainer.py", line 1809, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/transformers/trainer.py", line 2654, in training_step
loss = self.compute_loss(model, inputs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/transformers/trainer.py", line 2679, in compute_loss
outputs = model(**inputs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
return self.base_model(
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 806, in forward
outputs = self.model(
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 646, in forward
inputs_embeds = self.embed_tokens(input_ids)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
return F.embedding(
File "/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/site-packages/torch/nn/functional.py", line 2233, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device. Expected NPU tensor, please check whether the input tensor device is correct.
/home/HwHiAiUser/下载/yes/envs/NPU/lib/python3.9/tempfile.py:821: ResourceWarning: Implicitly cleaning up <TemporaryDirectory '/tmp/tmpi3_cfqya'>
_warnings.warn(warn_message, ResourceWarning)
0%| | 0/49040 [1:41:42<?, ?it/s]
(NPU) [HwHiAiUser@localhost Code]$
环境信息
固件版本检查
(NPU) [HwHiAiUser@localhost ~]$ sudo /usr/local/Ascend/driver/tools/upgrade-tool --device_index -1 --component -1 --version
{
Get component version(6.4.12.1.241) succeed for deviceId(0), componentType(11).
{"device_id":0, "component":hboot1a, "version":6.4.12.1.241}
Get component version(6.4.12.1.241) succeed for deviceId(0), componentType(12).
{"device_id":0, "component":hboot1b, "version":6.4.12.1.241}
Get component version(6.4.12.1.241) succeed for deviceId(0), componentType(18).
{"device_id":0, "component":hlink, "version":6.4.12.1.241}
}
npu-smi info
(NPU) [HwHiAiUser@localhost ~]$ npu-smi info
+--------------------------------------------------------------------------------------------------------+
| npu-smi 23.0.rc2 Version: 23.0.rc2 |
+-------------------------------+-----------------+------------------------------------------------------+
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page) |
| Chip Device | Bus-Id | AICore(%) Memory-Usage(MB) |
+===============================+=================+======================================================+
| 8 310P3 | OK | NA 37 0 / 0 |
| 0 0 | 0000:01:00.0 | 0 1700 / 21527 |
+===============================+=================+======================================================+
+-------------------------------+-----------------+------------------------------------------------------+
| NPU Chip | Process id | Process name | Process memory(MB) |
+===============================+=================+======================================================+
| No running processes found in NPU 8 |
+===============================+=================+======================================================+
在使用如下代码使用NPU训练LLaMA模型时,在执行trainer.train()时报错。 代码如下:
报错信息如下:
环境信息
固件版本检查
npu-smi info
CANN安装 已安装适应pytorch2.1.0版本的CANN7.0.RC1.alpha003,并且环境配置正确,如下代码运行正常:
运行结果: