Open nikhil-tensorwave opened 3 weeks ago
finetune_deepseek_ds.py
import pandas as pd
import json
import os
import argparse
import yaml
from pprint import pprint
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
os.environ["WANDB_DISABLED"] = "true"
# Argument parser for command-line inputs
parser = argparse.ArgumentParser(description='Fine-tune DeepSeek-Coder-V2 model')
parser.add_argument('--config', type=str, help='Path to config YAML file (optional)', default="config.yaml")
parser.add_argument('--jsonl_path', type=str, help='Path to input JSONL file (optional)', required=False)
parser.add_argument('--exp_id', type=str, help='Unique experiment ID for the run', required=True)
parser.add_argument('--deepspeed_config', type=str, help='Path to DeepSpeed config JSON file', required=False)
parser.add_argument('--local_rank', type=int, help='Local rank for distributed training', default=-1)
args = parser.parse_args()
# Load YAML config
config_path = args.config if os.path.isabs(args.config) else os.path.join(os.getcwd(), args.config)
if not os.path.exists(config_path):
raise FileNotFoundError(f"Configuration file not found at {config_path}")
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
# Extract base configurations
base_dir = config['run']['base_dir']
run_name_prefix = config['run']['run_name_prefix']
run_name = f"{run_name_prefix}-{args.exp_id}"
run_dir = os.path.join(base_dir, args.exp_id)
# Determine JSONL file path
jsonl_path = args.jsonl_path if args.jsonl_path else os.path.join(run_dir, "train_prompts.jsonl")
# Check if the JSONL file exists
if not os.path.exists(jsonl_path):
raise FileNotFoundError(f"JSONL file not found at {jsonl_path}. Please provide a valid file path.")
# Configuration
MODEL_NAME = config['model']['model_name']
"""
bnb_config = BitsAndBytesConfig(
load_in_4bit=config['quantization']['load_in_4bit'],
bnb_4bit_use_double_quant=config['quantization']['use_double_quant'],
bnb_4bit_quant_type=config['quantization']['quant_type'],
bnb_4bit_compute_dtype=torch.bfloat16 if config['quantization']['compute_dtype'] == "bfloat16" else torch.float16
)
"""
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
#quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load data
data_list = []
# Read JSONL and extract the 'text' field
with open(jsonl_path, 'r') as file:
for line in file:
if not line.strip():
continue # Skip empty lines
try:
data_point = json.loads(line)
text = data_point.get('text', '')
if text:
data_list.append({'text': text})
except json.JSONDecodeError as e:
print(f"Skipping invalid JSON line: {line} with error: {e}")
# Convert to pandas DataFrame
df = pd.DataFrame(data_list)
print("Total number of training samples:", len(df))
# Create Hugging Face Dataset
train_data = Dataset.from_pandas(df[['text']])
# Shuffle dataset
train_data = train_data.shuffle()
# Set max_seq_length
max_seq_length = config['training'].get('max_seq_length', 512)
# Prepare model for training
def get_num_layers(model):
import re
numbers = set()
for name, _ in model.named_parameters():
for number in re.findall(r'\d+', name):
numbers.add(int(number))
return max(numbers)
def get_last_layer_linears(model):
names = []
num_layers = get_num_layers(model)
for name, module in model.named_modules():
if str(num_layers) in name and not "encoder" in name:
if isinstance(module, torch.nn.Linear):
names.append(name)
return names
# Prepare model for training
config_lora = LoraConfig(
r=config['model']['lora_r'],
lora_alpha=config['model']['lora_alpha'],
target_modules=get_last_layer_linears(model),
lora_dropout=config['model']['lora_dropout'],
bias="none",
task_type=config['model']['task_type']
)
# Get DeepSpeed config path
if args.deepspeed_config:
deepspeed_config_path = args.deepspeed_config
else:
deepspeed_config_path = config['training'].get('deepspeed_config', None)
# Define training arguments
training_args = transformers.TrainingArguments(
per_device_train_batch_size=config['training']['batch_size'],
gradient_accumulation_steps=config['training']['grad_accumulation_steps'],
num_train_epochs=config['training']['num_epochs'],
learning_rate=float(config['training']['learning_rate']),
gradient_checkpointing=True,
fp16=config['training']['fp16'],
bf16=config['training'].get('bf16', False),
output_dir=os.path.join(run_dir, config['training']['output_subdir']),
# Remove optim and lr_scheduler_type when using DeepSpeed
# optim="paged_adamw_8bit",
# lr_scheduler_type=config['training']['scheduler'],
warmup_ratio=config['training']['warmup_ratio'],
logging_steps=config['training']['logging_steps'],
report_to=None, # Set to None to disable reporting to wandb
logging_dir=run_dir,
deepspeed=deepspeed_config_path
)
# Initialize SFTTrainer
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_data,
peft_config=config_lora,
dataset_text_field="text",
tokenizer=tokenizer,
max_seq_length=max_seq_length,
packing=False,
)
# Disable cache for training and start training
model.config.use_cache = False
trainer.train()
# Save the fine-tuned model in the run directory
model_output_dir = os.path.join(run_dir, "trained-model")
trainer.save_model(model_output_dir)
tokenizer.save_pretrained(model_output_dir)
print(f"Model fine-tuned and saved to '{model_output_dir}'.")
dp.yaml
run:
project_name: "text2sql-train"
base_dir: ""
run_name_prefix: "fine-tune-run"
model:
model_name: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
task_type: "CAUSAL_LM"
# Added target_modules as a list
target_modules:
- "q_proj"
- "k_proj"
- "v_proj"
- "o_proj"
- "gate_proj"
- "up_proj"
- "down_proj"
training:
batch_size: 1
grad_accumulation_steps: 4
num_epochs: 1
learning_rate: 1e-4
bf16: true
fp16: false
scheduler: "cosine"
warmup_ratio: 0.01
logging_steps: 10
output_subdir: "checkpoints"
max_seq_length: 8192 # Added max_seq_length
quantization:
load_in_4bit: true
use_double_quant: true
quant_type: "nf4"
compute_dtype: "bfloat16"
fsdp:
fsdp: "full_shard auto_wrap offload"
fsdp_config:
backward_prefetch: "backward_pre"
forward_prefetch: "false"
use_orig_params: "false"
ds_config2_zero3.json
{
"bf16": {
"enabled": true
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e7,
"reduce_bucket_size": 2e7,
"stage3_prefetch_bucket_size": 3774874,
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 5e8,
"stage3_max_reuse_distance": 5e8,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": 4,
"gradient_clipping": "auto",
"steps_per_print": 20,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false,
"activation_checkpointing": {
"partition_activations": true,
"contiguous_memory_optimization": true
}
}
Thanks @nikhil-tensorwave. Tagging @rraminen and @jithunnair-amd as well for help on the AMD side.
@loadams @rraminen @jithunnair-amd Are there any updates on resolving this issue?
Describe the bug I am trying to fine-tune DeepSeek-Coder-V2-Lite-Instruct (16B) on a system with 8 MI300X GPUs. Running on any number of GPUs less than 8 works as expected and runs to completion. When running on 8 GPUs, the training starts, hangs, and then outputs one of two errors. One error is:
where the GPU node is different from run to run. The second error (truncated) is:
To Reproduce Run command:
Training script and config files will be in the first comment.
ds_report output
System info:
Launcher context Launching with deepspeed
Additional context Running the same fine-tuning instead with smaller DeepSeek models (1B and 7B) works on 8 GPUs to completion. I am currently trying the largest DeepSeek model (200B).
@rraminen @jithunnair-amd