Closed pchiang5 closed 1 year ago
@pchiang5 - are you using --master_port 12344
? Just to make sure that the setting of master_port
inside deepspeed is working properly?
@loadams I tried it at the beginning. Setting --master_port with different numbers did not work.
The issue could be resolved by generating a random port number before each tuning inside the .py file. However, the deepspeed did not offload into CPU by comparing the VRAM usages with and without using deepspeed.
@pchiang5 - could you share your script so we can repro on our side? I don't have a GRX 1650, but will try with a few other nodes we have.
@loadams - Please see below. This is a minor modification of the Geneformer example (https://huggingface.co/ctheodoris/Geneformer/tree/main/examples). Thank you.
import os
# GPU_NUMBER = [0]
# os.environ["CUDA_VISIBLE_DEVICES"] = "0" #"" for cpu
# os.environ["NCCL_DEBUG"] = "INFO"
# os.environ["MASTER_ADDR"] = "localhost"
# os.environ["MASTER_PORT"] = "8888" # modify if RuntimeError: Address already in use
# os.environ["RANK"] = "0"
# os.environ["LOCAL_RANK"] = "0"
# os.environ["WORLD_SIZE"] = "1"
# add import random before def _create_c10d_store(hostname, port, rank, world_size, timeout) -> Store:
#add port = random.randint(33567, 43386) before return TCPStore(
#in the file rendezvous.py
# imports
from collections import Counter
import datetime
import pickle
import subprocess
import seaborn as sns; sns.set()
from datasets import load_from_disk
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertForSequenceClassification
from transformers import Trainer
from transformers.training_args import TrainingArguments
from geneformer import DataCollatorForCellClassification
# ## Prepare training and evaluation datasets
# In[ ]:
# load train dataset (includes all tissues)
train_dataset=load_from_disk("/mnt/c/Users/pc/Downloads/celltype_train")
# load evaluation dataset (includes all tissues)
eval_dataset=load_from_disk("/mnt/c/Users/pc/Downloads/celltype_test")
dataset_list = []
evalset_list = []
organ_list = []
target_dict_list = []
for organ in Counter(train_dataset["organ_major"]).keys():
# collect list of tissues for fine-tuning (immune and bone marrow are included together)
if organ in ["bone_marrow"]:
continue
elif organ=="immune":
organ_ids = ["immune","bone_marrow"]
organ_list += ["immune"]
else:
organ_ids = [organ]
organ_list += [organ]
print(organ)
# filter datasets for given organ
def if_organ(example):
return example["organ_major"] in organ_ids
trainset_organ = train_dataset.filter(if_organ, num_proc=16)
# per scDeepsort published method, drop cell types representing <0.5% of cells
celltype_counter = Counter(trainset_organ["cell_type"])
total_cells = sum(celltype_counter.values())
cells_to_keep = [k for k,v in celltype_counter.items() if v>(0.005*total_cells)]
def if_not_rare_celltype(example):
return example["cell_type"] in cells_to_keep
trainset_organ_subset = trainset_organ.filter(if_not_rare_celltype, num_proc=16)
# shuffle datasets and rename columns
trainset_organ_shuffled = trainset_organ_subset.shuffle(seed=42)
trainset_organ_shuffled = trainset_organ_shuffled.rename_column("cell_type","label")
trainset_organ_shuffled = trainset_organ_shuffled.remove_columns("organ_major")
# create dictionary of cell types : label ids
target_names = list(Counter(trainset_organ_shuffled["label"]).keys())
target_name_id_dict = dict(zip(target_names,[i for i in range(len(target_names))]))
target_dict_list += [target_name_id_dict]
# change labels to numerical ids
def classes_to_ids(example):
example["label"] = target_name_id_dict[example["label"]]
return example
labeled_trainset = trainset_organ_shuffled.map(classes_to_ids, num_proc=16)
# create 80/20 train/eval splits
labeled_train_split = labeled_trainset.select([i for i in range(0,round(len(labeled_trainset)*0.8))])
labeled_eval_split = labeled_trainset.select([i for i in range(round(len(labeled_trainset)*0.8),len(labeled_trainset))])
# filter dataset for cell types in corresponding training set
trained_labels = list(Counter(labeled_train_split["label"]).keys())
def if_trained_label(example):
return example["label"] in trained_labels
labeled_eval_split_subset = labeled_eval_split.filter(if_trained_label, num_proc=16)
dataset_list += [labeled_train_split]
evalset_list += [labeled_eval_split_subset]
# In[20]:
trainset_dict = dict(zip(organ_list,dataset_list))
traintargetdict_dict = dict(zip(organ_list,target_dict_list))
evalset_dict = dict(zip(organ_list,evalset_list))
# ## Fine-Tune With Cell Classification Learning Objective and Quantify Predictive Performance
# In[18]:
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
# calculate accuracy and macro f1 using sklearn's function
acc = accuracy_score(labels, preds)
macro_f1 = f1_score(labels, preds, average='macro')
return {
'accuracy': acc,
'macro_f1': macro_f1
}
# for hyperpara opti
# In[ ]:
# initiate runtime environment for raytune
import pyarrow # must occur prior to ray import
import ray
from ray import tune
from ray.tune import ExperimentAnalysis
# from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.search.hyperopt import HyperOptSearch
ray.init(num_gpus=1)
organ_trainset = trainset_dict[organ]
organ_evalset = evalset_dict[organ]
# define output directory path
current_date = datetime.datetime.now()
datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
output_dir = f"/mnt/c/Users/pc/Downloads/GF/{datestamp}_geneformer_DiseaseClassifier/"
# ensure not overwriting previously saved model
saved_model_test = os.path.join(output_dir, f"pytorch_model.bin")
if os.path.isfile(saved_model_test) == True:
raise Exception("Model already saved to this directory.")
# make output directory
subprocess.call(f'mkdir {output_dir}', shell=True)
# set training parameters
# how many pretrained layers to freeze
freeze_layers = 2
# batch size for training and eval
geneformer_batch_size = 2 #original 12
# number of epochs
epochs = 1
# logging steps
logging_steps = round(len(organ_trainset)/geneformer_batch_size/10)
# define function to initiate model
def model_init():
model = BertForSequenceClassification.from_pretrained("/mnt/c/Users/pc/Downloads/Geneformer/geneformer-12L-30M", # resolved with gradient checkpointing
num_labels=len(target_names),
# low_cpu_mem_usage=True,
output_attentions = False,
gradient_checkpointing=True,
# device_map="auto", #_is_split_module missing in the pretrained model?
output_hidden_states = False)
if freeze_layers is not None:
modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
for module in modules_to_freeze:
for param in module.parameters():
param.requires_grad = False
# model = model.to("cuda:0")
return model
# define metrics
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
# calculate accuracy and macro f1 using sklearn's function
acc = accuracy_score(labels, preds)
macro_f1 = f1_score(labels, preds, average='macro')
return {
'accuracy': acc,
'macro_f1': macro_f1
}
# }
# set training arguments
training_args = {
"do_train": True,
"do_eval": True,
"evaluation_strategy": "steps", #originally steps
"eval_steps": logging_steps,
"logging_steps": logging_steps,
"group_by_length": True,
"save_steps": 7248,
# "save_strategy": "epoch", #originally no this line
"length_column_name": "length",
"disable_tqdm": True,
"eval_steps": 453,
"skip_memory_metrics": True, # memory tracker causes errors in raytune
"per_device_train_batch_size": geneformer_batch_size,
"per_device_eval_batch_size": geneformer_batch_size,
"num_train_epochs": epochs,
"load_best_model_at_end": True, #original true
"output_dir": output_dir,
}
from transformers import EarlyStoppingCallback
training_args_init = TrainingArguments(**training_args)
# create the trainer
trainer = Trainer(
model_init=model_init,
args=training_args_init,
data_collator=DataCollatorForCellClassification(),
train_dataset=organ_trainset,
eval_dataset=organ_evalset,
compute_metrics=compute_metrics,
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
# specify raytune hyperparameter search space
ray_config = {
"num_train_epochs": tune.choice([epochs]),
"learning_rate": tune.loguniform(1e-6, 1e-3),
"weight_decay": tune.uniform(0.0, 0.3),
"lr_scheduler_type": tune.choice(["linear","cosine","polynomial"]),
"warmup_steps": tune.uniform(100, 2000),
"seed": tune.uniform(0,100),
"per_device_train_batch_size": tune.choice([geneformer_batch_size])
}
hyperopt_search = HyperOptSearch(
metric="eval_macro_f1", mode="max")
early_stop = {
"training_iteration": 10
}
# optimize hyperparameters
trainer.hyperparameter_search(
direction="maximize",
backend="ray",
resources_per_trial={"cpu":18,"gpu":1},
hp_space=lambda _: ray_config,
stop=early_stop,
search_alg=hyperopt_search,
n_trials=100, # number of trials
progress_reporter=tune.CLIReporter(max_report_frequency=600,
sort_by_metric=True,
max_progress_rows=100,
mode="max",
metric="eval_macro_f1",
metric_columns=["loss", "eval_loss", "eval_accuracy", "eval_macro_f1"])
# Add the early stopping callback
)
Hi @pchiang5 - I wasn't able to get it to repro with your test case. Out of curiosity, are you able to run a smaller example, perhaps something from DeepSpeedExamples like the CIFAR10 example and add --master_port xxxxx
to the command here? This should print out that the master_port command is being applied (like shown below)?
[2023-08-09 10:47:31,370] [INFO] [runner.py:567:main] cmd = /opt/conda/envs/ptca/bin/python -u -m deepspeed.launcher.launch --world_info=eyJub2RlLTAiOiBbMCwgMV19 --master_addr=10.17.32.217 --master_port=12345 --enable_each_rank_log=None cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
Its also possible I was invoking your example wrong, do you have the corresponding deepspeed ... --master_port xxx ...
command?
@pchiang5 - I know I was late to reply on this, but if you're still hitting this, please re-open. For now, closing since I cannot repro.
Describe the bug A hyperparameter training.py ran well without deepspeed. However, when I tried
deepspeed --master_port XXXXX hyperparameter training.py --deepspeed --deepspeed_config ds_config_zero3.json
, the following error showed up:The JSON file worked fine with other transformer test codes.
To Reproduce
deepspeed --master_port XXXXX hyperparameter training.py --deepspeed --deepspeed_config ds_config_zero3.json
Expected behavior same result with or without deepspeed
ds_report output
DeepSpeed C++/CUDA extension op report
NOTE: Ops not installed will be just-in-time (JIT) compiled at runtime if needed. Op compatibility means that your system meet the required dependencies to JIT install the op.
JIT compiled ops requires ninja ninja .................. [OKAY]
op name ................ installed .. compatible
async_io ............... [YES] ...... [OKAY] cpu_adagrad ............ [YES] ...... [OKAY] cpu_adam ............... [YES] ...... [OKAY] fused_adam ............. [YES] ...... [OKAY] fused_lamb ............. [YES] ...... [OKAY] quantizer .............. [YES] ...... [OKAY] random_ltd ............. [YES] ...... [OKAY] [WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0 [WARNING] please install triton==1.0.0 if you want to use sparse attention sparse_attn ............ [NO] ....... [NO] spatial_inference ...... [YES] ...... [OKAY] transformer ............ [YES] ...... [OKAY] stochastic_transformer . [YES] ...... [OKAY] transformer_inference .. [YES] ...... [OKAY]
DeepSpeed general environment info: torch install path ............... ['/home/pc/miniconda3/envs/Transformers/lib/python3.11/site-packages/torch'] torch version .................... 2.0.1+cu117 deepspeed install path ........... ['/home/pc/miniconda3/envs/Transformers/lib/python3.11/site-packages/deepspeed'] deepspeed info ................... 0.9.5, unknown, unknown torch cuda version ............... 11.7 torch hip version ................ None nvcc version ..................... 11.7 deepspeed wheel compiled w. ...... torch 2.0, cuda 11.7 Screenshots
from ray.air import session
def train(config):
...
System info (please complete the following information):
OS: [Ubuntu 20.04]
GPU count and types [1 machine with GRX 1650 ]
Python version 3.11
Any other relevant info about your setup
Launcher context Launching your experiment with the
deepspeed
launcherDocker context NO
Additional context WSL2