Open kaustavbecs opened 3 months ago
hyperparameters ={
'model_id': model_id, # pre-trained model
'dataset_path': '/opt/ml/input/data/training', # path where sagemaker will save training dataset
'num_train_epochs': 1, # number of training epochs
'per_device_train_batch_size': 6, # batch size for training
'gradient_accumulation_steps': 2, # Number of updates steps to accumulate
'gradient_checkpointing': True, # save memory but slower backward pass
'bf16': True, # use bfloat16 precision
'tf32': True, # use tf32 precision
'learning_rate': 2e-4, # learning rate
'max_grad_norm': 0.3, # Maximum norm (for gradient clipping)
'warmup_ratio': 0.03, # warmup ratio
"lr_scheduler_type":"constant", # learning rate scheduler
'save_strategy': "epoch", # save strategy for checkpoints
"logging_steps": 10, # log every x steps
'merge_adapters': True, # wether to merge LoRA into the model (needs more memory)
'trust_remote_code': True, # Whether to use trust_remote_code
'use_flash_attn': True, # Whether to use Flash Attention
'output_dir': '/tmp/run', # output directory, where to save assets during training
}
Models that require trust_remote_code=true are not supported by the script. Eg. Getting the following error when trying yo train Phi1.5
Error:
ErrorMessage "EOFError EOF when reading a line
During handling of the above exception, another exception occurred Traceback (most recent call last) File "/opt/ml/code/run_qlora.py", line 194, in
main()
File "/opt/ml/code/run_qlora.py", line 190, in main
training_function(script_args, training_args)
File "/opt/ml/code/run_qlora.py", line 97, in training_function
model = AutoModelForCausalLM.from_pretrained(
File "/opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 525, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
File "/opt/conda/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 1037, in from_pretrained
trust_remote_code = resolve_trust_remote_code(
File "/opt/conda/lib/python3.10/site-packages/transformers/dynamic_module_utils.py", line 608, in resolve_trust_remote_code
raise ValueError(
ValueError: The repository for microsoft/phi-1_5 contains custom code which must be executed to correctlyload the model. You can inspect the repository content at https://hf.co/microsoft/phi-1_5.
Please pass the argument
trust_remote_code=True
to allow custom code to be run."======
The following is the estimator used:
create the Estimator
huggingface_estimator = HuggingFace( entry_point = 'run_qlora.py', # train script source_dir = '../scripts', # directory which includes all the files needed for training instance_type = 'ml.g5.4xlarge', # instances type used for the training job instance_count = 1, # the number of instances used for training max_run = 2246060, # maximum runtime in seconds (days hours minutes seconds) base_job_name = job_name, # the name of the training job role = role, # Iam role used in training job to access AWS ressources, e.g. S3 volume_size = 300, # the size of the EBS volume in GB transformers_version = '4.28', # the transformers version used in the training job pytorch_version = '2.0', # the pytorch_version version used in the training job py_version = 'py310', # the python version used in the training job hyperparameters = hyperparameters, # the hyperparameters passed to the training job environment = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache1" }, # set env variable to cache models in /tmp disable_output_compression = True, # not compress output to save training time and cost trust_remote_code = True )