yongzx / multilingual-t0

Multilingual extension of T0
1 stars 3 forks source link

Integrating Deepspeed for training MT5 with RTX3090 #1

Open yongzx opened 2 years ago

yongzx commented 2 years ago

Specs

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
sparse_attn ............ [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
async_io ............... [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/gpfs/data/sbach/zyong2/mt0/env_mT0/lib/python3.7/site-packages/torch']
torch version .................... 1.10.1+cu113
torch cuda version ............... 11.3
nvcc version ..................... 11.1
deepspeed install path ........... ['/gpfs/data/sbach/zyong2/mt0/env_mT0/lib/python3.7/site-packages/deepspeed']
deepspeed info ................... 0.5.10, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.10, cuda 11.3
Sat Jan 22 12:54:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:C1:00.0 Off |                  N/A |
| 30%   27C    P8    16W / 350W |      7MiB / 24268MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:E1:00.0 Off |                  N/A |
| 30%   25C    P8    23W / 350W |      7MiB / 24268MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     50304      G   X                                   5MiB |
|    1   N/A  N/A     50304      G   X                                   5MiB |
+-----------------------------------------------------------------------------+
yongzx commented 2 years ago

Script 1: How to increase max_input_length and max_target_length? How to use mt5-xl?

Working script for mt5_large using ds_config_zero3.json.

CUR_TIME=`date +%H%M_%m%d%Y`

deepspeed \
    /users/zyong2/data/zyong2/mt0/data/external/mt0/multilingual_t0/main.py \
    --deepspeed "/users/zyong2/data/zyong2/mt0/data/external/mt0/multilingual_t0/ds_config_zero3.json" \
    --model_name_or_path "google/mt5-large" \
    --cache_dir "/users/zyong2/data/zyong2/huggingface/mt5_xl" \
    --dataset_name "mc4" \
    --max_input_length 512 \
    --max_target_length 512 \
    --do_train \
    --preprocessing_num_workers 4 \
    --per_device_train_batch_size 1 \
    --gradient_accumulation 8 \
    --overwrite_output_dir \
    --output_dir "/users/zyong2/data/zyong2/mt0/data/processed/001/mt5_xxl" \
    --max_steps 100 \
    --per_device_eval_batch_size 1 \
    --do_eval \
    --evaluation_strategy "steps" \
    --eval_steps 10 \
    --logging_dir "/users/zyong2/data/zyong2/mt0/data/processed/001/runs/mt5_xxl_${CUR_TIME}"\
    --logging_strategy "steps" \
    --logging_steps 10 \
    --report_to "tensorboard"
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e14,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_fp16_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
yongzx commented 2 years ago

2 V100s, mT5 + 512-512 with Zerov3

CUR_TIME=`date +%H%M_%m%d%Y`

deepspeed \
    /users/zyong2/data/zyong2/mt0/data/external/mt0/multilingual_t0/main.py \
    --model_name_or_path "google/mt5-xl" \
    --cache_dir "/users/zyong2/data/zyong2/huggingface/mt5_xl" \
    --dataset_name "mc4" \
    --max_input_length 512 \
    --max_target_length 512 \
    --do_train \
    --preprocessing_num_workers 4 \
    --per_device_train_batch_size 1 \
    --gradient_accumulation 8 \
    --overwrite_output_dir \
    --output_dir "/users/zyong2/data/zyong2/mt0/data/processed/001/mt5_xl" \
    --max_steps 100 \
    --per_device_eval_batch_size 1 \
    --do_eval \
    --evaluation_strategy "steps" \
    --eval_steps 10 \
    --logging_dir "/users/zyong2/data/zyong2/mt0/data/processed/001/runs/mt5_xl_${CUR_TIME}"\
    --logging_strategy "steps" \
    --logging_steps 10 \
    --report_to "tensorboard" \
    --deepspeed "/users/zyong2/data/zyong2/mt0/data/external/mt0/multilingual_t0/ds_config_zero3.json"
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_fp16_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}