The process is stuck at this step:compiling and loading fused kernels ...

I‘m runing examples/run_deepspeed_example.sh modified as follow:

#!/bin/bash
set -ex

CUDA_VISBLE_DEVICES=0,2,3,4
BASE_PATH=./
DATA_PATH=./
DS_CONFIG=ds_config.json

TP=1
PP=1
NLAYERS=24
HIDDEN=512

GLOBAL_BATCH=32
MICRO_BATCH=4

ZERO_STAGE=1

OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
mkdir -p $OUTPUT_DIR

cat <<EOT > $DS_CONFIG
{
  "train_batch_size" : $GLOBAL_BATCH,
  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
  "steps_per_print": 1,

  "zero_optimization": {
    "stage": $ZERO_STAGE
  },

  "fp16": {
    "enabled": true,
    "initial_scale_power": 12
  },

  "wall_clock_breakdown" : true
}
EOT

export NCCL_DEBUG=warn

ds_args=""
ds_args=" --deepspeed ${ds_args}"
ds_args=" --no-pipeline-parallel ${ds_args}"
ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
ds_args=" --deepspeed-activation-checkpointing ${ds_args}"

deepspeed pretrain_gpt.py \
    --tensor-model-parallel-size $TP \
    --pipeline-model-parallel-size $PP \
    --num-layers $NLAYERS \
    --hidden-size $HIDDEN \
    --num-attention-heads 16 \
    --seq-length 1024 \
    --loss-scale 12 \
    --max-position-embeddings 1024 \
    --micro-batch-size 4 \
    --global-batch-size 32 \
    --train-iters 1000 \
    --lr 6.0e-5 \
    --min-lr 6.0e-6 \
    --lr-decay-style cosine \
    --log-interval 1 \
    --eval-iters 40 \
    --eval-interval 1000 \
    --data-path $DATA_PATH \
    --vocab-file $BASE_PATH/gpt2-vocab.json \
    --merge-file $BASE_PATH/gpt2-merges.txt \
    --save-interval 1000 \
    --split 98,2,0 \
    --clip-grad 1.0 \
    --weight-decay 0.1 \

The output of the program is as follows, which seems to be stuck, and there is no new output for a long time：

+ CUDA_VISBLE_DEVICES=0,2,3,4
+ BASE_PATH=./
+ DATA_PATH=./
+ DS_CONFIG=ds_config.json
+ TP=1
+ PP=1
+ NLAYERS=24
+ HIDDEN=512
+ GLOBAL_BATCH=32
+ MICRO_BATCH=4
+ ZERO_STAGE=1
+ OUTPUT_DIR=ds_z1_nl24_hs512_gb32_mb4
+ mkdir -p ds_z1_nl24_hs512_gb32_mb4
+ cat
+ export NCCL_DEBUG=warn
+ NCCL_DEBUG=warn
+ ds_args=
+ ds_args=' --deepspeed '
+ ds_args=' --no-pipeline-parallel  --deepspeed '
+ ds_args=' --deepspeed_config=ds_config.json  --no-pipeline-parallel  --deepspeed '
+ ds_args=' --zero-stage=1  --deepspeed_config=ds_config.json  --no-pipeline-parallel  --deepspeed '
+ ds_args=' --deepspeed-activation-checkpointing  --zero-stage=1  --deepspeed_config=ds_config.json  --no-pipeline-parallel  --deepspeed '
+ deepspeed pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 512 --num-attention-heads 16 --seq-length 1024 --loss-scale 12 --max-position-embeddings 1024 --micro-batch-size 4 --global-batch-size 32 --train-iters 1000 --lr 6.0e-5 --min-lr 6.0e-6 --lr-decay-style cosine --log-interval 1 --eval-iters 40 --eval-interval 1000 --data-path ./ --vocab-file .//gpt2-vocab.json --merge-file .//gpt2-merges.txt --save-interval 1000 --split 98,2,0 --clip-grad 1.0 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --fp16 --checkpoint-activations --tensorboard-dir ds_z1_nl24_hs512_gb32_mb4 --deepspeed-activation-checkpointing --zero-stage=1 --deepspeed_config=ds_config.json --no-pipeline-parallel --deepspeed --exit-interval 1000
+ tee ds_z1_nl24_hs512_gb32_mb4/output.log
[2022-11-10 09:45:41,839] [WARNING] [runner.py:159:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2022-11-10 09:45:42,016] [INFO] [runner.py:457:main] cmd = /opt/conda/bin/python3.8 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 512 --num-attention-heads 16 --seq-length 1024 --loss-scale 12 --max-position-embeddings 1024 --micro-batch-size 4 --global-batch-size 32 --train-iters 1000 --lr 6.0e-5 --min-lr 6.0e-6 --lr-decay-style cosine --log-interval 1 --eval-iters 40 --eval-interval 1000 --data-path ./ --vocab-file .//gpt2-vocab.json --merge-file .//gpt2-merges.txt --save-interval 1000 --split 98,2,0 --clip-grad 1.0 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --fp16 --checkpoint-activations --tensorboard-dir ds_z1_nl24_hs512_gb32_mb4 --deepspeed-activation-checkpointing --zero-stage=1 --deepspeed_config=ds_config.json --no-pipeline-parallel --deepspeed --exit-interval 1000
[2022-11-10 09:45:43,730] [INFO] [launch.py:96:main] 0 NCCL_VERSION=2.11.4
[2022-11-10 09:45:43,730] [INFO] [launch.py:96:main] 0 NCCL_DEBUG=warn
[2022-11-10 09:45:43,730] [INFO] [launch.py:103:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
[2022-11-10 09:45:43,730] [INFO] [launch.py:109:main] nnodes=1, num_local_procs=8, node_rank=0
[2022-11-10 09:45:43,730] [INFO] [launch.py:122:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
[2022-11-10 09:45:43,730] [INFO] [launch.py:123:main] dist_world_size=8
[2022-11-10 09:45:43,730] [INFO] [launch.py:125:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
using world size: 8, data-parallel-size: 8, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 
using torch.float16 for parameters ...
------------------------ arguments ------------------------
  accumulate_allreduce_grads_in_fp32 .............. False
  adam_beta1 ...................................... 0.9
  adam_beta2 ...................................... 0.95
  adam_eps ........................................ 1e-08
  adlr_autoresume ................................. False
  adlr_autoresume_interval ........................ 1000
  aml_data_download_path .......................... None
  apply_query_key_layer_scaling ................... True
  apply_residual_connection_post_layernorm ........ False
  attention_dropout ............................... 0.1
  attention_softmax_in_fp32 ....................... False
  bert_binary_head ................................ True
  bert_load ....................................... None
  bf16 ............................................ False
  bias_dropout_fusion ............................. True
  bias_gelu_fusion ................................ True
  biencoder_projection_dim ........................ 0
  biencoder_shared_query_context_model ............ False
  block_data_path ................................. None
  checkpoint_activations .......................... True
  checkpoint_in_cpu ............................... False
  checkpoint_num_layers ........................... 1
  clip_grad ....................................... 1.0
  compression_training ............................ False
  consumed_train_samples .......................... 0
  consumed_train_tokens ........................... 0
  consumed_valid_samples .......................... 0
  contigious_checkpointing ........................ False
  cpu_optimizer ................................... False
  cpu_torch_adam .................................. False
  create_moe_param_group .......................... False
  curriculum_learning ............................. False
  custom_token_counting ........................... False
  data_impl ....................................... infer
  data_parallel_size .............................. 8
  data_path ....................................... ['./']
  dataloader_type ................................. single
  DDP_impl ........................................ local
  decoder_seq_length .............................. None
  deepscale ....................................... False
  deepscale_config ................................ None
  deepspeed ....................................... True
  deepspeed_activation_checkpointing .............. True
  deepspeed_config ................................ ds_config.json
  deepspeed_mpi ................................... False
  distribute_checkpointed_activations ............. False
  distributed_backend ............................. nccl
  ds_inference .................................... False
  ds_pipeline_enabled ............................. False
  embedding_path .................................. None
  enable_expert_tensor_parallelism ................ False
  encoder_seq_length .............................. 1024
  eod_mask_loss ................................... False
  eval_interval ................................... 1000
  eval_iters ...................................... 40
  evidence_data_path .............................. None
  exit_duration_in_mins ........................... None
  exit_interval ................................... 1000
  expert_interval ................................. 2
  ffn_hidden_size ................................. 2048
  finetune ........................................ False
  fp16 ............................................ True
  fp16_lm_cross_entropy ........................... False
  fp32_residual_connection ........................ False
  global_batch_size ............................... 32
  hidden_dropout .................................. 0.1
  hidden_size ..................................... 512
  hidden_size_teacher ............................. None
  hysteresis ...................................... 2
  ict_head_size ................................... None
  ict_load ........................................ None
  img_dim ......................................... 224
  indexer_batch_size .............................. 128
  indexer_log_interval ............................ 1000
  inference ....................................... False
  init_method_std ................................. 0.006
  init_method_xavier_uniform ...................... False
  initial_loss_scale .............................. 4294967296
  kd .............................................. False
  kd_alpha_ce ..................................... 1
  kd_beta_ce ...................................... 1
  kd_temp ......................................... 1.0
  kv_channels ..................................... 32
  layernorm_epsilon ............................... 1e-05
  lazy_mpu_init ................................... None
  load ............................................ None
  load_teacher .................................... None
  local_rank ...................................... 0
  log_batch_size_to_tensorboard ................... False
  log_interval .................................... 1
  log_learning_rate_to_tensorboard ................ True
  log_loss_scale_to_tensorboard ................... True
  log_num_zeros_in_grad ........................... False
  log_optimizer_states_to_tensorboard ............. False
  log_params_norm ................................. False
  log_timers_to_tensorboard ....................... False
  log_validation_ppl_to_tensorboard ............... False
  loss_scale ...................................... 12.0
  loss_scale_window ............................... 1000
  lr .............................................. 6e-05
  lr_decay_iters .................................. None
  lr_decay_samples ................................ None
  lr_decay_style .................................. cosine
  lr_decay_tokens ................................. None
  lr_warmup_fraction .............................. None
  lr_warmup_iters ................................. 0
  lr_warmup_samples ............................... 0
  lr_warmup_tokens ................................ None
  make_vocab_size_divisible_by .................... 128
  mask_prob ....................................... 0.15
  masked_softmax_fusion ........................... True
  max_position_embeddings ......................... 1024
  memory_centric_tiled_linear ..................... False
  merge_file ...................................... .//gpt2-merges.txt
  micro_batch_size ................................ 4
  min_loss_scale .................................. 1.0
  min_lr .......................................... 6e-06
  mlp_type ........................................ standard
  mmap_warmup ..................................... False
  moe_eval_capacity_factor ........................ 1.0
  moe_expert_parallel_size ........................ 1
  moe_loss_coeff .................................. 0.1
  moe_min_capacity ................................ 4
  moe_token_dropping .............................. True
  moe_train_capacity_factor ....................... 1.0
  mos ............................................. False
  no_load_lr_state ................................ False
  no_load_optim ................................... None
  no_load_rng ..................................... None
  no_pipeline_parallel ............................ True
  no_save_optim ................................... None
  no_save_rng ..................................... None
  num_attention_heads ............................. 16
  num_attention_heads_teacher ..................... None
  num_channels .................................... 3
  num_classes ..................................... 1000
  num_experts ..................................... [1]
  num_experts_teacher ............................. [1]
  num_layers ...................................... 24
  num_layers_per_virtual_pipeline_stage ........... None
  num_layers_teacher .............................. None
  num_workers ..................................... 2
  onnx_safe ....................................... None
  openai_gelu ..................................... False
  optimizer ....................................... adam
  override_lr_scheduler ........................... False
  params_dtype .................................... torch.float16
  partition_activations ........................... False
  patch_dim ....................................... 16
  pipeline_model_parallel_size .................... 1
  profile_backward ................................ False
  query_in_block_prob ............................. 0.1
  rampup_batch_size ............................... None
  rank ............................................ 0
  remote_device ................................... none
  reset_attention_mask ............................ False
  reset_iteration ................................. False
  reset_position_ids .............................. False
  retriever_report_topk_accuracies ................ []
  retriever_score_scaling ......................... False
  retriever_seq_length ............................ 256
  sample_rate ..................................... 1.0
  save ............................................ None
  save_interval ................................... 1000
  scatter_gather_tensors_in_pipeline .............. True
  scattered_embeddings ............................ False
  seed ............................................ 1234
  seq_length ...................................... 1024
  sgd_momentum .................................... 0.9
  short_seq_prob .................................. 0.1
  split ........................................... 98,2,0
  split_transformers .............................. False
  synchronize_each_layer .......................... False
  tensor_model_parallel_size ...................... 1
  tensorboard_dir ................................. ds_z1_nl24_hs512_gb32_mb4
  tensorboard_log_interval ........................ 1
  tensorboard_queue_size .......................... 1000
  tile_factor ..................................... 1
  titles_data_path ................................ None
  tokenizer_type .................................. GPT2BPETokenizer
  topk ............................................ 1
  train_iters ..................................... 1000
  train_samples ................................... None
  train_tokens .................................... None
  use_checkpoint_lr_scheduler ..................... False
  use_contiguous_buffers_in_ddp ................... False
  use_cpu_initialization .......................... None
  use_one_sent_docs ............................... False
  use_pin_memory .................................. False
  use_tutel ....................................... False
  virtual_pipeline_model_parallel_size ............ None
  vocab_extra_ids ................................. 0
  vocab_file ...................................... .//gpt2-vocab.json
  weight_decay .................................... 0.1
  world_size ...................................... 8
  zero_allgather_bucket_size ...................... 0.0
  zero_contigious_gradients ....................... False
  zero_reduce_bucket_size ......................... 0.0
  zero_reduce_scatter ............................. False
  zero_stage ...................................... 1
-------------------- end of arguments ---------------------
setting number of micro-batches to constant 1
> building GPT2BPETokenizer tokenizer ...
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
 > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
> initializing torch distributed ...
[2022-11-10 09:45:46,590] [INFO] [distributed.py:48:init_distributed] Initializing torch distributed with backend: nccl
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
--------------------------------------------------DeepSpeed general environment info:

DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
torch install path--------------------------------------------------
JIT compiled ops requires ninja
 ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
 [WARNING]  please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
async_io ............... [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/lib/python3.8/site-packages/torch']
torch version .................... 1.11.0a0+b6df043
torch cuda version ............... 11.5
torch hip version ................ None
nvcc version ..................... 11.5
deepspeed install path ........... ['/opt/conda/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.6.5, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.11, cuda 11.5
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
fatal: not a git repository (or any of the parent directories): .git
**** Git info for Megatron: git_hash=unknown git_branch=unknown ****
> setting tensorboard ...
> initializing tensor model parallel with size 1
> initializing pipeline model parallel with size 1
> setting random seeds to 1234 ...
[2022-11-10 09:45:47,963] [INFO] [checkpointing.py:226:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
> compiling dataset index builder ...
make: Entering directory '/workspace/Megatron-DeepSpeed/megatron/data'
make: Nothing to be done for 'default'.
make: Leaving directory '/workspace/Megatron-DeepSpeed/megatron/data'
>>> done with dataset index builder. Compilation time: 0.091 seconds
> compiling and loading fused kernels ...

So I terminated the process. The error message when terminating is as follows. It seems that the program is stuck in the initialization of C++extension:

^CTraceback (most recent call last):
  File "pretrain_gpt.py", line 294, in <module>
--- Logging error ---
    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
  File "/workspace/Megatron-DeepSpeed/megatron/training.py", line 98, in pretrain
    initialize_megatron(extra_args_provider=extra_args_provider,
  File "/workspace/Megatron-DeepSpeed/megatron/initialize.py", line 89, in initialize_megatron
    _compile_dependencies()
  File "/workspace/Megatron-DeepSpeed/megatron/initialize.py", line 137, in _compile_dependencies
    fused_kernels.load(args)
  File "/workspace/Megatron-DeepSpeed/megatron/fused_kernels/__init__.py", line 88, in load
    scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
  File "/workspace/Megatron-DeepSpeed/megatron/fused_kernels/__init__.py", line 56, in _cpp_extention_load_helper
    return cpp_extension.load(
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1125, in load
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15854'
Arguments: ()
    --- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15855'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15856'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15857'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15858'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15859'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15860'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 178, in sigkill_handler
    logger.info(f"Killing subprocess {process.pid}")
Message: 'Killing subprocess 15861'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1088, in emit
    stream.write(msg + self.terminator)
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 218, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 214, in main
    time.sleep(1)
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/launch.py", line 187, in sigkill_handler
    logger.info(f"Main process received {sig_names[signum]}, exiting")
Message: 'Main process received SIGINT, exiting'
Arguments: ()
Traceback (most recent call last):
  File "/opt/conda/bin/deepspeed", line 6, in <module>
    main()
  File "/opt/conda/lib/python3.8/site-packages/deepspeed/launcher/runner.py", line 460, in main
    result.wait()
  File "/opt/conda/lib/python3.8/subprocess.py", line 1083, in wait
    return self._wait(timeout=timeout)
  File "/opt/conda/lib/python3.8/subprocess.py", line 1806, in _wait
    (pid, sts) = self._try_wait(0)
  File "/opt/conda/lib/python3.8/subprocess.py", line 1764, in _try_wait
    (pid, sts) = os.waitpid(self.pid, wait_flags)
KeyboardInterrupt

microsoft / Megatron-DeepSpeed

The process is stuck at this step:compiling and loading fused kernels ... #93