Closed YFeather closed 10 months ago
I use
python tools/preprocess_data.py \ --input /cobol/gpt2/data \ --output-prefix /cobol/data_preprocess \ --vocab-file /cobol/gpt2/vocab.json \ --dataset-impl mmap \ --tokenizer-type GPT2BPETokenizer \ --merge-file /cobol/gpt2/merges.txt \ --json-key content \ --workers 32 \ --chunk-size 25 \ --append-eod
to create the dataset, and use
#!/bin/bash # Runs the "345M" parameter model export CUDA_DEVICE_MAX_CONNECTIONS=1 CHECKPOINT_PATH=/cobol/gpt2/checkpoint VOCAB_FILE=/cobol/vocab_file/vocab.json MERGE_FILE=/cobol/gpt2/merges.txt DATA_PATH=/cobol/gpt2/data_document/data_preprocess_content_document GPT_ARGS=" --num-layers 24 \ --hidden-size 1024 \ --num-attention-heads 16 \ --attention-head-type multihead \ --seq-length 1024 \ --max-position-embeddings 1024 \ --micro-batch-size 4 \ --global-batch-size 8 \ --lr 0.00015 \ --train-iters 500000 \ --lr-decay-iters 320000 \ --lr-decay-style cosine \ --min-lr 1.0e-5 \ --weight-decay 1e-2 \ --lr-warmup-fraction .01 \ --clip-grad 1.0 \ --fp16 " DATA_ARGS=" --data-path $DATA_PATH \ --vocab-file $VOCAB_FILE \ --merge-file $MERGE_FILE \ --data-impl mmap \ --split 949,50,1 " OUTPUT_ARGS=" --log-interval 100 \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 " torchrun pretrain_gpt.py \ $GPT_ARGS \ $DATA_ARGS \ $OUTPUT_ARGS \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH
to pretrain. But I get the Key Error:
using world size: 1, data-parallel-size: 1, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 using torch.float16 for parameters ... ------------------------ arguments ------------------------ accumulate_allreduce_grads_in_fp32 .............. False adam_beta1 ...................................... 0.9 adam_beta2 ...................................... 0.999 adam_eps ........................................ 1e-08 add_bias_linear ................................. True add_position_embedding .......................... True adlr_autoresume ................................. False adlr_autoresume_interval ........................ 1000 apply_layernorm_1p .............................. False apply_query_key_layer_scaling ................... True apply_residual_connection_post_layernorm ........ False async_tensor_model_parallel_allreduce ........... True attention_dropout ............................... 0.1 attention_head_type ............................. multihead attention_softmax_in_fp32 ....................... False barrier_with_L1_time ............................ True bert_binary_head ................................ True bert_embedder_type .............................. megatron bert_load ....................................... None bf16 ............................................ False bias_dropout_fusion ............................. True bias_gelu_fusion ................................ True biencoder_projection_dim ........................ 0 biencoder_shared_query_context_model ............ False block_data_path ................................. None classes_fraction ................................ 1.0 clip_grad ....................................... 1.0 consumed_train_samples .......................... 0 consumed_valid_samples .......................... 0 data_impl ....................................... mmap data_parallel_random_init ....................... False data_parallel_size .............................. 1 data_path ....................................... ['/cobol/gpt2/data_document/data_preprocess_content_document'] data_per_class_fraction ......................... 1.0 data_sharding ................................... True dataloader_type ................................. single DDP_impl ........................................ local decoder_num_layers .............................. None decoder_seq_length .............................. None dino_bottleneck_size ............................ 256 dino_freeze_last_layer .......................... 1 dino_head_hidden_size ........................... 2048 dino_local_crops_number ......................... 10 dino_local_img_size ............................. 96 dino_norm_last_layer ............................ False dino_teacher_temp ............................... 0.07 dino_warmup_teacher_temp ........................ 0.04 dino_warmup_teacher_temp_epochs ................. 30 distribute_saved_activations .................... False distributed_backend ............................. nccl distributed_timeout ............................. 600 distributed_timeout_minutes ..................... 10 embedding_path .................................. None empty_unused_memory_level ....................... 0 encoder_num_layers .............................. 24 encoder_seq_length .............................. 1024 end_weight_decay ................................ 0.01 eod_mask_loss ................................... False eval_interval ................................... 1000 eval_iters ...................................... 10 evidence_data_path .............................. None exit_duration_in_mins ........................... None exit_interval ................................... None exit_on_missing_checkpoint ...................... False exit_signal_handler ............................. False ffn_hidden_size ................................. 4096 fim_rate ........................................ 0.0 fim_split_sample ................................ None fim_spm_rate .................................... 0.5 finetune ........................................ False fp16 ............................................ True fp16_lm_cross_entropy ........................... False fp32_residual_connection ........................ False fp8_amax_compute_algo ........................... most_recent fp8_amax_history_len ............................ 1 fp8_e4m3 ........................................ False fp8_hybrid ...................................... False fp8_interval .................................... 1 fp8_margin ...................................... 0 fp8_wgrad ....................................... True fragment_fim_rate ............................... 0.5 global_batch_size ............................... 8 glu_activation .................................. None gradient_accumulation_fusion .................... True head_lr_mult .................................... 1.0 hidden_dropout .................................. 0.1 hidden_size ..................................... 1024 hysteresis ...................................... 2 ict_head_size ................................... None ict_load ........................................ None img_h ........................................... 224 img_w ........................................... 224 indexer_batch_size .............................. 128 indexer_log_interval ............................ 1000 inference_batch_times_seqlen_threshold .......... 512 init_method_std ................................. 0.02 init_method_xavier_uniform ...................... False initial_loss_scale .............................. 4294967296 iter_per_epoch .................................. 1250 kv_channels ..................................... 64 layernorm_epsilon ............................... 1e-05 lazy_mpu_init ................................... None load ............................................ /cobol/gpt2/checkpoint local_rank ...................................... 0 log_batch_size_to_tensorboard ................... False log_interval .................................... 100 log_learning_rate_to_tensorboard ................ True log_loss_scale_to_tensorboard ................... True log_memory_to_tensorboard ....................... False log_num_zeros_in_grad ........................... False log_params_norm ................................. False log_timers_to_tensorboard ....................... False log_validation_ppl_to_tensorboard ............... False log_world_size_to_tensorboard ................... False loss_scale ...................................... None loss_scale_window ............................... 1000 lr .............................................. 0.00015 lr_decay_iters .................................. 320000 lr_decay_samples ................................ None lr_decay_style .................................. cosine lr_warmup_fraction .............................. 0.01 lr_warmup_iters ................................. 0 lr_warmup_samples ............................... 0 make_vocab_size_divisible_by .................... 128 mask_factor ..................................... 1.0 mask_prob ....................................... 0.15 mask_type ....................................... random masked_softmax_fusion ........................... True max_position_embeddings ......................... 1024 max_tokens_to_oom ............................... 12000 merge_file ...................................... /cobol/gpt2/merges.txt micro_batch_size ................................ 4 min_loss_scale .................................. 1.0 min_lr .......................................... 1e-05 mmap_warmup ..................................... False no_load_optim ................................... None no_load_rng ..................................... None no_persist_layer_norm ........................... False no_save_optim ................................... None no_save_rng ..................................... None num_attention_heads ............................. 16 num_channels .................................... 3 num_classes ..................................... 1000 num_experts ..................................... None num_layers ...................................... 24 num_layers_per_virtual_pipeline_stage ........... None num_workers ..................................... 2 onnx_safe ....................................... None openai_gelu ..................................... False optimizer ....................................... adam output_bert_embeddings .......................... False override_opt_param_scheduler .................... False params_dtype .................................... torch.float16 patch_dim ....................................... 16 perform_initialization .......................... True pipeline_model_parallel_size .................... 1 pipeline_model_parallel_split_rank .............. None position_embedding_type ......................... PositionEmbeddingType.absolute query_in_block_prob ............................. 0.1 rampup_batch_size ............................... None rank ............................................ 0 recompute_granularity ........................... None recompute_method ................................ None recompute_num_layers ............................ 1 reset_attention_mask ............................ False reset_position_ids .............................. False retriever_report_topk_accuracies ................ [] retriever_score_scaling ......................... False retriever_seq_length ............................ 256 retro_add_retriever ............................. False retro_cyclic_train_iters ........................ None retro_encoder_attention_dropout ................. 0.1 retro_encoder_hidden_dropout .................... 0.1 retro_encoder_layers ............................ 2 retro_num_neighbors ............................. 2 retro_num_retrieved_chunks ...................... 2 retro_return_doc_ids ............................ False retro_workdir ................................... None rotary_percent .................................. 1.0 rotary_theta .................................... 10000 sample_rate ..................................... 1.0 sanity_check_dataloader_interval ................ None save ............................................ /cobol/gpt2/checkpoint save_interval ................................... 10000 scatter_gather_tensors_in_pipeline .............. True seed ............................................ 1234 seq_length ...................................... 1024 sequence_parallel ............................... False sgd_momentum .................................... 0.9 short_seq_prob .................................. 0.1 split ........................................... 949,50,1 squared_relu .................................... False standalone_embedding_stage ...................... False start_weight_decay .............................. 0.01 structured_logs ................................. False structured_logs_dir ............................. None swiglu .......................................... False swin_backbone_type .............................. tiny tensor_model_parallel_size ...................... 1 tensorboard_dir ................................. None tensorboard_log_interval ........................ 1 tensorboard_queue_size .......................... 1000 test_data_path .................................. None test_weighted_split_names ....................... None test_weighted_split_paths ....................... None test_weighted_split_paths_path .................. None test_weighted_split_splits ...................... None test_weighted_split_weights ..................... None timing_log_level ................................ 0 timing_log_option ............................... minmax titles_data_path ................................ None tokenizer_file .................................. None tokenizer_model ................................. None tokenizer_type .................................. GPT2BPETokenizer train_data_path ................................. None train_iters ..................................... 500000 train_samples ................................... None train_weighted_split_paths ...................... None train_weighted_split_paths_path ................. None transformer_impl ................................ local transformer_pipeline_model_parallel_size ........ 1 transformer_timers .............................. False untie_embeddings_and_output_weights ............. False use_checkpoint_args ............................. False use_checkpoint_opt_param_scheduler .............. False use_contiguous_buffers_in_local_ddp ............. True use_cpu_initialization .......................... None use_distributed_optimizer ....................... False use_flash_attn .................................. False use_one_sent_docs ............................... False use_ring_exchange_p2p ........................... False use_rotary_position_embeddings .................. False valid_data_path ................................. None valid_num_workers ............................... 2 valid_weighted_split_names ...................... None valid_weighted_split_paths ...................... None valid_weighted_split_paths_path ................. None valid_weighted_split_splits ..................... None valid_weighted_split_weights .................... None variable_seq_lengths ............................ False virtual_pipeline_model_parallel_size ............ None vision_backbone_type ............................ vit vision_pretraining .............................. False vision_pretraining_type ......................... classify vocab_extra_ids ................................. 0 vocab_file ...................................... /cobol/vocab_file/vocab.json vocab_size ...................................... None wandb_entity_name ............................... None wandb_project_name .............................. None weight_decay .................................... 0.01 weight_decay_incr_style ......................... constant world_size ...................................... 1 -------------------- end of arguments --------------------- setting number of micro-batches to constant 2 > building GPT2BPETokenizer tokenizer ... > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) > initializing torch distributed ... > initialized tensor model parallel with size 1 > initialized pipeline model parallel with size 1 > setting random seeds to 1234 ... > compiling dataset index builder ... make: Entering directory '/Megatron-LM/megatron/data' make: Nothing to be done for 'default'. make: Leaving directory '/Megatron-LM/megatron/data' >>> done with dataset index builder. Compilation time: 0.055 seconds [rank0]:[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator()) /Megatron-LM/megatron/training.py:104: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/pytorch/pytorch/torch/csrc/tensor/python_tensor.cpp:83.) start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME]) time to initialize megatron (seconds): 1.204 [after megatron is initialized] datetime: 2024-01-02 17:35:35 building GPT model ... > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 354871296 > learning rate decay style: cosine loading release checkpoint from /cobol/gpt2/checkpoint could not find arguments in the checkpoint ... checkpoint version 0 succesfully fixed query-key-values ordering for checkpoint version 0 successfully loaded checkpoint from /cobol/gpt2/checkpoint at iteration 0 (min, max) time across ranks (ms): load-checkpoint ................................: (540.41, 540.41) [after model, optimizer, and learning rate scheduler are built] datetime: 2024-01-02 17:35:36 > building train, validation, and test datasets ... > datasets target sizes (minimum size): train: 4000000 validation: 40080 test: 80 > building train, validation, and test datasets for GPT ... Single data path provided for train, valid & test > building dataset index ... reading sizes... reading pointers... reading document index... creating numpy buffer of mmap... creating memory view of numpy buffer... > finished creating indexed dataset in 0.000152 seconds number of documents: 4047 > dataset split: train: document indices in [0, 3841) total of 3841 documents validation: document indices in [3841, 4043) total of 202 documents test: document indices in [4043, 4047) total of 4 documents > Tokens per epoch: 14533042 > loading doc-idx mapping from /cobol/gpt2/data_document/data_preprocess_content_document_train_indexmap_4000000ns_1024sl_1234s_doc_idx.npy > loading sample-idx mapping from /cobol/gpt2/data_document/data_preprocess_content_document_train_indexmap_4000000ns_1024sl_1234s_sample_idx.npy > loading shuffle-idx mapping from /cobol/gpt2/data_document/data_preprocess_content_document_train_indexmap_4000000ns_1024sl_1234s_shuffle_idx.npy loaded indexed file in 0.001 seconds total number of samples: 4002264 total number of epochs: 282 Traceback (most recent call last): File "/Megatron-LM/megatron/data/gpt_dataset.py", line 346, in __init__ self.suffix_tok_id, self.prefix_tok_id, self.middle_tok_id, self.pad_tok_id = (self.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]) File "/Megatron-LM/megatron/data/gpt_dataset.py", line 346, in <genexpr> self.suffix_tok_id, self.prefix_tok_id, self.middle_tok_id, self.pad_tok_id = (self.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]) KeyError: '<fim_suffix>' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/Megatron-LM/pretrain_gpt.py", line 148, in <module> pretrain(train_valid_test_datasets_provider, File "/Megatron-LM/megatron/training.py", line 140, in pretrain = build_train_valid_test_data_iterators( File "/Megatron-LM/megatron/training.py", line 1047, in build_train_valid_test_data_iterators build_train_valid_test_data_loaders( File "/Megatron-LM/megatron/training.py", line 979, in build_train_valid_test_data_loaders train_ds, valid_ds, test_ds = build_train_valid_test_datasets_provider( File "/Megatron-LM/pretrain_gpt.py", line 100, in train_valid_test_datasets_provider train_ds, valid_ds, test_ds = build_train_valid_test_datasets( File "/Megatron-LM/megatron/data/gpt_dataset.py", line 33, in build_train_valid_test_datasets all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], File "/Megatron-LM/megatron/data/gpt_dataset.py", line 234, in _build_train_valid_test_datasets train_dataset = build_dataset(0, 'train') File "/Megatron-LM/megatron/data/gpt_dataset.py", line 227, in build_dataset dataset = GPTDataset(name, data_prefix, File "/Megatron-LM/megatron/data/gpt_dataset.py", line 348, in __init__ self.suffix_tok_id, self.prefix_tok_id, self.middle_tok_id, self.pad_tok_id = (self.tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]) File "/Megatron-LM/megatron/data/gpt_dataset.py", line 348, in <genexpr> self.suffix_tok_id, self.prefix_tok_id, self.middle_tok_id, self.pad_tok_id = (self.tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]) KeyError: '<fim_suffix>'
It looks like the tokenizer doesn't add , what should I do?
I add the --tokenizer-type GPT2BPETokenizerWithFIM in torch run and it work!
I use
to create the dataset, and use
to pretrain. But I get the Key Error:
It looks like the tokenizer doesn't add, what should I do?