boostcampaitech2 / klue-level2-nlp-14

klue-level2-nlp-p-14 created by GitHub Classroom
MIT License
9 stars 6 forks source link

[Dev] Hyper-parameter search code & search space of baseline #7

Closed jinmang2 closed 2 years ago

jinmang2 commented 2 years ago

기타

taeukkkim commented 2 years ago

OPTUNA를 학습에 적용시킨 코드입니다! optuna==2.9.1 버전을 사용했습니다. 추가적인 논의 사항이나 질문 남겨주세요!

import optuna

if __name__ == '__main__':
    if args.optuna:
        def train_optuna(trial):
            # optuna Setting
            args.epochs = trial.suggest_int('n_epochs', optuna_epoch_min, optuna_epoch_max)
            args.lr = trial.suggest_loguniform('lr', optuna_lr_min, optuna_lr_max)
            args.optimizer = trial.suggest_categorical('optimizer', optuna_optimizer)
            print(args)
            return train(data_dir, model_dir, args)

        study = optuna.create_study(direction='maximize') 
        study.optimize(train_optuna, n_trials=optuna_ntrials)
        print(f"Best F1 Val: {study.best_trial.value}\n Params\n {study.best_trial.params}\n Save at {model_dir}/optuna.json")
        with open(os.path.join(model_dir, f'optuna_{args.name}_{study.best_trial.value}.json'), 'w', encoding='utf-8') as f:\
            json.dump(study.best_trial.params, f, ensure_ascii=False, indent=4)
    else:
        print(args)
        train(data_dir, model_dir, args)
KimDaeUng commented 2 years ago

Ray & Hugging Face Trainer를 사용한 Hyperparameter Search 작업 간 이슈 및 완성 코드 보고

이슈

  1. Token Embedding size : 베이스라인은 기본 32000이지만, 엔티티의 스페셜토큰 추가하면서 32004가 됨. 이때 1회차 실행 후에 2회차 실행시에 pretrained weight를 가져오면서 임베딩 레이어 사이즈가 변경되지 않은 모델을 불러오며 실행 불가.

    ...
    (pid=22448)   File "/opt/conda/envs/basic/lib/python3.8/site-packages/transformers/modeling_utils.py", line 1576, in _load_state_dict_into_model
    (pid=22448)     raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
    (pid=22448) RuntimeError: Error(s) in loading state_dict for RobertaForSequenceClassification:
    (pid=22448)     size mismatch for roberta.embeddings.word_embeddings.weight: copying a param with shape torch.Size([32000, 768]) from checkpoint, the shape in current model is torch.Size([32004, 768]).
    ...
    • 모델을 특정 경로에 저장한 다음에 다시 불러오는 시도를 했으나 실패
    • cache dir을 바꾸어보았으나 실패
    • 각 프로세스에서 모델을 불러올때(get_model) 실행 경로가 다른지 확인 -> 절대경로로 바꾸어 모델을 저장한 뒤, 불러오도록 수정하여 바뀐 토큰 임베딩 레이어 사이즈로 불러오는데 성공함.
  2. trial에 assignment가 누락되었다는 오류, huggingface trainer.py에서 발생함.

    ...
    (pid=23933)   File "/opt/conda/envs/basic/lib/python3.8/site-packages/transformers/integrations.py", line 183, in _objective
    (pid=23933)     local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
    (pid=23933)   File "/opt/conda/envs/basic/lib/python3.8/site-packages/transformers/trainer.py", line 1241, in train
    (pid=23933)     self.state.trial_params = hp_params(trial.assignments) if trial is not None else None
    (pid=23933) AttributeError: 'dict' object has no attribute 'assignments'
    ...
    • 아래와 같이 코드를 수정하여 해결, trial은 hyperparameter setting값이 담긴 dictionary 인데 없는 attribute를 불러오니 에러가 난것.
      # self.state.trial_params = hp_params(trial.assignments) if trial is not None else None
      self.state.trial_params = hp_params(trial) if trial is not None else None
  3. 최종 실행 코드는 얻었으나 population_based training은 checkpoint 저장을 필수로 하기 때문에 서버용량을 빠르게 잡아먹는 문제.

    • roberta-large로 실험한 결과 2개 실험 조건으로 1/10 데이터셋으로 실험 중에 약 40GB의 checkpoint가 생성됨
    • checkpoint를 저장하지 않거나 적게 저장하는 다른 grid search 방식 적용, 또는 사용에 주의할 필요가 있음

최종코드

접기/펼치기 ```python import os import ray from ray import tune from ray.tune import CLIReporter from ray.tune.schedulers import PopulationBasedTraining, ASHAScheduler from transformers import AutoConfig, \ AutoModelForSequenceClassification, AutoTokenizer, Trainer, GlueDataset, \ GlueDataTrainingArguments, TrainingArguments, TrainerCallback from constant import * import entity_prep from entity_prep import * from metrics import * import datasets import torch from datasets import Dataset, DatasetDict, Features, Value, ClassLabel class DataCollator: def __init__(self, tokenizer, max_length='single'): self.tokenizer = tokenizer if max_length == 'single': self.max_length = tokenizer.max_len_single_sentence elif max_length == 'pair': self.max_length = tokenizer.max_len_sentences_pair elif type(max_length) == int: self.max_length = max_length def __call__(self, batch): input_ids = [x["input_ids"] for x in batch] labels = [x["label"] for x in batch] batch_encoding = self.tokenizer.pad( {"input_ids": input_ids}, max_length=self.max_length, return_tensors="pt", ) batch_encoding.update({"labels": torch.LongTensor(labels)}) return batch_encoding # class MemorySaverCallback(TrainerCallback): # "A callback that deleted the folder in which checkpoints are saved, to save memory" # def __init__(self, run_name): # super(MemorySaverCallback, self).__init__() # self.run_name = run_name # def on_train_begin(self, args, state, control, **kwargs): # print("Removing dirs...") # if os.path.isdir(f'./{self.run_name}'): # import shutil # shutil.rmtree(f'./{self.run_name}') # else: # print("\n\nDirectory does not exists") def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False): data_dir_name = "./hp_search" if not smoke_test else "./hp_search_test" data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name)) if not os.path.exists(data_dir): os.mkdir(data_dir, 0o755) cache_dir_name = './cache' cache_dir = os.path.abspath(os.path.join(os.getcwd(), cache_dir_name)) if not os.path.exists(cache_dir): os.mkdir(cache_dir, 0o755) # Change these as needed. model_name = "klue/roberta-large" if not smoke_test \ else "klue/roberta-small" task_name = "re" task_data_dir = os.path.join(data_dir, task_name.upper()) num_labels = len(CLASS_NAMES) # config = AutoConfig.from_pretrained( # model_name) # setting model hyperparameter config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) config.num_labels = 30 # config.cache_dir = cache_dir config.id2label = IDX2LABEL config.label2id = LABEL2IDX # Download and cache tokenizer, model, and features print("Downloading and caching Tokenizer") tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) tokenizer.add_special_tokens( {"additional_special_tokens": list(MARKERS.values())} ) # Triggers tokenizer download to cache print("Downloading and caching pre-trained model") model = AutoModelForSequenceClassification.from_pretrained( model_name, config=config, cache_dir=cache_dir ) if model.config.vocab_size < len(tokenizer): print("resize...") model.resize_token_embeddings(len(tokenizer)) model_path = os.path.join(cache_dir, 'models') print("###", os.getcwd(), model_path) model.save_pretrained(model_path) config.vocab_size = len(tokenizer) del model def get_model(): print("### inner path", os.getcwd()) model = AutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=cache_dir ) if model.config.vocab_size < len(tokenizer): print("resize...") model.resize_token_embeddings(len(tokenizer)) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if torch.cuda.device_count() > 1: net = nn.DataParallel(net) model.to(device) return model # Download data. klue_re = datasets.load_dataset("jinmang2/load_klue_re",script_version="v1.0.1b", cache_dir=cache_dir) convert_example_to_features = partial( entity_prep.convert_example_to_features, tokenizer=tokenizer, **MARKERS, ) examples = klue_re.map(entity_prep.mark_entity_spans) tokenized_datasets = examples.map(convert_example_to_features) max_length = 128 data_collator = DataCollator(tokenizer, max_length=max_length) training_args = TrainingArguments( # Checkpoint output_dir=".", save_strategy="epoch", # Run do_train=True, do_eval=True, # Training num_train_epochs=1, max_steps=-1, learning_rate=5e-5, # config per_device_train_batch_size=32, # config per_device_eval_batch_size=32, # config ## Learning rate scheduling warmup_steps=0, ## Regularization weight_decay=0.01, # config # Logging logging_dir='./logs', report_to ="none", # Evaluation metric_for_best_model = 'auprc', evaluation_strategy='epoch', eval_steps = 500, # ETC load_best_model_at_end = True, seed = 42, skip_memory_metrics=True, # GPU fp16 = True, no_cuda=gpus_per_trial <= 0, # dataloader_num_workers=4, ) trainer = Trainer( model_init=get_model, args=training_args, train_dataset=tokenized_datasets['train'].shard(index=1, num_shards=100), eval_dataset=tokenized_datasets['valid'].shard(index=1, num_shards=100), compute_metrics=build_compute_metrics_fn(), data_collator=data_collator, # callbacks=[MemorySaverCallback(".")] ) def tung_config_fn(*args, **kwargs): return { "per_device_train_batch_size": 32, "per_device_eval_batch_size": 32, "num_train_epochs": tune.choice([2, 3]), "max_steps": 1 if smoke_test else -1, # Used for smoke test. } scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="eval_micro_f1", mode="max", perturbation_interval=1, hyperparam_mutations={ "weight_decay": tune.uniform(0.0, 0.3), "learning_rate": tune.uniform(1e-5, 5e-5), "per_device_train_batch_size": [16, 32, 64], } ) # scheduler = ASHAScheduler( # time_attr="training_iteration", # metric="eval_micro_f1", # mode="max", # max_t=100, # reduction_factor=3, # brackets = 1, # ) reporter = CLIReporter( parameter_columns={ "weight_decay": "w_decay", "learning_rate": "lr", "per_device_train_batch_size": "train_bs/gpu", "num_train_epochs": "num_epochs" }, metric_columns=[ "eval_micro_f1", "eval_loss", "epoch", "training_iteration" ]) trainer.hyperparameter_search( hp_space=tung_config_fn, backend="ray", n_trials=num_samples, resources_per_trial={ "cpu": 4, "gpu": gpus_per_trial }, scheduler=scheduler, keep_checkpoints_num=1, checkpoint_score_attr="training_iteration", stop={"training_iteration": 1} if smoke_test else None, progress_reporter=reporter, local_dir="~/ray_results/", name="tune_transformer_pbt", log_to_file=True) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") parser.add_argument( "--ray-address", type=str, default=None, help="Address to use for Ray. " "Use \"auto\" for cluster. " "Defaults to None for local.") parser.add_argument( "--server-address", type=str, default=None, required=False, help="The address of server to connect to if using " "Ray Client.") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(local_mode=True) elif args.server_address: ray.init(f"ray://{args.server_address}") else: ray.init(args.ray_address) if args.smoke_test: tune_transformer(num_samples=1, gpus_per_trial=1, smoke_test=True) else: # You can change the number of GPUs here: tune_transformer(num_samples=2, gpus_per_trial=1) ```

TODO

Reference

jinmang2 commented 2 years ago

@KimDaeUng 님 혹시 code commit hash로 링크 걸어주실 수 있으신가요??