Open A-zhudong opened 2 weeks ago
Hi @A-zhudong, thanks for opening an issue!
Is it intended that a lot of this issue is crossed out?
Hi @A-zhudong, thanks for opening an issue!
Is it intended that a lot of this issue is crossed out?
Thank you for the reminder. The strikethrough was caused by characters related to "DEFAULT_TOKEN." For better readability, I have removed that part.
cc @muellerzr @SunMarc
System Info
transformers
version: 4.42.0Who can help?
No response
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
step1: train without checkpoint and load llama2: """ Copyright (c) Facebook, Inc. and its affiliates.
This source code is licensed under the MIT license found in the LICENSE file in the root directory of this source tree. """
import os import glob import argparse import torch import random import warnings import numpy as np import pandas as pd from pymatgen.core.structure import Structure from pathlib import Path
from dataclasses import dataclass import transformers from transformers import ( LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, AutoModelForCausalLM )
from trl import SFTTrainer from torch.utils.data import Dataset
from peft import ( LoraConfig, get_peft_model, prepare_model_for_kbit_training )
IGNORE_INDEX = -100 MAX_LENGTH = 2048
def get_crystal_string(cif_str): structure = Structure.from_str(cif_str, fmt="cif") structure.translate_sites( indices=range(len(structure.sites)), vector=np.random.uniform(size=(3,)) )
class CifDataset(Dataset): def init( self, csv_fn, format_options={}, llama_tokenizer=None, w_attributes=False, ): super().init()
@dataclass class DataCollatorForSupervisedDataset(object): """Collate examples for supervised fine-tuning."""
def setup_datasets(args, llama_tokenizer, transform_args={}):
format_options = { "permute_composition": args.format_permute_composition, "permute_structure": args.format_permute_structure, } datasets = { "train": CifDataset( str(args.data_path / "train_10.csv"), format_options, llama_tokenizer=llama_tokenizer, w_attributes=args.w_attributes, ), "val": CifDataset( str(args.data_path / "val_10.csv"), format_options, llama_tokenizer=llama_tokenizer, w_attributes=args.w_attributes, ), } return datasets
def setup_training_args(args): output_dir= args.expdir / args.run_name output_dir.mkdir(parents=True, exist_ok=True)
def smart_tokenizer_and_embedding_resize( special_tokens_dict, llama_tokenizer, model, ): """Resize tokenizer and embedding.
def setup_model(args, rank): llama_options = args.model_name.split("-") is_chat = len(llama_options) == 2 model_size = llama_options[0]
def setup_trainer(args): training_args = setup_training_args(args) model, llama_tokenizer = setup_model(args, training_args.local_rank)
def main(args):
if name == "main": parser = argparse.ArgumentParser() parser.add_argument("--run-name", type=str, required=True) parser.add_argument("--expdir", type=Path, default="exp") parser.add_argument("--model_name", default="7b") parser.add_argument("--fp8", action="store_true", default=True) parser.add_argument("--lora-rank", type=int, default=8) parser.add_argument("--lora-alpha", type=int, default=32) parser.add_argument("--lora-dropout", type=float, default=0.05) parser.add_argument("--data-path", type=Path, default="data/basic") parser.add_argument("--num-epochs", type=int, default=10) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--grad-accum", type=int, default=1) parser.add_argument("--lr", type=float, default=1e-4) parser.add_argument("--lr-scheduler", type=str, default="cosine") parser.add_argument("--num-warmup-steps", type=int, default=100) parser.add_argument("--weight-decay", type=float, default=0.0) parser.add_argument("--eval-freq", default=1000, type=int) parser.add_argument("--save-freq", default=500, type=int) parser.add_argument("--format-permute-composition", action="store_true", default=False) parser.add_argument("--format-permute-structure", action="store_true", default=False) parser.add_argument("--w-attributes", type=int, default=1) parser.add_argument("--resume-dir", type=Path, default=None) parser.add_argument("--finetune-dir", type=Path, default=None) parser.add_argument("--debug", action="store_true", default=False) args = parser.parse_args() print(args.batch_size, args.w_attributes) print(args.expdir) main(args)
step2: set the resume checkpoint path(saved by trainer)
error: File "/home/wuzh/zd/GIT-Mol/crystal-text-llm-main/llama_finetune.py", line 522, in
main(args)
File "/home/wuzh/zd/GIT-Mol/crystal-text-llm-main/llama_finetune.py", line 481, in main
train_result = trainer.train(resume_from_checkpoint=args.resume_dir)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/transformers/trainer.py", line 1932, in train
return inner_training_loop(
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/transformers/trainer.py", line 3307, in training_step
loss = self.compute_loss(model, inputs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/transformers/trainer.py", line 3338, in compute_loss
outputs = model(inputs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/accelerate/utils/operations.py", line 819, in forward
return model_forward(*args, kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/accelerate/utils/operations.py", line 807, in call
return convert_to_fp32(self.model_forward(*args, *kwargs))
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/amp/autocast_mode.py", line 16, in decorate_autocast
return func(args, kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
return fn(*args, kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
return fn(*args, *kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 823, in forward
args, kwargs = _root_pre_forward(self, self, args, kwargs)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/distributed/fsdp/_runtime_utils.py", line 558, in _root_pre_forward
_lazy_init(state, module)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/distributed/fsdp/_runtime_utils.py", line 173, in _lazy_init
_share_state_and_init_handle_attrs(state, root_module)
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/distributed/fsdp/_runtime_utils.py", line 261, in _share_state_and_init_handle_attrs
_p_assert(
File "/work/zd/anaconda/fsdp/lib/python3.9/site-packages/torch/distributed/utils.py", line 145, in _p_assert
traceback.print_stack()
Non-root FSDP instance's
_is_root
should not have been set yet or should have been set toFalse
Expected behavior
load checkpoint without error