[BUG]: use huggingface automodel load chatglm error: AttributeError: _old_init is not found in the PrefixEncoder

🐛 Describe the bug

use the example to fintune chatglm like https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/gpt/gemini/train_gpt_demo.py, but report error

code

from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
from transformers import AutoModelForCausalLM,  AutoTokenizer, AutoModel

# build  model
with ColoInitContext(device=get_current_device(),
                             dtype=torch.half,
                             default_dist_spec=default_dist_spec,
                             default_pg=shard_pg):
    model = AutoModel.from_pretrained(chatglm_model_path,  trust_remote_code=True, device_map="auto")

error

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /export/App/training_platform/PinoModel/gpt2_advance/gemini/./train_chatglm.py:310 in main       │
│                                                                                                  │
│   307 │   │   │   │   │   │   │    default_dist_spec=default_dist_spec,                          │
│   308 │   │   │   │   │   │   │    default_pg=shard_pg):                                         │
│   309 │   │   │   ## zlz: define model                                                           │
│ ❱ 310 │   │   │   model = AutoModel.from_pretrained(chatglm_model_path,  trust_remote_code=Tru   │
│   311                                                                                            │
│   312                                                                                            │
│   313                                                                                            │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py:466 in │
│ from_pretrained                                                                                  │
│                                                                                                  │
│   463 │   │   │   │   pretrained_model_name_or_path, module_file + ".py", class_name, **hub_kw   │
│   464 │   │   │   )                                                                              │
│   465 │   │   │   model_class.register_for_auto_class(cls.__name__)                              │
│ ❱ 466 │   │   │   return model_class.from_pretrained(                                            │
│   467 │   │   │   │   pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs,   │
│   468 │   │   │   )                                                                              │
│   469 │   │   elif type(config) in cls._model_mapping.keys():                                    │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:2629 in          │
│ from_pretrained                                                                                  │
│                                                                                                  │
│   2626 │   │   │   init_contexts.append(init_empty_weights())                                    │
│   2627 │   │                                                                                     │
│   2628 │   │   with ContextManagers(init_contexts):                                              │
│ ❱ 2629 │   │   │   model = cls(config, *model_args, **model_kwargs)                              │
│   2630 │   │                                                                                     │
│   2631 │   │   # Check first if we are `from_pt`                                                 │
│   2632 │   │   if use_keep_in_fp32_modules:                                                      │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/colossalai/utils/model/utils.py:59 in wrapper   │
│                                                                                                  │
│    56 │   │   │                                                                                  │
│    57 │   │   │   @functools.wraps(f)                                                            │
│    58 │   │   │   def wrapper(module: torch.nn.Module, *args, **kwargs):                         │
│ ❱  59 │   │   │   │   f(module, *args, **kwargs)                                                 │
│    60 │   │   │   │   self._post_init_method(module, *args, **kwargs)                            │
│    61 │   │   │                                                                                  │
│    62 │   │   │   return wrapper                                                                 │
│                                                                                                  │
│ /media/cfs/zhanglezhong/cache/chatglm/modules/transformers_modules/chatglm-6b/modeling_chatglm.p │
│ y:1047 in __init__                                                                               │
│                                                                                                  │
│   1044 │   │                                                                                     │
│   1045 │   │   self.position_encoding_2d = config.position_encoding_2d                           │
│   1046 │   │                                                                                     │
│ ❱ 1047 │   │   self.transformer = ChatGLMModel(config, empty_init=empty_init)                    │
│   1048 │   │                                                                                     │
│   1049 │   │   self.lm_head = init_method(                                                       │
│   1050 │   │   │   nn.Linear,                                                                    │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/colossalai/utils/model/utils.py:59 in wrapper   │
│                                                                                                  │
│    56 │   │   │                                                                                  │
│    57 │   │   │   @functools.wraps(f)                                                            │
│    58 │   │   │   def wrapper(module: torch.nn.Module, *args, **kwargs):                         │
│ ❱  59 │   │   │   │   f(module, *args, **kwargs)                                                 │
│    60 │   │   │   │   self._post_init_method(module, *args, **kwargs)                            │
│    61 │   │   │                                                                                  │
│    62 │   │   │   return wrapper                                                                 │
│                                                                                                  │
│ /media/cfs/zhanglezhong/cache/chatglm/modules/transformers_modules/chatglm-6b/modeling_chatglm.p │
│ y:821 in __init__                                                                                │
│                                                                                                  │
│    818 │   │   self.pre_seq_len = config.pre_seq_len                                             │
│    819 │   │   self.prefix_projection = config.prefix_projection                                 │
│    820 │   │                                                                                     │
│ ❱  821 │   │   self.word_embeddings = init_method(                                               │
│    822 │   │   │   torch.nn.Embedding,                                                           │
│    823 │   │   │   num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,               │
│    824 │   │   │   dtype=self.params_dtype                                                       │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/utils/init.py:51 in skip_init          │
│                                                                                                  │
│   48 │                                                                                           │
│   49 │   final_device = kwargs.pop('device', 'cpu')                                              │
│   50 │   kwargs['device'] = 'meta'                                                               │
│ ❱ 51 │   return module_cls(*args, **kwargs).to_empty(device=final_device)                        │
│   52                                                                                             │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/colossalai/utils/model/utils.py:60 in wrapper   │
│                                                                                                  │
│    57 │   │   │   @functools.wraps(f)                                                            │
│    58 │   │   │   def wrapper(module: torch.nn.Module, *args, **kwargs):                         │
│    59 │   │   │   │   f(module, *args, **kwargs)                                                 │
│ ❱  60 │   │   │   │   self._post_init_method(module, *args, **kwargs)                            │
│    61 │   │   │                                                                                  │
│    62 │   │   │   return wrapper                                                                 │
│    63                                                                                            │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/colossalai/zero/gemini/colo_init_context.py:128 │
│ in _post_init_method                                                                             │
│                                                                                                  │
│   125 │   │   │   │   │   │   │   │   │   │   │   │      self._default_dist_spec)                │
│   126 │   │   │   │   replaced_tensors[param] = colo_param                                       │
│   127 │   │   │   delattr(submodule, param_name)                                                 │
│ ❱ 128 │   │   │   setattr(submodule, param_name, colo_param)                                     │
│   129 │   │   │   colo_param.shared_param_modules.append(submodule)                              │
│   130 │   │                                                                                      │
│   131 │   │   param_number = 0                                                                   │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1203 in __setattr__  │
│                                                                                                  │
│   1200 │   │   │   │   raise AttributeError(                                                     │
│   1201 │   │   │   │   │   "cannot assign parameters before Module.__init__() call")             │
│   1202 │   │   │   remove_from(self.__dict__, self._buffers, self._modules, self._non_persisten  │
│ ❱ 1203 │   │   │   self.register_parameter(name, value)                                          │
│   1204 │   │   elif params is not None and name in params:                                       │
│   1205 │   │   │   if value is not None:                                                         │
│   1206 │   │   │   │   raise TypeError("cannot assign '{}' as parameter '{}' "                   │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/accelerate/big_modeling.py:108 in               │
│ register_empty_parameter                                                                         │
│                                                                                                  │
│   105 │   │   if param is not None:                                                              │
│   106 │   │   │   param_cls = type(module._parameters[name])                                     │
│   107 │   │   │   kwargs = module._parameters[name].__dict__                                     │
│ ❱ 108 │   │   │   module._parameters[name] = param_cls(module._parameters[name].to(device), **   │
│   109 │                                                                                          │
│   110 │   def register_empty_buffer(module, name, buffer):                                       │
│   111 │   │   old_register_buffer(module, name, buffer)                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
TypeError: __new__() got an unexpected keyword argument 'has_initialized'

During handling of the above exception, another exception occurred:

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /export/App/training_platform/PinoModel/gpt2_advance/gemini/./train_chatglm.py:433 in <module>   │
│                                                                                                  │
│   430                                                                                            │
│   431                                                                                            │
│   432 if __name__ == '__main__':                                                                 │
│ ❱ 433 │   main()                                                                                 │
│   434                                                                                            │
│                                                                                                  │
│ /export/App/training_platform/PinoModel/gpt2_advance/gemini/./train_chatglm.py:310 in main       │
│                                                                                                  │
│   307 │   │   │   │   │   │   │    default_dist_spec=default_dist_spec,                          │
│   308 │   │   │   │   │   │   │    default_pg=shard_pg):                                         │
│   309 │   │   │   ## zlz: define model                                                           │
│ ❱ 310 │   │   │   model = AutoModel.from_pretrained(chatglm_model_path,  trust_remote_code=Tru   │
│   311                                                                                            │
│   312                                                                                            │
│   313                                                                                            │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/colossalai/utils/model/utils.py:97 in __exit__  │
│                                                                                                  │
│    94 │   │   │   cls.__init__ = cls._old_init                                                   │
│    95 │   │                                                                                      │
│    96 │   │   # Replace .__init__() for all existing subclasses of torch.nn.Module               │
│ ❱  97 │   │   substitute_init_recursively(torch.nn.modules.module.Module, _disable_class, set(   │
│    98 │   │                                                                                      │
│    99 │   │   # Replace .__init__() for future subclasses of torch.nn.Module                     │
│   100 │   │   torch.nn.modules.module.Module.__init_subclass__ = (torch.nn.modules.module.Modu   │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/colossalai/utils/model/utils.py:15 in           │
│ substitute_init_recursively                                                                      │
│                                                                                                  │
│    12 │   for subcls in cls.__subclasses__():                                                    │
│    13 │   │   substitute_init_recursively(subcls, func, visited)                                 │
│    14 │   │   if subcls not in visited:                                                          │
│ ❱  15 │   │   │   func(subcls)                                                                   │
│    16 │   │   │   visited.add(subcls)                                                            │
│    17                                                                                            │
│    18                                                                                            │
│                                                                                                  │
│ /usr/local/anaconda3/lib/python3.8/site-packages/colossalai/utils/model/utils.py:91 in           │
│ _disable_class                                                                                   │
│                                                                                                  │
│    88 │   │                                                                                      │
│    89 │   │   def _disable_class(cls):                                                           │
│    90 │   │   │   if not hasattr(cls, '_old_init'):                                              │
│ ❱  91 │   │   │   │   raise AttributeError(                                                      │
│    92 │   │   │   │   │   f"_old_init is not found in the {cls.__name__}, please make sure tha   │
│    93 │   │   │   │   )                                                                          │
│    94 │   │   │   cls.__init__ = cls._old_init                                                   │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: _old_init is not found in the PrefixEncoder, please make sure that you have imported PrefixEncoder before entering the context.

my full finetune train python file:

%%writefile train_chatglm.py

import os
os.environ['HF_HOME'] = '/media/cfs/zhanglezhong/cache/chatglm'

from transformers import AutoTokenizer
from tqdm import tqdm
from datasets import load_dataset, concatenate_datasets,load_from_disk
from functools import partial
from time import time

import psutil
import torch
import torch.nn as nn
from tqdm import tqdm
# from commons.model_zoo import model_builder
from commons.model_zoo_gpt import model_builder
from commons.utils import get_data, get_profile_context, get_tflops, get_time_stamp
from packaging import version
from torch.nn.parallel import DistributedDataParallel as DDP

import colossalai
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
# from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
from colossalai.utils import get_current_device
# from colossalai.utils.model.colo_init_context import ColoInitContext

from transformers import AutoModelForCausalLM,  AutoTokenizer, AutoModel
from datasets import load_dataset, concatenate_datasets,load_from_disk
from ChatGLM_GetLoader import  GetLoader_Train, data_collator

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper

CAI_VERSION = colossalai.__version__

def parse_args():
    parser = colossalai.get_default_parser()
    parser.add_argument(
        "--distplan",
        type=str,
        default='CAI_Gemini',
        help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
    )
    parser.add_argument(
        "--tp_degree",
        type=int,
        default=1,
        help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.",
    )
    parser.add_argument(
        "--placement",
        type=str,
        default='cpu',
        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
    )
    parser.add_argument(
        "--shardinit",
        action='store_true',
#         default=True,
        help=
        "Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=8,
        help="batch size per DP group of training.",
    )
    parser.add_argument(
        "--model_type",
        type=str,
        default="gpt2_medium",
        help="model model scale",
    )
    parser.add_argument(
        "--train_step",
        type=int,
        default=10,
        help="training iterations for test",
    )

    args = parser.parse_args()
    return args

# Parameter Sharding Strategies for Tensor Parallelism
def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
    param.set_tensor_spec(*spec)

def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
    split_param_single_dim_tp1d(0, param, pg)

def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
    split_param_single_dim_tp1d(-1, param, pg)

class GPTLMLoss(nn.Module):

    def __init__(self):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        # Flatten the tokens
        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

def get_cpu_mem():
    return psutil.Process().memory_info().rss / 1024**2

def get_gpu_mem():
    return torch.cuda.memory_allocated() / 1024**2

def get_mem_info(prefix=''):
    return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'

def get_model_size(model: nn.Module):
    total_numel = 0
    for module in model.modules():
        for p in module.parameters(recurse=False):
            total_numel += p.numel()
    return total_numel

def model_size_formatter(numel: int) -> str:
    GB_SIZE = 10**9
    MB_SIZE = 10**6
    KB_SIZE = 10**3
    if numel >= GB_SIZE:
        return f'{numel / GB_SIZE:.1f}B'
    elif numel >= MB_SIZE:
        return f'{numel / MB_SIZE:.1f}M'
    elif numel >= KB_SIZE:
        return f'{numel / KB_SIZE:.1f}K'
    else:
        return str(numel)

def set_cpu_maximum_parallelism():
    conf_str = torch.__config__.parallel_info()
    inter_str = conf_str.split("hardware_concurrency() : ")[1]
    max_concurrency = inter_str.split('\n')[0]
    os.environ["OMP_NUM_THREADS"] = max_concurrency
    print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.")

# Tensor Parallel
def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
    """tensor_parallelize
    Sharding the Model Parameters.

    Args:
        model (torch.nn.Module): a torch module to be sharded
    """
    for mn, module in model.named_modules():
        for pn, param in module.named_parameters(recurse=False):
            # NOTE() a param maybe shared by two modules
            if hasattr(param, 'visited'):
                continue

            # if shard init, then convert param to replica and use the dp-only ProcessGroup
            param: ColoParameter = param
            param.set_dist_spec(ReplicaSpec())
            param.set_process_group(pg)

            # shard it w.r.t tp pattern
            if 'mlp.c_fc' in mn:
                if 'weight' in pn or 'bias' in pn:
                    split_param_col_tp1d(param, pg)    # colmn slice
                    # keep the shape of the output from c_fc
                    param.compute_spec.set_output_replicate(False)
                else:
                    param.set_dist_spec(ReplicaSpec())
            elif 'mlp.c_proj' in mn:
                if 'weight' in pn:
                    split_param_row_tp1d(param, pg)    # row slice
                else:
                    param.set_dist_spec(ReplicaSpec())
            elif 'wte' in mn or 'wpe' in mn:
                split_param_col_tp1d(param, pg)    # colmn slice
            elif 'c_attn' in mn or 'c_proj' in mn:
                split_param_col_tp1d(param, pg)    # colmn slice
            else:
                param.set_dist_spec(ReplicaSpec())
            param.visited = True

def main():
    # version check
    # this example is supposed to work for versions greater than 0.2.0
    assert version.parse(CAI_VERSION) >= version.parse("0.2.0")

    set_cpu_maximum_parallelism()

    ## zlz: 设置参数
    args = parse_args()
    for key,value in vars(args).items():
        print(f"-------> key: {key}, value: {value}")

    # if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
    if args.distplan not in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]:
        raise TypeError(f"{args.distplan} is error")

    NUM_STEPS = args.train_step

    WARMUP_STEPS = 1
    assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
    assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median"
    PROF_FLAG = False    # The flag of profiling, False by default

    disable_existing_loggers()
    colossalai.launch_from_torch(config={})

    logger = get_dist_logger()

    """
    自定义：
    数据处理dataloader
    """
    # batch size per DP degree
    BATCH_SIZE = args.batch_size
    SEQ_LEN = 2048

    logger.info(f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])
    chatglm_model_path = '/media/cfs/techdata-sjgx-nryf/wangli/Pretrained_model/chatglm-6b'
    tokenizer = AutoTokenizer.from_pretrained(chatglm_model_path, trust_remote_code=True)

    root_path='/media/cfs/zhanglezhong/LLMS/all_data2'
    chatglm_datasets = load_from_disk(f"{root_path}/save_disk/instruct_merge_datasets2_sample_10w")

    def preprocess_function_train(examples, max_seq_length=1024):
        model_inputs = {"input_ids": [], "seq_len": [],}
        for index in range(len(examples["instruction"])):
            prompt = examples["instruction"][index] + "\n" + examples['input'][index]
            target = examples["output"][index]
            prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)  ## 末尾增加[130001:gmask, 130004:sop]
            target_ids = tokenizer.encode(target, max_length=max_seq_length, truncation=True, add_special_tokens=False) 
            input_ids = tokenizer.build_inputs_with_special_tokens(prompt_ids, target_ids)   ## 末尾增加eop token

            model_inputs["input_ids"].append(input_ids)
            model_inputs["seq_len"].append(len(prompt_ids))

        return model_inputs

    train_datasets = chatglm_datasets['train']
    train_dataset_map = train_datasets.map(
                preprocess_function_train,
                batched=True,
                num_proc=1,
                remove_columns=["instruction","input","output","channel"],
                load_from_cache_file=False,
                desc="Running tokenizer on train dataset",
            )
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset_map)
    train_data_loader = torch.utils.data.DataLoader(train_dataset_map, sampler=train_sampler, batch_size=8, collate_fn=data_collator)

#     train_file_path = ''
#     train_data = GetLoader_Train(train_file_path, tokenizer, max_seq_length=SEQ_LEN,  skip_overlength=True)
#     train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
#     train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE, collate_fn=collate_func)

#     test_file_path = ''
#     test_data = GetLoader_Train(test_file_path, tokenizer, max_seq_length=SEQ_LEN,  skip_overlength=True)
#     test_sampler = torch.utils.data.distributed.DistributedSampler(test_data)
#     test_data_loader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE, collate_fn=collate_func)

    torch.manual_seed(123)
    if args.distplan.startswith("CAI"):
        # all param must use the same process group.
        world_size = torch.distributed.get_world_size()
        shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None
        default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None

        print(f"args.distplan: {args.distplan}, world_size: {world_size}, shard_pg： {shard_pg}, default_dist_spec: {default_dist_spec}")

        if args.shardinit and args.distplan != "CAI_Gemini":
            raise RuntimeError("You can only use shardinit with CAI_Gemini")

        # build GPT model
        with ColoInitContext(device=get_current_device(),
                             dtype=torch.half,
                             default_dist_spec=default_dist_spec,
                             default_pg=shard_pg):
            ## zlz: define model
            model = AutoModel.from_pretrained(chatglm_model_path,  trust_remote_code=True, device_map="auto")

        tp_pg = ProcessGroup(tp_degree=args.tp_degree)
        # Tensor Parallelism (TP)
        # You should notice that v0.1.10 is not compatible with TP degree > 1
        if args.tp_degree > 1:
            tensor_parallelize(model, tp_pg)

        # asign running configurations
        gemini_config = None
        if args.distplan.startswith("CAI_ZeRO"):
            optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
        elif args.distplan == "CAI_Gemini":
            gemini_config = dict(strict_ddp_mode=args.tp_degree == 1,
                                 device=get_current_device(),
                                 placement_policy=args.placement,
                                 pin_memory=True,
                                 hidden_dim=model.config.n_embd,
                                 search_range_mb=128,
                                 min_chunk_size_mb=128 )
            optim_config = dict(gpu_margin_mem_ratio=0.)
        else:
            raise RuntimeError

        # build a highly optimized gpu/cpu optimizer
        optimizer = HybridAdam(model.parameters(), lr=1e-5,  betas = (0.9, 0.95), weight_decay = 0.1)

        if args.distplan == "CAI_ZeRO1":
            zero_stage = 1
        elif args.distplan == "CAI_ZeRO2":
            zero_stage = 2
        elif args.distplan == "CAI_Gemini":
            zero_stage = 3
        else:
            raise RuntimeError

        # wrap your model and optimizer
        print(f"\n\n\n------------------>>> gemini_config: {gemini_config}")
        model = zero_model_wrapper(model, zero_stage, gemini_config)
        optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config)

        logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
    elif args.distplan.startswith("Pytorch"):
        assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
        model = model_builder(args.model_type)(checkpoint=True).cuda()
        model = DDP(model)
        if args.distplan.endswith("DDP"):
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
        elif args.distplan.endswith("ZeRO"):
            from torch.distributed.optim import ZeroRedundancyOptimizer
            optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3)
    else:
        raise RuntimeError

    # model is shared after TP
    numel = get_model_size(model)
    logger.info(f"the size of testing model size is {model_size_formatter(numel)}.")
    logger.info(get_mem_info(prefix='After init model, '), ranks=[0])

    # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
    # = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree)
    # = batch_per_DP_group * numel * seq_len * 8
    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)

    torch.cuda.synchronize()
    model.train()
#     title_id = train_data.title_id
    tflops_list = []

    demo_profiler = get_profile_context(PROF_FLAG,WARMUP_STEPS,NUM_STEPS - WARMUP_STEPS,save_dir=f"profile/{get_time_stamp()}-demo")
    with demo_profiler as prof:
        iter_bar = tqdm(train_data_loader, desc="Iter (loss=X.XXX)", disable=False)
        tr_loss, logging_loss, min_loss = 0.0, 0.0, 0.0
        for step, batch in enumerate(iter_bar, start=0):
            if step>=NUM_STEPS: continue
            start = time()
            input_ids = batch["input_ids"].to(torch.cuda.current_device())
            labels = batch["labels"].to(torch.cuda.current_device())
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            tr_loss += loss.item()

            if step%10==0:
                iter_bar.set_description("Iter (loss=%5.3f)" % loss.item())
            torch.cuda.synchronize()
            fwd_end = time()
            fwd_time = fwd_end - start
            logger.info(get_mem_info(prefix=f'[{step + 1}/{NUM_STEPS}] Forward '), ranks=[0])
            if args.distplan.startswith("CAI"): optimizer.backward(loss)
            elif args.distplan.startswith("Pytorch"): loss.backward()
            else: raise RuntimeError

            torch.cuda.synchronize()
            bwd_end = time()
            bwd_time = bwd_end - fwd_end
            logger.info(get_mem_info(prefix=f'[{step + 1}/{NUM_STEPS}] Backward '), ranks=[0])

            optimizer.step()
            torch.cuda.synchronize()
            optim_time = time() - bwd_end
            step_time = time() - start
            logger.info(get_mem_info(prefix=f'[{step + 1}/{NUM_STEPS}] Optimizer step '), ranks=[0])

            step_tflops = get_tflops_func(step_time)
            logger.info(
                f"[{step + 1}/{NUM_STEPS}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s",
            ranks=[0],)
            if step >= WARMUP_STEPS: tflops_list.append(step_tflops)

        prof.step()

    tflops_list.sort()
    median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
    logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
    torch.cuda.synchronize()

if __name__ == '__main__':
    main()

Environment

Package Version

accelerate 0.17.1 bitsandbytes 0.37.1 datasets 2.12.0 decorator 4.4.2 deepspeed 0.9.1 huggingface-hub 0.13.3 numpy 1.23.0 torch 1.11.0+cu113 torchaudio 0.11.0+rocm4.5.2 torchvision 0.12.0+cu113 colossalai 0.2.8

hpcaitech / ColossalAI

[BUG]: use huggingface automodel load chatglm error: AttributeError: _old_init is not found in the PrefixEncoder #3687

🐛 Describe the bug

Environment