Closed huashiyiqike closed 1 year ago
训练没有使用8bit
import sys sys.path.append("../") from transformers import AutoTokenizer, AutoModel, TrainingArguments, AutoConfig, BitsAndBytesConfig import torch import torch.nn as nn from peft import get_peft_model, LoraConfig, TaskType quantization_config = BitsAndBytesConfig(load_in_8bit=False,llm_int8_enable_fp32_cpu_offload=True) device_map = { "lm_head": 0, "transformer.word_embeddings":0, "transformer.layers": 0, "transformer.final_layernorm": 0, } class CastOutputToFloat(nn.Sequential): def forward(self, x): return super().forward(x).to(torch.float32) model = AutoModel.from_pretrained("THUDM/chatglm-6b", load_in_8bit=False, trust_remote_code=True, device_map=device_map,quantization_config=quantization_config,) model.supports_gradient_checkpointing = True model.gradient_checkpointing_enable() model.enable_input_require_grads() model.lm_head = CastOutputToFloat(model.lm_head) model.config.use_cache = False # silence the warnings. Please re-enable for inference!
然后做inference时出错
with torch.no_grad(): for idx, item in enumerate(instructions): feature = format_example(item) input_text = feature['context'] ids = tokenizer.encode(input_text) input_ids = torch.LongTensor([ids]) input_ids = input_ids.to('cuda') # print(input_ids) out = model.generate( input_ids=input_ids, max_length=4000, do_sample=False, temperature=0 ) out_text = tokenizer.decode(out[0]) answer = out_text.replace(input_text, "").replace("\nEND", "").strip() item['infer_answer'] = answer print(out_text) print(f"### {idx+1}.Answer:\n", item.get('output'), '\n\n') answers.append({'index': idx, **item})
File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/torch/nn/functional.py:2515, in layer_norm(input, normalized_shape, weight, bias, eps) 2511 if has_torch_function_variadic(input, weight, bias): 2512 return handle_torch_function( 2513 layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, bias=bias, eps=eps 2514 ) -> 2515 return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: expected scalar type Half but found Float
print(model.dict)
{'training': True, '_parameters': OrderedDict(), '_buffers': OrderedDict(), '_non_persistent_buffers_set': set(), '_backward_pre_hooks': OrderedDict(), '_backward_hooks': OrderedDict(), '_is_full_backward_hook': None, '_forward_hooks': OrderedDict(), '_forward_hooks_with_kwargs': OrderedDict(), '_forward_pre_hooks': OrderedDict(), '_forward_pre_hooks_with_kwargs': OrderedDict(), '_state_dict_hooks': OrderedDict(), '_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_post_hooks': OrderedDict(), '_modules': OrderedDict([('base_model', LoraModel( (model): ChatGLMForConditionalGeneration( (transformer): ChatGLMModel( (word_embeddings): Embedding(130528, 4096) (layers): ModuleList( (0-27): 28 x GLMBlock( (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) (attention): SelfAttention( (rotary_emb): RotaryEmbedding() (query_key_value): Linear( in_features=4096, out_features=12288, bias=True (lora_dropout): ModuleDict( (default): Dropout(p=0.1, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=12288, bias=False) ) ) (dense): Linear(in_features=4096, out_features=4096, bias=True) ) (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) (mlp): GLU( (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True) (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True) ) ) ) (final_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (lm_head): CastOutputToFloat( (0): Linear(in_features=4096, out_features=130528, bias=False) ) ) ))]), 'config': ChatGLMConfig { "_name_or_path": "THUDM/chatglm-6b", "architectures": [ "ChatGLMModel" ], "auto_map": { "AutoConfig": "configuration_chatglm.ChatGLMConfig", "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration" }, "bos_token_id": 130004, "eos_token_id": 130005, "gmask_token_id": 130001, "hidden_size": 4096, "inner_hidden_size": 16384, "layernorm_epsilon": 1e-05, "mask_token_id": 130000, "max_sequence_length": 2048, "model_type": "chatglm", "num_attention_heads": 32, "num_layers": 28, "pad_token_id": 3, "position_encoding_2d": true, "pre_seq_len": null, "prefix_projection": false, "quantization_bit": 0, "torch_dtype": "float16", "transformers_version": "4.27.1", "use_cache": false, "vocab_size": 130528 } , 'modules_to_save': None, 'peft_config': {'default': LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, base_model_name_or_path='THUDM/chatglm-6b', task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=16, target_modules=['query_key_value'], lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True)}, 'active_adapter': 'default', 'peft_type': <PeftType.LORA: 'LORA'>, 'base_model_torch_dtype': torch.float16, 'base_model_prepare_inputs_for_generation': <bound method ChatGLMForConditionalGeneration.prepare_inputs_for_generation of ChatGLMForConditionalGeneration( (transformer): ChatGLMModel( (word_embeddings): Embedding(130528, 4096) (layers): ModuleList( (0-27): 28 x GLMBlock( (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) (attention): SelfAttention( (rotary_emb): RotaryEmbedding() (query_key_value): Linear( in_features=4096, out_features=12288, bias=True (lora_dropout): ModuleDict( (default): Dropout(p=0.1, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=16, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=16, out_features=12288, bias=False) ) ) (dense): Linear(in_features=4096, out_features=4096, bias=True) ) (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) (mlp): GLU( (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True) (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True) ) ) ) (final_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) ) (lm_head): CastOutputToFloat( (0): Linear(in_features=4096, out_features=130528, bias=False) ) )>, 'is_parallelizable': True, 'model_parallel': True}
load model时候加上 .half()
解决了,谢谢
@huashiyiqike 请教下咋解决的
训练没有使用8bit
然后做inference时出错
File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/torch/nn/functional.py:2515, in layer_norm(input, normalized_shape, weight, bias, eps) 2511 if has_torch_function_variadic(input, weight, bias): 2512 return handle_torch_function( 2513 layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, bias=bias, eps=eps 2514 ) -> 2515 return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
RuntimeError: expected scalar type Half but found Float
print(model.dict)