artidoro / qlora

QLoRA: Efficient Finetuning of Quantized LLMs
https://arxiv.org/abs/2305.14314
MIT License
9.72k stars 798 forks source link

try finetune guanaco-33b-merged with default params and some problems #54

Open apachemycat opened 1 year ago

apachemycat commented 1 year ago

loading base model /models/guanaco-33b-merged... Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████| 7/7 [01:12<00:00, 10.30s/it] adding LoRA modules...

/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1614 in getattr │ │ │ │ 1611 │ │ │ modules = self.dict['_modules'] │ │ 1612 │ │ │ if name in modules: │ │ 1613 │ │ │ │ return modules[name] │ │ ❱ 1614 │ │ raise AttributeError("'{}' object has no attribute '{}'".format( │ │ 1615 │ │ │ type(self).name, name)) │ │ 1616 │ │ │ 1617 │ def setattr(self, name: str, value: Union[Tensor, 'Module']) -> None: │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AttributeError: 'CastOutputToFloat' object has no attribute 'weight' root@5e0ba28fefc9:/wzh/qlora# root@5e0ba28fefc9:/wzh/qlora# CUDA_VISIBLE_DEVICES=0 PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:24 sh scripts/finetune.sh

apachemycat commented 1 year ago

when i remove follow code ,the error gone ,but i dont know it's correct ?

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    cache_dir=args.cache_dir,
    padding_side="right",
    use_fast=True,
)

if tokenizer.pad_token is None:

smart_tokenizer_and_embedding_resize(

special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),

tokenizer=tokenizer,

model=model,

)

apachemycat commented 1 year ago

TypeError: pad_sequence(): argument 'padding_value' (position 3) must be float, not NoneType

│ /wzh/qlora/qlora.py:417 in call │ │ │ │ 414 │ │ │ else: │ │ 415 │ │ │ │ input_ids.append(torch.tensor(tokenized_source)) │ │ 416 │ │ # Apply padding │ │ ❱ 417 │ │ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokeniz │ │ 418 │ │ labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if n │ │ 419 │ │ data_dict = { │ │ 420 │ │ │ 'input_ids': input_ids, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/utils/rnn.py:399 in pad_sequence │ │ │ │ 396 │ │ │ 397 │ # assuming trailing dimensions and type of all the Tensors │ │ 398 │ # in sequences are same and fetching those from sequences[0] │ │ ❱ 399 │ return torch._C._nn.pad_sequence(sequences, batch_first, padding_value) │ │ 400

apachemycat commented 1 year ago

│if i change code to following if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.unk_token

smart_tokenizer_and_embedding_resize(

special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),

tokenizer=tokenizer,

model=model,

)

then maximum recursion depth exceeded while getting the str of an object \ /usr/local/lib/python3.8/dist-packages/transformers/tokenization_utils_fast.py:257 in │ │ _convert_token_to_id_with_added_voc │ │ │ │ 254 │ def _convert_token_to_id_with_added_voc(self, token: str) -> int: │ │ 255 │ │ index = self._tokenizer.token_to_id(token) │ │ 256 │ │ if index is None: │ │ ❱ 257 │ │ │ return self.unk_token_id │ │ 258 │ │ return index │ │ 259 │ │ │ 260 │ def _convert_id_to_token(self, index: int) -> Optional[str]: │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/tokenization_utils_base.py:1142 in │ │ unk_token_id │ │ │ │ 1139 │ │ """ │ │ 1140 │ │ if self._unk_token is None: │ │ 1141 │ │ │ return None │ │ ❱ 1142 │ │ return self.convert_tokens_to_ids(self.unk_token) │ │ 1143 │ │ │ 1144 │ @property │ │ 1145 │ def sep_token_id(self) -> Optional[int]: │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/tokenization_utils_fast.py:250 in │ │ convert_tokens_to_ids │ │ │ │ 247 │ │ │ return None │ │ 248 │ │ │ │ 249 │ │ if isinstance(tokens, str): │ │ ❱ 250 │ │ │ return self._convert_token_to_id_with_added_voc(tokens) │ │ 251 │ │ │ │ 252 │ │ return [self._convert_token_to_id_with_added_voc(token) for token in tokens] │ │ 253 │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/tokenization_utils_fast.py:257 in │ │ _convert_token_to_id_with_added_voc │ │ │ │ 254 │ def _convert_token_to_id_with_added_voc(self, token: str) -> int: │ │ 255 │ │ index = self._tokenizer.token_to_id(token) │ │ 256 │ │ if index is None: │ │ ❱ 257 │ │ │ return self.unk_token_id │ │ 258 │ │ return index │ │ 259 │ │ │ 260 │ def _convert_id_to_token(self, index: int) -> Optional[str]: │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/tokenization_utils_base.py:1142 in │ │ unk_token_id │ │ │ │ 1139 │ │ """ │ │ 1140 │ │ if self._unk_token is None: │ │ 1141 │ │ │ return None │ │ ❱ 1142 │ │ return self.convert_tokens_to_ids(self.unk_token) │ │ 1143 │ │ │ 1144 │ @property │ │ 1145 │ def sep_token_id(self) -> Optional[int]: │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/tokenization_utils_base.py:1022 in unk_token │ │ │ │ 1019 │ │ │ if self.verbose: │ │ 1020 │ │ │ │ logger.error("Using unk_token, but it is not set yet.") │ │ 1021 │ │ │ return None │ │ ❱ 1022 │ │ return str(self._unk_token) │ │ 1023 │ │ │ 1024 │ @property │ │ 1025 │ def sep_token(self) -> str: │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RecursionError: maximum recursion depth exceeded while getting the str of an object

apachemycat commented 1 year ago

change code to following , if tokenizer.pad_token is None:

tokenizer.pad_token = tokenizer.unk_token

    **tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))**
    #smart_tokenizer_and_embedding_resize(
    #    special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
    #    tokenizer=tokenizer,
    #    model=model,
    #)'

error :

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [68,0,0], thread: [26,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [68,0,0], thread: [27,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [68,0,0], thread: [28,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [68,0,0], thread: [29,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [68,0,0], thread: [30,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [68,0,0], thread: [31,0,0] Assertion srcIndex < srcSelectDimSize failed.

│ │ │ /wzh/qlora/qlora.py:724 in train │ │ │ │ 721 │ all_metrics = {"run_name": args.run_name} │ │ 722 │ # Training │ │ 723 │ if args.do_train: │ │ ❱ 724 │ │ train_result = trainer.train(resume_from_checkpoint=checkpoint_dir) │ │ 725 │ │ metrics = train_result.metrics │ │ 726 │ │ trainer.log_metrics("train", metrics) │ │ 727 │ │ trainer.save_metrics("train", metrics) │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1696 in train │ │ │ │ 1693 │ │ inner_training_loop = find_executable_batch_size( │ │ 1694 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1695 │ │ ) │ │ ❱ 1696 │ │ return inner_training_loop( │ │ 1697 │ │ │ args=args, │ │ 1698 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1699 │ │ │ trial=trial, │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1973 in _inner_training_loop │ │ │ │ 1970 │ │ │ │ │ with model.no_sync(): │ │ 1971 │ │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1972 │ │ │ │ else: │ │ ❱ 1973 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1974 │ │ │ │ │ │ 1975 │ │ │ │ if ( │ │ 1976 │ │ │ │ │ args.logging_nan_inf_filter │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:2787 in training_step │ │ │ │ 2784 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │ │ 2785 │ │ │ │ 2786 │ │ with self.compute_loss_context_manager(): │ │ ❱ 2787 │ │ │ loss = self.compute_loss(model, inputs) │ │ 2788 │ │ │ │ 2789 │ │ if self.args.n_gpu > 1: │ │ 2790 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:2819 in compute_loss │ │ │ │ 2816 │ │ │ labels = inputs.pop("labels") │ │ 2817 │ │ else: │ │ 2818 │ │ │ labels = None │ │ ❱ 2819 │ │ outputs = model(inputs) │ │ 2820 │ │ # Save past state if it exists │ │ 2821 │ │ # TODO: this needs to be fixed and made cleaner later. │ │ 2822 │ │ if self.args.past_index >= 0: │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /usr/local/lib/python3.8/dist-packages/peft/peft_model.py:575 in forward │ │ │ │ 572 │ │ kwargs, │ │ 573 │ ): │ │ 574 │ │ if not isinstance(self.peft_config, PromptLearningConfig): │ │ ❱ 575 │ │ │ return self.base_model( │ │ 576 │ │ │ │ input_ids=input_ids, │ │ 577 │ │ │ │ attention_mask=attention_mask, │ │ 578 │ │ │ │ inputs_embeds=inputs_embeds, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = []

hemangjoshi37a commented 1 year ago

Dear @apachemycat,

Thank you for bringing this issue to our attention. We understand that you are encountering some problems while trying to finetune guanaco-33b-merged with default parameters. We apologize for the inconvenience caused, and we would be happy to assist you with a solution.

Based on the error messages you shared, it seems that there are some attribute errors and recursion depth exceeded errors related to the tokenizer and padding in your code. Here is a possible solution to address these issues:

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path,
    cache_dir=args.cache_dir,
    padding_side="right",
    use_fast=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Rest of your code...

By explicitly adding the [PAD] token as a special token and assigning it to the pad_token, we ensure that the tokenizer has a proper padding token. This should resolve the attribute errors and recursion depth exceeded errors related to padding.

Please give this solution a try and let us know if it resolves the issue for you. If you encounter any further problems or have any additional questions, please don't hesitate to ask. We are here to help!

Best regards, @hemangjoshi37a

apachemycat commented 1 year ago

code has adding the [PAD] logic as following if tokenizer.pad_token is None: smart_tokenizer_and_embedding_resize( special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), tokenizer=tokenizer, model=model, )

""" print("token ..."+str(special_tokens_dict)) num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer))

runtime output loaded model Using pad_token, but it is not set yet. token ...{'pad_token': '[PAD]'}


but another Error : AttributeError: 'CastOutputToFloat' object has no attribute 'weight' │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1384 in │ │ resize_token_embeddings │ │ │ │ 1381 │ │ Return: │ │ 1382 │ │ │ torch.nn.Embedding: Pointer to the input tokens Embeddings Module of the m │ │ 1383 │ │ """ │ │ ❱ 1384 │ │ model_embeds = self._resize_token_embeddings(new_num_tokens) │ │ 1385 │ │ if new_num_tokens is None: │ │ 1386 │ │ │ return model_embeds │ │ 1387 │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1405 in │ │ _resize_token_embeddings │ │ │ │ 1402 │ │ # if word embeddings are not tied, make sure that lm head is resized as well │ │ 1403 │ │ if self.get_output_embeddings() is not None and not self.config.tie_word_embeddi │ │ 1404 │ │ │ old_lm_head = self.get_output_embeddings() │ │ ❱ 1405 │ │ │ new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens) │ │ 1406 │ │ │ self.set_output_embeddings(new_lm_head) │ │ 1407 │ │ │ │ 1408 │ │ return self.get_input_embeddings() │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py:1509 in │ │ _get_resized_lm_head │ │ │ │ 1506 │ │ │ │ ) │ │ 1507 │ │ else: │ │ 1508 │ │ │ old_num_tokens, old_lm_head_dim = ( │ │ ❱ 1509 │ │ │ │ old_lm_head.weight.size() if not transposed else old_lm_head.weight.t(). │ │ 1510 │ │ │ ) │ │ 1511 │ │ │ │ 1512 │ │ if old_num_tokens == new_num_tokens: │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1614 in getattr │ │ │ │ 1611 │ │ │ modules = self.dict['_modules'] │ │ 1612 │ │ │ if name in modules: │ │ 1613 │ │ │ │ return modules[name] │ │ ❱ 1614 │ │ raise AttributeError("'{}' object has no attribute '{}'".format( │ │ 1615 │ │ │ type(self).name, name)) │ │ 1616 │ │ │ 1617 │ def setattr(self, name: str, value: Union[Tensor, 'Module']) -> None: │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AttributeError: 'CastOutputToFloat' object has no attribute 'weight'

apachemycat commented 1 year ago

can't reslove this problem ....help

bestfleer commented 1 year ago

I also met the same issue that "AttributeError: 'CastOutputToFloat' object has no attribute 'weight'". The pretrained LLM I used is llama-7b-hf.