infinitylogesh / mutate

A library to synthesize text datasets using Large Language Models (LLM)
149 stars 8 forks source link

How to train the model by prompting? #2

Open liyang619 opened 2 years ago

liyang619 commented 2 years ago

Hi~ could I train the model and update parameters by mutate prompting code?

infinitylogesh commented 2 years ago

Hi, Unfortunately Mutate currently doesn't support training. However, if you wish to finetune your LLM - following the steps in the repo would probably help - https://github.com/Xirider/finetune-gpt2xl

EricPeter commented 7 months ago
From this from mutate import pipeline
    ...:
    ...: pipe = pipeline("text-classification-synthesis",
    ...:                 model="EleutherAI/gpt-neo-2.7B")
    ...:
    ...: task_dec ="Each item in the following contains Swahili news  texts that are classified as Kitaifa (National),Kimataifa (International), Biashara (Business),Michezo (Sports) and Burudani (Entertainment) "
    ...:
    ...:
    ...: # returns a python generator
    ...: text_synth_gen = pipe("csv",
    ...:                     data_files=["Train.csv"],
    ...:                     task_desc=task_desc,
    ...:                     text_column="content",
    ...:                     label_column="category",
    ...:                     text_column_alias="News",
    ...:                     label_column_alias="Category",
    ...:                     shot_count=5,
    ...:                     class_names=['Kimataifa', 'Burudani'])
    ...:
    ...: #Loop through the generator to synthesize examples by class
    ...: for synthesized_examples  in text_synth_gen:
    ...:     print(synthesized_examples)  

I get this error Token indices sequence length is longer than the specified maximum sequence length for this model (3341 > 2048). Running this sequence through the model will result in indexing errors /home/ubuntu/transformers/src/transformers/generation/configuration_utils.py:422: UserWarning: num_beams is set to 1. However, early_stopping is set to True -- this flag is only used in beam-based generation modes. You should set num_beams>1 or unset early_stopping. warnings.warn( A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set padding_side='left' when initializing the tokenizer. /home/ubuntu/transformers/src/transformers/generation/utils.py:1362: UserWarning: Input length of input_ids is 3341, but max_length is set to 300. This can lead to unexpected behavior. You should consider increasing max_new_tokens. warnings.warn(

IndexError Traceback (most recent call last) Cell In[85], line 21 10 text_synth_gen = pipe("csv", 11 data_files=["Train.csv"], 12 task_desc=task_desc, (...) 17 shot_count=5, 18 class_names=['Kimataifa', 'Burudani']) 20 #Loop through the generator to synthesize examples by class ---> 21 for synthesized_examples in text_synth_gen: 22 print(synthesized_examples)

File ~/.local/lib/python3.8/site-packages/mutate/pipelines/text_classification.py:200, in TextClassificationSynthesize.call(self, dataset_path, text_column, label_column, task_desc, split, data_files, data_dir, text_column_alias, label_column_alias, class_names, dataset_args, dataset_kwargs, batch_size, shot_count, infinite_loop, **kwargs) 198 batch_parsed_examples = [] 199 batch_class_names = [] --> 200 batch_generated_texts = self.infer.run_single_batch( 201 batch, generate_args=self.generate_kwargs 202 ) 204 num_return_sequences = len(batch_generated_texts) // len(batch["input_ids"]) 206 for idx, generated_text in enumerate(batch_generated_texts):

File ~/.local/lib/python3.8/site-packages/mutate/infer.py:52, in TextGeneration.run_single_batch(self, batch, is_include_prompt_in_generation, ignore_prompt_last_line, generate_args) 44 def run_single_batch( 45 self, 46 batch, (...) 49 generate_args: Optional[Dict[str, str]] = None, 50 ): 51 batch_size = len(batch["input_ids"]) ---> 52 generated_sequences = self.model.generate( 53 batch["input_ids"], attention_mask=batch["attention_mask"], **generate_args 54 ) 55 generated_sequences = generated_sequences.cpu() 56 num_return_sequences = len(generated_sequences) // batch_size

File /usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, *kwargs): 114 with ctx_factory(): --> 115 return func(args, kwargs)

File ~/transformers/src/transformers/generation/utils.py:1763, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, kwargs) 1755 input_ids, model_kwargs = self._expand_inputs_for_generation( 1756 input_ids=input_ids, 1757 expand_size=generation_config.num_return_sequences, 1758 is_encoder_decoder=self.config.is_encoder_decoder, 1759 model_kwargs, 1760 ) 1762 # 13. run sample -> 1763 return self.sample( 1764 input_ids, 1765 logits_processor=logits_processor, 1766 logits_warper=logits_warper, 1767 stopping_criteria=stopping_criteria, 1768 pad_token_id=generation_config.pad_token_id, 1769 eos_token_id=generation_config.eos_token_id, 1770 output_scores=generation_config.output_scores, 1771 return_dict_in_generate=generation_config.return_dict_in_generate, 1772 synced_gpus=synced_gpus, 1773 streamer=streamer, 1774 **model_kwargs, 1775 ) 1777 elif generation_mode == GenerationMode.BEAM_SEARCH: 1778 # 11. prepare beam search scorer 1779 beam_scorer = BeamSearchScorer( 1780 batch_size=batch_size, 1781 num_beams=generation_config.num_beams, (...) 1786 max_length=generation_config.max_length, 1787 )

File ~/transformers/src/transformers/generation/utils.py:2860, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, model_kwargs) 2857 model_inputs = self.prepare_inputs_for_generation(input_ids, model_kwargs) 2859 # forward pass to get next token -> 2860 outputs = self( 2861 **model_inputs, 2862 return_dict=True, 2863 output_attentions=output_attentions, 2864 output_hidden_states=output_hidden_states, 2865 ) 2867 if synced_gpus and this_peer_finished: 2868 continue # don't waste resources running the code we don't need

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/transformers/src/transformers/models/gpt_neo/modeling_gpt_neo.py:954, in GPTNeoForCausalLM.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict) 946 r""" 947 labels (torch.LongTensor of shape (batch_size, sequence_length), optional): 948 Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set 949 labels = input_ids Indices are selected in [-100, 0, ..., config.vocab_size] All labels set to -100 950 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size] 951 """ 952 return_dict = return_dict if return_dict is not None else self.config.use_return_dict --> 954 transformer_outputs = self.transformer( 955 input_ids, 956 past_key_values=past_key_values, 957 attention_mask=attention_mask, 958 token_type_ids=token_type_ids, 959 position_ids=position_ids, 960 head_mask=head_mask, 961 inputs_embeds=inputs_embeds, 962 use_cache=use_cache, 963 output_attentions=output_attentions, 964 output_hidden_states=output_hidden_states, 965 return_dict=return_dict, 966 ) 967 hidden_states = transformer_outputs[0] 969 lm_logits = self.lm_head(hidden_states)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File ~/transformers/src/transformers/models/gpt_neo/modeling_gpt_neo.py:778, in GPTNeoModel.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict) 776 if inputs_embeds is None: 777 inputs_embeds = self.wte(input_ids) --> 778 position_embeds = self.wpe(position_ids) 779 hidden_states = inputs_embeds + position_embeds 781 # Attention mask.

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, kwargs) 1516 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(args, kwargs)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, *kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(args, **kwargs) 1529 try: 1530 result = None

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/sparse.py:162, in Embedding.forward(self, input) 161 def forward(self, input: Tensor) -> Tensor: --> 162 return F.embedding( 163 input, self.weight, self.padding_idx, self.max_norm, 164 self.norm_type, self.scale_grad_by_freq, self.sparse)

File /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:2233, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse) 2227 # Note [embedding_renorm set_grad_enabled] 2228 # XXX: equivalent to 2229 # with torch.no_grad(): 2230 # torch.embeddingrenorm 2231 # remove once script supports set_grad_enabled 2232 _no_grad_embeddingrenorm(weight, input, max_norm, norm_type) -> 2233 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

IndexError: index out of range in self