RuolinZheng08 / twewy-discord-chatbot

Discord AI Chatbot using DialoGPT, trained on the game transcript of The World Ends With You
https://www.freecodecamp.org/news/discord-ai-chatbot/
MIT License
317 stars 156 forks source link

Main throws "TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]" #10

Closed winterClover closed 2 years ago

winterClover commented 2 years ago

When running the main function, I get the error above.

01/27/2022 19:54:30 - WARNING - __main__ -   Process rank: -1, device: cuda, n_gpu: 1, distributed training: False, 16-bits training: False
/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:807: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.
  FutureWarning,
01/27/2022 19:54:37 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7f71d694fad0>
01/27/2022 19:54:37 - INFO - __main__ -   Creating features from dataset file at cached

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-19-523c0d2a27d3> in <module>()
----> 1 main(trn_df, val_df)

10 frames

<ipython-input-18-aa20b6fc78bc> in main(df_trn, df_val)
     61     # Training
     62     if args.do_train:
---> 63         train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
     64 
     65         global_step, tr_loss = train(args, train_dataset, model, tokenizer)

<ipython-input-13-67f62bb60333> in load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate)
      2 
      3 def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
----> 4     return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
      5 
      6 

<ipython-input-12-a654172287f5> in __init__(self, tokenizer, args, df, block_size)
     25             self.examples = []
     26             for _, row in df.iterrows():
---> 27                 conv = construct_conv(row, tokenizer)
     28                 self.examples.append(conv)
     29 

<ipython-input-12-a654172287f5> in construct_conv(row, tokenizer, eos)
      2 def construct_conv(row, tokenizer, eos = True):
      3     flatten = lambda l: [item for sublist in l for item in sublist]
----> 4     conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
      5     conv = flatten(conv)
      6     return conv

<ipython-input-12-a654172287f5> in <listcomp>(.0)
      2 def construct_conv(row, tokenizer, eos = True):
      3     flatten = lambda l: [item for sublist in l for item in sublist]
----> 4     conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
      5     conv = flatten(conv)
      6     return conv

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in encode(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, return_tensors, **kwargs)
   2213             stride=stride,
   2214             return_tensors=return_tensors,
-> 2215             **kwargs,
   2216         )
   2217 

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in encode_plus(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
   2552             return_length=return_length,
   2553             verbose=verbose,
-> 2554             **kwargs,
   2555         )
   2556 

/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/tokenization_gpt2_fast.py in _encode_plus(self, *args, **kwargs)
    172         )
    173 
--> 174         return super()._encode_plus(*args, **kwargs)
    175 
    176     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _encode_plus(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
    512             return_length=return_length,
    513             verbose=verbose,
--> 514             **kwargs,
    515         )
    516 

/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/tokenization_gpt2_fast.py in _batch_encode_plus(self, *args, **kwargs)
    162         )
    163 
--> 164         return super()._batch_encode_plus(*args, **kwargs)
    165 
    166     def _encode_plus(self, *args, **kwargs) -> BatchEncoding:

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
    425             batch_text_or_text_pairs,
    426             add_special_tokens=add_special_tokens,
--> 427             is_pretokenized=is_split_into_words,
    428         )
    429 

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]