[S2T] Whisper ASR Model excution got TypeError

Describe the bug A clear and concise description of what the bug is.

Traceback (most recent call last)/tmp/ipykernel_98/3684188953.py in <module>
     10     audio_file=audio_file,
     11     language='ja',
---> 12     device=paddle.get_device())
~/external-libraries/paddlespeech/cli/utils.py in _warpper(self, *args, **kwargs)
    326         except Exception:
    327             pass
--> 328         return executor_func(self, *args, **kwargs)
    329 
    330     return _warpper
~/external-libraries/paddlespeech/cli/whisper/infer.py in __call__(self, audio_file, model, lang, task, size, language, sample_rate, config, ckpt_path, decode_method, num_decoding_left_chunks, force_yes, rtf, device)
    482 
    483         self.preprocess(model, audio_file)
--> 484         self.infer(model)
    485         res = self.postprocess()  # Retrieve result of asr.
    486 
<decorator-gen-695> in infer(self, model_type)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py in _decorate_function(func, *args, **kwargs)
    373         def _decorate_function(func, *args, **kwargs):
    374             with self:
--> 375                 return func(*args, **kwargs)
    376 
    377         @decorator.decorator
~/external-libraries/paddlespeech/cli/whisper/infer.py in infer(self, model_type)
    293             initial_prompt=cfg.initial_prompt,
    294             condition_on_previous_text=cfg.condition_on_previous_text,
--> 295             no_speech_threshold=cfg.no_speech_threshold)
    296 
    297     def postprocess(self) -> Union[str, os.PathLike]:
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in transcribe(model, mel, resource_path, verbose, temperature, compression_ratio_threshold, logprob_threshold, no_speech_threshold, condition_on_previous_text, **decode_options)
    623                         time_precision,
    624                         text_tokens=sliced_tokens[1:-1],
--> 625                         result=result, )
    626                     last_slice = current_slice
    627                 last_timestamp_position = (
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in add_segment(start, end, text_tokens, result)
    552                     result: DecodingResult):
    553         text = tokenizer.decode(
--> 554             [token for token in text_tokens if token < tokenizer.eot])
    555         if len(text.strip()) == 0:  # skip empty text output
    556             return
~/external-libraries/paddlespeech/s2t/models/whisper/tokenizer.py in decode(self, token_ids, **kwargs)
    157             token_ids = ids_list
    158 
--> 159         return self.tokenizer.decode(token_ids, **kwargs)
    160 
    161     def decode_with_timestamps(self, tokens) -> str:
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlenlp/transformers/tokenizer_utils_base.py in decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
   3156             skip_special_tokens=skip_special_tokens,
   3157             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-> 3158             **kwargs,
   3159         )
   3160 
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlenlp/transformers/tokenizer_utils.py in _decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs)
   1404 
   1405         filtered_tokens = self.convert_ids_to_tokens(
-> 1406             token_ids, skip_special_tokens=skip_special_tokens)
   1407 
   1408         # To avoid mixing byte-level and unicode for byte-level BPT
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlenlp/transformers/tokenizer_utils.py in convert_ids_to_tokens(self, ids, skip_special_tokens)
    837         tokens = []
    838         for index in ids:
--> 839             index = int(index)
    840             if skip_special_tokens and index in self.all_special_ids:
    841                 continue
TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'

To Reproduce

audio_file = 'audio.wav'
whisper_executor = paddlespeech.cli.whisper.WhisperExecutor()
result = whisper_executor(
    model='whisper',
    task='transcribe',
    size='medium',
    sample_rate=16000,
    config=None,  # Set `config` and `ckpt_path` to None to use pretrained model.
    ckpt_path=None,
    audio_file=audio_file,
    language='ja',
    device=paddle.get_device())

遇到这个问题时, 被推理的音频都比较长 (比如 100s 音频, 我这手动改了 self.max_len 50 秒限制) , 无端猜测可能和音频长度/显存有关｡但真正显存不足的时候 cuda 运行库会直接报告显存不足, 所以又感觉不像｡

Environment (please complete the following information):

Baidu AIStudio V100 / A100
OS: Ubuntu
GCC/G++ Version unkonwn
Python Version 3.7
PaddlePaddle Version 2.4.0
Model Version whisper-large medium, small 也会遇到
GPU/DRIVER Information Tesla V100-SXM2-32GB/460.32.03, A100 也会遇到
CUDA/CUDNN Version cuda-10.2/cuDNN Version-8.2
- TensorRT Version

Additional context Add any other context about the problem here.

@zxcd

PaddlePaddle / PaddleSpeech

[S2T] Whisper ASR Model excution got TypeError #2819