PaddlePaddle / PaddleSpeech

Easy-to-use Speech Toolkit including Self-Supervised Learning model, SOTA/Streaming ASR with punctuation, Streaming TTS with text frontend, Speaker Verification System, End-to-End Speech Translation and Keyword Spotting. Won NAACL2022 Best Demo Award.
https://paddlespeech.readthedocs.io
Apache License 2.0
10.55k stars 1.81k forks source link

[S2T] [ASRExecutor] list index out of range || The size of SequenceLength has to equal the batch_size #3416

Open Scisaga opened 11 months ago

Scisaga commented 11 months ago

python==3.10 paddlepaddle==2.5.0 paddlespeech=1.4.1

错误信息1:

KeyError                                  Traceback (most recent call last)
Cell In[4], line 11
      4 # 读取wav
      5 # 格式要求: 16k 16 bit 1 channel
      6 # 音频时长 < 200s
      7 # default_model = conformer_u2pp_online_wenetspeech
      8 # better_model = deepspeech2online_wenetspeech
      9 asr = ASRExecutor()
---> 11 result = asr(
     12     audio_file='1.wav', 
     13     force_yes=True
     14 )
     16 print(asr._outputs)
     18 # 给输出文本加上标点
     19 # 使用 ernie_linear_p3_wudao 提升效果

File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/utils.py:328, in stats_wrapper.<locals>._warpper(self, *args, **kwargs)
    326 except Exception:
    327     pass
--> 328 return executor_func(self, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:512, in ASRExecutor.__call__(self, audio_file, model, lang, codeswitch, sample_rate, config, ckpt_path, decode_method, num_decoding_left_chunks, force_yes, rtf, device)
    510 self.preprocess(model, audio_file)
    511 self.infer(model)
--> 512 res = self.postprocess()  # Retrieve result of asr.
    514 if rtf:
    515     CLI_TIMER[k]['end'].append(time.time())

File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:335, in ASRExecutor.postprocess(self)
    331 def postprocess(self) -> Union[str, os.PathLike]:
    332     """
    333         Output postprocess and return human-readable results such as texts and audio files.
    334     """
--> 335     return self._outputs["result"]

KeyError: 'result'

错误信息2:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 11
      4 # 读取wav
      5 # 格式要求: 16k 16 bit 1 channel
      6 # 音频时长 < 200s
      7 # default_model = conformer_u2pp_online_wenetspeech
      8 # better_model = deepspeech2online_wenetspeech
      9 asr = ASRExecutor()
---> 11 result = asr(
     12     audio_file='1.wav', 
     13     force_yes=True, 
     14     rtf=True,
     15     model='deepspeech2online_wenetspeech'
     16 )
     18 print(asr._outputs)
     20 # 给输出文本加上标点
     21 # 使用 ernie_linear_p3_wudao 提升效果

File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/utils.py:328, in stats_wrapper.<locals>._warpper(self, *args, **kwargs)
    326 except Exception:
    327     pass
--> 328 return executor_func(self, *args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:511, in ASRExecutor.__call__(self, audio_file, model, lang, codeswitch, sample_rate, config, ckpt_path, decode_method, num_decoding_left_chunks, force_yes, rtf, device)
    508     CLI_TIMER[k]['start'].append(time.time())
    510 self.preprocess(model, audio_file)
--> 511 self.infer(model)
    512 res = self.postprocess()  # Retrieve result of asr.
    514 if rtf:

File /opt/conda/lib/python3.10/site-packages/decorator.py:232, in decorate.<locals>.fun(*args, **kw)
    230 if not kwsyntax:
    231     args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)

File /opt/conda/lib/python3.10/site-packages/paddle/fluid/dygraph/base.py:347, in _DecoratorContextManager.__call__.<locals>._decorate_function(func, *args, **kwargs)
    344 @decorator.decorator
    345 def _decorate_function(func, *args, **kwargs):
    346     with self:
--> 347         return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:306, in ASRExecutor.infer(self, model_type)
    299 decode_batch_size = audio.shape[0]
    300 self.model.decoder.init_decoder(
    301     decode_batch_size, self.text_feature.vocab_list,
    302     cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
    303     cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
    304     cfg.num_proc_bsearch)
--> 306 result_transcripts = self.model.decode(audio, audio_len)
    307 self.model.decoder.del_decoder()
    308 self._outputs["result"] = result_transcripts[0]

File /opt/conda/lib/python3.10/site-packages/decorator.py:232, in decorate.<locals>.fun(*args, **kw)
    230 if not kwsyntax:
    231     args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)

File /opt/conda/lib/python3.10/site-packages/paddle/fluid/dygraph/base.py:347, in _DecoratorContextManager.__call__.<locals>._decorate_function(func, *args, **kwargs)
    344 @decorator.decorator
    345 def _decorate_function(func, *args, **kwargs):
    346     with self:
--> 347         return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/paddlespeech/s2t/models/ds2/deepspeech2.py:299, in DeepSpeech2Model.decode(self, audio, audio_len)
    295 @paddle.no_grad()
    296 def decode(self, audio, audio_len):
    297     # decoders only accept string encoded in utf-8
    298     # Make sure the decoder has been initialized
--> 299     eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
    300         audio, audio_len, None, None)
    301     probs = self.decoder.softmax(eouts)
    302     batch_size = probs.shape[0]

File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1254, in Layer.__call__(self, *inputs, **kwargs)
   1245 if (
   1246     (not in_declarative_mode())
   1247     and (not self._forward_pre_hooks)
   (...)
   1251     and (not in_profiler_mode())
   1252 ):
   1253     self._build_once(*inputs, **kwargs)
-> 1254     return self.forward(*inputs, **kwargs)
   1255 else:
   1256     return self._dygraph_call_func(*inputs, **kwargs)

File /opt/conda/lib/python3.10/site-packages/paddlespeech/s2t/models/ds2/deepspeech2.py:130, in CRNNEncoder.forward(self, x, x_lens, init_state_h_box, init_state_c_box)
    128 final_chunk_state_list = []
    129 for i in range(0, self.num_rnn_layers):
--> 130     x, final_state = self.rnn[i](x, init_state_list[i],
    131                                  x_lens)  #[B, T, D]
    132     final_chunk_state_list.append(final_state)
    133     x = self.layernorm_list[i](x)

File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1254, in Layer.__call__(self, *inputs, **kwargs)
   1245 if (
   1246     (not in_declarative_mode())
   1247     and (not self._forward_pre_hooks)
   (...)
   1251     and (not in_profiler_mode())
   1252 ):
   1253     self._build_once(*inputs, **kwargs)
-> 1254     return self.forward(*inputs, **kwargs)
   1255 else:
   1256     return self._dygraph_call_func(*inputs, **kwargs)

File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/rnn.py:1580, in RNNBase.forward(self, inputs, initial_states, sequence_length)
   1570     initial_states = (
   1571         [initial_states]
   1572         if isinstance(initial_states, paddle.static.Variable)
   1573         else initial_states
   1574     )
   1576 if self.could_use_cudnn and (
   1577     not paddle.device.is_compiled_with_rocm() or sequence_length is None
   1578 ):
   1579     # Add CPU kernel and dispatch in backend later
-> 1580     return self._cudnn_impl(inputs, initial_states, sequence_length)
   1582 states = split_states(
   1583     initial_states, self.num_directions == 2, self.state_components
   1584 )
   1585 final_states = []

File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/rnn.py:1470, in RNNBase._cudnn_impl(self, inputs, initial_states, sequence_length)
   1467     inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
   1469 if in_dygraph_mode():
-> 1470     out, _, state = _C_ops.rnn(
   1471         inputs,
   1472         initial_states,
   1473         self._all_weights,
   1474         sequence_length,
   1475         self._dropout_state,
   1476         self.dropout,
   1477         self.num_directions == 2,
   1478         self.input_size,
   1479         self.hidden_size,
   1480         self.num_layers,
   1481         self.mode,
   1482         0,
   1483         not self.training,
   1484     )
   1485 elif in_dynamic_mode():
   1486     _, _, out, state = _legacy_C_ops.rnn(
   1487         inputs,
   1488         initial_states,
   (...)
   1506         not self.training,
   1507     )

ValueError: (InvalidArgument) The size of SequenceLength has to equal the batch_size. But received batch_size is 1 and the size of SequenceLength is 0.
  [Hint: Expected in_dims[1] == seq_dims[0], but received in_dims[1]:1 != seq_dims[0]:0.] (at ../paddle/phi/infermeta/multiary.cc:2690)

直接编译dev分支就没有上述问题,还是尽快发布新版本吧

zxcd commented 11 months ago

感谢您的使用,给您带来的不便我们深感抱歉。我们目前没有发版计划,如果您定位到了问题,也欢迎贡献代码,感谢您的支持与理解。

zhanghzong commented 11 months ago

你试着使用 develop 分支编译

  1. python setup.py build
  2. python setup.py install

我也遇到过类似的问题

danyow-cheung commented 10 months ago

使用命令行paddlespeech asr --lang zh --input demo.wav 也会出现keyError

qingjiaozyn commented 7 months ago

我也遇到这个问题,训练完asr,测试的时候报这个错,请问有解决的办法吗