Open FucK5t4r opened 2 months ago
def audio2feat(self,audio_path): # get the sample rate of the audio result = self.model.transcribe(audio_path) embed_list = [] for emb in result['segments']: encoder_embeddings = emb['encoder_embeddings'] encoder_embeddings = encoder_embeddings.transpose(0,2,1,3) encoder_embeddings = encoder_embeddings.squeeze(0) start_idx = int(emb['start']) end_idx = int(emb['end']) emb_end_idx = int((end_idx - start_idx)/2) embed_list.append(encoder_embeddings[:emb_end_idx])#?为什么采取了取中间值的截断操作? concatenated_array = np.concatenate(embed_list, axis=0) return concatenated_array
问题在上面的注释中,在训练时,您对音频的处理也是这样的吗?
问题在上面的注释中,在训练时,您对音频的处理也是这样的吗?