Azure-Samples / cognitive-services-speech-sdk

Sample code for the Microsoft Cognitive Services Speech SDK
MIT License
2.85k stars 1.84k forks source link

Pronunciation assessment IPA phoneme alphabet doesn't work for en-GB #2200

Closed peterolson closed 8 months ago

peterolson commented 9 months ago

I'm using the microsoft-cognitiveservices-speech-sdk in Node.js.

I'm trying to get the pronunciation of words in US English and UK English. To do that, I first run text-to-speech. Unfortunately, TTS doesn't return phoneme information directly, so I feed the result of TTS into speech recognition, and then run pronunciation assessment on that.

For the en-US locale, I get the following correct result:

comma   k ɑ m ə    
bug     b ʌ ɡ     
beard   b i r d    
bottle  b ɑ t ə l
hear    h i r     
ernest  ɝ r n ɪ s t
tour    t ɔ r     
pillow  p ɪ l oʊ
hair    h ɛ r     
driver  d r aɪ v ə r

For en-GB, there are no IPA phonemes returned at all.

These are the results I would expect based on the information here: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets#en-gben-ieen-au

comma   k ɒ m ə    
bug     b ʌ ɡ     
beard   b ɪə d    
bottle  b ɒ t ə l
hear    h ɪə     
ernest  ɜː n ɪ s t
tour    t ʊə   
pillow  p ɪ l əʊ
hair    h ɛə     
driver  d r aɪ v ə

Here's my code:

const sdk = require("microsoft-cognitiveservices-speech-sdk");

const locales = {
  "en-US": {
    name: "English (United States)",
    voiceName: "en-US-JennyNeural",
    locale: "en-US",
  },
  "en-GB": {
    name: "English (United Kingdom)",
    voiceName: "en-GB-SoniaNeural",
    locale: "en-GB",
  },
};

async function getPhonemes(word, locale) {
  const speechConfig = sdk.SpeechConfig.fromSubscription(
    "SECRET_KEY",
    "eastus"
  );
  const { voiceName } = locales[locale];
  speechConfig.speechSynthesisVoiceName = voiceName;
  speechConfig.speechRecognitionLanguage = locale;
  speechConfig.speechSynthesisLanguage = locale;

  // create audio stream
  const audioStream = sdk.AudioOutputStream.createPullStream();
  const synthesizerAudioConfig = sdk.AudioConfig.fromStreamOutput(audioStream);
  // Create the speech synthesizer.
  const synthesizer = new sdk.SpeechSynthesizer(
    speechConfig,
    synthesizerAudioConfig
  );

  return new Promise((resolve, reject) => {
    synthesizer.speakTextAsync(
      word,
      function (result) {
        if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
          //console.log("synthesis finished.");
        } else {
          console.error(
            "Speech synthesis canceled, " +
              result.errorDetails +
              "\nDid you set the speech resource key and region values?"
          );
        }
        const audioData = result.audioData;
        // convert to buffer
        const buffer = Buffer.from(audioData);
        const recognizerAudioConfig = sdk.AudioConfig.fromWavFileInput(buffer);
        synthesizer.close();
        var pronunciationAssessmentConfig =
          sdk.PronunciationAssessmentConfig.fromJSON(
            JSON.stringify({
              referenceText: word,
              granularity: "Phoneme",
            })
          );
        pronunciationAssessmentConfig.phonemeAlphabet = "IPA";
        const speechRecognizer = new sdk.SpeechRecognizer(
          speechConfig,
          recognizerAudioConfig
        );
        speechRecognizer.speechRecognitionLanguage = locale;
        const timeout = setTimeout(() => {
          console.log("Timeout!");
          speechRecognizer.close();
          resolve(["TIMEOUT!"]);
        }, 10000);
        pronunciationAssessmentConfig.applyTo(speechRecognizer);
        speechRecognizer.recognizeOnceAsync(
          (speechRecognitionResult) => {
            clearTimeout(timeout);
            // The pronunciation assessment result as a Speech SDK object
            const pronunciationAssessmentResult =
              sdk.PronunciationAssessmentResult.fromResult(
                speechRecognitionResult
              );
            const phonemes = [];
            for (const word of pronunciationAssessmentResult.privPronJson
              .Words) {
              for (const phoneme of word.Phonemes) {
                phonemes.push(phoneme.Phoneme);
              }
            }
            speechRecognizer.close();
            resolve(phonemes);
          },
          (err) => {
            console.error(err);
            speechRecognizer.close();
            reject(err);
          }
        );
      },
      function (err) {
        console.trace("err - " + err);
        synthesizer.close();
        synthesizer = null;
        reject(err);
      }
    );
  });
}

(async () => {

  let currently_processing = 0;
  const max_concurrent = 16;

  async function processWord(word) {
    try {
      currently_processing++;
      const phonemes_us = await getPhonemes(word, "en-US");
      const phonemes_gb = await getPhonemes(word, "en-GB");
      console.log(
        `${word}\t${phonemes_us.join(" ")}\t${phonemes_gb.join(" ")}\n`
      );
      currently_processing--;
    } catch (e) {
      console.error(e);
      await processWord(word);
    }
  }

  const wordlist = [
    "beard",
    "tour",
    "pillow",
    "hear",
    "ernest",
    "driver",
    "hair",
    "bug",
    "comma",
    "bottle",
  ];

  for (let i = 0; i < wordlist.length; i += max_concurrent) {
    console.log(`Processing ${i}-${i + max_concurrent} of ${wordlist.length}`);
    let words = wordlist.slice(i, i + max_concurrent);
    let promises = [];
    for (const word of words) {
      promises.push(processWord(word));
    }
    await Promise.all(promises);
  }
})();
wangkenpu commented 9 months ago

For English, only en-US supports IPA phones. Currently, it's by design.

We can support exposing the IPA phone with some effort on en-GB. Can you estimate the usage of pronunciation services to help use to priority the work? Thanks!

peterolson commented 9 months ago

Is it documented anywhere on that only en-US supports IPA phones? If not, that limitation should definitely be in the documentation.

Can you estimate the usage of pronunciation services to help use to priority the work?

I'm not using it at all right now because it doesn't work. Once it starts working properly I plan on using it heavily.

jpalvarezl commented 8 months ago

To keep our open issues list up to date, this item will be closed since it's been inactive and needs more information to proceed. Please file a new issue (and feel free to reference this one) if there's new information we can follow up on. Thank you!

peterolson commented 8 months ago

@jpalvarezl I don't understand what you mean by "needs more information to proceed". What other information do you need?

jpalvarezl commented 8 months ago

That is a template message we use in our team when closing an issue due to inactivity.

It seemed to me that @wangkenpu may have needed more information from you to proceed. Is that correct, @wangkenpu ? How can we unblock to continue work on this issue?

Please @peterolson , feel free to re-open this if necessary.