Azure-Samples / cognitive-services-speech-sdk

Sample code for the Microsoft Cognitive Services Speech SDK
MIT License
2.86k stars 1.84k forks source link

Pronunciation assessment IPA phoneme alphabet doesn't work for en-GB #2223

Closed peterolson closed 7 months ago

peterolson commented 9 months ago

I am reopening this issue because the other one was closed, even though the issue has not been resolved yet.


I'm using the microsoft-cognitiveservices-speech-sdk in Node.js.

I'm trying to get the pronunciation of words in US English and UK English. To do that, I first run text-to-speech. Unfortunately, TTS doesn't return phoneme information directly, so I feed the result of TTS into speech recognition, and then run pronunciation assessment on that.

For the en-US locale, I get the following correct result:

comma   k ɑ m ə    
bug     b ʌ ɡ     
beard   b i r d    
bottle  b ɑ t ə l
hear    h i r     
ernest  ɝ r n ɪ s t
tour    t ɔ r     
pillow  p ɪ l oʊ
hair    h ɛ r     
driver  d r aɪ v ə r

For en-GB, there are no IPA phonemes returned at all.

These are the results I would expect based on the information here: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets#en-gben-ieen-au

comma   k ɒ m ə    
bug     b ʌ ɡ     
beard   b ɪə d    
bottle  b ɒ t ə l
hear    h ɪə     
ernest  ɜː n ɪ s t
tour    t ʊə   
pillow  p ɪ l əʊ
hair    h ɛə     
driver  d r aɪ v ə

Here's my code:

const sdk = require("microsoft-cognitiveservices-speech-sdk");

const locales = {
  "en-US": {
    name: "English (United States)",
    voiceName: "en-US-JennyNeural",
    locale: "en-US",
  },
  "en-GB": {
    name: "English (United Kingdom)",
    voiceName: "en-GB-SoniaNeural",
    locale: "en-GB",
  },
};

async function getPhonemes(word, locale) {
  const speechConfig = sdk.SpeechConfig.fromSubscription(
    "SECRET_KEY",
    "eastus"
  );
  const { voiceName } = locales[locale];
  speechConfig.speechSynthesisVoiceName = voiceName;
  speechConfig.speechRecognitionLanguage = locale;
  speechConfig.speechSynthesisLanguage = locale;

  // create audio stream
  const audioStream = sdk.AudioOutputStream.createPullStream();
  const synthesizerAudioConfig = sdk.AudioConfig.fromStreamOutput(audioStream);
  // Create the speech synthesizer.
  const synthesizer = new sdk.SpeechSynthesizer(
    speechConfig,
    synthesizerAudioConfig
  );

  return new Promise((resolve, reject) => {
    synthesizer.speakTextAsync(
      word,
      function (result) {
        if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
          //console.log("synthesis finished.");
        } else {
          console.error(
            "Speech synthesis canceled, " +
              result.errorDetails +
              "\nDid you set the speech resource key and region values?"
          );
        }
        const audioData = result.audioData;
        // convert to buffer
        const buffer = Buffer.from(audioData);
        const recognizerAudioConfig = sdk.AudioConfig.fromWavFileInput(buffer);
        synthesizer.close();
        var pronunciationAssessmentConfig =
          sdk.PronunciationAssessmentConfig.fromJSON(
            JSON.stringify({
              referenceText: word,
              granularity: "Phoneme",
            })
          );
        pronunciationAssessmentConfig.phonemeAlphabet = "IPA";
        const speechRecognizer = new sdk.SpeechRecognizer(
          speechConfig,
          recognizerAudioConfig
        );
        speechRecognizer.speechRecognitionLanguage = locale;
        const timeout = setTimeout(() => {
          console.log("Timeout!");
          speechRecognizer.close();
          resolve(["TIMEOUT!"]);
        }, 10000);
        pronunciationAssessmentConfig.applyTo(speechRecognizer);
        speechRecognizer.recognizeOnceAsync(
          (speechRecognitionResult) => {
            clearTimeout(timeout);
            // The pronunciation assessment result as a Speech SDK object
            const pronunciationAssessmentResult =
              sdk.PronunciationAssessmentResult.fromResult(
                speechRecognitionResult
              );
            const phonemes = [];
            for (const word of pronunciationAssessmentResult.privPronJson
              .Words) {
              for (const phoneme of word.Phonemes) {
                phonemes.push(phoneme.Phoneme);
              }
            }
            speechRecognizer.close();
            resolve(phonemes);
          },
          (err) => {
            console.error(err);
            speechRecognizer.close();
            reject(err);
          }
        );
      },
      function (err) {
        console.trace("err - " + err);
        synthesizer.close();
        synthesizer = null;
        reject(err);
      }
    );
  });
}

(async () => {

  let currently_processing = 0;
  const max_concurrent = 16;

  async function processWord(word) {
    try {
      currently_processing++;
      const phonemes_us = await getPhonemes(word, "en-US");
      const phonemes_gb = await getPhonemes(word, "en-GB");
      console.log(
        `${word}\t${phonemes_us.join(" ")}\t${phonemes_gb.join(" ")}\n`
      );
      currently_processing--;
    } catch (e) {
      console.error(e);
      await processWord(word);
    }
  }

  const wordlist = [
    "beard",
    "tour",
    "pillow",
    "hear",
    "ernest",
    "driver",
    "hair",
    "bug",
    "comma",
    "bottle",
  ];

  for (let i = 0; i < wordlist.length; i += max_concurrent) {
    console.log(`Processing ${i}-${i + max_concurrent} of ${wordlist.length}`);
    let words = wordlist.slice(i, i + max_concurrent);
    let promises = [];
    for (const word of words) {
      promises.push(processWord(word));
    }
    await Promise.all(promises);
  }
})();
jpalvarezl commented 9 months ago

Hi @peterolson , after investigating further, I reached the same conclusion, IPA phonemes are not supported for en-GB . We appreciate your feedback on the documentation not really reflecting this clearly and the appropriate team has been notified. Regarding @wangkenpu answer:

https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues/2200#issuecomment-1878054535

I believe they wanted an estimate from you, in order to have a sense on how to plan and accommodate resources, as this would be a feature request.

peterolson commented 9 months ago

I don't understand what kind of estimate they are looking for. As I said in the previous pull request, I have 0 usage now because it doesn't function at all for my purposes. Once it starts functioning correctly, I plan to use the feature heavily.

jpalvarezl commented 9 months ago

I don't understand what kind of estimate they are looking for

I cannot speak for @wangkenpu , but my understanding of their original question is that the estimate is not on your current usage, but rather your expected usage if the feature were to be implemented.

jpalvarezl commented 9 months ago

I wanted to also provide more clarity on your question. In the documentation we have a section where it is mentioned which locales are supported for which phoneme alphabets. Here you can read more about that.

We appreciate your feedback on our documentation, we've taken action to update our documentation to make this more clear ( more specifically here)

jpalvarezl commented 9 months ago

I modified your script and was able to produce the following output (where SAPI is used for en-GB instead):

hair    h ɛ r    -      h eh r

driver  d r aɪ v ə r     -      d r ay v ax r

ernest  ɝ r n ɪ s t      -      er r n ih s t

beard   b i r d  -      b iy r d

hear    h i r    -      h iy r

tour    t ɔ r    -      t ao r

comma   k ɑ m ə  -      k aa m ax

bug     b ʌ ɡ    -      b ah g

bottle  b ɑ t ə l        -      b aa t ax l

pillow  p ɪ l oʊ         -      p ih l ow

the columns are word, en-US IPA and en-GB SAPI.

Here is the modified script:

const sdk = require("microsoft-cognitiveservices-speech-sdk");

const locales = {
  "en-US": {
    name: "English (United States)",
    voiceName: "en-US-JennyNeural",
    locale: "en-US",
    phonemeAlphabet: "IPA",
  },
  "en-GB": {
    name: "English (United Kingdom)",
    voiceName: "en-GB-SoniaNeural",
    locale: "en-GB",
    phonemeAlphabet: "SAPI",
  },
};

async function getPhonemes(word, locale) {
  const speechConfig = sdk.SpeechConfig.fromSubscription(
    "YOUR_SUBSCRIPTION_KEY",
    "YOUR_SUBSCRIPTION_REGION"
  );
  const { voiceName, phonemeAlphabet } = locales[locale];
  speechConfig.speechSynthesisVoiceName = voiceName;
  speechConfig.speechRecognitionLanguage = locale;
//   speechConfig.speechSynthesisLanguage = locale;

  // create audio stream
  const audioStream = sdk.AudioOutputStream.createPullStream();
  const synthesizerAudioConfig = sdk.AudioConfig.fromStreamOutput(audioStream);
  // Create the speech synthesizer.
  const synthesizer = new sdk.SpeechSynthesizer(
    speechConfig,
    synthesizerAudioConfig
  );

  return new Promise((resolve, reject) => {
    synthesizer.speakTextAsync(
      word,
      function (result) {
        if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
          //console.log("synthesis finished.");
        } else {
          console.error(
            "Speech synthesis canceled, " +
              result.errorDetails +
              "\nDid you set the speech resource key and region values?"
          );
        }
        const audioData = result.audioData;
        // convert to buffer
        const buffer = Buffer.from(audioData);
        const recognizerAudioConfig = sdk.AudioConfig.fromWavFileInput(buffer);
        synthesizer.close();
        var pronunciationAssessmentConfig =
          sdk.PronunciationAssessmentConfig.fromJSON(
            JSON.stringify({
              granularity: "Phoneme",
            })
          );
        pronunciationAssessmentConfig.referenceText = word;
        pronunciationAssessmentConfig.phonemeAlphabet = phonemeAlphabet;

        const speechRecognizer = new sdk.SpeechRecognizer(
          speechConfig,
          recognizerAudioConfig
        );
        speechRecognizer.speechRecognitionLanguage = locale;
        const timeout = setTimeout(() => {
          console.log("Timeout!");
          speechRecognizer.close();
          resolve(["TIMEOUT!"]);
        }, 10000);

        pronunciationAssessmentConfig.applyTo(speechRecognizer);
        speechRecognizer.recognizeOnceAsync(
          (speechRecognitionResult) => {
            clearTimeout(timeout);
            // The pronunciation assessment result as a Speech SDK object
            const pronunciationAssessmentResult =
              sdk.PronunciationAssessmentResult.fromResult(
                speechRecognitionResult
              );

            // console.log(pronunciationAssessmentResult.privPronJson.Words);
            // console.log(pronunciationAssessmentResult.privPronJson.Words[0].Phonemes);
            const phonemes = [];
            for (const word of pronunciationAssessmentResult.privPronJson
              .Words) {
              for (const phoneme of word.Phonemes) {
                phonemes.push(phoneme.Phoneme);
              }
            }
            speechRecognizer.close();
            resolve(phonemes);
          },
          (err) => {
            console.error(err);
            speechRecognizer.close();
            reject(err);
          }
        );
      },
      function (err) {
        console.trace("err - " + err);
        synthesizer.close();
        synthesizer = null;
        reject(err);
      }
    );
  });
}

(async () => {

  let currently_processing = 0;
  const max_concurrent = 16;

  async function processWord(word) {
    try {
      currently_processing++;
      const phonemes_us = await getPhonemes(word, "en-US");
      const phonemes_gb= await getPhonemes(word, "en-GB");
      console.log(
        `${word}\t${phonemes_us.join(" ")}\t - \t${phonemes_gb.join(" ")}\n`
      );
      currently_processing--;
    } catch (e) {
      console.error(e);
      await processWord(word);
    }
  }

  const wordlist = [
    "beard",
    "tour",
    "pillow",
    "hear",
    "ernest",
    "driver",
    "hair",
    "bug",
    "comma",
    "bottle",
  ];

  for (let i = 0; i < wordlist.length; i += max_concurrent) {
    console.log(`Processing ${i}-${i + max_concurrent} of ${wordlist.length}`);
    let words = wordlist.slice(i, i + max_concurrent);
    let promises = [];
    for (const word of words) {
      promises.push(processWord(word));
    }
    await Promise.all(promises);
  }
})();
peterolson commented 9 months ago

@jpalvarezl It appears that the SAPI output for en-GB is incorrect, since it is identical to en-US, even though many of these words are pronounced differently in the two regions.

Here is the SAPI output that I currently receive. The columns are word, en-US SAPI and en-GB SAPI.

tour    t ao r   -      t ao r
beard   b iy r d         -      b iy r d
pillow  p ih l ow        -      p ih l ow
ernest  er r n ih s t    -      er r n ih s t
comma   k aa m ax        -      k aa m ax
hear    h iy r   -      h iy r
bug     b ah g   -      b ah g
bottle  b aa t ax l      -      b aa t ax l
hair    h eh r   -      h eh r
driver  d r ay v ax r    -      d r ay v ax r

As you can see, the output is identical for the two regions.

The documentation here describing the phonetic alphabet for en-GB does not use SAPI, but based on the information there I would expect the output to differ as follows for en-GB:

jpalvarezl commented 9 months ago

Thank you for pointing that out. I have contacted somewhere close to the feature work to provide guidance on this. I will keep you updated with what I can find out.

chschrae commented 8 months ago

@peterolson The service team is asking for more information about existing en-us usage and potential en-gb usage. Can you please contact the email address mspafeedback@microsoft.com with information about that. When you do, include your Azure subscription ID so they can get exact numbers on usage.

github-actions[bot] commented 8 months ago

This item has been open without activity for 19 days. Provide a comment on status and remove "update needed" label.

pankopon commented 7 months ago

Closed as no update since Feb 2, presumed handled offline (a new service feature request).

peterolson commented 7 months ago

@pankopon Note that the issue pointed out in this comment is a bug, not a feature request. Anyways, I have reopened it as a separate issue here: https://github.com/Azure-Samples/cognitive-services-speech-sdk/issues/2284