Description

In order to get a stable, fast and stt vendor, I try IBM and Google's solution. Here are some outputs.

IBM

Service Portal - https://console.ng.bluemix.net After login, navigate to watson cloud, provision a service, then get the credentials.

#! /bin/bash 
###########################################
# recognize text with voice file
# http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/speech-to-text/tutorial.shtml
###########################################

# constants
baseDir=$(cd `dirname "$0"`;pwd)
. $baseDir/../watson_rc
# testFile=$baseDir/0001.flac
testFile=$baseDir/icomefromchina.wav
# testFileType=flac
testFileType=wav

# functions

# main 
[ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return

cd $baseDir

curl -u $sttUserName:$sttPassword -X POST \
--header "Content-Type: audio/$testFileType" \
--header "Transfer-Encoding: chunked" \
--data-binary @$testFile \
"https://stream.watsonplatform.net/speech-to-text/api/v1/recognize?continuous=true"

watson_rc

export sttUserName=YOUR_USERNAMR
export sttPassword=YOUR_PASSWD

TTS

IBM also provide TTS api, in my personal, it is state of art. http://www.ibm.com/watson/developercloud/doc/text-to-speech/index.shtml

<voice-transformation type="Custom" glottal_tension="-80%" rate="x-slow">body </voice-transformation>

demo

docs

watson = require('watson-developer-cloud'),
var textToSpeech = watson.text_to_speech({
    version: 'v1',
    username: config.watson_tts.username,
    password: config.watson_tts.password,
    url: config.watson_tts.url,
});

function transcript(txt, output) {
    let deferred = Q.defer();
    let params = {
        text: txt,
        voice: 'en-US_AllisonVoice', // Optional voice 
        accept: 'audio/wav'
    };
    // Pipe the synthesized text to a file 
    let tran = textToSpeech.synthesize(params);
    tran.on('error', function (error) {
        logger.error(error);
        deferred.reject();
    });

    tran.on('end', function () {
        logger.debug('done.');
        deferred.resolve();
    })

    tran.pipe(fs.createWriteStream(output));

    return deferred.promise;
}

Google

let googleSpeech = (file, text = null) => {
  let api = `https://speech.googleapis.com/v1beta1/speech:syncrecognize`
  let bitmap = fs.readFileSync(file)
  let speech = new Buffer(bitmap).toString('base64')
  let postData = {
    'config': {
      'encoding': 'FLAC',
      'sampleRate': 24000,
      'languageCode': 'en-US'
    },
    'audio': {
      'content': speech
    }
  }

  if (text) {
    postData.config.speechContext = {
      "phrases": [text]
    }
  }
  return new Promise((resolve, reject) => {
    # Get google token first.
    get_google_access_token()
      .then((access_token) => {

        superagent
          .post(api)
          .proxy(https_proxy)
          .set('Content-Type', 'application/json')
          .set('Authorization', `Bearer ${access_token}`)
          .send(JSON.stringify(postData))
          .end((err, res) => {
            logger.debug('[speech google]', 'end')
            fs.unlink(file)
            if (err) {
              reject(err)
            }
            else {
              let resObj = JSON.parse(res.text)
              if (resObj.results) {
                resolve(resObj.results[0].alternatives[0].transcript)
              }
              else {
                resolve()
              }
            }
          })
      })
  })
}

云知声

文档：https://github.com/oraleval/http_api_doc/blob/master/eval.md

let unisoundASR = (file) => {
  return new Promise((resolve, reject) => {
    superagent
      .post('http://enasr.edu.hivoice.cn:5858/eval/pcm')
      .set('X-EngineType', 'asr.en_US')
      .set('appkey', 'xxx')
      .attach('voice', file)
      .end((err, res) => {
        if (err) {
          logger.debug('[error]', err)
          reject(err)
        }
        resolve(res)
      })
  })
}

Chatie / stt

Some investigations in STT Services #11

Description

IBM

TTS

Google

云知声