Open hailiang-wang opened 7 years ago
In order to get a stable, fast and stt vendor, I try IBM and Google's solution. Here are some outputs.
Service Portal - https://console.ng.bluemix.net After login, navigate to watson cloud, provision a service, then get the credentials.
#! /bin/bash ########################################### # recognize text with voice file # http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/doc/speech-to-text/tutorial.shtml ########################################### # constants baseDir=$(cd `dirname "$0"`;pwd) . $baseDir/../watson_rc # testFile=$baseDir/0001.flac testFile=$baseDir/icomefromchina.wav # testFileType=flac testFileType=wav # functions # main [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return cd $baseDir curl -u $sttUserName:$sttPassword -X POST \ --header "Content-Type: audio/$testFileType" \ --header "Transfer-Encoding: chunked" \ --data-binary @$testFile \ "https://stream.watsonplatform.net/speech-to-text/api/v1/recognize?continuous=true"
export sttUserName=YOUR_USERNAMR export sttPassword=YOUR_PASSWD
IBM also provide TTS api, in my personal, it is state of art. http://www.ibm.com/watson/developercloud/doc/text-to-speech/index.shtml
<voice-transformation type="Custom" glottal_tension="-80%" rate="x-slow">body </voice-transformation>
demo
docs
watson = require('watson-developer-cloud'), var textToSpeech = watson.text_to_speech({ version: 'v1', username: config.watson_tts.username, password: config.watson_tts.password, url: config.watson_tts.url, }); function transcript(txt, output) { let deferred = Q.defer(); let params = { text: txt, voice: 'en-US_AllisonVoice', // Optional voice accept: 'audio/wav' }; // Pipe the synthesized text to a file let tran = textToSpeech.synthesize(params); tran.on('error', function (error) { logger.error(error); deferred.reject(); }); tran.on('end', function () { logger.debug('done.'); deferred.resolve(); }) tran.pipe(fs.createWriteStream(output)); return deferred.promise; }
let googleSpeech = (file, text = null) => { let api = `https://speech.googleapis.com/v1beta1/speech:syncrecognize` let bitmap = fs.readFileSync(file) let speech = new Buffer(bitmap).toString('base64') let postData = { 'config': { 'encoding': 'FLAC', 'sampleRate': 24000, 'languageCode': 'en-US' }, 'audio': { 'content': speech } } if (text) { postData.config.speechContext = { "phrases": [text] } } return new Promise((resolve, reject) => { # Get google token first. get_google_access_token() .then((access_token) => { superagent .post(api) .proxy(https_proxy) .set('Content-Type', 'application/json') .set('Authorization', `Bearer ${access_token}`) .send(JSON.stringify(postData)) .end((err, res) => { logger.debug('[speech google]', 'end') fs.unlink(file) if (err) { reject(err) } else { let resObj = JSON.parse(res.text) if (resObj.results) { resolve(resObj.results[0].alternatives[0].transcript) } else { resolve() } } }) }) }) }
文档:https://github.com/oraleval/http_api_doc/blob/master/eval.md
let unisoundASR = (file) => { return new Promise((resolve, reject) => { superagent .post('http://enasr.edu.hivoice.cn:5858/eval/pcm') .set('X-EngineType', 'asr.en_US') .set('appkey', 'xxx') .attach('voice', file) .end((err, res) => { if (err) { logger.debug('[error]', err) reject(err) } resolve(res) }) }) }
Thanks buddy!
Description
In order to get a stable, fast and stt vendor, I try IBM and Google's solution. Here are some outputs.
IBM
Service Portal - https://console.ng.bluemix.net After login, navigate to watson cloud, provision a service, then get the credentials.
TTS
IBM also provide TTS api, in my personal, it is state of art. http://www.ibm.com/watson/developercloud/doc/text-to-speech/index.shtml
demo
docs
Google
云知声
文档:https://github.com/oraleval/http_api_doc/blob/master/eval.md