alphacep / vosk-api

Offline speech recognition API for Android, iOS, Raspberry Pi and servers with Python, Java, C# and Node
Apache License 2.0
7.35k stars 1.04k forks source link

Vosk empty results from RTP packet with gstreamer #1582

Open Brexard opened 1 month ago

Brexard commented 1 month ago

Hi, I am currently working with vosk and mediasoup to create a live speech to text recognition. My web video conferencing app is working really well. For the speech recongnition, I am currently getting rtp packet one by one with mediasoup. I use gstreamer to create a pipeline and to format the rtp packets for vosk. Vosk is getting some packets from my gstreamer pipeline but every results/partial results are empty. I was wondering if I was doing something wrong when formating my rtp packets or even if what I am doing is posible.

I already tried multiple gstreamer configurations to make this work without success. Maybe it is preferable to manually create the gstreamer process instead of using "gstreamer-superficial" in node ?

Thanks for your help

My code from my mediasoup server :

this._directTransport = await this._mediasoupRouter.createDirectTransport();
const codecs = [];
const routerCodec = this._mediasoupRouter.rtpCapabilities.codecs.find(
      codec => codec.kind === producer.kind
);
codecs.push(routerCodec);
const rtpCapabilities = {
      codecs,
      rtcpFeedback: []
};
const directConsumer = await this._directTransport.consume({
       producerId: producer.id,
       rtpCapabilities: rtpCapabilities,
       paused: false
});
console.log(`DirectConsumer (${directConsumer.id}) is created on transport ${this._directTransport.id}`);

const model = await rtpConverter.createVoskModel();
const recognizer = new Recognizer({model: model, sampleRate: 16000 });

const { payloadType, clockRate } = rtpParameters.codecs[0];
const pipelineInfo = rtpConverter.createGstreamerPipeline(payloadType, clockRate);
//console.log(pipelineInfo);
rtpConverter.startPipeline(pipelineInfo);

directConsumer.on('rtp', (rtpPacket) => {
       //console.log(rtpPacket.dump());
       const parsedPacket = parseRtpPacket(rtpPacket);
       //console.log(parsedPacket);
       const buffer = new Uint8Array(rtpPacket.buffer);
       rtpConverter.receiveRtp(buffer, pipelineInfo);
});

My rtpConverter.js file :


const { v4: uuidv4 } = require("uuid");
const gstreamer = require("gstreamer-superficial");
const { Model } = require("vosk");

const modelPath = '/home/busseau/temma/dev/auxamed-media-server/lib/';

function createGstreamerPipeline(payloadType, clockRate) {
       console.log("Creating Gstreamer Pipeline...");
       console.log("recevied payloadType :", payloadType, "and clockRate :", clockRate);
       const sourceId = uuidv4();
       const sinkId = uuidv4();
       const pipelineElements = [
              `appsrc name=${sourceId} format=time is-live=true do-timestamp=true caps="application/x-rtp,media=audio,clock-rate=${clockRate},encoding-name=OPUS,channels=2,payload=96"`,
              `rtpopusdepay`,
              `opusparse`,
              `opusdec`,
              `audioconvert`,
              `audioresample`,
              `audio/x-raw,format=S16LE,channels=1,rate=16000`,
              `appsink name=${sinkId}`
       ];
       const elements = pipelineElements.join(` ! `);
       const pipeline = new gstreamer.Pipeline(elements);
       const gstreamerInfo = {
              pipeline: pipeline,
              appsrc: pipeline.findChild(sourceId),
              appsink: pipeline.findChild(sinkId),
       };
       return gstreamerInfo;
}

function startPipeline(gstreamerInfo) {
       //Start pipeline according to pipeline information
       let pipeline = gstreamerInfo.pipeline;
       pipeline.play();
       console.log("pipeline started");
}

function stopPipeline(gstreamerInfo) {
       //Stop pipeline according to pipeline information
       let pipeline = gstreamerInfo.pipeline;
       pipeline.stop();
       console.log("pipeline Stoped");
}

async function receiveRtp(buffer, gstreamerInfo) {
       //add every rtp packet to the pipeline
       await gstreamerInfo.appsrc.push(buffer);
}

async function createVoskModel() {
       console.log("Loading Vosk model...");
       try {
              const model = new Model(modelPath + 'vosk-model-small-fr-0.22/');
              console.log("Vosk model loaded successfully");
              return model;
       } catch (error) {
              console.error("Error loading Vosk model:", error);
              return null;
       }
}

async function processAudioWithVosk(gstreamerInfo, recognizer) {
       const appsink = gstreamerInfo.appsink;
       const idleTimeInMs = 30;

       const poll = function() {
              appsink.pull((sample) => {
              if (!sample) {
                     setTimeout(poll, idleTimeInMs);
                     return;
       }

       const arrayBuffer = sample.buffer;
       if (!arrayBuffer || arrayBuffer.byteLength === 0) {
              console.error('ArrayBuffer is undefined or empty:', arrayBuffer);
              setTimeout(poll, idleTimeInMs);
              return;
       }

       const buffer = Buffer.from(new Uint8Array(arrayBuffer));
       if (!Buffer.isBuffer(buffer)) {
              console.error('Error: Converted buffer is not a Buffer instance');
              setTimeout(poll, idleTimeInMs);
              return;
       }

       try {
              //console.log("Sample received from pipeline: ", sample);
              //console.log("Buffer from sample : ", sample.buffer);
              //console.log("Uint8Array Buffer sent to Vosk: ", buffer);
              recognizer.acceptWaveform(buffer);

              const result = recognizer.resultString();
              const partialResult = recognizer.partialResult();
              if (result) {
                     console.log('Result =', result);
              }
              if (partialResult) {
                     console.log('Partial result =', partialResult);
              }
       } catch (error) {
       console.error('Error during recognition:', error);
       }

poll();
});
};

console.log("Starting audio processing with Vosk");
poll();
}

module.exports = {
      createGstreamerPipeline,
      startPipeline,
      stopPipeline,
      receiveRtp,
      createVoskModel,
      processAudioWithVosk,
}
nshmyrev commented 1 month ago

You can dump audio you send to recognizer and listen / share here. Likely sample rate or byte endian is wrong.

Brexard commented 1 month ago

My producer has payloadType : 111 and clockRate : 48000 from my meda codec :

mediaCodecs :
[
    {
        kind      : 'audio',
        mimeType  : 'audio/opus',
        clockRate : 48000,
        channels  : 2
    },
]

so my consumer has : consumer has : ({"codecs":[{"mimeType":"audio/opus","payloadType":100,"clockRate":48000,"channels":2,"parameters":{"minptime":10,"useinbandfec":1,"sprop-stereo":1,"usedtx":1},"rtcpFeedback":[]}],"headerExtensions":[],"encodings":[{"ssrc":997556025}],"rtcp":{"cname":"XvC+gStmsWSnOtt8","reducedSize":true},"mid":"0"}

And after starting gstreamer and retreiving audio samples :

console.log("Raw sample: ", sample);
console.log("Json sample received from pipeline: ", JSON.stringify(sample));
console.log("raw buffer: ", buffer);    
console.log("Json buffer sent to Vosk (full): ", JSON.stringify(buffer));
console.log("rtp packet infos: ", parseRtpPacket(buffer))
recognizer.acceptWaveform(buffer);

I have some conversion happenning before the logs :

const arrayBuffer = sample.buffer;
if (!arrayBuffer || arrayBuffer.byteLength === 0) {
       console.error('ArrayBuffer is undefined or empty:', arrayBuffer);
       setTimeout(poll, idleTimeInMs);
       return;
}

const buffer = Buffer.from(new Uint8Array(arrayBuffer));
if (!Buffer.isBuffer(buffer)) {
       console.error('Error: Converted buffer is not a Buffer instance');
       setTimeout(poll, idleTimeInMs);
       return;
}

I am getting something like this from my pipeline : (for some reason, some of the rtpSample are not recognized from 'simple-rtp-parser' from wich I get an "Unsupported RTP version" error)

caps {
  name: 'audio/x-raw',
  format: 'S16LE',
  layout: 'interleaved',
  rate: 16000,
  channels: 1
}
Raw sample:  <Buffer@0x74e090c770b0 ae d7 7d 13 16 33 80 35 3e 22 92 04 fe eb d5 e2 06 e9 ff f8 98 0a 31 17 d6 18 25 10 b8 00 af f0 14 e5 bd e6 5a f4 5a 03 23 0d 74 0e 9b 07 7b fe 4c fa ... 1390 more bytes>
Json sample received from pipeline:  {"type":"Buffer","data":[174,215,125,19,22,51,128,53,62,34,146,4,254,235,213,226,6,233,255,248,152,10,49,23,214,24,37,16,184,0,175,240,20,229,189,230,90,244,90,3,35,13,116,14,155,7,123,254,76,250,157,246,216,245,215,246,246,251,116,2,2,6,84,1,129,251,234,246,196,238,251,237,57,245,40,1,80,5,177,8,184,7,194,0,134,250,84,248,12,249,147,251,12,1,234,6,68,8,125,5,165,0,21,251,223,247,149,249,86,254,171,0,109,255,160,1,109,4,65,1,126,255,166,254,230,0,20,2,223,1,41,3,254,2,65,1,209,1,141,2,37,2,148,1,127,3,144,6,138,4,47,4,59,2,155,255,187,252,61,252,110,0,186,0,199,0,153,0,10,255,64,252,60,250,112,250,23,251,225,253,178,0,89,0,232,254,220,252,233,253,204,254,184,1,50,3,254,2,111,1,20,2,221,0,158,0,22,2,177,2,12,2,58,1,164,0,205,0,237,1,183,2,231,2,83,2,243,1,147,1,187,1,13,2,43,2,168,2,8,3,96,3,142,3,211,3,32,4,25,4,123,3,173,2,193,1,22,1,235,0,249,0,156,1,165,1,3,1,63,0,230,255,170,255,222,254,62,254,35,254,186,253,25,253,63,253,31,254,87,254,182,253,48,253,26,253,116,253,79,254,205,254,35,255,100,255,91,255,69,255,104,255,181,255,215,255,60,0,199,0,78,1,236,1,155,2,149,2,209,1,13,1,164,0,136,0,188,0,32,1,243,0,77,0,217,255,117,255,9,255,43,255,196,255,16,0,225,255,62,255,139,254,115,254,233,254,149,255,69,0,205,0,255,0,89,1,184,1,243,1,56,2,137,2,171,2,120,2,23,2,134,1,253,0,171,0,153,0,171,0,174,0,128,0,64,0,11,0,217,255,145,255,61,255,234,254,161,254,115,254,113,254,177,254,22,255,128,255,180,255,192,255,195,255,244,255,64,0,102,0,133,0,160,0,186,0,211,0,24,1,96,1,174,1,4,2,63,2,105,2,132,2,172,2,179,2,152,2,96,2,48,2,46,2,76,2,107,2,149,2,185,2,185,2,146,2,78,2,242,1,134,1,35,1,162,0,20,0,172,255,134,255,111,255,73,255,4,255,202,254,170,254,120,254,31,254,198,253,152,253,141,253,162,253,200,253,3,254,85,254,158,254,215,254,16,255,80,255,124,255,158,255,191,255,224,255,245,255,7,0,14,0,17,0,22,0,39,0,52,0,55,0,47,0,29,0,15,0,16,0,35,0,60,0,72,0,75,0,70,0,67,0,66,0,71,0,88,0,74,0,55,0,7,0,6,0,17,0,91,0,175,0,195,0,121,0,94,0,51,0,223,255,40,0,99,0,72,0,248,255,196,255,177,255,228,255,25,0,59,0,37,0,255,255,230,255,240,255,35,0,68,0,114,0,161,0,206,0,251,0,41,1,101,1,148,1,158,1,149,1,141,1,123,1,124,1,149,1,240,1,54,2,26,2,218,1,160,1,106,1,15,1,192,0,183,0,155,0,72,0,246,255,199,255,130,255,22,255,198,254,188,254,223,254,35,255,52,255,13,255,229,254,159,254,72,254,22,254,50,254,100,254,174,254,13,255,94,255,142,255,171,255,173,255,104,255,36,255,9,255,42,255,105,255,195,255,250,255,230,255,183,255,149,255,142,255,177,255,233,255,35,0,76,0,91,0,61,0,21,0,23,0,79,0,129,0,140,0,139,0,144,0,123,0,89,0,71,0,82,0,92,0,79,0,64,0,54,0,37,0,36,0,59,0,51,0,19,0,219,255,180,255,172,255,185,255,214,255,236,255,253,255,22,0,39,0,61,0,87,0,99,0,46,0,249,255,211,255,190,255,126,255,25,255,178,254,62,254,234,253,134,253,35,253,185,252,101,252,26,252,225,251,173,251,102,251,48,251,22,251,62,251,143,251,242,251,65,252,115,252,178,252,36,253,173,253,59,254,214,254,107,255,227,255,66,0,176,0,17,1,99,1,178,1,2,2,58,2,102,2,138,2,127,2,87,2,49,2,13,2,175,1,36,1,154,0,9,0,111,255,184,254,3,254,64,253,148,252,244,251,89,251,157,250,245,249,123,249,37,249,8,249,26,249,64,249,112,249,172,249,8,250,86,250,85,250,201,251,140,254,2,252,135,245,132,247,211,240,217,240,63,254,201,6,108,47,210,46,167,69,51,82,155,15,15,227,92,179,250,190,145,220,136,240,62,65,1,100,255,127,90,47,43,186,3,185,0,128,202,134,95,159,2,195,11,40,142,39,174,100,255,127,216,36,184,240,74,48,24,117,20,44,98,223,12,8,50,18,49,249,81,136,133,167,229,209,163,158,126,209,36,231,131,30,6,48,237,32,25,75,207,207,221,135,221,181,22,202,240,15,152,58,52,85,121,117,120,106,84,52,88,223,26,9,56,3,78,248,54,5,208,27,102,33,199,49,189,61,186,7,153,205,114,193,28,184,72,201,91,33,176,31,238,243,69,41,90,245,9,138,0,128,1,143,153,16,255,127,184,115,199,107,150,118,209,121,253,60,96,19,192,204,0,128,85,168,47,78,255,127,94,88,184,226,154,183,141,158,243,204,47,220,139,218,185,224,36,250,116,220,244,157,0,128,136,223,242,2,203,26,133,126,56,115,102,68,29,67,207,36,24,19,62,242,103,6,45,88,87,107,163,110,84,51,125,210,2,156,159,187,70,159,0,128,13,164,224,14,176,123,220,33,234,228,55,185,247,58,255,127,137,73,240,52,217,16,229,0,155,8,48,240,106,43,157,15,30,242,70,89,154,24,52,143,0,128,169,150,61,183,185,168,64,82,228,109,6,209,207,178,82,72,96,104,210,6,193,30,44,46,68,255,219,5,5,38,170,239,65,205,73,237,229,209,249,25,200,97,58,98,177,213,208,181,181,11,162,59,47,225,188,244,51,179,89,234,231,29,134,6,119,189,39,191,211,244,66,18,67,10,121,29,18,39,106,8,67,241,46,230,26,224,204,219,121,217,78,232,120,32,9,76,155,71,117,34,152,19,98,255,239,252,194,207,31,192,55,205,206,197,86,217,164,231,3,253,31,25,231,7,155,249,188,236,130,216,107,233,159,40,176,73,42,53,255,28,80,9,231,245,239,239,178,238,180,246,93,241,58,239,39,6,21,222,147,158,2,171,7,222,214,1,189,40,21,89,120,74,76,23]}
raw buffer:  <Buffer@0x74e090c05270 ae d7 7d 13 16 33 80 35 3e 22 92 04 fe eb d5 e2 06 e9 ff f8 98 0a 31 17 d6 18 25 10 b8 00 af f0 14 e5 bd e6 5a f4 5a 03 23 0d 74 0e 9b 07 7b fe 4c fa ... 1390 more bytes>

Json buffer sent to Vosk (full):  {"type":"Buffer","data":[174,215,125,19,22,51,128,53,62,34,146,4,254,235,213,226,6,233,255,248,152,10,49,23,214,24,37,16,184,0,175,240,20,229,189,230,90,244,90,3,35,13,116,14,155,7,123,254,76,250,157,246,216,245,215,246,246,251,116,2,2,6,84,1,129,251,234,246,196,238,251,237,57,245,40,1,80,5,177,8,184,7,194,0,134,250,84,248,12,249,147,251,12,1,234,6,68,8,125,5,165,0,21,251,223,247,149,249,86,254,171,0,109,255,160,1,109,4,65,1,126,255,166,254,230,0,20,2,223,1,41,3,254,2,65,1,209,1,141,2,37,2,148,1,127,3,144,6,138,4,47,4,59,2,155,255,187,252,61,252,110,0,186,0,199,0,153,0,10,255,64,252,60,250,112,250,23,251,225,253,178,0,89,0,232,254,220,252,233,253,204,254,184,1,50,3,254,2,111,1with those console.log ,20,2,221,0,158,0,22,2,177,2,12,2,58,1,164,0,205,0,237,1,183,2,231,2,83,2,243,1,147,1,187,1,13,2,43,2,168,2,8,3,96,3,142,3,211,3,32,4,25,4,123,3,173,2,193,1,22,1,235,0,249,0,156,1,165,1,3,1,63,0,230,255,170,255,222,254,62,254,35,254,186,253,25,253,63,253,31,254,87,254,182,253,48,253,26,253,116,253,79,254,205,254,35,255,100,255,91,255,69,255,104,255,181,255,215,255,60,0,199,0,78,1,236,1,155,2,149,2,209,1,13,1,164,0,136,0,188,0,32,1,243,0,77,0,217,255,117,255,9,255,43,255,196,255,16,0,225,255,62,255,139,254,115,254,233,254,149,255,69,0,205,0,255,0,89,1,184,1,243,1,56,2,137,2,171,2,120,2,23,2,134,1,253,0,171,0,153,0,171,0,174,0,128,0,64,0,11,0,217,255,145,255,61,255,234,254,161,254,115,254,113,254,177,254,22,255,128,255,180,255,192,255,195,255,244,255,64,0,102,0,133,0,160,0,186,0,211,0,24,1,96,1,174,1,4,2,63,2,105,2,132,2,172,2,179,2,152,2,96,2,48,2,46,2,76,2,107,2,149,2,185,2,185,2,146,2,78,2,242,1,134,1,35,1,162,0,20,0,172,255,134,255,111,255,73,255,4,255,202,254,170,254,120,254,31,254,198,253,152,253,141,253,162,253,200,253,3,254,85,254,158,254,215,254,16,255,80,255,124,255,158,255,191,255,224,255,245,255,7,0,14,0,17,0,22,0,39,0,52,0,55,0,47,0,29,0,15,0,16,0,35,0,60,0,72,0,75,0,70,0,67,0,66,0,71,0,88,0,74,0,55,0,7,0,6,0,17,0,91,0,175,0,195,0,121,0,94,0,51,0,223,255,40,0,99,0,72,0,248,255,196,255,177,255,228,255,25,0,59,0,37,0,255,255,230,255,240,255,35,0,68,0,114,0,161,0,206,0,251,0,41,1,101,1,148,1,158,1,149,1,141,1,123,1,124,1,149,1,240,1,54,2,26,2,218,1,160,1,106,1,15,1,192,0,183,0,155,0,72,0,246,255,199,255,130,255,22,255,198,254,188,254,223,254,35,255,52,255,13,255,229,254,159,254,72,254,22,254,50,254,100,254,174,254,13,255,94,255,142,255,171,255,173,255,104,255,36,255,9,255,42,255,105,255,195,255,250,255,230,255,183,255,149,255,142,255,177,255,233,255,35,0,76,0,91,0,61,0,21,0,23,0,79,0,129,0,140,0,139,0,144,0,123,0,89,0,71,0,82,0,92,0,79,0,64,0,54,0,37,0,36,0,59,0,51,0,19,0,219,255,180,255,172,255,185,255,214,255,236,255,253,255,22,0,39,0,61,0,87,0,99,0,46,0,249,255,211,255,190,255,126,255,25,255,178,254,62,254,234,253,134,253,35,253,185,252,101,252,26,252,225,251,173,251,102,251,48,251,22,251,62,251,143,251,242,251,65,252,115,252,178,252,36,253,173,253,59,254,214,254,107,255,227,255,66,0,176,0,17,1,99,1,178,1,2,2,58,2,102,2,138,2,127,2,87,2,49,2,13,2,175,1,36,1,154,0,9,0,111,255,184,254,3,254,64,253,148,252,244,251,89,251,157,250,245,249,123,249,37,249,8,249,26,249,64,249,112,249,172,249,8,250,86,250,85,250,201,251,140,254,2,252,135,245,132,247,211,240,217,240,63,254,201,6,108,47,210,46,167,69,51,82,155,15,15,227,92,179,250,190,145,220,136,240,62,65,1,100,255,127,90,47,43,186,3,185,0,128,202,134,95,159,2,195,11,40,142,39,174,100,255,127,216,36,184,240,74,48,24,117,20,44,98,223,12,8,50,18,49,249,81,136,133,167,229,209,163,158,126,209,36,231,131,30,6,48,237,32,25,75,207,207,221,135,221,181,22,202,240,15,152,58,52,85,121,117,120,106,84,52,88,223,26,9,56,3,78,248,54,5,208,27,102,33,199,49,189,61,186,7,153,205,114,193,28,184,72,201,91,33,176,31,238,243,69,41,90,245,9,138,0,128,1,143,153,16,255,127,184,115,199,107,150,118,209,121,253,60,96,19,192,204,0,128,85,168,47,78,255,127,94,88,184,226,154,183,141,158,243,204,47,220,139,218,185,224,36,250,116,220,244,157,0,128,136,223,242,2,203,26,133,126,56,115,102,68,29,67,207,36,24,19,62,242,103,6,45,88,87,107,163,110,84,51,125,210,2,156,159,187,70,159,0,128,13,164,224,14,176,123,220,33,234,228,55,185,247,58,255,127,137,73,240,52,217,16,229,0,155,8,48,240,106,43,157,15,30,242,70,89,154,24,52,143,0,128,169,150,61,183,185,168,64,82,228,109,6,209,207,178,82,72,96,104,210,6,193,30,44,46,68,255,219,5,5,38,170,239,65,205,73,237,229,209,249,25,200,97,58,98,177,213,208,181,181,11,162,59,47,225,188,244,51,179,89,234,231,29,134,6,119,189,39,191,211,244,66,18,67,10,121,29,18,39,106,8,67,241,46,230,26,224,204,219,121,217,78,232,120,32,9,76,155,71,117,34,152,19,98,255,239,252,194,207,31,192,55,205,206,197,86,217,164,231,3,253,31,25,231,7,155,249,188,236,130,216,107,233,159,40,176,73,42,53,255,28,80,9,231,245,239,239,178,238,180,246,93,241,58,239,39,6,21,222,147,158,2,171,7,222,214,1,189,40,21,89,120,74,76,23]}
rtp packet infos:  {
  version: 2,
  padding: 1,
  extension: 0,
  csrcCount: 14,
  marker: 1,
  payloadType: 87,
  sequenceNumber: 32019,
  timestamp: 372473909,
  ssrc: 1042452996,
  payload: <Buffer@0x74e090c770f4 c4 ee fb ed 39 f5 28 01 50 05 b1 08 b8 07 c2 00 86 fa 54 f8 0c f9 93 fb 0c 01 ea 06 44 08 7d 05 a5 00 15 fb df f7 95 f9 56 fe ab 00 6d ff a0 01 6d 04 ... 1322 more bytes>
}
nshmyrev commented 4 weeks ago

You need to dump audio to a file as bytes, not as hex values and share

Brexard commented 3 weeks ago

Hi, I fixed my problem, everything is working fine now. The problem was caused by gstreamer, I decided to create it as a separate process from node. Using "udpsink" instead of "appsink" is working.