/* eslint-disable no-restricted-syntax */
/* eslint-disable no-unused-vars */
/* eslint-disable no-console */
import mic from 'mic';
import { NonRealTimeVAD } from '@ricky0123/vad-node';
async function main() {
const sampleRate = '16000';
const audioConfig = {
rate: sampleRate,
channels: '1',
bitwidth: '8',
};
const micInstance = mic({
rate: audioConfig.rate,
channels: audioConfig.channels,
});
const vad = await NonRealTimeVAD.new({});
micInstance.getAudioStream().on('data', async (chunk) => {
// Try the VAD algo
for await (const vadResult of vad.run(chunk, parseFloat(sampleRate))) {
console.log(vadResult);
}
// Brute-force heuristic to detect speech blocks
let speechSample;
for (let i = 0; i < chunk.length; i += 2) {
if (chunk[i + 1] > 128) {
speechSample = (chunk[i + 1] - 256) * 256;
} else {
speechSample = chunk[i + 1] * 256;
}
speechSample += chunk[i];
if (Math.abs(speechSample) > 2000) {
console.log('Found POSSIBLE speech block', speechSample);
break;
}
}
});
micInstance.start();
await new Promise((resolve) => {
setTimeout(() => {
micInstance.stop();
resolve();
}, 60_000);
});
}
main()
.catch((ex) => console.error(ex))
.finally(() => process.exit(0));
Running this script produces a false positive on every single data chunk with { start: 192, end: 480 } - even when the mic is muted and dead silent in the room. Speaking into the mic changes nothing.
I know the mic instance is working (picking up voice vs picking up silence) - because when I AM speaking into the mic, I do properly see the brute-force heuristic triggering.
Full example output from running the script above (snipped some repeated Float32Array blocks for brevity):
Worth noting:
onnxruntime complains - important? No clue
audio object is output even when not speaking (silence in mic)
Found POSSIBLE speech block XXXX correctly triggers when I AM speaking in the mic
[VAD] initializing vad
2024-06-09 21:31:45.414 node[57351:5879341] 2024-06-09 21:31:45.414909 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '628'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.414 node[57351:5879341] 2024-06-09 21:31:45.414969 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '629'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.414 node[57351:5879341] 2024-06-09 21:31:45.414983 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '623'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.414 node[57351:5879341] 2024-06-09 21:31:45.414992 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '625'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.415 node[57351:5879341] 2024-06-09 21:31:45.414999 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '620'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.415 node[57351:5879341] 2024-06-09 21:31:45.415037 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '139'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.415 node[57351:5879341] 2024-06-09 21:31:45.415049 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '131'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.415 node[57351:5879341] 2024-06-09 21:31:45.415057 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '140'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.415 node[57351:5879341] 2024-06-09 21:31:45.415065 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '134'. It is not used by any node and should be removed from the model.
2024-06-09 21:31:45.415 node[57351:5879341] 2024-06-09 21:31:45.415073 [W:onnxruntime:, graph.cc:3490 CleanUnusedInitializersAndNodeArgs] Removing initializer '136'. It is not used by any node and should be removed from the model.
[VAD] vad is initialized
{
audio: Float32Array(6144) [
2, 0, 255, 255, 255, 255, 2, 0, 8, 0, 4, 0,
255, 255, 255, 255, 0, 0, 255, 255, 4, 0, 8, 0,
254, 255, 246, 255, 247, 255, 251, 255, 252, 255, 247, 255,
244, 255, 245, 255, 240, 255, 245, 255, 248, 255, 248, 255,
247, 255, 246, 255, 251, 255, 255, 255, 255, 255, 2, 0,
1, 0, 0, 0, 0, 0, 3, 0, 16, 0, 16, 0,
8, 0, 3, 0, 8, 0, 14, 0, 16, 0, 16, 0,
9, 0, 6, 0, 1, 0, 255, 255, 3, 0, 255, 255,
248, 255, 247, 255,
... 6044 more items
],
start: 192,
end: 480
}
{
audio: Float32Array(6144) [
4, 0, 6, 0, 3, 0, 0, 0, 6, 0, 12, 0,
10, 0, 9, 0, 10, 0, 7, 0, 11, 0, 5, 0,
253, 255, 251, 255, 247, 255, 248, 255, 250, 255, 247, 255,
249, 255, 252, 255, 0, 0, 0, 0, 0, 0, 1, 0,
2, 0, 5, 0, 255, 255, 0, 0, 6, 0, 7, 0,
1, 0, 0, 0, 1, 0, 254, 255, 3, 0, 8, 0,
2, 0, 3, 0, 4, 0, 0, 0, 3, 0, 2, 0,
0, 0, 1, 0, 255, 255, 251, 255, 255, 255, 255, 255,
247, 255, 246, 255,
... 6044 more items
],
start: 192,
end: 480
}
Found POSSIBLE speech block 2294
Found POSSIBLE speech block -2202
Found POSSIBLE speech block -2153
Found POSSIBLE speech block 2528
Found POSSIBLE speech block 2125
Found POSSIBLE speech block 2026
Found POSSIBLE speech block 2246
Found POSSIBLE speech block 2393
{
audio: Float32Array(6144) [
3, 0, 4, 0, 4, 0, 2, 0, 2, 0, 4, 0,
5, 0, 5, 0, 255, 255, 252, 255, 252, 255, 0, 0,
5, 0, 2, 0, 255, 255, 250, 255, 250, 255, 252, 255,
251, 255, 254, 255, 1, 0, 1, 0, 3, 0, 2, 0,
249, 255, 243, 255, 245, 255, 247, 255, 253, 255, 3, 0,
9, 0, 9, 0, 4, 0, 255, 255, 252, 255, 252, 255,
250, 255, 248, 255, 248, 255, 254, 255, 255, 255, 4, 0,
7, 0, 7, 0, 5, 0, 0, 0, 253, 255, 2, 0,
3, 0, 5, 0,
... 6044 more items
],
start: 192,
end: 480
}
Thoughts/suggestions?
For now, this is useless and the brute-force heuristic works - but it feels so dirty. How can I get VAD working on node?
Ideally, I'd like to use it as well with twilio's audio stream sockets, etc - but if I can't get it working with a simple mic, not going to bother with trying with twilio (and spending real $$ as opposed to just my time haha)
Example:
Running this script produces a false positive on every single data chunk with
{ start: 192, end: 480 }
- even when the mic is muted and dead silent in the room. Speaking into the mic changes nothing.I know the mic instance is working (picking up voice vs picking up silence) - because when I AM speaking into the mic, I do properly see the brute-force heuristic triggering.
Full example output from running the script above (snipped some repeated
Float32Array
blocks for brevity):Worth noting:
onnxruntime
complains - important? No clueaudio
object is output even when not speaking (silence in mic)Found POSSIBLE speech block XXXX
correctly triggers when I AM speaking in the micThoughts/suggestions?
For now, this is useless and the brute-force heuristic works - but it feels so dirty. How can I get VAD working on node?
Ideally, I'd like to use it as well with twilio's audio stream sockets, etc - but if I can't get it working with a simple mic, not going to bother with trying with twilio (and spending real $$ as opposed to just my time haha)