alphacep / vosk-asterisk

Speech Recognition in Asterisk with Vosk Server
GNU General Public License v2.0
104 stars 41 forks source link

Added multiple server, each for separate language model support #8

Open Lirein opened 4 years ago

Lirein commented 4 years ago

I has been added support for load and select for the different language models per server. Of course it will be the one server with multiple models, but in the real case each model may work on the different server or the different docker container.

nshmyrev commented 4 years ago

Thank you for the patch. Looks great, I would design it a bit differently though:

  1. Implement set_grammar to configure the URL probably instead of config.
  2. Pass language uniformly as an URL parameter.
  3. Configure nginx proxy to map url parameters to decoder servers

There are minor things like spaces in the code and removed last commit change to reduce wait time, but design is the major one.

Lirein commented 4 years ago

I have applyed patch to the my code tree, need's to modify ASR server to accepts and processing grammar. Added grammar upload, activation, deactivation, language selection by the config fle. Activate by url may be supported in the next path - ws_url - change to "server" parameter and "language" would be add an ?language=%s to the websocket URL.

Also it need's patch to the app_speech_utils.c and speech.h header file to the all real cases recognition supporting. This patch will be included later.

Lirein commented 4 years ago

diff -u app_speech_utils.c ./asterisk-certified-16.8-cert3/apps/app_speech_utils.c

--- app_speech_utils.c  2020-07-08 14:55:34.208726295 +0500
+++ ./asterisk-certified-16.8-cert3/apps/app_speech_utils.c 2020-07-08 14:31:55.650695937 +0500
@@ -772,6 +772,7 @@
    }
    ast_channel_unlock(chan);

+   ast_clear_flag(speech, AST_SPEECH_STREAM);
    /* Before we go into waiting for stuff... make sure the structure is ready, if not - start it again */
    if (speech->state == AST_SPEECH_STATE_NOT_READY || speech->state == AST_SPEECH_STATE_DONE) {
        ast_speech_change_state(speech, AST_SPEECH_STATE_NOT_READY);
@@ -789,6 +790,7 @@
            ast_stopstream(chan);
            /* Start new stream */
            speech_streamfile(chan, filename, ast_channel_language(chan));
+           ast_set_flag(speech, AST_SPEECH_STREAM);
        }

        /* Run scheduled stuff */
@@ -835,6 +837,7 @@
            if (ast_channel_streamid(chan) == -1 && ast_channel_timingfunc(chan) == NULL)
                ast_stopstream(chan);
            if (!quieted && ast_channel_stream(chan) == NULL && timeout && started == 0 && !filename_tmp) {
+               ast_clear_flag(speech, AST_SPEECH_STREAM);
                if (timeout == -1) {
                    done = 1;
                    if (f)
@@ -851,6 +854,7 @@
            break;
        case AST_SPEECH_STATE_WAIT:
            /* Cue up waiting sound if not already playing */
+           ast_clear_flag(speech, AST_SPEECH_STREAM);
            if (!strlen(dtmf)) {
                if (ast_channel_stream(chan) == NULL) {
                    if (speech->processing_sound != NULL) {
@@ -879,6 +883,7 @@
                /* Stop audio playback */
                if (ast_channel_stream(chan) != NULL) {
                    ast_stopstream(chan);
+                   ast_clear_flag(speech, AST_SPEECH_STREAM);
                }
            }
            break;
@@ -930,12 +935,14 @@

    if (!ast_strlen_zero(dtmf)) {
        /* We sort of make a results entry */
-       speech->results = ast_calloc(1, sizeof(*speech->results));
-       if (speech->results != NULL) {
+       struct ast_speech_result *current_result = ast_calloc(sizeof(struct ast_speech_result), 1);
+       if (current_result != NULL) {
            ast_speech_dtmf(speech, dtmf);
-           speech->results->score = 1000;
-           speech->results->text = ast_strdup(dtmf);
-           speech->results->grammar = ast_strdup("dtmf");
+           current_result->score = 1000;
+           current_result->text = ast_strdup(dtmf);
+           current_result->grammar = ast_strdup("dtmf");
+           current_result->list.next = speech->results;
+           speech->results = current_result
        }
        ast_speech_change_state(speech, AST_SPEECH_STATE_NOT_READY);
    }

diff -u speech.h ./asterisk-certified-16.8-cert3/include/asterisk/speech.h

--- speech.h    2020-06-19 22:47:29.000000000 +0500
+++ ./asterisk-certified-16.8-cert3/include/asterisk/speech.h   2020-07-08 14:03:04.739300555 +0500
@@ -32,7 +32,9 @@
    AST_SPEECH_QUIET = (1 << 0),        /* Quiet down output... they are talking */
    AST_SPEECH_SPOKE = (1 << 1),        /* Speaker spoke! */
    AST_SPEECH_HAVE_RESULTS = (1 << 2), /* Results are present */
+   AST_SPEECH_HAS_STREAM = (1 << 3),       /* Stream are playing to oouput */
 };
+#define AST_SPEECH_STREAM AST_SPEECH_HAS_STREAM

 /* Speech structure states - in order of expected change */
 enum ast_speech_states {