I2S multi stream to HTTP and VAD (AUD-5076)

Environment

Audio development kit: ESP32-LyraT
Audio kit version ESP32-LyraT v3.4
Module or chip used: ESP32-WROVER-E
IDF version 4.4.6 & ADF v2.4.1-26-gceca7b3d
IDF version 5.1.2 & ADF v2.6-23-gef058dac
Build system: idf.py

My goal is is to make VAD in parallel to http sending of the audio.

To do so I looked at the http save an play advanced example and I modified the VAD example.

I made a multi output pipeline from the i2s stream pipeline, one output is for the http pipeline the other is for the raw pipeline. and the while loop analyze with vad if someone speaking.

The code:

/* Example of Voice Activity Detection (VAD)

   This example code is in the Public Domain (or CC0 licensed, at your option.)

   Unless required by applicable law or agreed to in writing, this
   software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
   CONDITIONS OF ANY KIND, either express or implied.
*/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "freertos/FreeRTOS.h"
#include "esp_log.h"
#include "board.h"
#include "audio_common.h"
#include "audio_pipeline.h"
#include "esp_wifi.h"
#include "nvs_flash.h"
#include "i2s_stream.h"
#include "raw_stream.h"
#include "http_stream.h"
#include "periph_wifi.h"
#include "esp_http_client.h"
#include "filter_resample.h"
#include "esp_vad.h"

#include "audio_idf_version.h"

static const char *TAG = "HTTP-VAD";

#define VAD_SAMPLE_RATE_HZ 16000
#define VAD_FRAME_LENGTH_MS 30
#define VAD_BUFFER_LENGTH (VAD_FRAME_LENGTH_MS * VAD_SAMPLE_RATE_HZ / 1000)

audio_pipeline_handle_t pipeline_http, pipeline_vad;
audio_element_handle_t i2s_stream_reader, raw_read, http_stream_writer;

esp_err_t _http_stream_event_handle(http_stream_event_msg_t *msg)
{
    esp_http_client_handle_t http = (esp_http_client_handle_t)msg->http_client;
    char len_buf[16];
    static int total_write = 0;

    if (msg->event_id == HTTP_STREAM_PRE_REQUEST) {
        // set header
        ESP_LOGI(TAG, "[ + ] HTTP client HTTP_STREAM_PRE_REQUEST, lenght=%d", msg->buffer_len);
        esp_http_client_set_method(http, HTTP_METHOD_POST);
        char dat[10] = {0};
        snprintf(dat, sizeof(dat), "%d", VAD_SAMPLE_RATE_HZ);
        esp_http_client_set_header(http, "x-audio-sample-rates", dat);
        memset(dat, 0, sizeof(dat));
        snprintf(dat, sizeof(dat), "%d", 16);
        esp_http_client_set_header(http, "x-audio-bits", dat);
        memset(dat, 0, sizeof(dat));
        snprintf(dat, sizeof(dat), "%d", 1);
        esp_http_client_set_header(http, "x-audio-channel", dat);
        total_write = 0;
        return ESP_OK;
    }

    if (msg->event_id == HTTP_STREAM_ON_REQUEST) {
        // write data
        int wlen = sprintf(len_buf, "%x\r\n", msg->buffer_len);
        if (esp_http_client_write(http, len_buf, wlen) <= 0) {
            return ESP_FAIL;
        }
        if (esp_http_client_write(http, msg->buffer, msg->buffer_len) <= 0) {
            return ESP_FAIL;
        }
        if (esp_http_client_write(http, "\r\n", 2) <= 0) {
            return ESP_FAIL;
        }
        total_write += msg->buffer_len;
        // printf("Total bytes written: %d\n", total_write);
        return msg->buffer_len;
    }

    if (msg->event_id == HTTP_STREAM_POST_REQUEST) {
        ESP_LOGI(TAG, "[ + ] HTTP client HTTP_STREAM_POST_REQUEST, write end chunked marker");
        if (esp_http_client_write(http, "0\r\n\r\n", 5) <= 0) {
            return ESP_FAIL;
        }
        return ESP_OK;
    }

    if (msg->event_id == HTTP_STREAM_FINISH_REQUEST) {
        ESP_LOGI(TAG, "[ + ] HTTP client HTTP_STREAM_FINISH_REQUEST");
        char *buf = calloc(1, 64);
        assert(buf);
        int read_len = esp_http_client_read(http, buf, 64);
        if (read_len <= 0) {
            free(buf);
            return ESP_FAIL;
        }
        buf[read_len] = 0;
        ESP_LOGI(TAG, "Got HTTP Response = %s", (char *)buf);
        free(buf);
        return ESP_OK;
    }
    return ESP_OK;
}

void app_main()
{
    esp_log_level_set("*", ESP_LOG_WARN);
    esp_log_level_set(TAG, ESP_LOG_INFO);

    esp_err_t err = nvs_flash_init();
    if (err == ESP_ERR_NVS_NO_FREE_PAGES) {
        // NVS partition was truncated and needs to be erased
        // Retry nvs_flash_init
        ESP_ERROR_CHECK(nvs_flash_erase());
        err = nvs_flash_init();
    }
    #if (ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 1, 0))
        ESP_ERROR_CHECK(esp_netif_init());
    #else
        tcpip_adapter_init();
    #endif

    ESP_LOGI(TAG, "[ 1 ] Initialize Button Peripheral & Connect to wifi network");
    // Initialize peripherals management
    esp_periph_config_t periph_cfg = DEFAULT_ESP_PERIPH_SET_CONFIG();
    esp_periph_set_handle_t set = esp_periph_set_init(&periph_cfg);

    periph_wifi_cfg_t wifi_cfg = {
        .ssid = "NETGEAR",
        .password = "vanillacream239",
    };
    esp_periph_handle_t wifi_handle = periph_wifi_init(&wifi_cfg);

    // Start wifi & button peripheral
    esp_periph_start(set, wifi_handle);
    periph_wifi_wait_for_connected(wifi_handle, portMAX_DELAY);

    ESP_LOGI(TAG, "[ 2 ] Start codec chip");
    audio_board_handle_t board_handle = audio_board_init();
    audio_hal_ctrl_codec(board_handle->audio_hal, AUDIO_HAL_CODEC_MODE_BOTH, AUDIO_HAL_CTRL_START);

    ESP_LOGI(TAG, "[ 3.0 ] Create audio pipeline for recording");
    audio_pipeline_cfg_t pipeline_cfg = DEFAULT_AUDIO_PIPELINE_CONFIG();
    pipeline_http = audio_pipeline_init(&pipeline_cfg);
    mem_assert(pipeline_http);

    ESP_LOGI(TAG, "[3.1] Create i2s stream to read audio data from codec chip");
    i2s_stream_cfg_t i2s_cfg = I2S_STREAM_CFG_DEFAULT();
    i2s_cfg.i2s_config.sample_rate = VAD_SAMPLE_RATE_HZ;
    i2s_cfg.i2s_config.channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT; // Only use right channel to avoid add a filter on raw to vad
    i2s_cfg.out_rb_size = 32 * 1024; // Increase buffer to avoid missing data in bad network conditions
    i2s_cfg.task_core = 0;
    i2s_cfg.type = AUDIO_STREAM_READER;
    i2s_cfg.multi_out_num = 1;
#if defined CONFIG_ESP_LYRAT_MINI_V1_1_BOARD
    i2s_cfg.i2s_port = 1;
#if (ESP_IDF_VERSION <= ESP_IDF_VERSION_VAL(4, 0, 0))
    i2s_cfg.i2s_config.channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT;
#endif
#endif
    i2s_stream_reader = i2s_stream_init(&i2s_cfg);

    ESP_LOGI(TAG, "[3.2] Create http stream to post data to server");

    http_stream_cfg_t http_cfg = HTTP_STREAM_CFG_DEFAULT();
    http_cfg.type = AUDIO_STREAM_WRITER;
    http_cfg.out_rb_size = 32 * 1024;
    http_cfg.task_core = 1;
    http_cfg.event_handle = _http_stream_event_handle;
    http_stream_writer = http_stream_init(&http_cfg);
    audio_element_set_uri(http_stream_writer, "http://10.0.0.4:8000/upload");

    ESP_LOGI(TAG, "[ 3.3 ] Register all elements to HTTP pipeline");
    audio_pipeline_register(pipeline_http, i2s_stream_reader, "i2s");
    audio_pipeline_register(pipeline_http, http_stream_writer, "http");

    ESP_LOGI(TAG, "[ 3.4 ] Link elements together [codec_chip]-->i2s_stream-->http-->[http_server]");
    const char *link_tag[2] = {"i2s", "http"};
    audio_pipeline_link(pipeline_http, &link_tag[0], 2);

    ESP_LOGI(TAG, "[4.0] Create raw to receive data");
    raw_stream_cfg_t raw_cfg = {
        .out_rb_size = 32 * 1024,
        .type = AUDIO_STREAM_READER,
    };
    raw_read = raw_stream_init(&raw_cfg);

    ESP_LOGI(TAG, "[4.1] Create pipeline to vad file");
    audio_pipeline_cfg_t pipeline_vad_cfg = DEFAULT_AUDIO_PIPELINE_CONFIG();
    audio_pipeline_handle_t pipeline_vad = audio_pipeline_init(&pipeline_vad_cfg);

    ESP_LOGI(TAG, "[4.1] Register pipeline to vad");
    audio_pipeline_register(pipeline_vad, raw_read, "raw");

    const char *link_save[1] = {"raw"};
    audio_pipeline_link(pipeline_vad, &link_save[0], 1);

    ESP_LOGI(TAG, "[4.2] Connect input ringbuffer of pipeline_http to pipeline_vad");
    ringbuf_handle_t rb = audio_element_get_output_ringbuf(raw_read);
    audio_element_set_multi_output_ringbuf(i2s_stream_reader, rb, 0);

    ESP_LOGI(TAG, "[ 5 ] Start audio_pipeline");
    audio_pipeline_run(pipeline_http);
    audio_pipeline_run(pipeline_vad);

    ESP_LOGI(TAG, "[ 6 ] Initialize VAD handle");
    vad_handle_t vad_inst = vad_create(VAD_MODE_3);

    int16_t *vad_buff = (int16_t *)malloc(VAD_BUFFER_LENGTH * sizeof(short));
    if (vad_buff == NULL) {
        ESP_LOGE(TAG, "Memory allocation failed!");
    }
    int i=0;
    while (1) {
        raw_stream_read(raw_read, (char *)vad_buff, VAD_BUFFER_LENGTH * sizeof(short));

        // Feed samples to the VAD process and get the result
        vad_state_t vad_state = vad_process(vad_inst, vad_buff, VAD_SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
        if (vad_state == VAD_SPEECH) {
            ESP_LOGI(TAG, "Speech detected");
        }
        else{
            i+=1;
            if(i>70){
                ESP_LOGI(TAG, "No more speech");  
                break;
            }
        }
    }
    audio_element_set_ringbuf_done(i2s_stream_reader);
    free(vad_buff);
    vad_buff = NULL;

    ESP_LOGI(TAG, "[ 7 ] Destroy VAD");
    vad_destroy(vad_inst);

    ESP_LOGI(TAG, "[ 8 ] Stop audio_pipeline and release all resources");
    audio_pipeline_stop(pipeline_http);
    audio_pipeline_wait_for_stop(pipeline_http);
    audio_pipeline_terminate(pipeline_http);
    audio_pipeline_stop(pipeline_vad);
    audio_pipeline_wait_for_stop(pipeline_vad);
    audio_pipeline_terminate(pipeline_vad);

    audio_pipeline_unregister(pipeline_http, http_stream_writer);
    audio_pipeline_unregister(pipeline_http, i2s_stream_reader);
    audio_pipeline_unregister(pipeline_vad, raw_read);

    /* Terminate the pipeline before removing the listener */
    audio_pipeline_remove_listener(pipeline_http);
    audio_pipeline_remove_listener(pipeline_vad);

    /* Release all resources */
    audio_pipeline_deinit(pipeline_http);
    audio_pipeline_deinit(pipeline_vad);
    audio_element_deinit(http_stream_writer);
    audio_element_deinit(i2s_stream_reader);
    audio_element_deinit(raw_read);

}

The Log I get running on IDF 4.4:

I (29) boot: ESP-IDF v4.4.6-dirty 2nd stage bootloader
I (29) boot: compile time 11:24:51
I (29) boot: Multicore bootloader
I (33) boot: chip revision: v3.1
I (37) boot.esp32: SPI Speed      : 40MHz
I (41) boot.esp32: SPI Mode       : DIO
I (46) boot.esp32: SPI Flash Size : 8MB
I (50) boot: Enabling RNG early entropy source...
I (56) boot: Partition Table:
I (59) boot: ## Label            Usage          Type ST Offset   Length
I (67) boot:  0 nvs              WiFi data        01 02 00009000 00006000
I (74) boot:  1 phy_init         RF data          01 01 0000f000 00001000
I (82) boot:  2 factory          factory app      00 00 00010000 00177000
I (89) boot: End of partition table
I (93) esp_image: segment 0: paddr=00010020 vaddr=3f400020 size=368e0h (223456) map
I (183) esp_image: segment 1: paddr=00046908 vaddr=3ffb0000 size=037bch ( 14268) load
I (189) esp_image: segment 2: paddr=0004a0cc vaddr=40080000 size=05f4ch ( 24396) load
I (199) esp_image: segment 3: paddr=00050020 vaddr=400d0020 size=dea98h (912024) map
I (529) esp_image: segment 4: paddr=0012eac0 vaddr=40085f4c size=16ea0h ( 93856) load
I (581) boot: Loaded app from partition at offset 0x10000
I (582) boot: Disabling RNG early entropy source...
I (593) cpu_start: Multicore app
I (593) psram: This chip is ESP32-D0WD
I (595) spiram: Found 64MBit SPI RAM device
I (595) spiram: SPI RAM mode: flash 40m sram 40m
I (600) spiram: PSRAM initialized, cache is in low/high (2-core) mode.
I (608) cpu_start: Pro cpu up.
I (611) cpu_start: Starting app cpu, entry point is 0x40081750
0x40081750: call_start_cpu1 at /Users/edouardmallecourt/esp/v4.4/esp-idf/components/esp_system/port/cpu_start.c:151

I (0) cpu_start: App cpu up.
I (1505) spiram: SPI SRAM memory test OK
I (1513) cpu_start: Pro cpu start user code
I (1513) cpu_start: cpu freq: 240000000
I (1513) cpu_start: Application information:
I (1516) cpu_start: Project name:     http_and_vad
I (1521) cpu_start: App version:      1443a96-dirty
I (1527) cpu_start: Compile time:     Dec 13 2023 11:24:34
I (1533) cpu_start: ELF file SHA256:  9296dd32aa2d3579...
I (1539) cpu_start: ESP-IDF:          v4.4.6-dirty
I (1545) cpu_start: Min chip rev:     v0.0
I (1549) cpu_start: Max chip rev:     v3.99 
I (1554) cpu_start: Chip rev:         v3.1
I (1559) heap_init: Initializing. RAM available for dynamic allocation:
I (1567) heap_init: At 3FFAE6E0 len 00001920 (6 KiB): DRAM
I (1573) heap_init: At 3FFB8D88 len 00027278 (156 KiB): DRAM
I (1579) heap_init: At 3FFE0440 len 00003AE0 (14 KiB): D/IRAM
I (1585) heap_init: At 3FFE4350 len 0001BCB0 (111 KiB): D/IRAM
I (1592) heap_init: At 4009CDEC len 00003214 (12 KiB): IRAM
I (1598) spiram: Adding pool of 4095K of external SPI memory to heap allocator
I (1607) spi_flash: detected chip: gd
I (1610) spi_flash: flash io: dio
I (1616) cpu_start: Starting scheduler on PRO CPU.
I (0) cpu_start: Starting scheduler on APP CPU.
I (1625) spiram: Reserving pool of 32K of internal memory for DMA/internal allocations
I (1665) HTTP-VAD: [ 1 ] Initialize Button Peripheral & Connect to wifi network
W (1685) phy_init: failed to load RF calibration data (0xffffffff), falling back to full calibration
W (2875) PERIPH_WIFI: WiFi Event cb, Unhandle event_base:WIFI_EVENT, event_id:4
I (4875) HTTP-VAD: [ 2 ] Start codec chip
E (4875) gpio: gpio_install_isr_service(450): GPIO isr service already installed
I (4895) HTTP-VAD: [ 3.0 ] Create audio pipeline for recording
I (4895) HTTP-VAD: [3.1] Create i2s stream to read audio data from codec chip
I (4905) HTTP-VAD: [3.2] Create http stream to post data to server
I (4905) HTTP-VAD: [ 3.3 ] Register all elements to HTTP pipeline
I (4915) HTTP-VAD: [ 3.4 ] Link elements together [codec_chip]-->i2s_stream-->http-->[http_server]
I (4925) HTTP-VAD: [4.0] Create raw to receive data
I (4935) HTTP-VAD: [4.1] Create pipeline to vad file
I (4935) HTTP-VAD: [4.1] Register pipeline to vad
I (4945) HTTP-VAD: [4.2] Connect input ringbuffer of pipeline_http to pipeline_vad
I (4945) HTTP-VAD: [ 5 ] Start audio_pipeline
I (4955) HTTP-VAD: [ 6 ] Initialize VAD handle
I (4955) HTTP-VAD: [ + ] HTTP client HTTP_STREAM_PRE_REQUEST, lenght=0
I (4985) HTTP-VAD: Speech detected
I (4985) HTTP-VAD: Speech detected
I (5035) HTTP-VAD: Speech detected
Total bytes written: 4096
Total bytes written: 8192
Total bytes written: 12288
Total bytes written: 16384
Total bytes written: 20480
Total bytes written: 24576
I (5935) HTTP-VAD: Speech detected
Total bytes written: 28672
Total bytes written: 32768
Total bytes written: 36864
Total bytes written: 40960
I (6645) HTTP-VAD: Speech detected
I (6645) HTTP-VAD: Speech detected
I (6655) HTTP-VAD: Speech detected
Total bytes written: 45056
I (6835) HTTP-VAD: Speech detected
Total bytes written: 49152
Total bytes written: 53248
Total bytes written: 57344
I (7345) HTTP-VAD: Speech detected
I (7345) HTTP-VAD: Speech detected
I (7345) HTTP-VAD: Speech detected
I (7415) HTTP-VAD: Speech detected
I (7415) HTTP-VAD: Speech detected
Total bytes written: 61440
Total bytes written: 65536
Total bytes written: 69632
Total bytes written: 73728
Total bytes written: 77824
Total bytes written: 81920
Total bytes written: 86016
Total bytes written: 90112
I (8505) HTTP-VAD: Speech detected
I (8505) HTTP-VAD: Speech detected
Total bytes written: 94208
I (8565) HTTP-VAD: Speech detected
Total bytes written: 98304
Total bytes written: 102400
I (9015) HTTP-VAD: Speech detected
I (9015) HTTP-VAD: Speech detected
I (9015) HTTP-VAD: Speech detected
Total bytes written: 106496
Total bytes written: 110592
Total bytes written: 114688
Total bytes written: 118784
Total bytes written: 122880
Total bytes written: 126976
I (9835) HTTP-VAD: Speech detected
I (9835) HTTP-VAD: Speech detected
Total bytes written: 131072
I (10045) HTTP-VAD: Speech detected
I (10045) HTTP-VAD: Speech detected
Total bytes written: 135168
Total bytes written: 139264
Total bytes written: 143360
Total bytes written: 147456
Total bytes written: 151552
Total bytes written: 155648
Total bytes written: 159744
Total bytes written: 163840
Total bytes written: 167936
Total bytes written: 172032
I (11765) HTTP-VAD: Speech detected
I (11765) HTTP-VAD: Speech detected
I (11825) HTTP-VAD: Speech detected
I (11825) HTTP-VAD: Speech detected
I (11895) HTTP-VAD: Speech detected
I (11895) HTTP-VAD: Speech detected
I (11895) HTTP-VAD: Speech detected
I (11955) HTTP-VAD: Speech detected
I (11955) HTTP-VAD: Speech detected
I (12015) HTTP-VAD: Speech detected
I (12015) HTTP-VAD: Speech detected
I (12145) HTTP-VAD: Speech detected
I (12215) HTTP-VAD: Speech detected
I (12215) HTTP-VAD: Speech detected
I (12275) HTTP-VAD: Speech detected
I (12275) HTTP-VAD: Speech detected
I (12325) HTTP-VAD: Speech detected
I (12405) HTTP-VAD: Speech detected
I (12465) HTTP-VAD: Speech detected
I (12465) HTTP-VAD: Speech detected
Total bytes written: 176128
Total bytes written: 180224
Total bytes written: 184320
Total bytes written: 188416
I (14395) HTTP-VAD: No more speech
I (14395) HTTP-VAD: [ 7 ] Destroy VAD
I (14395) HTTP-VAD: [ 8 ] Stop audio_pipeline and release all resources
Total bytes written: 192512
I (14395) HTTP-VAD: [ + ] HTTP client HTTP_STREAM_POST_REQUEST, write end chunked marker
I (14415) HTTP-VAD: [ + ] HTTP client HTTP_STREAM_FINISH_REQUEST
W (14425) HTTP_CLIENT: esp_transport_read returned:-1 and errno:128 
I (14425) HTTP-VAD: Got HTTP Response = File 20231213T102637Z_16000_16_1.wav was written, size 192512
W (14445) AUDIO_PIPELINE: There are no listener registered
W (14445) AUDIO_PIPELINE: There are no listener registered
W (14455) AUDIO_PIPELINE: There are no listener registered
W (14455) AUDIO_PIPELINE: There are no listener registered
W (14465) AUDIO_ELEMENT: [http] Element has not create when AUDIO_ELEMENT_TERMINATE
W (14475) AUDIO_ELEMENT: [i2s] Element has not create when AUDIO_ELEMENT_TERMINATE
W (14485) AUDIO_ELEMENT: [raw] Element has not create when AUDIO_ELEMENT_TERMINATE

So the code runs well I get the sound on the server. But two major issues, the VAD isn't very good and there is many glitch on the sound of the server. (the sound on the server is recorded by the server.py from the pipeline_raw_http example)

What I feel is that the glitches on the server come when the vad process manage to run and the raw pipeline consume sometimes the buffer

Is it a problem of configuration in my multi output pipeline or it's something else?

Thank you for your help

Edouard

espressif / esp-adf

I2S multi stream to HTTP and VAD (AUD-5076) #1120

Environment