espressif / esp-adf

Espressif Audio Development Framework
Other
1.56k stars 689 forks source link

M4A parsing fails for files recorded on Android (Google Pixel 8) devices (AUD-5792) #1298

Closed nathan-swidget closed 3 weeks ago

nathan-swidget commented 1 month ago

Custom Board using ESP-S3-PICO-1 ESP-IDF: release/v5.3 (707d097b01756687cca18be855a2675d150247ae)
ESP-ADF: v2.7 (9cf556de500019bb79f3bb84c821fda37668c052)

Hey, I'm running into issues when trying to playback M4A recordings generated from Android devices. The sample m4a file, aswell as recordings from iOS work as expected.

E (980) M4A_PARSER: Sample number miss match, line:566
E (990) M4A_PARSER: Error opening audio
E (995) AAC_DECODER: Error mp4 stream, release
E (1000) AUDIO_ELEMENT: [dec] AEL_STATUS_ERROR_OPEN,-1

The full log output can be found below.

Example of failing file: https://spades-test-public-access.s3.us-east-1.amazonaws.com/android.m4a

#include "audio_element.h"
#include "audio_pipeline.h"
#include "audio_event_iface.h"
#include "audio_mem.h"
#include "audio_common.h"
#include "i2s_stream.h"
#include "mp3_decoder.h"
#include "aac_decoder.h"
#include "http_stream.h"
#include "board.h"
#include <string.h>

#include <string>

static const char *TAG = "audio test app";

extern const uint8_t audio_start_asm[] asm("_binary_android_m4a_start");
extern const uint8_t audio_end_asm[] asm("_binary_android_m4a_end");

typedef struct  {
    const uint8_t* data;
    int           size;
} buf_stream_t;

typedef struct {
    const uint8_t* data;
    int size;
} buf_stream_cfg_t;

static esp_err_t buf_stream_open(audio_element_handle_t self)
{
    buf_stream_t *buf_stream = (buf_stream_t *)audio_element_getdata(self);
    int ret = audio_element_set_total_bytes(self, buf_stream->size);
    return ret;
}

static audio_element_err_t buf_stream_read(audio_element_handle_t self, char *buffer, int len, TickType_t ticks_to_wait, void *context)
{
    buf_stream_t *buf_stream = (buf_stream_t *)audio_element_getdata(self);
    audio_element_info_t info;
    audio_element_getinfo(self, &info);
    int pos = info.byte_pos;
    int rlen = len;
    if (pos + rlen > buf_stream->size) {
        rlen = buf_stream->size - pos;
    }
    if (rlen) {
        memcpy(buffer, buf_stream->data + pos, rlen);
    }
    printf("Start to read %d/%d\n", pos, buf_stream->size);
    if (rlen == 0) {
        printf("No more data, ret: %d\r\n", rlen);
    } else {
        audio_element_update_byte_pos(self, rlen);
    }
    return (audio_element_err_t)rlen;
}

static audio_element_err_t buf_stream_process(audio_element_handle_t self, char *in_buffer, int in_len)
{
    int r_size = audio_element_input(self, in_buffer, in_len);
    int w_size = 0;
    if (r_size > 0) {
        w_size = audio_element_output(self, in_buffer, r_size);
    } else {
        w_size = r_size;
    }
    return (audio_element_err_t)w_size;
}

static esp_err_t buf_stream_close(audio_element_handle_t self)
{
    return ESP_OK;
}

static esp_err_t buf_stream_destroy(audio_element_handle_t self)
{
    buf_stream_t *buf_stream = (buf_stream_t *)audio_element_getdata(self);
    audio_free(buf_stream);
    return ESP_OK;
}

audio_element_handle_t buf_stream_init(buf_stream_cfg_t * buf_cfg)
{
    audio_element_handle_t el;
    buf_stream_t *buf_stream = (buf_stream_t*)audio_calloc(1, sizeof(buf_stream_t));
    buf_stream->data = buf_cfg->data;
    buf_stream->size = buf_cfg->size;

    audio_element_cfg_t cfg = DEFAULT_AUDIO_ELEMENT_CONFIG();
    cfg.open = buf_stream_open;
    cfg.close = buf_stream_close;
    cfg.process = buf_stream_process;
    cfg.destroy = buf_stream_destroy;
    cfg.task_stack = 3*1024;
    cfg.task_prio = 5;
    cfg.out_rb_size = 4*1024,
    cfg.buffer_len = 1024;
    cfg.tag = "buf";
    cfg.read = buf_stream_read;
    el = audio_element_init(&cfg);

    audio_element_setdata(el, buf_stream);
    return el;
}
static i2s_stream_cfg_t getDefaultI2CConfig() {
    i2s_stream_cfg_t cfg{
        .type = AUDIO_STREAM_WRITER,
        .transmit_mode = I2S_COMM_MODE_STD,
        .chan_cfg = {
            .id = I2S_NUM_0,
            .role = I2S_ROLE_MASTER,
            .dma_desc_num = 3,
            .dma_frame_num = 312,
            .auto_clear = true
        },
        .std_cfg = {
            .clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
            .slot_cfg = {
                .data_bit_width = I2S_DATA_BIT_WIDTH_16BIT,
                .slot_bit_width = I2S_SLOT_BIT_WIDTH_AUTO,
                .slot_mode = I2S_SLOT_MODE_MONO,
                .slot_mask = I2S_STD_SLOT_RIGHT,
                .ws_width = I2S_DATA_BIT_WIDTH_16BIT,
                .ws_pol = false,
                .bit_shift = true,
                #if SOC_I2S_HW_VERSION_1
                .msb_right = true,
                #else
                .left_align = true,
                .big_endian = false,
                .bit_order_lsb = false
                #endif
            },
            .gpio_cfg = { // this is loaded from custom board info
                .invert_flags = {
                    .mclk_inv = false,
                    .bclk_inv = false,
                }
            }
        },
        .use_alc = true,
        .volume = 100,
        .out_rb_size = I2S_STREAM_RINGBUFFER_SIZE,                                 
        .task_stack = I2S_STREAM_TASK_STACK,                                       
        .task_core = I2S_STREAM_TASK_CORE,                                         
        .task_prio = I2S_STREAM_TASK_PRIO,                                         
        .stack_in_ext = false,                                                     
        .multi_out_num = 0,                                                        
        .uninstall_drv = true,                                                     
        .need_expand = false,                                                      
        .buffer_len = I2S_STREAM_BUF_SIZE                                     
    };

    return cfg;
}

extern "C" void play_audio() {
    audio_board_handle_t board_handle = audio_board_init();
    audio_hal_ctrl_codec(board_handle->audio_hal, AUDIO_HAL_CODEC_MODE_DECODE, AUDIO_HAL_CTRL_START);

    audio_pipeline_cfg_t pipeline_cfg = DEFAULT_AUDIO_PIPELINE_CONFIG();
    audio_pipeline_handle_t pipeline = audio_pipeline_init(&pipeline_cfg);

    audio_hal_set_volume(board_handle->audio_hal, 1);
    i2s_stream_cfg_t i2s_cfg = getDefaultI2CConfig();
    audio_element_handle_t i2sWriter = i2s_stream_init(&i2s_cfg);
    i2s_stream_set_clk(i2sWriter, 16000, 16, 1);

    audio_element_handle_t source;

    buf_stream_cfg_t buf_stream_cfg = {
        .data = audio_start_asm,
        .size = (int)(audio_end_asm - audio_start_asm),
    };

    source = buf_stream_init(&buf_stream_cfg);

    audio_element_handle_t decoder;
    aac_decoder_cfg_t aac_dec_cfg  = DEFAULT_AAC_DECODER_CONFIG();
    decoder = aac_decoder_init(&aac_dec_cfg);

    const char *link_tag[3] = {"source", "dec",  "i2s"};
    audio_pipeline_register(pipeline, source, "source");
    audio_pipeline_register(pipeline, decoder, "dec");
    audio_pipeline_register(pipeline, i2sWriter, "i2s");

    audio_pipeline_link(pipeline, &link_tag[0], 3);
    audio_event_iface_cfg_t evt_cfg = AUDIO_EVENT_IFACE_DEFAULT_CFG();
    audio_event_iface_handle_t eventHandle = audio_event_iface_init(&evt_cfg);
    audio_pipeline_set_listener(pipeline, eventHandle);
    audio_pipeline_run(pipeline);
    while(1) {
        audio_event_iface_msg_t msg;
        esp_err_t ret = audio_event_iface_listen(eventHandle, &msg, portMAX_DELAY);
        if (ret != ESP_OK) continue;
        if (msg.cmd == AEL_MSG_CMD_STOP) {
            break;
        }
        if (msg.source_type == AUDIO_ELEMENT_TYPE_ELEMENT && msg.cmd == AEL_MSG_CMD_REPORT_MUSIC_INFO) {
            audio_element_info_t music_info = {0};
            audio_element_getinfo((audio_element_handle_t)msg.source, &music_info);
            i2s_stream_set_clk(i2sWriter, music_info.sample_rates, music_info.bits, music_info.channels);
            continue;
        }

        if (msg.source_type == AUDIO_ELEMENT_TYPE_ELEMENT && msg.source == (void *) i2sWriter
            && msg.cmd == AEL_MSG_CMD_REPORT_STATUS) {
            if ((int)msg.data == AEL_STATUS_STATE_FINISHED || (int)msg.data == AEL_STATUS_STATE_STOPPED) {
                break;
            }
        }
    }

    audio_pipeline_stop(pipeline);
    audio_pipeline_wait_for_stop(pipeline);
    audio_pipeline_unlink(pipeline);
    audio_pipeline_unregister(pipeline, i2sWriter);
    audio_element_deinit(i2sWriter);
    audio_pipeline_unregister(pipeline, decoder);
    audio_element_deinit(decoder);
    audio_pipeline_remove_listener(pipeline);
}

extern "C" void app_main() {
    play_audio();
}

Log:

ESP-ROM:esp32s3-20210327
Build:Mar 27 2021
rst:0x1 (POWERON),boot:0x2b (SPI_FAST_FLASH_BOOT)
SPIWP:0xee
mode:DIO, clock div:1
load:0x3fce2810,len:0x178c
load:0x403c8700,len:0x4
load:0x403c8704,len:0xcb8
load:0x403cb700,len:0x2db0
entry 0x403c8914
I (27) boot: ESP-IDF v5.3-384-g1216499a98-dirty 2nd stage bootloader
I (27) boot: compile time Oct 23 2024 15:20:03
I (28) boot: Multicore bootloader
I (32) boot: chip revision: v0.2
I (35) boot.esp32s3: Boot SPI Speed : 80MHz
I (40) boot.esp32s3: SPI Mode       : DIO
I (45) boot.esp32s3: SPI Flash Size : 8MB
I (50) boot: Enabling RNG early entropy source...
I (55) boot: Partition Table:
I (59) boot: ## Label            Usage          Type ST Offset   Length
I (66) boot:  0 nvs              WiFi data        01 02 00009000 00004000
I (73) boot:  1 otadata          OTA data         01 00 0000d000 00002000
I (81) boot:  2 phy_init         RF data          01 01 0000f000 00001000
I (88) boot:  3 ota_0            OTA app          00 10 00010000 00300000
I (96) boot:  4 ota_1            OTA app          00 11 00310000 00300000
I (103) boot:  5 factory_nvs      WiFi data        01 02 00610000 00004000
I (111) boot:  6 nvs_keys         NVS keys         01 04 00614000 00001000
I (118) boot:  7 storage          Unknown data     01 82 00615000 00060000
I (126) boot:  8 audio_clips      Unknown data     01 83 00675000 0018b000
I (134) boot: End of partition table
I (138) boot: No factory image, trying OTA 0
I (143) esp_image: segment 0: paddr=00010020 vaddr=3c040020 size=2e260h (189024) map
I (185) esp_image: segment 1: paddr=0003e288 vaddr=3fc94f00 size=01d90h (  7568) load
I (187) esp_image: segment 2: paddr=00040020 vaddr=42000020 size=3d6fch (251644) map
I (236) esp_image: segment 3: paddr=0007d724 vaddr=3fc96c90 size=00ebch (  3772) load
I (237) esp_image: segment 4: paddr=0007e5e8 vaddr=40374000 size=10e24h ( 69156) load
I (264) boot: Loaded app from partition at offset 0x10000
I (290) boot: Set actual ota_seq=1 in otadata[0]
I (290) boot: Disabling RNG early entropy source...
I (301) esp_psram: Found 2MB PSRAM device
I (301) esp_psram: Speed: 40MHz
I (301) cpu_start: Multicore app
I (721) esp_psram: SPI SRAM memory test OK
I (730) cpu_start: Pro cpu start user code
I (730) cpu_start: cpu freq: 160000000 Hz
I (730) app_init: Application information:
I (733) app_init: Project name:     audio_tst
I (738) app_init: App version:      e7c7a95-dirty
I (743) app_init: Compile time:     Oct 23 2024 15:18:41
I (749) app_init: ELF file SHA256:  bb46422c7...
I (755) app_init: ESP-IDF:          v5.3-384-g1216499a98-dirty
I (761) efuse_init: Min chip rev:     v0.0
I (766) efuse_init: Max chip rev:     v0.99 
I (771) efuse_init: Chip rev:         v0.2
I (776) heap_init: Initializing. RAM available for dynamic allocation:
I (783) heap_init: At 3FC985F8 len 00051118 (324 KiB): RAM
I (789) heap_init: At 3FCE9710 len 00005724 (21 KiB): RAM
I (795) heap_init: At 3FCF0000 len 00008000 (32 KiB): DRAM
I (801) heap_init: At 600FE100 len 00001EE8 (7 KiB): RTCRAM
I (808) esp_psram: Adding pool of 2048K of PSRAM memory to heap allocator
I (816) spi_flash: detected chip: gd
I (819) spi_flash: flash io: dio
I (823) sleep: Configure to isolate all GPIO pins in sleep state
I (830) sleep: Enable automatic switching of GPIO sleep configuration
I (837) main_task: Started on CPU0
I (858) esp_psram: Reserving pool of 32K of internal memory for DMA/internal allocations
I (859) main_task: Calling app_main()
I (860) new_codec: new_codec init
I (864) AUDIO_HAL: Codec mode is 2, Ctrl:1
I (871) AUDIO_PIPELINE: link el->rb, el:0x3c070f4c, tag:source, rb:0x3c071260
I (877) AUDIO_PIPELINE: link el->rb, el:0x3c0710f0, tag:dec, rb:0x3c0722a8
I (885) AUDIO_THREAD: The source task allocate stack on internal memory
I (892) AUDIO_ELEMENT: [source-0x3c070f4c] Element task created
I (898) AUDIO_THREAD: The dec task allocate stack on external memory
I (906) AUDIO_ELEMENT: [dec-0x3c0710f0] Element task created
I (912) AUDIO_THREAD: The i2s task allocate stack on internal memory
I (919) AUDIO_ELEMENT: [i2s-0x3c070c40] Element task created
I (925) AUDIO_PIPELINE: Func:audio_pipeline_run, Line:359, MEM Total:2408484 Bytes, Inter:364199 Bytes, Dram:364199 Bytes, Dram largest free:270336Bytes

I (939) AUDIO_ELEMENT: [source] AEL_MSG_CMD_RESUME,state:1
Start to read 0/84596
Start to read 1024/84596
Start to read 2048/84596
Start to read 3072/84596
Start to read 4096/84596
I (957) AUDIO_ELEMENT: [dec] AEL_MSG_CMD_RESUME,state:1
I (963) CODEC_ELEMENT_HELPER: The element is 0x3c0710f0. The reserve data 2 is 0x0.
I (971) AAC_DECODER: A new song playing
Start to read 5120/84596
Start to read 6144/84596
Start to read 7168/84596
Start to read 8192/84596
E (980) M4A_PARSER: Sample number miss match, line:566
E (990) M4A_PARSER: Error opening audio
E (995) AAC_DECODER: Error mp4 stream, release
E (1000) AUDIO_ELEMENT: [dec] AEL_STATUS_ERROR_OPEN,-1
W (1006) AUDIO_ELEMENT: [dec] audio_element_on_cmd_error,7
I (1012) AAC_DECODER: Closed by [7]
Start to read 9216/84596
W (1018) AUDIO_ELEMENT: OUT-[source] AEL_IO_ABORT
I (1024) AUDIO_ELEMENT: [i2s] AEL_MSG_CMD_RESUME,state:1
W (1057) AUDIO_ELEMENT: IN-[i2s] AEL_IO_ABORT
I (1057) AUDIO_PIPELINE: Pipeline started
W (1057) AUDIO_ELEMENT: [source] Element already stopped
W (1062) AUDIO_ELEMENT: [dec] Element already stopped
W (1067) AUDIO_ELEMENT: [i2s] Element already stopped
I (1073) AUDIO_PIPELINE: audio_pipeline_unlinked
W (1078) AUDIO_ELEMENT: [i2s] Element already stopped
W (1085) AUDIO_ELEMENT: [dec] Element already stopped
I (1090) CODEC_ELEMENT_HELPER: The element is 0x3c0710f0. The reserve data 2 is 0x0.
W (1098) AUDIO_PIPELINE: There are no listener registered
I (1104) main_task: Returned from app_main()
TempoTian commented 3 weeks ago

After check, we found the m4a use co64 instead of stco box, which not yet supported by the parser. image

We will check and try to enhance the parse logic. So your use senario including playback all recorded m4a file from all mobile devices? Use co64 box waste some storage size for small file ( < 4GB) actually, not quite suitable for IOT devices (which commonly use FATFS).

TempoTian commented 3 weeks ago

I have done the test for co64 support, you can dowload following lib and replace into folder esp-adf-libs/esp_codec/lib/esp32s3 and test whether it fix your issues. libesp_processing.zip

nathan-swidget commented 3 weeks ago

This lib does fix the issue, thanks!