espressif / ESP8266_NONOS_SDK

ESP8266 nonOS SDK
Other
916 stars 533 forks source link

crashes in ieee80211_setup_ratetable #320

Open flameboss opened 4 years ago

flameboss commented 4 years ago

In order to understand why customers were having connection problems, I integrated code from Arduino and other sources to report crashes back to our server. And I've seen over 1000 crash reports in just a few weeks. I'm attaching the code I use to save the exception info to flash.

The most common crash happens when ieee80211_setup_ratetable is calling memcpy.

How can I troubleshoot this?

Here is the info from printstack.sh

Fatal exception 29 STORE_PROHIBITED pc=0x4000df64 sp=0x3ffffbe0 excvaddr=0x00000000 ps=0x00000030 sar=0x00000004 vpri=0x4022426b r00: 0x400018bc=1073748156 r01: 0x3ffffbe0=1073740768 r02: 0x00000000= 0 r03: 0x4024f304=1076163332 r04: 0x000000d4= 212 r05: 0x00000000= 0 r06: 0x0000000c= 12 r07: 0x00000000= 0 r08: 0x4024f3d4=1076163540 r09: 0x00000000= 0 r10: 0x00000000= 0 r11: 0x00000000= 0 r12: 0x00000000= 0 r13: 0x3fff4720=1073694496 r14: 0x00000016= 22 r15: 0x3fff4074=1073692788 section 1 type 1 len 976

Stack dump x3ffffbe0 x3d0: 3ffffbe0: 4000df64 00000030 00000004 4022426b
3ffffbf0: 3fff4720 00000016 3fff4074 0000001d
3ffffc00: 00000000 3fff6660 00000000 4024b7b8
3ffffc10: 00000000 3fffbff0 3fff96c8 401077a8
3ffffc20: 4020769b 00000000 00000020 4010522e
3ffffc30: 3fff4720 3ffffdb0 3ffffdb0 00000000
3ffffc40: 3fff1958 3fff40c0 ffffffcf 3fff1968
3ffffc50: 40000f58 00000000 000088c0 00000000
3ffffc60: 00000000 00000000 3fff1a1d 3fff19e3
3ffffc70: 00000000 3fff1958 3fff37f0 3fff1958
3ffffc80: 00000080 00000080 00000000 3fffdcb0
3ffffc90: 3ffffed8 4010027d 000000b4 00000002
3ffffca0: 3ffe8f90 00006274 3ffef1fc 00000001
3ffffcb0: 00006274 5ffffe00 00000001 401000a0
3ffffcc0: 4a4ef8f8 383c4046 01010000 05040302
3ffffcd0: 00000000 00000000 00000000 00000000
3ffffce0: 00000000 00000000 00000000 00000000
3ffffcf0: 00000000 00000000 00000000 00000000
3ffffd00: 00000000 00000000 00000000 00000000
3ffffd10: 00000000 00000000 00000000 00000000

0x4000df64: ?? ??:0 0x00000000: ?? ??:0 0x4020769b: ieee80211_setup_ratetable at ??:? 0x40000f58: ?? ??:0 0x3ffffed8: ?? ??:0 0x00006274: ?? ??:0

flameboss commented 4 years ago

crash.c

/*
 * Handle crashes
 * Code adapted from Arduino ESP8266, esp-gdbstub and Espruino
 * https://gitlab.fai.utb.cz/jurenat/Espruino/commit/30aae54c80b6958d22447fc76f6d15d4c9da81c0
 */

#include <crash.h>
#include <bsp.h>
#include <xtensa/corebits.h>

struct XTensa_exception_frame_s {
    uint32_t pc;
    uint32_t ps;
    uint32_t sar;
    uint32_t vpri;
    uint32_t a0;
    uint32_t a[14]; //a2..a15
//These are added manually by the exception code; the HAL doesn't set these on an exception.
    uint32_t litbase;
    uint32_t sr176;
    uint32_t sr208;
    uint32_t a1;
    uint32_t reason;
    uint32_t excvaddr;
};

struct XTensa_exception_frame_s gdbstub_savedRegs;

void _xtos_set_exception_handler(int cause, void (exhandler)(struct XTensa_exception_frame_s *frame));

void gdbstub_save_extra_sfrs_for_exception();

uint8_t crash_in_exception;

static uint32_t crash_dump_len(uint32_t start, uint32_t end)
{
    uint32_t stack_len = end - start;
    uint32_t image_len =    sizeof(crash_header_st) +
                            sizeof(crash_sec_header_st) * 2 +
                            sizeof(gdbstub_savedRegs) +
                            stack_len;
    return image_len;
}

static void crash_dump_write(int fd, uint32_t in_faddr, uint32_t start, uint32_t end)
{
    uint32_t faddr = in_faddr;
    crash_header_st hdr;
    crash_sec_header_st sec_hdr;
    int32_t stack_len = end - start;

    hdr.version = CRASH_IMAGE_VERSION;
    hdr.num_sections = 2;
    hdr.hw_id = hw_id;
    hdr.pos = system_upgrade_userbin_check();
    hdr.fw_ver = VERSION32;
    ffs_program(fd, faddr, &hdr, sizeof(hdr));
    faddr += sizeof(hdr);

    sec_hdr.sec_type = CRASH_SEC_REGS;
    memset(sec_hdr.reserved, 0, sizeof(sec_hdr.reserved));
    sec_hdr.addr = 0;
    sec_hdr.len = sizeof(gdbstub_savedRegs);
    ffs_program(fd, faddr, &sec_hdr, sizeof(sec_hdr));
    faddr += sizeof(sec_hdr);

    ffs_program(fd, faddr, &gdbstub_savedRegs, sizeof(gdbstub_savedRegs));
    faddr += sizeof(gdbstub_savedRegs);

    sec_hdr.sec_type = CRASH_SEC_MEM;
    memset(sec_hdr.reserved, 0, sizeof(sec_hdr.reserved));
    sec_hdr.addr = start;
    sec_hdr.len = end - start;
    ffs_program(fd, faddr, &sec_hdr, sizeof(sec_hdr));
    faddr += sizeof(sec_hdr);

    ffs_program(fd, faddr, (void *)start, sec_hdr.len);
    faddr += sec_hdr.len;
    printf("core dumped slot %d, len %d\n", EepromCopyApp.Error_Cnt, faddr - in_faddr);
}

static void crash_dump(uint32_t start, uint32_t end)
{
    ffs_device_t fd;
    ffs_file_t file;
    ffs_file_id_et fids[] = {FLASH_DATA_LOG, FLASH_CORE_DUMP_0, FLASH_CORE_DUMP_1};

    fd = FLASH_SPI;
    ffs_remove_files(fd, fids, 3);

    file.id = FLASH_CORE_DUMP_0 + EepromCopyApp.Error_Cnt - 1;
    file.len = crash_dump_len(start, end);
    file.load_addr = 0;
    if (ffs_add_file(fd, &file)) {
        return;
    }

    ffs_erase(fd, file.faddr, file.len);

    crash_dump_write(fd, file.faddr, start, end);

    EepromCopyApp.Ram_Dumped = EepromCopyApp.Error_Cnt;
    EepromCopyApp.Ram_Dumped_fd = fd;
    config_save();
}

//Get the value of one of the A registers
static unsigned int getaregval(int reg) {
    if (reg==0) return gdbstub_savedRegs.a0;
    if (reg==1) return gdbstub_savedRegs.a1;
    return gdbstub_savedRegs.a[reg-2];
}

static void print_stack(uint32_t start, uint32_t end)
{
    uint32_t pos;
    printf("\nStack dump x%x x%x:\n", start, end);
    for (pos = start; pos < end; pos += 0x10) {
        uint32_t *values = (uint32_t*) (pos);
        // rough indicator: stack frames usually have SP saved as the second word
        bool looksLikeStackFrame = (values[2] == pos + 0x10);

        printf("%08lx:  %08lx %08lx %08lx %08lx %c\n", pos, values[0],
                values[1], values[2], values[3],
                (looksLikeStackFrame) ? '<' : ' ');
    }
    printf("\n");
}

static void print_reason()
{
    int i;
    struct XTensa_exception_frame_s *reg = &gdbstub_savedRegs;
    printf("Fatal exception %ld\n", reg->reason);
    printf("pc=0x%08lx sp=0x%08lx excvaddr=0x%08lx\n", reg->pc, reg->a1,
            reg->excvaddr);
    printf("ps=0x%08lx sar=0x%08lx vpri=0x%08lx\n", reg->ps, reg->sar,
            reg->vpri);
    for (i = 0; i < 16; i++) {
        unsigned int r = getaregval(i);
        printf("r%02d: 0x%08x=%10d ", i, r, r);
        if (i % 3 == 2)
            printf("\n");
    }
    printf("\n");
    print_stack(getaregval(1), 0x3fffffb0);
}

//Non-OS exception handler. Gets called by the Xtensa HAL.
static void exception_handler(struct XTensa_exception_frame_s *frame)
{
    gdbstub_save_extra_sfrs_for_exception();

    os_memcpy(&gdbstub_savedRegs, frame, 19 * 4);

    //Credits go to Cesanta for this trick. A1 seems to be destroyed, but because it
    //has a fixed offset from the address of the passed frame, we can recover it.
    //gdbstub_savedRegs.a1=(uint32_t)frame+EXCEPTION_GDB_SP_OFFSET;
    gdbstub_savedRegs.a1 = (uint32_t) frame;

    ets_wdt_disable();

    print_reason();

    if (crash_in_exception++ == 0 &&
            EepromCopyApp.Error_Cnt < CONFIG_ERROR_LOG_CNT &&
            !config_in_assertion_failure) {
        config_log_error("exception", __LINE__, gdbstub_savedRegs.pc, gdbstub_savedRegs.excvaddr);
        crash_dump(getaregval(1), 0x3fffffb0);
    }
    BSP_RESET();
}

static void ICACHE_FLASH_ATTR install_exceptions() {
    int i;
    int exno[]={EXCCAUSE_ILLEGAL, EXCCAUSE_SYSCALL, EXCCAUSE_INSTR_ERROR, EXCCAUSE_LOAD_STORE_ERROR,
            EXCCAUSE_DIVIDE_BY_ZERO, EXCCAUSE_UNALIGNED, EXCCAUSE_INSTR_DATA_ERROR, EXCCAUSE_LOAD_STORE_DATA_ERROR,
            EXCCAUSE_INSTR_ADDR_ERROR, EXCCAUSE_LOAD_STORE_ADDR_ERROR, EXCCAUSE_INSTR_PROHIBITED,
            EXCCAUSE_LOAD_PROHIBITED, EXCCAUSE_STORE_PROHIBITED};
    for (i=0; i<(sizeof(exno)/sizeof(exno[0])); i++) {
        _xtos_set_exception_handler(exno[i], exception_handler);
    }
}
void ICACHE_FLASH_ATTR crash_init()
{
    install_exceptions();
}

/** called after an assertion failure up to twice per firmware release */
void wifi_hw_dump_core()
{
    /* cause an exception */
    __asm__ __volatile__ ("syscall");
}
flameboss commented 4 years ago

[Edit] Sorry I don't know how to insert code without it getting messed up by the markdown... crash.h

#ifndef CRASH_H_
#define CRASH_H_

#include <stdint.h>

#define CRASH_IMAGE_VERSION   0

typedef enum __attribute__((__packed__)) {
    CRASH_SEC_REGS = 0,
    CRASH_SEC_MEM = 1
} crash_sec_type_et;

typedef struct {
    uint8_t version;
    uint8_t num_sections;
    uint8_t hw_id;
    uint8_t pos;
    uint32_t fw_ver;
} crash_header_st;

typedef struct {
    crash_sec_type_et sec_type;
    uint8_t reserved[3];
    uint32_t addr;
    uint32_t len;
} crash_sec_header_st;

void crash_init(void);

#endif /* CRASH_H_ */
kriegste commented 4 years ago

First mark the code, then hit the "code" button.

flameboss commented 4 years ago

This is in a closed source part of the SDK, but I need to solve this. I'm in the middle of porting our whole app to the RTOS SDK, but I'd like some feedback on this crash. Will RTOS SDK fix it? Is anyone available to provide support or consulting?

FayeY commented 3 years ago

Could you have a try with the latest master? It should have solved this issue. If this issue persists, please feel free to let us know. Thanks.

flameboss commented 3 years ago

Thank you! We will give it a try now.

flameboss commented 3 years ago

I'm sorry but that got worse for me. I do not know how to reproduce the crash - I only know that many of our customers are getting it - but now I got exceptions as soon as it started associating with an AP:

Fatal exception 3 pc=0x402078d7 sp=0x3ffffc10 excvaddr=0x4024f5f4 ps=0x00000030 sar=0x0000001e vpri=0x3fff07ac r00: 0x40207883=1075869827 r01: 0x3ffffc10=1073740816 r02: 0x00000002= 2 r03: 0x0000000b= 11 r04: 0x3fff4174=1073693044 r05: 0x4024f5f4=1076164084 r06: 0x3fff3ff4=1073692660 r07: 0x3ffffd10=1073741072 r08: 0x00000001= 1 r09: 0x3fff1ac0=1073683136 r10: 0x00000016= 22 r11: 0x01205355= 18895701 r12: 0x00000002= 2 r13: 0x3fff4860=1073694816 r14: 0x00000000= 0 r15: 0x3fff4860=1073694816

0x402078d7: ieee80211_setup_ratetable at ??:? 0x40207883: ieee80211_phy_init at ??:? 0x4024f5f4: chip_v6_unset_chanfreq at ??:? 0x4000dd3b: ?? ??:0 0x40002ef1: ?? ??:0 0x00000005: ?? ??:0 0x3ffefef5: ?? ??:0 0x40204d1c: ieee80211_setup_rates at ??:? 0x4020b4f4: cnx_update_bss_more at ??:? 0x4020b511: cnx_update_bss_more at ??:? 0x40208f87: scan_parse_beacon at ??:? 0x40204e09: ieee80211_parse_beacon at ??:? 0x40000f58: ?? ??:0 0x40100a81: wdt_feed at ??:? 0x40238f9b: ppPeocessRxPktHdr at ??:? 0x40000e19: ?? ??:0 0x40100283: call_user_start_local at ??:? 0x4010000d: call_user_start at ??:?

kriegste commented 3 years ago

https://github.com/espressif/ESP8266_NONOS_SDK/pull/345

FayeY commented 2 years ago

May I ask if this issue still persists with the latest SDK?

kriegste commented 2 years ago

According to https://github.com/espressif/ESP8266_NONOS_SDK/pull/345 the cause is still present.