PikoRT / pikoRT

A tiny Linux-like real-time kernel optimized for ARM Cortex-M chips
Other
304 stars 61 forks source link

Thumb2 optimized string routines for performance and/or size #58

Open jserv opened 6 years ago

jserv commented 6 years ago

Piko/RT depends on several string routines such as memcpy and memset. They can be optimized in Thumb2 assembly in consideration of performance and/or size.

Sample implementation for memcpy:

static inline void memcpy(void *restrict dst, const void *restrict src, size_t l)
{
    __asm__ volatile(" \
        mov r1, %2; \
        mov r3, %1; \
        mov r4, %0; \
        orr r2, r3, r4; \
        ands r2, #3; \
        bne 2f; \
1: \
    cmp r1, #4; \
    ittt hs; \
    ldrhs r2, [r3], #4; \
    strhs r2, [r4], #4; \
    subshs r1, #4; \
    bhs 1b; \
2: \
    cmp r1, #0; \
    ittt ne; \
    ldrbne r2, [r3], #1; \
    strbne r2, [r4], #1; \
    subsne r1, #1; \
    bne 2b"
    :
    : "r" (dst), "r" (src), "r" (l)
    : "r1", "r2", "r3", "r4", "memory", "cc");
}

Sample implementation for memcpy:

static inline void memset(void *dst, int v, size_t l)
{
    __asm__ volatile(" \
        mov r1, %2; \
        mov r3, %1; \
        orr r3, r3, r3, lsl #8; \
        orr r3, r3, r3, lsl #16; \
        mov r4, %0; \
        ands r2, r4, #3; \
        bne 2f; \
1: \
        cmp r1, #4; \
        itt hs; \
        strhs r3, [r4], #4; \
        subshs r1, #4; \
        bhs 1b; \
2: \
        cmp r1, #0; \
        itt ne; \
        strbne r3, [r4], #1; \
        subsne r1, #1; \
        bne 2b"
    :
    : "r" (dst), "r" (v & 0xff), "r" (l)
    : "r1", "r2", "r3", "r4", "memory", "cc");
}