earlephilhower / newlib-xtensa

newlib-xtensa fork intended for esp8266
GNU General Public License v2.0
5 stars 7 forks source link

Q: slightly more compact memmove_P #23

Open mcspr opened 2 years ago

mcspr commented 2 years ago

Looking at the current memmove_P implementation https://github.com/earlephilhower/newlib-xtensa/blob/ebc967552ce827f21fc579fd8c437037c1b472ab/newlib/libc/sys/xtensa/string_pgmspace.c#L184-L190

Since it is checking for a number with only one bit set... I wondered if just checking that fact does anything to the code, since we could simply discard the idea that it is going to be used on any 'higher' addresses. Not sure how to benchmark it, though, so I am not really sure if this does anything useful at all (besides making it 5 bytes smaller :)

// > cat memmove.c
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <sys/pgmspace.h>

inline static bool inFlash(const void* ptr) {
    // common comparison would use >=0x40000000
    // instead, slightly reduce the footprint by
    // checking *only* for numbers below it
    static const uintptr_t Mask = 1 << 30;
    return ((uintptr_t)(ptr) & Mask) > 0;
}

void* memmove_P2 (void* dest, const void* src, size_t n) {
    if (inFlash(src) && !inFlash(dest)) {
        return memcpy_P(dest, src, n);
    } else {
        return memmove(dest, src, n);
    }
}

void* memmove_P1 (void* dest, const void* src, size_t n)
{
    if ( ((const char *)src >= (const char *)0x40000000) && ((const char *)dest < (const char *)0x40000000) )
        return memcpy_P(dest, src, n);
    else
        return memmove(dest, src, n);
}
> xtensa-lx106-elf-gcc -c -Os memmove.c
> xtensa-lx106-elf-nm --radix=d -S memmove.o | grep memmove
         U memmove
00000020 00000023 T memmove_P1
00000000 00000018 T memmove_P2
> xtensa-lx106-elf-gcc -S -Os memmove.c
    .file   "memmove.c"
    .text
    .literal_position
    .align  4
    .global memmove_P2
    .type   memmove_P2, @function
memmove_P2:
    bbci    a3, 30, .L2    ; branch on bit set / unset
    bbsi    a2, 30, .L2
    j.l memcpy_P, a9
.L2:
    j.l memmove, a9
    .size   memmove_P2, .-memmove_P2
    .literal_position
    .align  4
    .global memmove_P1
    .type   memmove_P1, @function
memmove_P1:
    movi.n  a5, -1        ; btw this only happens on Os, O2 and O3 use l32r const of 0x40000000
    srli    a5, a5, 2
    bgeu    a5, a3, .L7
    bltu    a5, a2, .L7
    j.l memcpy_P, a9
.L7:
    j.l memmove, a9
    .size   memmove_P1, .-memmove_P1
    .ident  "GCC: (GNU) 10.3.0"