sys/pgmspace.h: Optimize a bit

jjsuwa-sys3175 commented 3 years ago

prefix two underscores to private symbol names in order to avoid collision
make addr of pgm_read_with_offset() to be preserved in order to improve register utilization
shrink part of inline asm in order to improve insn scheduling

// fact case #1. `pgm_read_byte_inlined(p)` as un-inlined form:

[before]
   0:   143020      extui   a3, a2, 0, 2
   3:   c02230      sub a2, a2, a3
   6:   0228        l32i.n  a2, a2, 0
   8:   402300      ssa8l   a3
   b:   813220      src a3, a2, a2
   e:   742030      extui   a2, a3, 0, 8
  11:   f00d        ret.n

[after]
   0:   c47c        movi.n  a4, -4
   2:   104240      and a4, a2, a4
   5:   0438        l32i.n  a3, a4, 0
   7:   402200      ssa8l   a2
   a:   912030      srl a2, a3
   d:   742020      extui   a2, a2, 0, 8
  10:   f00d        ret.n

// fact case #2. `strnlen_P()`:

[before]
   0:   323a        add.n   a3, a2, a3
   2:   024d        mov.n   a4, a2
;loop head
   4:   191347      beq a3, a4, 21
   7:   046d        mov.n   a6, a4
   9:   145060      extui   a5, a6, 0, 2
   c:   c06650      sub a6, a6, a5
   f:   0668        l32i.n  a6, a6, 0
  11:   402500      ssa8l   a5
  14:   815660      src a5, a6, a6
  17:   745050      extui   a5, a5, 0, 8
  1a:   358c        beqz.n  a5, 21
  1c:   441b        addi.n  a4, a4, 1
  1e:   fff886      j   4
;loop tail
  21:   c02420      sub a2, a4, a2
  24:   f00d        ret.n

[after]
   0:   323a        add.n   a3, a2, a3
   2:   024d        mov.n   a4, a2
   4:   c67c        movi.n  a6, -4
;loop head
   6:   161347      beq a3, a4, 20
   9:   105460      and a5, a4, a6
   c:   0558        l32i.n  a5, a5, 0
   e:   402400      ssa8l   a4
  11:   915050      srl a5, a5
  14:   745050      extui   a5, a5, 0, 8
  17:   558c        beqz.n  a5, 20
  19:   441b        addi.n  a4, a4, 1
  1b:   fff9c6      j   6
;loop tail
  1e:   00      .byte 00
  1f:   00      .byte 00
  20:   c02420      sub a2, a4, a2
  23:   f00d        ret.n

earlephilhower commented 3 years ago

Nice, one byte shorter than the current macro. Not sure that it will change runtime due to load/use stalls, but that's easily tested.

I'll need to run some testing on this first, as well as finish the newlib 4.0 merge, so it may take a week or so to merge. Thx!

jjsuwa-sys3175 commented 3 years ago

quite artificial example:

int compare(PGM_P a, PGM_P b) {
  return pgm_read_byte_inlined(a) == pgm_read_byte_inlined(b);
}

the results:

[before]
00000000 <_Z7comparePKcS0_>:
   0:   144020                  extui   a4, a2, 0, 2
   3:   c02240                  sub a2, a2, a4
   6:   0228                    l32i.n  a2, a2, 0
   8:   402400                  ssa8l   a4
   b:   814220                  src a4, a2, a2
   e:   142030                  extui   a2, a3, 0, 2
  11:   c03320                  sub a3, a3, a2
  14:   0338                    l32i.n  a3, a3, 0
  16:   402200                  ssa8l   a2
  19:   812330                  src a2, a3, a3
  1c:   743040                  extui   a3, a4, 0, 8
  1f:   742020                  extui   a2, a2, 0, 8
  22:   c03230                  sub a3, a2, a3
  25:   040c                    movi.n  a4, 0
  27:   120c                    movi.n  a2, 1
  29:   932430                  movnez  a2, a4, a3
  2c:   f00d                    ret.n

[after]
00000000 <_Z7comparePKcS0_>:
   0:   c67c                    movi.n  a6, -4
   2:   104260                  and a4, a2, a6
   5:   106360                  and a6, a3, a6
   8:   0458                    l32i.n  a5, a4, 0
   a:   0648                    l32i.n  a4, a6, 0
   c:   402200                  ssa8l   a2
   f:   915050                  srl a5, a5
  12:   402300                  ssa8l   a3
  15:   913040                  srl a3, a4
  18:   745050                  extui   a5, a5, 0, 8
  1b:   743030                  extui   a3, a3, 0, 8
  1e:   c03350                  sub a3, a3, a5
  21:   040c                    movi.n  a4, 0
  23:   120c                    movi.n  a2, 1
  25:   932430                  movnez  a2, a4, a3
  28:   f00d                    ret.n

earlephilhower commented 3 years ago

I did a quick test and got some very nice numbers with your patch:

const char pmem[60000] PROGMEM = {'H',0,'e',0,0,'l',0,0,0,'l',0,0,0,0,'o'};

void sum(const void *start, int cnt, int delta) {
  uint32_t s = 0;
  uint8_t *ptr = (uint8_t*)start;
  for (int i=0; i<cnt; i++) {
    s += pgm_read_byte(ptr) + (cnt & 255);
    ptr += delta;
  }
  Serial.printf("sum: %u\n", s);
}

void setup() {
  Serial.begin(115200);
  delay(1000);
  Serial.printf("\n\nStarting 60,000 byte sum...\n");
  uint32_t start = ESP.getCycleCount();
  sum(pmem, 60000, 1);
  sum(pmem, 30000, 2);
  sum(pmem+1, 30000, 2);
  uint32_t end = ESP.getCycleCount();
  Serial.printf("\n\nDone...\n%u cycles\n", end - start);
}

void loop() {
}

Head:

Starting 60,000 byte sum...
sum: 5760500
sum: 1440284
sum: 1440216
Done...
6245985 cycles

This PR:

Starting 60,000 byte sum...
sum: 5760500
sum: 1440284
sum: 1440216
Done...
5885489 cycles

About 6% improvement in runtime on a very heavy pgm_read_byte workload.

However, the IROM usage actually has gone up microscopically to achieve that:

Head:

ICACHE : 32768           - flash instruction cache 
IROM   : 294392          - code in flash         (default or ICACHE_FLASH_ATTR) 
IRAM   : 26560   / 32768 - code in IRAM          (ICACHE_RAM_ATTR, ISRs...) 
DATA   : 1252  )         - initialized variables (global, static) in RAM/HEAP 
RODATA : 860   ) / 81920 - constants             (global, static) in RAM/HEAP 
BSS    : 25560 )         - zeroed variables      (global, static) in RAM/HEAP

This PR:

ICACHE : 32768           - flash instruction cache 
IROM   : 294408          - code in flash         (default or ICACHE_FLASH_ATTR) 
IRAM   : 26560   / 32768 - code in IRAM          (ICACHE_RAM_ATTR, ISRs...) 
DATA   : 1252  )         - initialized variables (global, static) in RAM/HEAP 
RODATA : 872   ) / 81920 - constants             (global, static) in RAM/HEAP 
BSS    : 25568 )         - zeroed variables      (global, static) in RAM/HEAP

Overall, seems like a good win for heavy-use cases and minimal impact on code size. Thx!

earlephilhower / newlib-xtensa

sys/pgmspace.h: Optimize a bit #16