Perl / perl5

🐪 The Perl programming language
https://dev.perl.org/perl5/
Other
1.95k stars 554 forks source link

Fix huge inline bloat msvc sv inline h #22667

Open bulk88 opened 1 week ago

bulk88 commented 1 week ago

success!!!, new rule, "struct body_details bodies_by_type[]" only in sv.c

-svfix.pl is quick throwaway garbage done in 20 mins and probably doesnt regen sv_inline.h properly and copy pasting/replace regexps were was used anyway to fix up the code, it can be rewritten correctly tho and put in as a official regen.pl script

how to finish this fix, options

set and unset sv_type as a CPP macro then

-#include a header 17 times that contains ONLY only Perl_newSV_type() and its mortal() sister creating 17*2 static inline fns (basically what I did here), code is stepable, extra ms'es of I/O build times perf degrade debate may or may not come up, I hope the CC has a sane in memory cache for .h files and doesn't go back to the kernel

or put the entire Perl_newSV_type() fnc in a #define

#define PETS blah(foo(myarg)) +  \
  cat(dog(fur)) + \
  laser(ball(toy))

then execute that macro 17 times

or write 17 Perl_newSV_type() copies into sv_inline.h with /regen.pl infrastructure (fastest for build speed core and build speed CPAN and code is c dbg stepable) OR is a dedicated "sv_newg.h" for regen.pl needed? does the master Perl_newSV_type() template live in a .pl or a .h? i dont have an opinion

or against concept of sv_inline.h just have 5-10 hand written versions sv type specific of Perl_newSV_type(), its a cheap gimick fix to keep all 17 types together mashed with if/else/switch in 1 func and expecting bug free perfection from LTO engines of various C compilers, and expecting perfection from an single vendor LTO engine is very against the spirit of portable code

-todo ideas, turn those super long #define ==?:==?:==?: into char array/struct initializers, stored in macros, one faux-string per each column of struct body_details, use that macro as c auto stk rw array initializer, then do the

U32 len = "\x01\x02\x03"[sv_type];

or

U8 sizes [3] = {1,2,3};
U32 len = sizes[sv_type];

which in perl core would look

U32 arena_size = SVBD_AR_SZ_DECL_INIT;
U32 len = arena_size[sv_type];

maybe VC will optimize those since no global memory is used. Only Perl_newSV_typeX() needs this.

in this commit static inline Perl_newSVtypeX(pTHX const svtype type) which is the ONLY Perl_newSV_type*() variant that take an arbitrary svtype arg, this is the fallback for gv_pvn_add_by() since I couldn't "const" that call to newSV_type() cuz gv_pvn_add_by() is only place in the whole core that takes a random SV type number.

Internals of Perl_newSV_typeX() are trashy, here is an example, MSVC DID not turn this into a jump table but instead 17 test/cond_jump ops.

      v5 = (char *)S_new_body(v2);
      v6 = 40i64;
      if ( v2 == 15 )
        v6 = 136i64;
      if ( v2 == 14 )
        v6 = 104i64;
      if ( v2 == 13 )
        v6 = 104i64;
      if ( v2 == 12 )
        v6 = 32i64;
      if ( v2 == 11 )
        v6 = 40i64;
      if ( v2 == 10 )
        v6 = 80i64;
      if ( v2 == 9 )
        v6 = 48i64;
      if ( v2 == 8 )
        v6 = 224i64;
      if ( v2 == 7 )
        v6 = 48i64;
      if ( v2 == 6 )
        v6 = 32i64;
      if ( v2 == 5 )
        v6 = 24i64;
      if ( v2 == 4 )
        v6 = 40i64;
      if ( v2 == 3 )
        v6 = 16i64;
      if ( v2 == 2 )
        v6 = 0i64;
      if ( v2 == 1 )
        v6 = 0i64;
      memset(v5, 0, v6 & -(signed __int64)(v2 != 0));

Solution is move Perl_newSV_typeX() to sv.c, and let it be struct body_details driven. Cuz it only purpose is when newSV_type() absolutly CAN NOT be constant folded (random number input). it only has 1 caller in core.

S_new_body() properly const folded away in 99% of cases except for TWO callers Perl_newSV_typeX() and Perl_make_trie(). Perl_make_trie() failure to inline is bizzare, since Perl_make_trie() internally does

v9 = S_new_body(SVt_PVAV);

and DID inline Perl_newSV_typeSVt_PVAV() !!! and therefore Perl_make_trie() has the AV field initing/nulling code.

Here is the "optimized" contents of S_new_body(), its junk performance/design wise (but runtime correct/no bugs)

void **__fastcall S_new_body(svtype sv_type)
{
  svtype v1; // er9
  __int64 v2; // rbx
  void **result; // rax
  signed int v4; // ecx
  signed __int64 v5; // rax

  v1 = sv_type;
  v2 = sv_type;
  result = (void **)PL_body_roots[sv_type];
  if ( !result )
  {
    v4 = 4080;
    if ( v1 == 15 )
      v4 = 3264;
    if ( v1 == 14 )
      v4 = 2080;
    if ( v1 == 13 )
      v4 = 4056;
    if ( v1 == 12 )
      v4 = 4064;
    if ( v1 == 11 )
      v4 = 4080;
    if ( v1 == 10 )
      v4 = 4080;
    if ( v1 == 9 )
      v4 = 4080;
    if ( v1 == 8 )
      v4 = 4032;
    if ( v1 == 7 )
      v4 = 4080;
    if ( v1 == 6 )
      v4 = 3296;
    if ( v1 == 5 )
      v4 = 3424;
    if ( v1 == 4 )
      v4 = 3120;
    if ( v1 == 3 )
      v4 = 3536;
    if ( v1 == 2 )
      v4 = 0;
    if ( v1 == 1 )
      v4 = 0;
    v5 = 40i64;
    if ( v1 == 15 )
      v5 = 136i64;
    if ( v1 == 14 )
      v5 = 104i64;
    if ( v1 == 13 )
      v5 = 104i64;
    if ( v1 == 12 )
      v5 = 32i64;
    if ( v1 == 11 )
      v5 = 40i64;
    if ( v1 == 10 )
      v5 = 80i64;
    if ( v1 == 9 )
      v5 = 48i64;
    if ( v1 == 8 )
      v5 = 224i64;
    if ( v1 == 7 )
      v5 = 48i64;
    if ( v1 == 6 )
      v5 = 32i64;
    if ( v1 == 5 )
      v5 = 24i64;
    if ( v1 == 4 )
      v5 = 40i64;
    if ( v1 == 3 )
      v5 = 16i64;
    if ( v1 == 2 )
      v5 = 0i64;
    if ( v1 == 1 )
      v5 = 0i64;
    result = (void **)Perl_more_bodies(v1, v5 & -(signed __int64)(v1 != 0), v4 & (unsigned int)-(v1 != 0));
  }
  PL_body_roots[v2] = *result;
  return result;
}

disassembly view of S_new_body()

cmp     r9d, 0Fh
lea     edi, [rbp+28h]
mov     r8d, 0FF0h
lea     r11d, [rbp+20h]
mov     edx, 0CC0h
lea     r10d, [rbp+30h]
mov     ecx, r8d
mov     eax, r9d
cmovz   ecx, edx
cmp     r9d, 0Eh
mov     edx, 820h
cmovz   ecx, edx
cmp     r9d, 0Dh
lea     edx, [r8-18h]
cmovz   ecx, edx
cmp     r9d, 0Ch
lea     edx, [r8-10h]
cmovz   ecx, edx
cmp     r9d, 0Bh
lea     edx, [r8-30h]
cmovz   ecx, r8d
cmp     r9d, 0Ah
cmovz   ecx, r8d
cmp     r9d, 9
cmovz   ecx, r8d
cmp     r9d, 8
cmovz   ecx, edx
cmp     r9d, 7
mov     edx, 0CE0h
cmovz   ecx, r8d
cmp     r9d, 6
cmovz   ecx, edx
cmp     r9d, 5
mov     edx, 0D60h
cmovz   ecx, edx
cmp     r9d, 4
mov     edx, 0C30h
cmovz   ecx, edx
cmp     r9d, 3
mov     edx, 0DD0h
cmovz   ecx, edx
cmp     r9d, 2
lea     edx, [rdi+60h]
cmovz   ecx, ebp
cmp     r9d, 1
cmovz   ecx, ebp
neg     eax
sbb     eax, eax
and     eax, ecx
mov     ecx, r9d
mov     r8d, eax
cmp     r9d, 0Fh
mov     eax, edi
cmovz   eax, edx
cmp     r9d, 0Eh
lea     edx, [rbp+68h]
cmovz   eax, edx
cmp     r9d, 0Dh
cmovz   eax, edx
cmp     r9d, 0Ch
lea     edx, [rbp+50h]
cmovz   eax, r11d
cmp     r9d, 0Bh
cmovz   eax, edi
cmp     r9d, 0Ah
cmovz   eax, edx
cmp     r9d, 9
mov     edx, 0E0h
cmovz   eax, r10d
cmp     r9d, 8
cmovz   eax, edx
cmp     r9d, 7
lea     edx, [rbp+18h]
cmovz   eax, r10d
cmp     r9d, 6
cmovz   eax, r11d
cmp     r9d, 5
cmovz   eax, edx
cmp     r9d, 4
lea     edx, [rbp+10h]
cmovz   eax, edi
cmp     r9d, 3
cmovz   eax, edx
cmp     r9d, 2
cmovz   eax, ebp
cmp     r9d, 1
cmovz   eax, ebp
neg     ecx
mov     ecx, r9d
sbb     rdx, rdx
and     rdx, rax
call    Perl_more_bodies

17 test ops and 17 conditional_move_constant_8_bits ops

solution, turn S_new_body() back into a macro so no CC ever tries to ref-inline it. It was a macro before sv_inline.h branch was merged

TODO add XSApitest.xs that worlds longest macros are identical to the master correct copy (struct body_details).

byte size drops from before these 3 commits to this "success commit"

mp.exe 0x1241AC-0x1224EC=7360 0x19D3D8-0x19B8E8=6896

p541.dll 0x154886-0x1532A6=5600 0x1AA19E-0x1A862E=7024

BEFORE

Dump of file ..\miniperl.exe SECTION HEADER #1 .text name 1241AC virtual size SECTION HEADER #2 .rdata name 19D3D8 virtual size

Dump of file ..\perl541.dll SECTION HEADER #1 .text name 154886 virtual size SECTION HEADER #2 .rdata name 1AA19E virtual size

AFTER

Dump of file ..\perl541.dll SECTION HEADER #1 .text name 1532A6 virtual size SECTION HEADER #2 .rdata name 1A862E virtual size

Dump of file ..\miniperl.exe SECTION HEADER #1 .text name 1224EC virtual size SECTION HEADER #2 .rdata name 19B8E8 virtual size

richardleach commented 4 days ago

If the consensus is that this is the way to go, then I'm happy to go with the flow. However, it does seem like a lot of added complexity basically for the benefit of MSVC, which (as I understand it from comments elsewhere) isn't used to compile any of the major perl distributions.

Additionally, there doesn't seem to have been any discussion of whether MSVC produces better code with different OPTIMISE flags. The current flags have been in place for a touch over a decade, I think in a commit by OP:

# Enable Whole Program Optimizations (WPO) and Link Time Code Generation (LTCG).
# -O1 yields smaller code, which turns out to be faster than -O2 on x86 and x64
OPTIMIZE    = -O1 -Zi -GL

I wonder if -O1 being better is still true on today's MSVC and average hardware, compared with that of the time.

@bulk88 - do you have any data points on the current vs other sets of OPTIMIZE flags at all?

richardleach commented 4 days ago

I wonder if -O1 being better is still true on today's MSVC and average hardware, compared with that of the time.

Never mind, I've just seen https://github.com/Perl/perl5/pull/22664

jkeenan commented 4 days ago

This pull request is failing during make on our most basic "sanity-check" test rig in our GH CI. It appears the submitter did not run make to completion when updating this ticket.

bulk88 commented 3 days ago

This pull request is failing during make on our most basic "sanity-check" test rig in our GH CI. It appears the submitter did not run make to completion when updating this ticket.

This commit is a rough concept of the final fix. Only code style/formatting/P5P identifier naming/what is more or less maintainable, and what is master source of truth, questions are left.

Perl C style question, 12 ways to do it, main question is what software or human (me 1x ever) generates 17 copies?

Where is the master copy of newSV_type()? or split it by hand once and forever?

what disk file do 17 copies live in?

does struct body_details stay in the codebase, or refactored away forever, or stays only for -DDEBUGGING only builds?

#undef DBSVTYPE
#define DBSVTYPE SVt_IV
#include "newSV_type.h"
#undef DBSVTYPE

#define DBSVTYPE SVt_PV
#include "newSV_type.h"
#undef DBSVTYPE

17 times for 17 SV types? setting macros different each time

regen.pl a huge by C text, chunk of C code, with 17 copies of newSV_type() from a master version in a .h, auto-inject code block into current sv_inline.h with a dont touch this warning and t/porting/regen.t test to make sure it stays up to date?

regen.pl a huge by C text, chunk of C code, with 17 copies of newSV_type() from a master version in a new committed .h file called svnew.h which is only codegen? extra I/O forever, more perl core disk files

keep master version of C of newSV_type() in the actual regen.pl in a heredoc or in a .h? or in sv.c?

move newSV_type() back to sv.c, from there regen.pl finds it and uses it as master copy?

don't regen.pl anything, just break up struct, body_details_by_type, into ONE large no-() macro, where each column in that struct, becomes a CPP macro with BODY_FIELD_NAME() then later on, all the BODY_FIELD_NAME(), get #define then write ONE no-() macro/interpolate the

`#undef BODY_FIELD_NAME_1
#define BODY_FIELD_NAME_1(_val)  sizeof(_val)
`#undef BODY_FIELD_NAME_2
#define BODY_FIELD_NAME_2(_val)  /*drop out*/
BODY_DETAILS
`#undef BODY_FIELD_NAME_1
`#undef BODY_FIELD_NAME_2

CPP interpolation will drop columns, prefix them, wrap them, whatever is necessary, each time the root matrix/grid macro is invoked

Here actual example from day job, split what used to be an array of structs, into individual arrays, all with CPP macros and 1 source of truth

#define BIZFORMS \
BIZFORMS2(hf, _T("Red Sheet"), _T("Red Office.pdf"), "B352DEBC543A814F91A519D5AD3C0F78",  _T("rd"),0) \
BIZFORMS2(ar, _T("Open As Orange"), _T("Orange.pdf"), "48A607D8B5834E4FB8CF9099049E45BC", _T("og"),0) \
BIZFORMS2(br, _T("Open As Maroon"), _T("Maroon.pdf"), "12af62a2cdaa4357b6c9b142e7b19ee2", _T("mo"),0) \
BIZFORMS2(hd, _T("Open As Dog 1"), _T("Dog 1.pdf"), "E0355C21BB64E548A7D261AC480C6DE8", _T(""d1),0) \
BIZFORMS2(hi, _T("Open As Dog 2"), _T("Dog 2.pdf"), "4F31CF9CFE2729418B89A157556EEFB9", _T("d2"),0) \
BIZFORMS2(lb, _T("Open As Customer"), _T("Customer.pdf"), "B3E72C019D1C824C8E7A0A7A43B257F0", _T("cb"),1) \
BIZFORMS2(ld, _T("Open As Customer Ext"), _T("Customer Ext.pdf"), "B3E72C818D1C824C8E7A0A7A43B257F0", _T("ce"),1)\
BIZFORMS2(qu, _T("Open As Green"), _T("Green.pdf"), "C332F078EF14236F0B82CE82E00E5ECD", _T("gr"),0) \
BIZFORMS2(gm, _T("Open As Bankrupt"), _T("Bankrupt.pdf"), "17C8E36BC11C924C9F8E66762C6307F3", _T("bk"),0) \
BIZFORMS2(sr, _T("Open As AUS"), _T("AUS.pdf"), "254A8561F774440D8BAAD34779047AA7", _T("au"),0)

typedef struct {
#define BIZFORMS2(_m, _l,_f,_g,_i,_p) char _m [ARRAYSIZE(_f)-4-1];
   BIZFORMS
#undef BIZFORMS2
} formPaths_t;

typedef union {
  struct {
#define BIZFORMS2(_m, _l,_f,_g,_i,_p) formPathsLen_t _m;
   BIZFORMS
#undef BIZFORMS2
  } sct ;
  formPathsLen_t arr [
#define BIZFORMS2(_m, _l,_f,_g,_i,_p) + 1
0   BIZFORMS
#undef BIZFORMS2
  ];
} formPathsLens_t;

typedef union {
  char arr [
#define BIZFORMS2(_m, _l,_f,_g,_i,_p) + 1
0   BIZFORMS
#undef BIZFORMS2
  ][32];
  struct {
#define BIZFORMS2(_m, _l,_f,_g,_i,_p) char _m [32];
   BIZFORMS
#undef BIZFORMS2
  } sct;
} formGUIDs_t;

LBMII gmii = {
  {
      {
#define BIZFORMS2(_m, _l,_f,_g,_i,_p) MIIFORM(_m,_l,_p),
      BIZFORMS
#undef BIZFORMS2
      }
  },

or manually break up bodies_by_details into CPP macros? maybe leaving the struct in sv.c for -DDEBUGGING builds as a assert sanity test (yes!!!! i do want 1 copy of body_by_details, the grid/matrix look, for maintainability and comprehension but the above code also has a matrix data layout, except that code syntax high lighting of the root macro. the matrix or array of structs stays, and is more readable, since each cell has a CPP macro around it with the field name like above

-abandon the static inline concept, just keep 17 fast ONE SV TYPE copies, manually split and maintained with alot of asserts (really 12 copies after de-duping) in sv.c as extern exported symbols???

-Keep the static inline concept and .h concept, have 17 fast ONE SV TYPE copies, manually split ONCE and forever, and maintained with alot of asserts (probably against root bodies_by_type but only -DDEBUGGING) and no regen.pl stuff involved?

-add struct bodies_bydetails as RO data extern symbol, ```PL*type name, its been proven that the const static decl, all CC's/OS'es ignoreconst staticfor optimization LTO purposes b/c ofmprotect``` can change the data at any time???? I think yes at this point

Alot of style questions are left.

The technical fix is a #define FOO(_arg) arg==?val:arg==?val:arg==?: sequence which will always constant fold, on all CC ever released, since K&R.