Closed tersec closed 1 year ago
0e5bf5953e2ab6c846ba4095adca16734ae240f8 is the first bad commit https://github.com/nim-lang/Nim/pull/19712
I had noticed that applying the reverse diff seemed to fix it in currentl devel
, but hadn't looked deeper.
reverse_culpa_culpa_culpa.txt
In theory an overly aggressive optimizer can break Nim's stack root tracing that the old GCs use.
Tried to add GC_disable
at the beginning of the file, and issue still happens, so doesn't look like eager collection
gdb backtrace:
insert__test_13 (x=0x555555565268 <g.test_1>, item=0, i=0) at /home/tavon/.cache/nim/test_r/@m..@shome@stavon@s.choosenim@stoolchains@snim-1.6.8@slib@ssystem.nim.c:4674
4674 unsureAsgnRef((void**) (&(*x)), (tySequence__qwqHTkRvwhrRyENtudHQ7g*) setLengthSeqV2(&((*x))->Sup, (&NTIseqLintT__qwqHTkRvwhrRyENtudHQ7g_), ((NI) ((NI)(TM__Q5wkpxktOdTGvlSRo9bzt9aw_27)))));
(gdb) bt
#0 insert__test_13 (x=0x555555565268 <g.test_1>, item=0, i=0)
at /home/tavon/.cache/nim/test_r/@m..@shome@stavon@s.choosenim@stoolchains@snim-1.6.8@slib@ssystem.nim.c:4674
#1 j__test_11 (b=3) at /home/tavon/.cache/nim/test_r/@mtest.nim.c:89
#2 NimMainModule () at /home/tavon/.cache/nim/test_r/@mtest.nim.c:166
#3 NimMainInner () at /home/tavon/.cache/nim/test_r/@mtest.nim.c:127
#4 0x00005555555558b2 in NimMain () at /home/tavon/.cache/nim/test_r/@mtest.nim.c:135
#5 main (argc=<optimized out>, args=<optimized out>, env=<optimized out>) at /home/tavon/.cache/nim/test_r/@mtest.nim.c:142
(gdb) print *x
$1 = (tySequence__qwqHTkRvwhrRyENtudHQ7g *) 0x0
var g: seq[int]
proc j(b: int) =
g.insert(0, 0)
for i in 0 ..< 1:
discard
j(0)
suffices.
Also, Nim/bin/nim c -r --boundChecks:off --mm:refc -d:release --passC=-flto --passL=-flto --passC=-finline-limit=200 --passL=-finline-limit=200 repro.nim
avoids this, because it's the fallout of said bounds check's C code.
A short, standalone (except for stdlib.h
to access malloc()
) reproducible test case has, for me, the same failure modes as the Nim code shown:
#include <stdlib.h>
typedef struct Rhd Rhd;
struct Rhd {
int len;
};
void dpb(void** dest, void* src);
typedef struct Dkr Dkr;
typedef struct tbs tbs;
typedef struct wvg wvg;
typedef struct wyj wyj;
struct Dkr {
int size;
Dkr* base;
};
struct wvg {
int line;
};
struct tbs {
Rhd Sup;
wvg hkt[];
};
struct wyj {
Rhd Sup;
int hkt[];
};
Dkr Ekc;
void* newSeq(Dkr* typ, int len);
int* dollar___system_2721(tbs* stackTraceEntries);
int* rawNewStringNoInit();
void add__system_2306(int** x, char* y);
int __attribute__((__noinline__)) isOnStack__system_5289(void* p);
static void sysFatal__system_3037(int i, int n);
Dkr NTIint__rR5Bzr1D5krxoo1NcNyeMA_;
void* newSeq(Dkr* typ, int len) {
void* result = (void*)malloc((*(*typ).base).size * len + 32);
(*((Rhd*) (result))).len = len;
return result;
}
int* rawNewStringNoInit() {
return malloc(sizeof(int));
}
int __attribute__((__noinline__)) isOnStack__system_5289(void* p) {
return 1;
}
int refcount;
void dpb(void** dest, void* src) {
{
int T3_ = isOnStack__system_5289(0);
if (!!(T3_)) goto LA4_;
{
if (!src) goto LA8_;
refcount += 8;
}
LA8_: ;
{
refcount -= 8;
}
}
LA4_: ;
(*dest) = src;
}
void add__system_2306(int** x, char* y) {
int i;
i = ((int) 0);
{
if (!y) goto LA3_;
{
while (1) {
if (!!(y[i] == 0)) goto LA6;
dpb((void**) x, rawNewStringNoInit(2000));
i += ((int) 1);
} LA6: ;
}
}
LA3_: ;
}
static void sysFatal__system_3037(int i, int n) {
tbs T1_ = { 0 };
dollar___system_2721(&T1_);
exit(0);
}
int* dollar___system_2721(tbs* stackTraceEntries) {
int* result;
tbs* s;
s = stackTraceEntries;
result = rawNewStringNoInit(((int) 2000));
{
int i = 0;
int colontmp_;
int T2_ = (s ? s->Sup.len : 0);
int res = 0;
colontmp_ = (int)(T2_ - ((int) 1));
{
while (1) {
if (!(res <= colontmp_)) goto LA4;
i = res;
{
if (!(s->hkt[i].line == -10)) goto LA7_;
}
goto LA5_;
LA7_: ;
{
if (!(s->hkt[i].line == -100)) goto LA10_;
}
goto LA5_;
LA10_: ;
add__system_2306(&result, "ABCDEF");
LA5_: ;
res += ((int) 1);
} LA4: ;
}
}
return result;
}
void insert__repro_13(wyj** x, int item, int i) {
int TM__Q5wkpxktOdTGvlSRo9bzt9aw_23;
TM__Q5wkpxktOdTGvlSRo9bzt9aw_23 = ((*x) ? (*x)->Sup.len : 0) + 1;
dpb((void**)x, (wyj*) newSeq(&Ekc, TM__Q5wkpxktOdTGvlSRo9bzt9aw_23));
if (i < 0 || i >= ((*x) ? (*x)->Sup.len : 0)){ sysFatal__system_3037(i,((*x) ? (*x)->Sup.len : 0)-1); }
(*x)->hkt[i] = item;
}
void insert__repro_13(wyj** x, int item, int i);
wyj* g__repro_1;
int main(void) {
Ekc.size = sizeof(wyj*);
Ekc.base = (&NTIint__rR5Bzr1D5krxoo1NcNyeMA_);
insert__repro_13(&g__repro_1, 0, 0);
{
int i_2 = 0;
{
while (1) {
if (!(i_2 < ((int) 100))) goto LA3;
i_2 += 1;
} LA3: ;
}
}
return 0;
}
It can be compiled using
gcc -g -flto -Wall -Wextra -pedantic -Wno-unused-parameter -finline-limit=200 -O2 -fno-strict-aliasing -fno-ident repro.c
-O3
also works, but it needs -flto
to trigger the failure. When it does, one sees
==1383499== Memcheck, a memory error detector
==1383499== Copyright (C) 2002-2022, and GNU GPL'd, by Julian Seward et al.
==1383499== Using Valgrind-3.19.0 and LibVEX; rerun with -h for copyright info
==1383499== Command: ./a.out
==1383499==
==1383499==
==1383499== Process terminating with default action of signal 11 (SIGSEGV)
==1383499== Bad permissions for mapped region at address 0x10A008
==1383499== at 0x109095: UnknownInlinedFun (repro.c:64)
==1383499== by 0x109095: UnknownInlinedFun (repro.c:122)
==1383499== by 0x109095: main (repro.c:133)
==1383499==
==1383499== HEAP SUMMARY:
==1383499== in use at exit: 32 bytes in 1 blocks
==1383499== total heap usage: 1 allocs, 0 frees, 32 bytes allocated
==1383499==
==1383499== LEAK SUMMARY:
==1383499== definitely lost: 32 bytes in 1 blocks
==1383499== indirectly lost: 0 bytes in 0 blocks
==1383499== possibly lost: 0 bytes in 0 blocks
==1383499== still reachable: 0 bytes in 0 blocks
==1383499== suppressed: 0 bytes in 0 blocks
==1383499== Rerun with --leak-check=full to see details of leaked memory
==1383499==
==1383499== For lists of detected and suppressed errors, rerun with: -s
==1383499== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
Segmentation fault
where the exact cause of the segfault is that the variable in question gets put jin the .rodata
section, which is triggers a segfault upon attempt to modify it; in gdb
:
Program received signal SIGSEGV, Segmentation fault.
insert__repro_13 (x=0x555555556008 <g.repro_1>, item=0, i=0) at repro.c:122
(gdb) info files
...
0x0000555555556000 - 0x0000555555556010 is .rodata
...
Without LTO, but with otherwise identical build configuration:
gcc -Wall -Wextra -pedantic -Wno-unused-parameter -finline-limit=200 -O2 -fno-strict-aliasing -fno-ident repro.c
it places this in the mutable .bss
section instead:
(gdb) p &g__repro_1
$1 = (wyj **) 0x555555558030 <g.repro_1>
(gdb) info file
...
0x0000555555558020 - 0x0000555555558060 is .bss
...
One can adjust the test case to use non-void**
, and this helps:
#include <stdlib.h>
struct Rhd {
int len;
};
struct Dkr {
int size;
};
struct wvg {
int line;
};
struct tbs {
struct Rhd Sup;
struct wvg hkt[];
};
struct wyj {
struct Rhd Sup;
int hkt[];
};
struct Dkr Ekc;
void* newSeq(struct Dkr* typ, int len);
int __attribute__((__noinline__)) isOnStack__system_5289(void* p);
//static void adt(int i, int n);
struct Dkr NTIint__rR5Bzr1D5krxoo1NcNyeMA_;
void* newSeq(struct Dkr* typ, int len) {
void* result = (void*)malloc(16384);
(*((struct Rhd*) (result))).len = len;
return result;
}
int __attribute__((__noinline__)) isOnStack__system_5289(void* p) {
return 1;
}
int refcount;
void dpb(struct wyj** dest, void* src) {
{
int T3_ = isOnStack__system_5289(0);
if (!!(T3_)) goto LA4_;
{
if (!src) goto LA8_;
refcount += 8;
}
LA8_:
;
{
refcount -= 8;
}
}
LA4_:
;
*dest = src;
}
void add__system_2306(struct wyj** x, char* y) {
int i;
i = ((int) 0);
{
if (!y) goto LA3_;
{
while (1) {
if (y[i] != 0) goto LA6;
dpb(x, malloc(sizeof(struct wyj)));
i += 1;
}
LA6:
;
}
}
LA3_:
;
}
int* jby(struct wyj** hmm, struct tbs* stackTraceEntries) {
int* result = malloc(sizeof(int));
struct tbs* s;
s = stackTraceEntries;
{
int i = 0;
int colontmp_;
int T2_ = (s ? s->Sup.len : 0);
int res = 0;
colontmp_ = (int)(T2_ - ((int) 1));
{
while (1) {
if (!(res <= colontmp_)) goto LA4;
i = res;
{
if (!(s->hkt[i].line == -10)) goto LA7_;
}
goto LA5_;
LA7_:
;
{
if (!(s->hkt[i].line == -100)) goto LA10_;
}
goto LA5_;
LA10_:
;
add__system_2306(hmm, "ABCDEF");
LA5_:
;
res += ((int) 1);
}
LA4:
;
}
}
return result;
}
static void adt(struct wyj** hmm, int i, int n) {
struct tbs T1_ = { 0 };
jby(hmm, &T1_);
exit(0);
}
void insert__repro_13(struct wyj** x, int* maybe, int item, int i) {
int TM__Q5wkpxktOdTGvlSRo9bzt9aw_23;
TM__Q5wkpxktOdTGvlSRo9bzt9aw_23 = ((*x) ? (*x)->Sup.len : 0) + 1;
dpb(x, (struct wyj*) newSeq(&Ekc, TM__Q5wkpxktOdTGvlSRo9bzt9aw_23));
if (1 || i < 0 || i >= (1000 + ((*x) ? (*x)->Sup.len : 0))) {
adt(x, i, ((*x) ? (*x)->Sup.len : 0)-1);
}
*maybe = i;
(*x)->hkt[0] = i;
}
struct wyj* g__repro_1;
int* hmmm;
int main(void) {
Ekc.size = sizeof(struct wyj*);
hmmm = malloc(sizeof(int));
insert__repro_13(&g__repro_1, hmmm, 0, 0);
{
int i_2 = 0;
{
while (1) {
if (!(i_2 < 1)) goto LA3;
i_2 += 1;
}
LA3:
;
}
}
return 0;
}
It runs fine with or without -flto
.
Indeed, looking at
var g: seq[int]
proc j(b: int) =
g.insert(0, 0)
for i in 0 ..< 1:
discard
j(0)
more directly, using Nim devel
via
NimDevel/bin/nim c -r --mm:refc -d:release --passC=-flto --passL=-flto --passC=-finline-limit=200 --passL=-finline-limit=200 repro.nim
it crashes at
(gdb) r
Starting program: repro
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Program received signal SIGSEGV, Segmentation fault.
insert__repro_13 (x=0x555555565268 <g.repro_1>, item=0, i=0) at .cache/nim/repro_r/@mNimDevel@slib@ssystem.nim.c:4672
4672 unsureAsgnRef((void**) (&(*x)), (tySequence__qwqHTkRvwhrRyENtudHQ7g*) setLengthSeqV2(&((*x))->Sup, (&NTIseqLintT__qwqHTkRvwhrRyENtudHQ7g_), ((NI) ((NI)(TM__Q5wkpxktOdTGvlSRo9bzt9aw_27)))));
where
N_LIB_PRIVATE N_NIMCALL(void, unsureAsgnRef)(void** dest, void* src) {
{
NIM_BOOL T3_;
T3_ = (NIM_BOOL)0;
T3_ = isOnStack__system_5289(((void*) (dest)));
if (!!(T3_)) goto LA4_;
{
tyObject_Cell__1zcF9cV8XIAtbN8h5HRUB8g* T10_;
if (!!((src == NIM_NIL))) goto LA8_;
T10_ = (tyObject_Cell__1zcF9cV8XIAtbN8h5HRUB8g*)0;
T10_ = usrToCell__system_5263(src);
incRef__system_5301(T10_);
}
LA8_: ;
{
NIM_BOOL T13_;
tyObject_Cell__1zcF9cV8XIAtbN8h5HRUB8g* T16_;
T13_ = (NIM_BOOL)0;
T13_ = lteqpercent___system_985(((NI) 4096), ((NI) (ptrdiff_t) ((*dest))));
if (!T13_) goto LA14_;
T16_ = (tyObject_Cell__1zcF9cV8XIAtbN8h5HRUB8g*)0;
T16_ = usrToCell__system_5263((*dest));
decRef__system_5308(T16_);
}
LA14_: ;
}
goto LA1_;
LA4_: ;
{
}
LA1_: ;
(*dest) = src;
}
This (void **, void*)
signature is fine in isolation, but it's not actually called on void*
. Instead, it's called
by
N_LIB_PRIVATE N_NIMCALL(void, add__system_2306)(NimStringDesc** x, NCSTRING y) {
NI i;
i = ((NI) 0);
{
if (!!((((void*) (y)) == NIM_NIL))) goto LA3_;
{
while (1) {
if (!!(((NU8)(y[i]) == (NU8)(0)))) goto LA6;
unsureAsgnRef((void**) (&(*x)), addChar((*x), y[i]));
i += ((NI) 1);
} LA6: ;
}
}
LA3_: ;
}
and
N_LIB_PRIVATE N_NIMCALL(void, insert__repro_13)(tySequence__qwqHTkRvwhrRyENtudHQ7g** x, NI item, NI i) {
NI xlX60gensym0_;
NI T1_;
NI TM__Q5wkpxktOdTGvlSRo9bzt9aw_27;
T1_ = ((*x) ? (*x)->Sup.len : 0);
xlX60gensym0_ = T1_;
if (nimAddInt(xlX60gensym0_, ((NI) 1), &TM__Q5wkpxktOdTGvlSRo9bzt9aw_27)) { raiseOverflow(); };
if (((NI)(TM__Q5wkpxktOdTGvlSRo9bzt9aw_27)) < ((NI) 0) || ((NI)(TM__Q5wkpxktOdTGvlSRo9bzt9aw_27)) > ((NI) IL64(9223372036854775807))){ raiseRangeErrorI((NI)(TM__Q5wkpxktOdTGvlSRo9bzt9aw_27), ((NI) 0), ((NI) IL64(9223372036854775807))); }
unsureAsgnRef((void**) (&(*x)), (tySequence__qwqHTkRvwhrRyENtudHQ7g*) setLengthSeqV2(&((*x))->Sup, (&NTIseqLintT__qwqHTkRvwhrRyENtudHQ7g_), ((NI) ((NI)(TM__Q5wkpxktOdTGvlSRo9bzt9aw_27)))));
if (i < 0 || i >= ((*x) ? (*x)->Sup.len : 0)){ raiseIndexError2(i,((*x) ? (*x)->Sup.len : 0)-1); }
(*x)->data[i] = item;
}
the latter of which can be recognized from the gdb
backtrace:
(gdb) bt full
#0 insert__repro_13 (x=0x555555565268 <g.repro_1>, item=0, i=0)
at .cache/nim/repro_r/@mNimSansFix@slib@ssystem.nim.c:4672
xlX60gensym0_ = 0
T1_ = 0
TM__Q5wkpxktOdTGvlSRo9bzt9aw_27 = 1
xlX60gensym0_ = <optimized out>
T1_ = <optimized out>
TM__Q5wkpxktOdTGvlSRo9bzt9aw_27 = <optimized out>
#1 j__repro_11 (b=0) at .cache/nim/repro_r/@mrepro.nim.c:86
LA3 = <optimized out>
LA3 = <optimized out>
i = <optimized out>
i_2 = <optimized out>
TM__ckn2SibXCdoGNfJOD0JenA_3 = <optimized out>
#2 NimMainModule () at .cache/nim/repro_r/@mrepro.nim.c:151
No locals.
This is undefined behavior according to C11, because NimStringDesc** x
and tySequence__qwqHTkRvwhrRyENtudHQ7g** x
are not interchangeable with void**
.
The most recent freely accessible C11 draft I was able to find addresses this point, because 6.5p7 states:
An object shall have its stored value accessed only by an lvalue expression that has one of the following types: type compatible with the effective type of the object, [... and some other not-relevant-to-void-pointer possibilities].
Meanwhile, 6.7.6.1p2 notes that:
For two pointer types to be compatible, both shall be identically qualified and both shall be pointers to compatible types.
void **
and NimStringDesc** x
are pointers to, respectively, void *
and NimStringDesc *
, which are not compatible types. There are special-case rules for, specifically, dealing with void *
, which the C FAQ summarizes, but they don't apply here.
Thus, while this is a bit of an odd behavior from gcc, it's consistently responding to UB code generated by Nim for this test program
var g: seq[int]
proc j(b: int) =
g.insert(0, 0)
for i in 0 ..< 1:
discard
j(0)
with no references, pointers, macros, templates, imports, or anything similarly exotic.
In theory an overly aggressive optimizer can break Nim's stack root tracing that the old GCs use.
This wouldn't help the observed failure mode -- once a mutable global variable is placed in .rodata
it's not going to work regardless of what the GC does/doesn't do.
The most recent freely accessible C11 draft I was able to find addresses this point, because 6.5p7 states
FWIW I'm quite familiar with the C standard and C's aliasing rules. That doesn't help avoiding these bugs though. The problem is even if unsureAsgnRef
is changed to use void*
instead of void**
inside it we have to do dest[] = src
so the cast would move into the body of the proc where it's just as undefined behavior.
Closing as a confirmed gcc
issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107769
What happened?
When running
using
It results in
Nim Version
Tested with
and
using from https://packages.debian.org/sid/gcc
Current Standard Output Logs
Possible Solution
No response
Additional Information
Currently it appears to be a function of the memory management approach chosen, and post-
devel
-switch to ORC by default, if one doesn't specify--mm:refc
, it magically disappears due to its not appearing under ORC in the same way.It seems to depend on exactly how many times
c()
is called.