nodejs / node

Node.js JavaScript runtime ✨🐢🚀✨
https://nodejs.org
Other
107.67k stars 29.64k forks source link

Investigate flaky test-worker-prof #26401

Closed Trott closed 2 months ago

Trott commented 5 years ago

https://ci.nodejs.org/job/node-test-binary-arm/6472/RUN_SUBSET=0,label=pi1-docker/console

test-requireio_ceejbot-debian9-armv6l_pi1p-1

00:34:22 not ok 365 parallel/test-worker-prof
00:34:22   ---
00:34:22   duration_ms: 19.977
00:34:22   severity: fail
00:34:22   exitcode: 1
00:34:22   stack: |-
00:34:22     assert.js:85
00:34:22       throw new AssertionError(obj);
00:34:22       ^
00:34:22     
00:34:22     AssertionError [ERR_ASSERTION]: Expected values to be strictly equal:
00:34:22     
00:34:22     1 !== 2
00:34:22     
00:34:22         at Object.<anonymous> (/home/iojs/build/workspace/node-test-binary-arm/test/parallel/test-worker-prof.js:29:8)
00:34:22         at Module._compile (internal/modules/cjs/loader.js:807:30)
00:34:22         at Object.Module._extensions..js (internal/modules/cjs/loader.js:818:10)
00:34:22         at Module.load (internal/modules/cjs/loader.js:674:32)
00:34:22         at tryModuleLoad (internal/modules/cjs/loader.js:606:12)
00:34:22         at Function.Module._load (internal/modules/cjs/loader.js:598:3)
00:34:22         at Function.Module.runMain (internal/modules/cjs/loader.js:870:12)
00:34:22         at internal/main/run_main_module.js:21:11
00:34:22   ...
refack commented 5 years ago

https://ci.nodejs.org/job/node-test-binary-arm/6731/RUN_SUBSET=2,label=pi1-docker/testReport/junit/(root)/test/parallel_test_worker_prof/ Worker: https://ci.nodejs.org/computer/test-requireio_davglass-debian9-armv6l_pi1p-1/

mark-as-flaky PR: https://github.com/nodejs/node/pull/26557

gireeshpunathil commented 5 years ago

the assertion failure indicates that either the parent or the child - one those did not produce a trace output! (something beyond what we thought could potentially cause flake in this test - i.e., the process timings and the number of ticks etc.)

refack commented 5 years ago

Now it's flaking on Windows Worker: https://ci.nodejs.org/computer/test-azure_msft-win10-x64-1/

assert.js:340
    throw err;
    ^

AssertionError [ERR_ASSERTION]: 3 >= 15
    at Object.<anonymous> (c:\workspace\node-test-binary-windows\test\parallel\test-worker-prof.js:39:3)
    at Module._compile (internal/modules/cjs/loader.js:813:30)
    at Object.Module._extensions..js (internal/modules/cjs/loader.js:824:10)
    at Module.load (internal/modules/cjs/loader.js:680:32)
    at tryModuleLoad (internal/modules/cjs/loader.js:612:12)
    at Function.Module._load (internal/modules/cjs/loader.js:604:3)
    at Function.Module.runMain (internal/modules/cjs/loader.js:876:12)
    at internal/main/run_main_module.js:21:11
targos commented 5 years ago

Also on Linux without intl:

https://ci.nodejs.org/job/node-test-commit-linux-containered/11274/nodes=ubuntu1604_sharedlibs_withoutintl_x64/testReport/junit/(root)/test/parallel_test_worker_prof/

assert.js:340
    throw err;
    ^

AssertionError [ERR_ASSERTION]: 10 >= 15
    at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux-containered/test/parallel/test-worker-prof.js:39:3)
    at Module._compile (internal/modules/cjs/loader.js:813:30)
    at Object.Module._extensions..js (internal/modules/cjs/loader.js:824:10)
    at Module.load (internal/modules/cjs/loader.js:680:32)
    at tryModuleLoad (internal/modules/cjs/loader.js:612:12)
    at Function.Module._load (internal/modules/cjs/loader.js:604:3)
    at Function.Module.runMain (internal/modules/cjs/loader.js:876:12)
    at internal/main/run_main_module.js:21:11
Trott commented 5 years ago

This was on node-daily-master today:

https://ci.nodejs.org/job/node-test-commit-linux-containered/11398/nodes=ubuntu1604_sharedlibs_withoutintl_x64/console

test-digitalocean-ubuntu1604_sharedlibs_container-x64-10

00:09:19 not ok 2195 parallel/test-worker-prof # TODO : Fix flaky test
00:09:19   ---
00:09:19   duration_ms: 0.528
00:09:19   severity: flaky
00:09:19   exitcode: 1
00:09:19   stack: |-
00:09:19     assert.js:340
00:09:19         throw err;
00:09:19         ^
00:09:19     
00:09:19     AssertionError [ERR_ASSERTION]: 14 >= 15
00:09:19         at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux-containered/test/parallel/test-worker-prof.js:39:3)
00:09:19         at Module._compile (internal/modules/cjs/loader.js:813:30)
00:09:19         at Object.Module._extensions..js (internal/modules/cjs/loader.js:824:10)
00:09:19         at Module.load (internal/modules/cjs/loader.js:680:32)
00:09:19         at tryModuleLoad (internal/modules/cjs/loader.js:612:12)
00:09:19         at Function.Module._load (internal/modules/cjs/loader.js:604:3)
00:09:19         at Function.Module.runMain (internal/modules/cjs/loader.js:876:12)
00:09:19         at internal/main/run_main_module.js:21:11
00:09:19   ...
Trott commented 5 years ago

And for comparison, here's what the failure looks like with the changes in https://github.com/nodejs/node/pull/26608/files/840d31383703fdbc1fe7deb1074c306ee2415aea which is the current change proposed in https://github.com/nodejs/node/pull/26608:

https://ci.nodejs.org/job/node-test-commit-linux-containered/11402/nodes=ubuntu1604_sharedlibs_withoutintl_x64/console

test-digitalocean-ubuntu1604_sharedlibs_container-x64-10

03:27:44 not ok 2198 parallel/test-worker-prof # TODO : Fix flaky test
03:27:44   ---
03:27:44   duration_ms: 0.519
03:27:44   severity: flaky
03:27:44   exitcode: 1
03:27:44   stack: |-
03:27:44     assert.js:85
03:27:44       throw new AssertionError(obj);
03:27:44       ^
03:27:44     
03:27:44     AssertionError [ERR_ASSERTION]: Expected values to be strictly equal:
03:27:44     
03:27:44     null !== 0
03:27:44     
03:27:44         at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux-containered/test/parallel/test-worker-prof.js:54:10)
03:27:44         at Module._compile (internal/modules/cjs/loader.js:813:30)
03:27:44         at Object.Module._extensions..js (internal/modules/cjs/loader.js:824:10)
03:27:44         at Module.load (internal/modules/cjs/loader.js:680:32)
03:27:44         at tryModuleLoad (internal/modules/cjs/loader.js:612:12)
03:27:44         at Function.Module._load (internal/modules/cjs/loader.js:604:3)
03:27:44         at Function.Module.runMain (internal/modules/cjs/loader.js:876:12)
03:27:44         at internal/main/run_main_module.js:21:11
03:27:44   ...
gireeshpunathil commented 5 years ago

thanks @Trott . I have no idea what null exit code means - exited with exception? shouldn't it set a valid exit code then? also, shouldn't we get something on the stderr too? weird. Will try to recreate in a --without-intl configuration.

Trott commented 5 years ago

I have no idea what null exit code means - exited with exception?

Maybe this?

https://github.com/nodejs/node/blob/169b7f1f3b3751289f24678930e6a5731464ebc9/src/spawn_sync.cc#L683-L686

If I'm understanding correctly, maybe it means that the process received a signal?

Trott commented 5 years ago

Looking at test/parallel/test-child-process-spawnsync-kill-signal.js also seems to indicate that status is null when a signal is received.

gireeshpunathil commented 5 years ago

yes, looks like it is! but who could have sent it! probably an oom-killer from the container? I will see if I can replicate in a container; but is there a way to check what the upper memory limit these containers are configured with?

Trott commented 5 years ago

but is there a way to check what the upper memory limit these containers are configured with?

^^^^ @nodejs/build

refack commented 5 years ago
> free
              total        used        free      shared  buff/cache   available
Mem:       32946876     2048256     4308300       17532    26590320    30328076
Swap:             0           0           0
refack commented 5 years ago

dmesg (error happened around 10:27:44 UTC)

[Fri Mar 15 10:27:45 2019] do_general_protection: 14 callbacks suppressed

Might hint at a SIGSEGV

``` [Fri Mar 15 07:10:14 2019] traps: node[19847] trap invalid opcode ip:1622d49 sp:7ffe7ab1aa28 error:0 in node[400000+1dd7000] [Fri Mar 15 07:10:14 2019] traps: node[19849] trap invalid opcode ip:1622d49 sp:7ffef2278e78 error:0 in node[400000+1dd7000] [Fri Mar 15 07:10:16 2019] traps: node[19985] trap invalid opcode ip:15e92d9 sp:7f7ad841be48 error:0 in node[400000+19b9000] [Fri Mar 15 07:10:16 2019] traps: node[20007] trap invalid opcode ip:1622d49 sp:7f08310a4e38 error:0 in node[400000+1dd7000] [Fri Mar 15 09:48:55 2019] traps: node[31881] trap invalid opcode ip:15e92d9 sp:7ffcd7250008 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:55 2019] traps: node[31907] trap invalid opcode ip:15e92d9 sp:7ffdda0d4448 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:55 2019] traps: node[31949] trap invalid opcode ip:15e92d9 sp:7fffc2d46c98 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:56 2019] traps: node[32058] trap invalid opcode ip:15e92d9 sp:7ffd9662f348 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:56 2019] traps: node[32086] trap invalid opcode ip:15e92d9 sp:7ffcee647c98 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:56 2019] traps: node[32088] trap invalid opcode ip:15e92d9 sp:7fff077d8ec8 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:56 2019] traps: node[32141] trap invalid opcode ip:15e92d9 sp:7ffd8365d438 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:56 2019] traps: node[32215] trap invalid opcode ip:15e92d9 sp:7ffe30d4b918 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:56 2019] traps: node[32251] trap invalid opcode ip:15e92d9 sp:7ffe4d7f9838 error:0 in node[400000+19b9000] [Fri Mar 15 09:48:56 2019] traps: node[32258] trap invalid opcode ip:15e92d9 sp:7ffc7d4d79d8 error:0 in node[400000+19b9000] [Fri Mar 15 09:49:05 2019] do_trap: 2 callbacks suppressed [Fri Mar 15 09:49:05 2019] traps: node[2594] trap invalid opcode ip:1622cf9 sp:7ffe30970b58 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:05 2019] traps: node[2635] trap invalid opcode ip:1622cf9 sp:7fff1c56e128 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:05 2019] traps: node[2701] trap invalid opcode ip:1622cf9 sp:7ffc232b9f18 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:05 2019] traps: node[2790] trap invalid opcode ip:1622cf9 sp:7ffeb84c47e8 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:05 2019] traps: node[2813] trap invalid opcode ip:1622cf9 sp:7ffd0ae53348 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:05 2019] traps: node[2840] trap invalid opcode ip:1622cf9 sp:7ffdb3300df8 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:06 2019] traps: node[2904] trap invalid opcode ip:1622cf9 sp:7ffdef97dbc8 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:06 2019] traps: node[2969] trap invalid opcode ip:1622cf9 sp:7ffd427fa868 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:06 2019] traps: node[2977] trap invalid opcode ip:1622cf9 sp:7fffd67743b8 error:0 in node[400000+1dd7000] [Fri Mar 15 09:49:06 2019] traps: node[3002] trap invalid opcode ip:1622cf9 sp:7ffc84f22da8 error:0 in node[400000+1dd7000] [Fri Mar 15 09:50:34 2019] do_general_protection: 2 callbacks suppressed [Fri Mar 15 09:50:34 2019] traps: node[32317] general protection ip:f67050 sp:7fa64fffd180 error:0 in node[400000+19b9000] [Fri Mar 15 09:51:29 2019] traps: node[5946] trap invalid opcode ip:15e92d9 sp:7ffddc3e2008 error:0 in node[400000+19b9000] [Fri Mar 15 09:51:29 2019] traps: node[5947] trap invalid opcode ip:15e92d9 sp:7ffff5791708 error:0 in node[400000+19b9000] [Fri Mar 15 09:51:31 2019] traps: node[6058] trap invalid opcode ip:15e92d9 sp:7fde286f6e48 error:0 in node[400000+19b9000] [Fri Mar 15 09:51:42 2019] traps: node[7015] trap invalid opcode ip:1622cf9 sp:7ffde02634f8 error:0 in node[400000+1dd7000] [Fri Mar 15 09:51:42 2019] traps: node[7021] trap invalid opcode ip:1622cf9 sp:7ffe7d06afc8 error:0 in node[400000+1dd7000] [Fri Mar 15 09:51:44 2019] traps: node[7173] trap invalid opcode ip:1622cf9 sp:7f58dfff9e38 error:0 in node[400000+1dd7000] [Fri Mar 15 10:26:05 2019] traps: node[18744] trap invalid opcode ip:15e92d9 sp:7ffc389cfc38 error:0 in node[400000+19b9000] [Fri Mar 15 10:26:06 2019] traps: node[18801] trap invalid opcode ip:15e92d9 sp:7fffe40df9b8 error:0 in node[400000+19b9000] [Fri Mar 15 10:26:06 2019] traps: node[18867] trap invalid opcode ip:15e92d9 sp:7ffe320126b8 error:0 in node[400000+19b9000] [Fri Mar 15 10:26:06 2019] traps: node[18944] trap invalid opcode ip:15e92d9 sp:7ffdb1ea20f8 error:0 in node[400000+19b9000] [Fri Mar 15 10:26:06 2019] traps: node[18985] trap invalid opcode ip:15e92d9 sp:7ffee0ef7ad8 error:0 in node[400000+19b9000] [Fri Mar 15 10:26:06 2019] traps: node[19000] trap invalid opcode ip:1622d49 sp:7ffc4d1fd9f8 error:0 in node[400000+1dd7000] [Fri Mar 15 10:26:06 2019] traps: node[19003] trap invalid opcode ip:15e92d9 sp:7ffc94030618 error:0 in node[400000+19b9000] [Fri Mar 15 10:26:06 2019] traps: node[19016] trap invalid opcode ip:1622d49 sp:7fffb6cba348 error:0 in node[400000+1dd7000] [Fri Mar 15 10:26:06 2019] traps: node[19051] trap invalid opcode ip:15e92d9 sp:7ffe4899d318 error:0 in node[400000+19b9000] [Fri Mar 15 10:26:06 2019] traps: node[19067] trap invalid opcode ip:1622d49 sp:7ffc6ceeaed8 error:0 in node[400000+1dd7000] [Fri Mar 15 10:27:45 2019] do_general_protection: 14 callbacks suppressed [Fri Mar 15 10:27:45 2019] traps: node[19419] general protection ip:f67050 sp:7fc5e0864180 error:0 in node[400000+19b9000] [Fri Mar 15 10:28:40 2019] traps: node[23737] trap invalid opcode ip:15e92d9 sp:7ffd0e8b0458 error:0 in node[400000+19b9000] [Fri Mar 15 10:28:40 2019] traps: node[23738] trap invalid opcode ip:15e92d9 sp:7ffd8a741718 error:0 in node[400000+19b9000] [Fri Mar 15 10:28:43 2019] traps: node[23866] trap invalid opcode ip:15e92d9 sp:7fc6d37f8e48 error:0 in node[400000+19b9000] [Fri Mar 15 10:28:46 2019] traps: node[24299] trap invalid opcode ip:1622d49 sp:7fffa0d9ffa8 error:0 in node[400000+1dd7000] [Fri Mar 15 10:28:46 2019] traps: node[24303] trap invalid opcode ip:1622d49 sp:7fff88ae03e8 error:0 in node[400000+1dd7000] [Fri Mar 15 10:28:48 2019] traps: node[24502] trap invalid opcode ip:1622d49 sp:7f0cec7a7e38 error:0 in node[400000+1dd7000] [Fri Mar 15 11:21:22 2019] traps: node[3697] trap invalid opcode ip:15e92d9 sp:7ffe38f00168 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:22 2019] traps: node[3746] trap invalid opcode ip:15e92d9 sp:7ffde0081ef8 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:22 2019] traps: node[3793] trap invalid opcode ip:15e92d9 sp:7ffcac81e8c8 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:22 2019] traps: node[3864] trap invalid opcode ip:15e92d9 sp:7ffc2ad260c8 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:23 2019] traps: node[3934] trap invalid opcode ip:15e92d9 sp:7ffdf5e8c828 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:23 2019] traps: node[3931] trap invalid opcode ip:15e92d9 sp:7ffd7697fce8 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:23 2019] traps: node[3965] trap invalid opcode ip:15e92d9 sp:7ffd9668c578 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:23 2019] traps: node[4041] trap invalid opcode ip:15e92d9 sp:7ffd919dc5f8 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:23 2019] traps: node[4087] trap invalid opcode ip:15e92d9 sp:7ffef0300a38 error:0 in node[400000+19b9000] [Fri Mar 15 11:21:23 2019] traps: node[4107] trap invalid opcode ip:15e92d9 sp:7ffd31430458 error:0 in node[400000+19b9000] [Fri Mar 15 11:23:01 2019] do_general_protection: 14 callbacks suppressed [Fri Mar 15 11:23:01 2019] traps: node[5141] general protection ip:f66fa0 sp:7f5ff44c3180 error:0 in node[400000+19b9000] [Fri Mar 15 11:23:56 2019] traps: node[9284] trap invalid opcode ip:15e92d9 sp:7ffe2fee1258 error:0 in node[400000+19b9000] [Fri Mar 15 11:23:56 2019] traps: node[9285] trap invalid opcode ip:15e92d9 sp:7ffe99d0ccd8 error:0 in node[400000+19b9000] [Fri Mar 15 11:23:58 2019] traps: node[9588] trap invalid opcode ip:15e92d9 sp:7f37a4b48e48 error:0 in node[400000+19b9000] [Fri Mar 15 11:23:59 2019] traps: node[9702] trap invalid opcode ip:1622bd9 sp:7fff01347178 error:0 in node[400000+1dd7000] [Fri Mar 15 11:23:59 2019] traps: node[9704] trap invalid opcode ip:1622bd9 sp:7ffe03d0d848 error:0 in node[400000+1dd7000] [Fri Mar 15 11:24:01 2019] traps: node[9934] trap invalid opcode ip:1622bd9 sp:7fce70567e38 error:0 in node[400000+1dd7000] root@test-digitalocean-ubuntu1604-docker-x64-2:~# date Fri Mar 15 13:54:38 UTC 2019 ```
gireeshpunathil commented 5 years ago

thanks @refack ! the 32 GB total seem to be ample, but the actual available memory to the process when it ran will depend how many processes were running parallel. So unfortunately that data do not throw any clues.

However, the system log shows it all! invalid opcode is indeed OOM-KILLER from the container!

I am having hard time matching the timezones: ci shows 15:57:44 not ok 2198 parallel/test-worker-prof # TODO : Fix flaky test the container current time shows Fri Mar 15 13:54:38 UTC 2019 - so how do we map?

Outside of this, we seem to have terminated several processes frequently, through container memory constraints - what do you think of it? I mean, shouldn't we fix that?

gireeshpunathil commented 5 years ago

and, do_general_protection looks to be a kernel crash, as opposed to user space one? Are there evidence (in any other logs) of the container being restarted around do_general_protection ? there is always minutes of gap between the timestamps around these crash, but that may not be conclusive.

refack commented 5 years ago

I am having hard time matching the timezones:

The system times are UTC. Jenkins is based on your browser's locale.

and, do_general_protection looks to be a kernel crash, as opposed to user space one? Are there evidence (in any other logs) of the container being restarted around do_general_protection ?

No, all logs seems to show the container run just fine.

I'll look around for an occurrence of this error on non-containerized platform.

refack commented 5 years ago

So building without intl this repros easily:

$ date
Fri Mar 15 15:53:59 UTC 2019
$ python tools/test.py --repeat 100 -j 1 parallel/test-worker-prof
=== release test-worker-prof ===
Path: parallel/test-worker-prof
assert.js:340
    throw err;
    ^

AssertionError [ERR_ASSERTION]: 8 >= 15
    at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux/nodes/ubuntu1604-64/test/parallel/test-worker-prof.js:39:3)
    at Module._compile (internal/modules/cjs/loader.js:813:30)
    at Object.Module._extensions..js (internal/modules/cjs/loader.js:824:10)
    at Module.load (internal/modules/cjs/loader.js:680:32)
    at tryModuleLoad (internal/modules/cjs/loader.js:612:12)
    at Function.Module._load (internal/modules/cjs/loader.js:604:3)
    at Function.Module.runMain (internal/modules/cjs/loader.js:876:12)
    at internal/main/run_main_module.js:21:11
Command: out/Release/node /home/iojs/build/workspace/node-test-commit-linux/nodes/ubuntu1604-64/test/parallel/test-worker-prof.js
[00:04|%   6|+   5|-   1]: Done                    ^C
$ dmesg -T | tail -20
[Fri Mar 15 15:52:11 2019] traps: node[10234] general protection ip:f67050 sp:7f02fc41f180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:12 2019] traps: node[10252] general protection ip:f67050 sp:7ff855ff9180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:13 2019] traps: node[10270] general protection ip:f67050 sp:7f15ec4d0180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:13 2019] traps: node[10288] general protection ip:f67050 sp:7f8f00c67180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:14 2019] traps: node[10306] general protection ip:f67050 sp:7f83616b2180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:14 2019] traps: node[10326] general protection ip:f67050 sp:7f94f453e180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:15 2019] traps: node[10344] general protection ip:f67050 sp:7f94547be180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:16 2019] traps: node[10362] general protection ip:f67050 sp:7fe0f8a49180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:16 2019] traps: node[10380] general protection ip:f67050 sp:7fc17bffd180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:17 2019] traps: node[10398] general protection ip:f67050 sp:7f8b0bffd180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:18 2019] traps: node[10418] general protection ip:f67050 sp:7f799affb180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:18 2019] traps: node[10436] general protection ip:f67050 sp:7f1bf15d7180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:52:19 2019] traps: node[10454] general protection ip:f67050 sp:7f0e68679180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:54:05 2019] traps: node[10654] general protection ip:f67050 sp:7f9a10711180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:54:06 2019] traps: node[10672] general protection ip:f67050 sp:7f520c78c180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:54:06 2019] traps: node[10690] general protection ip:f67050 sp:7fbf207ad180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:54:07 2019] traps: node[10710] general protection ip:f67050 sp:7f9a14e7d180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:54:08 2019] traps: node[10728] general protection ip:f67050 sp:7f9a30996180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:54:08 2019] traps: node[10746] general protection ip:f67050 sp:7f99f879f180 error:0 in node[400000+19b9000]
[Fri Mar 15 15:54:09 2019] traps: node[10764] general protection ip:f67050 sp:7f5589ff9180 error:0 in node[400000+19b9000]

But it seems like every run causes a GP.

refack commented 5 years ago

node.0.dump.tar.zip Here's a core (it's .xz renamed to .zip for GitHub)

gireeshpunathil commented 5 years ago

thanks @refack, this is really useful.

(gdb) where
#0  0x0000000000f67050 in v8::internal::JsonParser<false>::ParseJsonArray() ()
#1  0x0000000000f674bc in v8::internal::JsonParser<false>::ParseJsonValue() ()
#2  0x0000000000f71983 in v8::internal::JsonStringifier::SerializeJSValue(v8::internal::Handle<v8::internal::JSValue>) ()
#3  0x00000000011578a6 in void v8::internal::JoinSparseArrayWithSeparator<unsigned char>(v8::internal::FixedArray*, int, unsigned int, v8::internal::String*, v8::internal::Vector<unsigned char>) [clone .isra.29] ()
#4  0x0000000000b643b8 in std::_Function_handler<v8::internal::compiler::Node* (), v8::internal::compiler::TNode<v8::internal::BoolT> v8::internal::CodeStubAssembler::Select<v8::internal::BoolT, v8::internal::CodeStubAssembler::IsNumeric(v8::internal::compiler::SloppyTNode<v8::internal::Object>)::{lambda()#1}, v8::internal::CodeStubAssembler::IsNumeric(v8::internal::compiler::SloppyTNode<v8::internal::Object>)::{lambda()#2}>(v8::internal::compiler::SloppyTNode<v8::internal::BoolT>, v8::internal::CodeStubAssembler::IsNumeric(v8::internal::compiler::SloppyTNode<v8::internal::Object>)::{lambda()#1} const&, v8::internal::CodeStubAssembler::IsNumeric(v8::internal::compiler::SloppyTNode<v8::internal::Object>)::{lambda()#2} const&)::{lambda()#2}>::_M_invoke(std::_Any_data const&) ()
#5  0x0000000000a64f83 in v8::internal::ApiNatives::CreateApiFunction(v8::internal::Isolate*, v8::internal::Handle<v8::internal::FunctionTemplateInfo>, v8::internal::Handle<v8::internal::Object>, v8::internal::InstanceType, v8::internal::MaybeHandle<v8::internal::Name>) ()
#6  0x00000000008fb53f in node::worker::Message::Deserialize(node::Environment*, v8::Local<v8::Context>)
    ()
#7  0x00000000008fbcdc in node::worker::MessagePort::OnMessage() ()
#8  0x00000000008fbde4 in node::worker::MessagePort::OnMessage() ()
#9  0x0000000000ae9465 in v8::internal::BignumDtoa(double, v8::internal::BignumDtoaMode, int, v8::internal::Vector<char>, int*, int*) ()
#10 0x0000000000aeb4b6 in v8::internal::Bignum::Square() ()
#11 0x0000000001694b2b in Builtins_CallWithSpreadHandler ()
#12 0x00003865d1594ea1 in ?? ()

(gdb) x/5i 0xf6704d
   0xf6704d <_ZN2v88internal10JsonParserILb0EE14ParseJsonArrayEv+1165>: mov    rdi,QWORD PTR [rbp-0x68]
   0xf67051 <_ZN2v88internal10JsonParserILb0EE14ParseJsonArrayEv+1169>: sub    edx,0x1
   0xf67054 <_ZN2v88internal10JsonParserILb0EE14ParseJsonArrayEv+1172>: cmp    r12,rcx
   0xf67057 <_ZN2v88internal10JsonParserILb0EE14ParseJsonArrayEv+1175>: 
    mov    QWORD PTR [rbx+0x7c00],rdi
   0xf6705e <_ZN2v88internal10JsonParserILb0EE14ParseJsonArrayEv+1182>: 
    mov    DWORD PTR [rbx+0x7c10],edx

(gdb) x/w ($rbp-0x68)
0x7f61ca7fa148: 0xb8045310
  1. the failing pc reported in the core (0xf67050) is not properly aligned with the actual instruction boundaries (0xf6704d and then 0xf67051)
  2. all memory that is referred by the code is accesible

I don't see anything suspecious in the proc status data, but I dont have full understanding of its each field:

Threads:    11
SigQ:   0/15710
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 0000000000000000
SigIgn: 0000000000001000
SigCgt: 0000000184004002
CapInh: 0000000000000000
CapPrm: 0000003fffffffff
CapEff: 0000003fffffffff
CapBnd: 0000003fffffffff
CapAmb: 0000000000000000
Seccomp:    0
Speculation_Store_Bypass:   vulnerable
Cpus_allowed:   7fff
Cpus_allowed_list:  0-14
Mems_allowed:   00000000,00000001
Mems_allowed_list:  0
voluntary_ctxt_switches:    11
nonvoluntary_ctxt_switches: 22

So in summary, at this point I am suspecting at an external reason for the crash.

Trott commented 5 years ago

https://ci.nodejs.org/job/node-test-commit-linux-containered/11616/nodes=ubuntu1604_sharedlibs_withoutintl_x64/console

test-softlayer-ubuntu1604_sharedlibs_container-x64-2

15:04:00 not ok 2218 parallel/test-worker-prof # TODO : Fix flaky test
15:04:00   ---
15:04:00   duration_ms: 0.531
15:04:00   severity: flaky
15:04:00   exitcode: 1
15:04:00   stack: |-
15:04:00     assert.js:342
15:04:00         throw err;
15:04:00         ^
15:04:00     
15:04:00     AssertionError [ERR_ASSERTION]: 12 >= 15
15:04:00         at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux-containered/test/parallel/test-worker-prof.js:39:3)
15:04:00         at Module._compile (internal/modules/cjs/loader.js:814:30)
15:04:00         at Object.Module._extensions..js (internal/modules/cjs/loader.js:825:10)
15:04:00         at Module.load (internal/modules/cjs/loader.js:681:32)
15:04:00         at tryModuleLoad (internal/modules/cjs/loader.js:613:12)
15:04:00         at Function.Module._load (internal/modules/cjs/loader.js:605:3)
15:04:00         at Function.Module.runMain (internal/modules/cjs/loader.js:877:12)
15:04:00         at internal/main/run_main_module.js:21:11
15:04:00   ...
Trott commented 5 years ago

Still happening but now (thanks to Gireesh's changes) with better debug info:

https://ci.nodejs.org/job/node-test-binary-arm/7110/RUN_SUBSET=2,label=pi2-docker/console test-requireio_rvagg-debian9-armv7l_pi2-1

00:19:09 not ok 373 parallel/test-worker-prof # TODO : Fix flaky test
00:19:09   ---
00:19:09   duration_ms: 11.32
00:19:09   severity: flaky
00:19:09   exitcode: 1
00:19:09   stack: |-
00:19:09     assert.js:87
00:19:09       throw new AssertionError(obj);
00:19:09       ^
00:19:09     
00:19:09     AssertionError [ERR_ASSERTION]: child exited with signal: { status: null,
00:19:09       signal: 'SIGSEGV',
00:19:09       output: [ null, '', '' ],
00:19:09       pid: 6532,
00:19:09       stdout: '',
00:19:09       stderr: '' }
00:19:09         at Object.<anonymous> (/home/iojs/build/workspace/node-test-binary-arm/test/parallel/test-worker-prof.js:57:10)
00:19:09         at Module._compile (internal/modules/cjs/loader.js:814:30)
00:19:09         at Object.Module._extensions..js (internal/modules/cjs/loader.js:825:10)
00:19:09         at Module.load (internal/modules/cjs/loader.js:681:32)
00:19:09         at tryModuleLoad (internal/modules/cjs/loader.js:613:12)
00:19:09         at Function.Module._load (internal/modules/cjs/loader.js:605:3)
00:19:09         at Function.Module.runMain (internal/modules/cjs/loader.js:877:12)
00:19:09         at internal/main/run_main_module.js:21:11
00:19:09   ...

https://ci.nodejs.org/job/node-test-binary-arm/7110/RUN_SUBSET=2,label=pi3-docker/console test-requireio_kahwee-debian9-arm64_pi3-1

00:17:23 not ok 376 parallel/test-worker-prof # TODO : Fix flaky test
00:17:23   ---
00:17:23   duration_ms: 7.236
00:17:23   severity: flaky
00:17:23   exitcode: 1
00:17:23   stack: |-
00:17:23     assert.js:87
00:17:23       throw new AssertionError(obj);
00:17:23       ^
00:17:23     
00:17:23     AssertionError [ERR_ASSERTION]: child exited with signal: { status: null,
00:17:23       signal: 'SIGSEGV',
00:17:23       output: [ null, '', '' ],
00:17:23       pid: 2080,
00:17:23       stdout: '',
00:17:23       stderr: '' }
00:17:23         at Object.<anonymous> (/home/iojs/build/workspace/node-test-binary-arm/test/parallel/test-worker-prof.js:57:10)
00:17:23         at Module._compile (internal/modules/cjs/loader.js:814:30)
00:17:23         at Object.Module._extensions..js (internal/modules/cjs/loader.js:825:10)
00:17:23         at Module.load (internal/modules/cjs/loader.js:681:32)
00:17:23         at tryModuleLoad (internal/modules/cjs/loader.js:613:12)
00:17:23         at Function.Module._load (internal/modules/cjs/loader.js:605:3)
00:17:23         at Function.Module.runMain (internal/modules/cjs/loader.js:877:12)
00:17:23         at internal/main/run_main_module.js:21:11
00:17:23   ...
Trott commented 5 years ago

https://ci.nodejs.org/job/node-test-commit-linux-containered/11643/nodes=ubuntu1604_sharedlibs_withoutintl_x64/console

test-digitalocean-ubuntu1604_sharedlibs_container-x64-10

00:05:24 not ok 2214 parallel/test-worker-prof # TODO : Fix flaky test
00:05:24   ---
00:05:24   duration_ms: 0.516
00:05:24   severity: flaky
00:05:24   exitcode: 1
00:05:24   stack: |-
00:05:24     assert.js:87
00:05:24       throw new AssertionError(obj);
00:05:24       ^
00:05:24     
00:05:24     AssertionError [ERR_ASSERTION]: child exited with signal: { status: null,
00:05:24       signal: 'SIGSEGV',
00:05:24       output: [ null, '', '' ],
00:05:24       pid: 1562,
00:05:24       stdout: '',
00:05:24       stderr: '' }
00:05:24         at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux-containered/test/parallel/test-worker-prof.js:57:10)
00:05:24         at Module._compile (internal/modules/cjs/loader.js:814:30)
00:05:24         at Object.Module._extensions..js (internal/modules/cjs/loader.js:825:10)
00:05:24         at Module.load (internal/modules/cjs/loader.js:681:32)
00:05:24         at tryModuleLoad (internal/modules/cjs/loader.js:613:12)
00:05:24         at Function.Module._load (internal/modules/cjs/loader.js:605:3)
00:05:24         at Function.Module.runMain (internal/modules/cjs/loader.js:877:12)
00:05:24         at internal/main/run_main_module.js:21:11
00:05:24   ...
gireeshpunathil commented 5 years ago

is it possible to get core files of any of these failures?

Trott commented 5 years ago

is it possible to get core files of any of these failures?

@nodejs/build ^^^^^

joyeecheung commented 5 years ago

Or maybe we can turn on report in the worker and see if it can report anything useful before crashing

richardlau commented 5 years ago

This is also failing in Travis, e.g. https://travis-ci.com/nodejs/node/jobs/188072187#L4408-L4428

=== release test-worker-prof ===
Path: parallel/test-worker-prof
assert.js:87
  throw new AssertionError(obj);
  ^
AssertionError [ERR_ASSERTION]: child exited with signal: { status: null,
  signal: 'SIGSEGV',
  output: [ null, '', '' ],
  pid: 23207,
  stdout: '',
  stderr: '' }
    at Object.<anonymous> (/home/travis/build/nodejs/node/test/parallel/test-worker-prof.js:57:10)
    at Module._compile (internal/modules/cjs/loader.js:837:30)
    at Object.Module._extensions..js (internal/modules/cjs/loader.js:848:10)
    at Module.load (internal/modules/cjs/loader.js:704:32)
    at tryModuleLoad (internal/modules/cjs/loader.js:636:12)
    at Function.Module._load (internal/modules/cjs/loader.js:628:3)
    at Function.Module.runMain (internal/modules/cjs/loader.js:900:12)
    at internal/main/run_main_module.js:21:11
Command: out/Release/node /home/travis/build/nodejs/node/test/parallel/test-worker-prof.js
gireeshpunathil commented 5 years ago

ran 2K times on a container with docker version 18.09.2 but could not reproduce. the visible difference is the failing system has 18.06.0-ce, don't know this is decisive!

Hakerh400 commented 5 years ago

Also crashed on:

richardlau commented 5 years ago

is it possible to get core files of any of these failures?

@gireeshpunathil I've recreated the failure on an internal IBM system with the current head (https://github.com/nodejs/node/commit/c35acc0260b48dc4c656014ef665d811bdd02aa6) and debug build.

-bash-4.2$ ./node test/parallel/test-worker-prof.js
assert.js:87
  throw new AssertionError(obj);
  ^

AssertionError [ERR_ASSERTION]: child exited with signal: { status: null,
  signal: 'SIGSEGV',
  output: [ null, '', '' ],
  pid: 11627,
  stdout: '',
  stderr: '' }
    at Object.<anonymous> (/home/users/riclau/sandbox/github/nodejs/test/parallel/test-worker-prof.js:57:10)
    at Module._compile (internal/modules/cjs/loader.js:837:30)
    at Object.Module._extensions..js (internal/modules/cjs/loader.js:848:10)
    at Module.load (internal/modules/cjs/loader.js:704:32)
    at tryModuleLoad (internal/modules/cjs/loader.js:636:12)
    at Function.Module._load (internal/modules/cjs/loader.js:628:3)
    at Function.Module.runMain (internal/modules/cjs/loader.js:903:10)
    at internal/main/run_main_module.js:21:11
-bash-4.2$

Recreating the command line for the child process that is crashing:

-bash-4.2$ ./node --prof test/parallel/test-worker-prof.js child
Segmentation fault (core dumped)
-bash-4.2$

Give the following backtrace:

(gdb) bt
#0  0x00000000010613a0 in v8::internal::Log::MessageBuilder::AppendSymbolNameDetails(v8::internal::String, bool) ()
#1  0x000000000106151f in v8::internal::Log::MessageBuilder::AppendSymbolName(v8::internal::Symbol) ()
#2  0x0000000001061e3c in v8::internal::Log::MessageBuilder& v8::internal::Log::MessageBuilder::operator<< <v8::internal::Name>(v8::internal::Name) ()
#3  0x000000000106c77c in v8::internal::Logger::CodeCreateEvent(v8::internal::CodeEventListener::LogEventsAndTags, v8::internal::AbstractCode, v8::internal::SharedFunctionInfo, v8::internal::Name, int, int) ()
#4  0x000000000126f36e in v8::internal::CodeSerializer::Deserialize(v8::internal::Isolate*, v8::internal::ScriptData*, v8::internal::Handle<v8::internal::String>, v8::ScriptOriginOptions) ()
#5  0x0000000000c54c85 in v8::internal::Compiler::GetWrappedFunction(v8::internal::Handle<v8::internal::String>, v8::internal::Handle<v8::internal::FixedArray>, v8::internal::Handle<v8::internal::Context>, v8::internal::Compiler::ScriptDetails const&, v8::ScriptOriginOptions, v8::internal::ScriptData*, v8::ScriptCompiler::CompileOptions, v8::ScriptCompiler::NoCacheReason) ()
#6  0x0000000000b51716 in v8::ScriptCompiler::CompileFunctionInContext(v8::Local<v8::Context>, v8::ScriptCompiler::Source*, unsigned long, v8::Local<v8::String>*, unsigned long, v8::Local<v8::Object>*, v8::ScriptCompiler::CompileOptions, v8::ScriptCompiler::NoCacheReason) ()
#7  0x000000000099b514 in node::native_module::NativeModuleLoader::LookupAndCompile(v8::Local<v8::Context>, char const*, std::vector<v8::Local<v8::String>, std::allocator<v8::Local<v8::String> > >*, node::Environment*) ()
#8  0x000000000099be10 in node::native_module::NativeModuleLoader::CompileFunction(v8::FunctionCallbackInfo<v8::Value> const&) ()
#9  0x0000000000bd26d3 in v8::internal::MaybeHandle<v8::internal::Object> v8::internal::(anonymous namespace)::HandleApiCallHelper<false>(v8::internal::Isolate*, v8::internal::Handle<v8::internal::HeapObject>, v8::internal::Handle<v8::internal::HeapObject>, v8::internal::Handle<v8::internal::FunctionTemplateInfo>, v8::internal::Handle<v8::internal::Object>, v8::internal::BuiltinArguments) ()
#10 0x0000000000bd3dff in v8::internal::Builtin_HandleApiCall(int, unsigned long*, v8::internal::Isolate*) ()
#11 0x0000000001a5046b in Builtins_CEntry_Return1_DontSaveFPRegs_ArgvOnStack_NoBuiltinExit ()
#12 0x00000000019b0517 in Builtins_InterpreterEntryTrampoline ()
#13 0x00000b244ce804d1 in ?? ()
#14 0x00000d8c165a5459 in ?? ()
richardlau commented 5 years ago

Sorry, ./node is symlinked to out/Release/node. With the actual debug build (out/Debug/node) this is the backtrace:

(gdb) bt
#0  v8::internal::Log::MessageBuilder::operator<< <v8::internal::Name> (this=0x7efc64529910, name=name@entry=...) at ../deps/v8/src/log-utils.cc:249
#1  0x0000000001739df4 in v8::internal::Logger::CodeCreateEvent (this=0x7efc4800e340, tag=v8::internal::CodeEventListener::SCRIPT_TAG, code=..., shared=..., source=...,
    line=<optimized out>, column=1) at ../deps/v8/src/log.cc:1260
#2  0x0000000001a50be1 in CodeCreateEvent (column=1, line=1, source=..., shared=..., code=..., tag=v8::internal::CodeEventListener::SCRIPT_TAG, this=0x7efc4800df00)
    at ../deps/v8/src/code-events.h:142
#3  v8::internal::CodeSerializer::Deserialize (isolate=0x7efc48000a30, cached_data=<optimized out>, source=..., origin_options=...)
    at ../deps/v8/src/snapshot/code-serializer.cc:299
#4  0x00000000011db324 in v8::internal::Compiler::GetWrappedFunction (source=..., arguments=..., arguments@entry=..., context=context@entry=..., script_details=...,
    origin_options=..., cached_data=cached_data@entry=0x7efc480e6fc0, compile_options=<optimized out>, no_cache_reason=<optimized out>) at ../deps/v8/src/compiler.cc:1840
#5  0x000000000106b615 in v8::ScriptCompiler::CompileFunctionInContext (v8_context=..., source=<optimized out>, arguments_count=<optimized out>, arguments=<optimized out>,
    context_extension_count=<optimized out>, context_extensions=<optimized out>, options=<optimized out>, no_cache_reason=<optimized out>) at ../deps/v8/src/api.cc:2507
#6  0x0000000000e07c8e in node::native_module::NativeModuleLoader::LookupAndCompile (this=0x3847940 <node::per_process::native_module_loader>, context=...,
    id=0x7efc6452a208 "path", parameters=0x7efc6452a140, optional_env=0x7efc480a2790) at ../src/node_native_module.cc:231
#7  0x0000000000e077e6 in node::native_module::NativeModuleLoader::CompileAsModule (env=0x7efc480a2790, id=0x7efc6452a208 "path") at ../src/node_native_module.cc:181
#8  0x0000000000e07675 in node::native_module::NativeModuleLoader::CompileFunction (args=...) at ../src/node_native_module.cc:166
#9  0x00000000010fee18 in v8::internal::FunctionCallbackArguments::Call (this=this@entry=0x7efc6452a770, handler=..., handler@entry=...) at ../deps/v8/src/api-arguments-inl.h:146
#10 0x0000000001101675 in v8::internal::(anonymous namespace)::HandleApiCallHelper<false> (isolate=isolate@entry=0x7efc48000a30, function=..., function@entry=..., new_target=...,
    new_target@entry=..., fun_data=..., receiver=..., receiver@entry=..., args=...) at ../deps/v8/src/builtins/builtins-api.cc:109
#11 0x0000000001102ab9 in v8::internal::Builtin_Impl_HandleApiCall (args=..., isolate=isolate@entry=0x7efc48000a30) at ../deps/v8/src/builtins/builtins-api.cc:139
#12 0x00000000011032cf in v8::internal::Builtin_HandleApiCall (args_length=6, args_object=0x7efc6452a990, isolate=0x7efc48000a30) at ../deps/v8/src/builtins/builtins-api.cc:127
#13 0x00000000024f6a72 in Builtins_CEntry_Return1_DontSaveFPRegs_ArgvOnStack_NoBuiltinExit () at ../deps/v8/src/code-stub-assembler.cc:128
#14 0x00000000022028a8 in Builtins_InterpreterEntryTrampoline () at ../deps/v8/src/builtins/builtins-collections-gen.cc:1308
#15 0x00000ac7043004d1 in ?? ()
...
richardlau commented 5 years ago

This is also failing in Travis, e.g. https://travis-ci.com/nodejs/node/jobs/188072187#L4408-L4428

Also crashed on:

* [188385292](https://travis-ci.com/nodejs/node/jobs/188385292)

* [188461493](https://travis-ci.com/nodejs/node/jobs/188461493)

* [188540977](https://travis-ci.com/nodejs/node/jobs/188540977)

* [188606170](https://travis-ci.com/nodejs/node/jobs/188606170)

This test is currently marked flaky: https://github.com/nodejs/node/blob/2c73868b0471fbd4038f500d076df056cbf697fe/test/parallel/parallel.status#L12-L13

Travis is supposed to be skipping flaky tests (but obviously isn't): https://github.com/nodejs/node/blob/805e614ba7401a04330d8f2878d61213268d6e8b/.travis.yml#L34

I've spotted why (the Makefile resets PARALLEL_ARGS because we export JOBS: https://github.com/nodejs/node/blob/2c73868b0471fbd4038f500d076df056cbf697fe/Makefile#L20-L24

Fix in https://github.com/nodejs/node/pull/26968.

gireeshpunathil commented 5 years ago

thanks @richardlau for the help. Now I am able to reproduce it easily. key change is the gcc compiler (gcc 7.3) one of the child process is crashing with this stack:

(gdb) where
#0  0x00000000010613a0 in v8::internal::Log::MessageBuilder::AppendSymbolNameDetails(v8::internal::String, bool) ()
#1  0x000000000106151f in v8::internal::Log::MessageBuilder::AppendSymbolName(v8::internal::Symbol) ()
#2  0x0000000001061e3c in v8::internal::Log::MessageBuilder& v8::internal::Log::MessageBuilder::operator<< <v8::internal::Name>(v8::internal::Name) ()
#3  0x000000000106c77c in v8::internal::Logger::CodeCreateEvent(v8::internal::CodeEventListener::LogEventsAndTags, v8::internal::AbstractCode, v8::internal::SharedFunctionInfo, v8::internal::Name, int, int) ()
#4  0x000000000126f36e in v8::internal::CodeSerializer::Deserialize(v8::internal::Isolate*, v8::internal::ScriptData*, v8::internal::Handle<v8::internal::String>, v8::ScriptOriginOptions) ()
#5  0x0000000000c54c85 in v8::internal::Compiler::GetWrappedFunction(v8::internal::Handle<v8::internal::String>, v8::internal::Handle<v8::internal::FixedArray>, v8::internal::Handle<v8::internal::Context>, v8::internal::Compiler::ScriptDetails const&, v8::ScriptOriginOptions, v8::internal::ScriptData*, v8::ScriptCompiler::CompileOptions, v8::ScriptCompiler::NoCacheReason) ()
...

which is https://github.com/nodejs/node/blob/01b112a031a2f0c6390e541bbaaeb8e0d85c9e0b/deps/v8/src/log-utils.cc#L177

(gdb) x/20i AppendSymbolNameDetails
   0x1061380 <AppendSymbolNameDetails>:    test   rsi,rsi
   0x1061383 <AppendSymbolNameDetails+3>:  je     0x10613d8 <AppendSymbolNameDetails+88> // if (str.is_null()) return;
   0x1061385 <AppendSymbolNameDetails+5>:  push   rbp
   0x1061386 <AppendSymbolNameDetails+6>:  mov    rbp,rsp
   0x1061389 <AppendSymbolNameDetails+9>:  push   r14
   0x106138b <AppendSymbolNameDetails+11>: push   r13
   0x106138d <AppendSymbolNameDetails+13>: push   r12
   0x106138f <AppendSymbolNameDetails+15>: push   rbx  // prologue
   0x1061390 <AppendSymbolNameDetails+16>: mov    r12d,0x1000
   0x1061396 <AppendSymbolNameDetails+22>: mov    rbx,rsi
   0x1061399 <AppendSymbolNameDetails+25>: mov    r13,rdi
   0x106139c <AppendSymbolNameDetails+28>: sub    rsp,0x10
=> 0x10613a0 <AppendSymbolNameDetails+32>: cmp    DWORD PTR [rsi+0xb],0x1000 // if (str->length() > 0x1000)
(gdb) i r rsi
rsi            0x736a2e68746170 32486369875222896
(gdb) 

str is bad and we need to inspect the callers list to see who passed bad value. I will do that.

btw, wery interesting to note the new optimization by the compiler - business logic is inserted before the method prologue!

gireeshpunathil commented 5 years ago

the problem is originated in this loop, though I cannot pinpoint to the issue: https://github.com/nodejs/node/blob/afad3b443e5aa90a771031ea9045182d1ecbf11a/deps/v8/src/snapshot/code-serializer.cc#L271-L280

the SharedFunctionInfo is potentially corrupt, from where the bad name field was extracted, that led to the crash.

the current state of info field in this literator loop is:

(gdb) p info
$9 = {<v8::internal::HeapObject> = {<v8::internal::Object> = {static kHeaderSize = 0, 
      ptr_ = 62469974952545}, <No data fields>}, static kNoSharedNameSentinel = {
    static kHeaderSize = 0, ptr_ = 0}, static kEntriesStart = 0, static kContextOffset = 0, 
  static kCachedCodeOffset = 1, static kEntryLength = 2, static kInitialLength = 2, 
  static kNotFound = -1, static kInvalidLength = 65535, static kDontAdaptArgumentsSentinel = 65535, 
  static kMaximumFunctionTokenOffset = 65534, static kFunctionTokenOutOfRange = 65535, 
  static kUniqueIdFieldSize = 4, static kAlignedSize = 56}

Any clues? @nodejs/v8 @addaleax

gireeshpunathil commented 5 years ago

smallest repro:

const { Worker, parentPort  } = require('worker_threads');

const ping = `
const { parentPort  } = require('worker_threads');
parentPort.on('message', (m) => {});
`;

const w = new Worker(ping, { eval: true });
$ node --prof p.js
Segmentation fault (core dumped)

crash is consistent. For release build, in v8::internal::Log::MessageBuilder::AppendSymbolNameDetails and for debug builds in v8::internal::Log::MessageBuilder::operator<<.

const v8 = require('v8')
let d  = 'x'.repeat(1024)
let s = ''
setInterval(() => {
  s = v8.serialize(d);
  d = v8.deserialize(s);
}, 100);

the above does not crash, which leads to suspect --prof's interaction with worker?

addaleax commented 5 years ago

@gireeshpunathil What platform is that on? I can’t reproduce it myself, unfortunately :/

Trott commented 5 years ago

https://ci.nodejs.org/job/node-test-binary-windows/24984/COMPILED_BY=vs2017-x86,RUNNER=win2012r2,RUN_SUBSET=3/console

test-azure_msft-win2012r2-x64-3

00:26:46 not ok 552 parallel/test-worker-prof # TODO : Fix flaky test
00:26:46   ---
00:26:46   duration_ms: 1.139
00:26:46   severity: flaky
00:26:46   exitcode: 1
00:26:46   stack: |-
00:26:46     assert.js:343
00:26:46         throw err;
00:26:46         ^
00:26:46     
00:26:46     AssertionError [ERR_ASSERTION]: 10 >= 15
00:26:46         at Object.<anonymous> (c:\workspace\node-test-binary-windows\test\parallel\test-worker-prof.js:76:5)
00:26:46         at Module._compile (internal/modules/cjs/loader.js:837:30)
00:26:46         at Object.Module._extensions..js (internal/modules/cjs/loader.js:848:10)
00:26:46         at Module.load (internal/modules/cjs/loader.js:704:32)
00:26:46         at tryModuleLoad (internal/modules/cjs/loader.js:636:12)
00:26:46         at Function.Module._load (internal/modules/cjs/loader.js:628:3)
00:26:46         at Function.Module.runMain (internal/modules/cjs/loader.js:903:10)
00:26:46         at internal/main/run_main_module.js:21:11
00:26:46   ...
gireeshpunathil commented 5 years ago

@addaleax - I can recreate in linux x64, very easily. With gcc 7.3.0:

Using built-in specs.
COLLECT_GCC=/local/gcc-7.3.0/bin/gcc
COLLECT_LTO_WRAPPER=/local/gcc-7.3.0/libexec/gcc/x86_64-pc-linux-gnu/7.3.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: gcc-7.3.0/configure --prefix=/local/gcc-7.3.0 --enable-shared --with-system-zlib --enable-threads=posix --enable-__cxa_atexit --enable-checking --enable-gnu-indirect-function --enable-languages=c,c++ --disable-bootstrap
Thread model: posix
gcc version 7.3.0 (GCC) 

Earlier I was using 4.9.x with which this crash was never observed.

If you cannot reproduce locally, but if there is anything you want me to try out pls let me know, as my reproduce frequency is pretty strong - even I can get it with debug build / launch it through gdb! the part I am unable to connect is the desierailizer's interaction with profiler, worker thread ...

EDIT: edited to remove some of the custom fs references from the gcc -v output which are not relevant.

addaleax commented 5 years ago

@gireeshpunathil That’s interesting, because that’s the exact same compiler version that I use :smile:

gireeshpunathil commented 5 years ago

ok! so my next attempt would be to containerize this crash so that anyone can debug.

gireeshpunathil commented 5 years ago

unfortunately I am unable to crash this in my container. I will look alternatives.

addaleax commented 5 years ago

@gireeshpunathil I’m not sure if that would help, because it looks like it’s hard to figure out the cause of the crash, but something you can always do is to share your Node.js binary and possibly a core dump from a reproduction? That could give us clues as to what possible relevant variables could be (and enable other people to inspect the core dump as well)

gireeshpunathil commented 5 years ago

@addaleax - agree, will upload soon, thnx.

gireeshpunathil commented 5 years ago

gist does not allow more than 100mb data. any guidance where else can I upload? [ my other alternative was git bisect to see if it brings anything conclusively ]

gireeshpunathil commented 5 years ago

git bisect threw a good and bad news:

115f0f5a57f50f6b039f28a56910207f92df116d

ideas?

BridgeAR commented 5 years ago

@gireeshpunathil that sounds like a similar issue to why https://github.com/nodejs/node/commit/ed5e69d7e6eaf5e887ff48aa3fe941d394c44846 was reverted.

addaleax commented 5 years ago
  • in my system, this commit is conclusively proving to be the source
  • unable to correlate the changes with the issue, unable to recreate in any other system

ideas?

I agree, it doesn’t seem like any of this should be possible :smile: I don’t know if it’s feasible for you to tinker with the changes in 115f0f5a57f50f6b039f28a56910207f92df116d to narrow it down a bit (as far as I can tell it’s only one modified source file)?

Also, this reminds me a bit about a bug we’ve collectively debugged last year in Berlin, where changing the length of a script would alter the memory layout of the process sufficiently for it to either crash or not crash…

Trott commented 5 years ago

https://ci.nodejs.org/job/node-test-commit-linux-containered/11931/nodes=ubuntu1604_sharedlibs_withoutintl_x64/console

test-digitalocean-ubuntu1604_sharedlibs_container-x64-10

00:06:39 not ok 2232 parallel/test-worker-prof # TODO : Fix flaky test
00:06:39   ---
00:06:39   duration_ms: 0.511
00:06:39   severity: flaky
00:06:39   exitcode: 1
00:06:39   stack: |-
00:06:39     assert.js:87
00:06:39       throw new AssertionError(obj);
00:06:39       ^
00:06:39     
00:06:39     AssertionError [ERR_ASSERTION]: child exited with signal: { status: null,
00:06:39       signal: 'SIGSEGV',
00:06:39       output: [ null, '', '' ],
00:06:39       pid: 16422,
00:06:39       stdout: '',
00:06:39       stderr: '' }
00:06:39         at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux-containered/test/parallel/test-worker-prof.js:57:10)
00:06:39         at Module._compile (internal/modules/cjs/loader.js:801:30)
00:06:39         at Object.Module._extensions..js (internal/modules/cjs/loader.js:812:10)
00:06:39         at Module.load (internal/modules/cjs/loader.js:671:32)
00:06:39         at tryModuleLoad (internal/modules/cjs/loader.js:603:12)
00:06:39         at Function.Module._load (internal/modules/cjs/loader.js:595:3)
00:06:39         at Function.Module.runMain (internal/modules/cjs/loader.js:865:10)
00:06:39         at internal/main/run_main_module.js:21:11
00:06:39   ...
gireeshpunathil commented 5 years ago

whatever funny this may look, here is a patch which I believe is going to recreate in any environment (derived from the findings in git bisect and subsequent isolation)

--- a/lib/internal/validators.js
+++ b/lib/internal/validators.js
@@ -121,6 +121,12 @@ const validateBuffer = hideStackFrames((buffer, name = 'buffer') => {
   }
 });

+const foo1 = function(){}
+const foo2 = function(){}
+const foo3 = function(){}
+const foo4 = function(){}
+const foo5 = function(){}
+
 module.exports = {
   isInt32,
   isUint32,
@@ -130,5 +136,6 @@ module.exports = {
   validateInt32,
   validateUint32,
   validateString,
-  validateNumber
+  validateNumber,
+  foo1, foo2, foo3, foo4, foo5
 };
Trott commented 5 years ago

https://ci.nodejs.org/job/node-test-commit-linux-containered/12008/nodes=ubuntu1604_sharedlibs_withoutintl_x64/console

test-softlayer-ubuntu1604_sharedlibs_container-x64-2

00:06:51 not ok 2238 parallel/test-worker-prof # TODO : Fix flaky test
00:06:51   ---
00:06:51   duration_ms: 0.669
00:06:51   severity: flaky
00:06:51   exitcode: 1
00:06:51   stack: |-
00:06:51     assert.js:87
00:06:51       throw new AssertionError(obj);
00:06:51       ^
00:06:51     
00:06:51     AssertionError [ERR_ASSERTION]: child exited with signal: { status: null,
00:06:51       signal: 'SIGSEGV',
00:06:51       output: [ null, '', '' ],
00:06:51       pid: 4544,
00:06:51       stdout: '',
00:06:51       stderr: '' }
00:06:51         at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-linux-containered/test/parallel/test-worker-prof.js:57:10)
00:06:51         at Module._compile (internal/modules/cjs/loader.js:824:30)
00:06:51         at Object.Module._extensions..js (internal/modules/cjs/loader.js:835:10)
00:06:51         at Module.load (internal/modules/cjs/loader.js:693:32)
00:06:51         at Function.Module._load (internal/modules/cjs/loader.js:620:12)
00:06:51         at Function.Module.runMain (internal/modules/cjs/loader.js:888:10)
00:06:51         at internal/main/run_main_module.js:17:11
00:06:51   ...
Trott commented 5 years ago

Times out after 30 seconds as coded into the test, but when it runs successfully on the same platform, it takes just over 1 second (so the problem is more significant that "oh the test just needs more time" just in case that's what someone might surmise).

https://ci.nodejs.org/job/node-test-commit-plinux/26285/nodes=centos7-ppcle/console

test-osuosl-centos7-ppc64_le-2

E00:22:40 not ok 2453 parallel/test-worker-prof # TODO : Fix flaky test
00:22:40   ---
00:22:40   duration_ms: 30.234
00:22:40   severity: flaky
00:22:40   exitcode: 1
00:22:40   stack: |-
00:22:40     assert.js:89
00:22:40       throw new AssertionError(obj);
00:22:40       ^
00:22:40     
00:22:40     AssertionError [ERR_ASSERTION]: child exited with signal: {
00:22:40       error: Error: spawnSync /home/iojs/build/workspace/node-test-commit-plinux/nodes/centos7-ppcle/out/Release/node ETIMEDOUT
00:22:40           at Object.spawnSync (internal/child_process.js:1047:20)
00:22:40           at spawnSync (child_process.js:584:24)
00:22:40           at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-plinux/nodes/centos7-ppcle/test/parallel/test-worker-prof.js:51:23)
00:22:40           at Module._compile (internal/modules/cjs/loader.js:777:30)
00:22:40           at Object.Module._extensions..js (internal/modules/cjs/loader.js:788:10)
00:22:40           at Module.load (internal/modules/cjs/loader.js:640:32)
00:22:40           at Function.Module._load (internal/modules/cjs/loader.js:555:12)
00:22:40           at Function.Module.runMain (internal/modules/cjs/loader.js:840:10)
00:22:40           at internal/main/run_main_module.js:17:11 {
00:22:40         errno: 'ETIMEDOUT',
00:22:40         code: 'ETIMEDOUT',
00:22:40         syscall: 'spawnSync ' +
00:22:40           '/home/iojs/build/workspace/node-test-commit-plinux/nodes/centos7-ppcle/out/Release/node',
00:22:40         path: '/home/iojs/build/workspace/node-test-commit-plinux/nodes/centos7-ppcle/out/Release/node',
00:22:40         spawnargs: [
00:22:40           '--prof',
00:22:40           '/home/iojs/build/workspace/node-test-commit-plinux/nodes/centos7-ppcle/test/parallel/test-worker-prof.js',
00:22:40           'child'
00:22:40         ]
00:22:40       },
00:22:40       status: null,
00:22:40       signal: 'SIGTERM',
00:22:40       output: [ null, '', '' ],
00:22:40       pid: 18589,
00:22:40       stdout: '',
00:22:40       stderr: ''
00:22:40     }
00:22:40         at Object.<anonymous> (/home/iojs/build/workspace/node-test-commit-plinux/nodes/centos7-ppcle/test/parallel/test-worker-prof.js:57:10)
00:22:40         at Module._compile (internal/modules/cjs/loader.js:777:30)
00:22:40         at Object.Module._extensions..js (internal/modules/cjs/loader.js:788:10)
00:22:40         at Module.load (internal/modules/cjs/loader.js:640:32)
00:22:40         at Function.Module._load (internal/modules/cjs/loader.js:555:12)
00:22:40         at Function.Module.runMain (internal/modules/cjs/loader.js:840:10)
00:22:40         at internal/main/run_main_module.js:17:11 {
00:22:40       generatedMessage: false,
00:22:40       code: 'ERR_ASSERTION',
00:22:40       actual: 'SIGTERM',
00:22:40       expected: null,
00:22:40       operator: 'strictEqual'
00:22:40     }
00:22:40   ...