kriszyp / lmdb-js

Simple, efficient, ultra-fast, scalable data store wrapper for LMDB
Other
479 stars 39 forks source link

Segmentation Failure after "Commit failed" and "Operation not permitted" Error #294

Open falcon027 opened 2 weeks ago

falcon027 commented 2 weeks ago

During a lengthy test run, I encountered a segmentation fault in my application. I believe this may be related to the preceding errors, specifically "Commit failed" and "Operation not permitted." For further details, please refer to the attached logs.

2024-06-13T00:01:36.779505881Z Error: Commit failed (see commitError for details)
2024-06-13T00:01:36.779507971Z at rejectCommit (file:///usr/src/app/node_modules/lmdb/write.js:604:17)
2024-06-13T00:01:36.779510381Z at resolveWrites (file:///usr/src/app/node_modules/lmdb/write.js:566:40)
2024-06-13T00:01:36.779513051Z at Function.<anonymous> (file:///usr/src/app/node_modules/lmdb/write.js:492:4)
2024-06-13T00:01:36.779515571Z This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). The promise rejected with the reason:
2024-06-13T00:01:36.779517711Z Error: Commit failed (see commitError for details)
2024-06-13T00:01:36.779520021Z at rejectCommit (file:///usr/src/app/node_modules/lmdb/write.js:604:17)
2024-06-13T00:01:36.779522721Z at resolveWrites (file:///usr/src/app/node_modules/lmdb/write.js:566:40)
2024-06-13T00:01:36.779524391Z at Function.<anonymous> (file:///usr/src/app/node_modules/lmdb/write.js:492:4)
2024-06-13T00:01:40.686225166Z overlapping duplicate entry 120647
2024-06-13T00:01:40.686363136Z mdb_page_alloc error
2024-06-13T00:01:40.686386388Z mdb_page_touch error
2024-06-13T00:01:40.686390118Z do_write error 0 4294967295
2024-06-13T00:01:40.692081140Z Error: Operation not permitted
2024-06-13T00:01:40.692092320Z at Function.<anonymous> (file:///usr/src/app/node_modules/lmdb/write.js:508:7) {
2024-06-13T00:01:40.692095760Z code: 1
2024-06-13T00:01:40.692097890Z }
2024-06-13T00:01:40.696061342Z This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). The promise rejected with the reason:
2024-06-13T00:01:40.696079692Z Error: Commit failed (see commitError for details)
2024-06-13T00:01:40.696082872Z at rejectCommit (file:///usr/src/app/node_modules/lmdb/write.js:604:17)
2024-06-13T00:01:40.696085433Z at resolveWrites (file:///usr/src/app/node_modules/lmdb/write.js:566:40)
2024-06-13T00:01:40.696087993Z at Function.<anonymous> (file:///usr/src/app/node_modules/lmdb/write.js:492:4)
2024-06-13T00:01:40.696090633Z This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). The promise rejected with the reason:
2024-06-13T00:01:40.696093373Z Error: Operation not permitted
2024-06-13T00:01:40.696096083Z at Function.<anonymous> (file:///usr/src/app/node_modules/lmdb/write.js:508:7)
2024-06-13T00:01:40.696109253Z This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). The promise rejected with the reason:
2024-06-13T00:01:40.696112153Z Error: Commit failed (see commitError for details)
2024-06-13T00:01:40.696114343Z at rejectCommit (file:///usr/src/app/node_modules/lmdb/write.js:604:17)
2024-06-13T00:01:40.696116873Z at resolveWrites (file:///usr/src/app/node_modules/lmdb/write.js:566:40)
2024-06-13T00:01:40.696119343Z at Function.<anonymous> (file:///usr/src/app/node_modules/lmdb/write.js:492:4)
2024-06-13T00:01:40.737963330Z ../dependencies/lmdb/libraries/liblmdb/mdb.c:3206: Assertion 'mp->mp_pgno != pgno' failed in mdb_page_touch()
2024-06-13T00:01:40.742039184Z PID 1 received SIGSEGV for address: 0x0
2024-06-13T00:01:40.743371289Z /usr/src/app/node_modules/segfault-handler/build/Release/segfault-handler.node(+0x31e8)[0x7f102e38e1e8]
2024-06-13T00:01:40.743386039Z /lib/x86_64-linux-gnu/libc.so.6(+0x3bfd0)[0x7f102dec0fd0]
2024-06-13T00:01:40.743389249Z /lib/x86_64-linux-gnu/libc.so.6(abort+0x170)[0x7f102deab50f]
2024-06-13T00:01:40.743391810Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x9b766)[0x7f102c4c6766]
2024-06-13T00:01:40.743394650Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x83e9f)[0x7f102c4aee9f]
2024-06-13T00:01:40.743397330Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x8f0e3)[0x7f102c4ba0e3]
2024-06-13T00:01:40.743399480Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x8f667)[0x7f102c4ba667]
2024-06-13T00:01:40.743401650Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x983df)[0x7f102c4c33df]
2024-06-13T00:01:40.743404030Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x543dd)[0x7f102c47f3dd]
2024-06-13T00:01:40.743406550Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x548fe)[0x7f102c47f8fe]
2024-06-13T00:01:40.743408780Z /usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x54793)[0x7f102c47f793]
2024-06-13T00:01:40.743833754Z  main process(_ZZN4node14ThreadPoolWork12ScheduleWorkEvENUlP9uv_work_sE_4_FUNES2_+0x59)[0xc612d9]
2024-06-13T00:01:40.744563082Z  main process[0x187e7b0]
2024-06-13T00:01:40.744572382Z /lib/x86_64-linux-gnu/libc.so.6(+0x89044)[0x7f102df0e044]
2024-06-13T00:01:40.744585142Z /lib/x86_64-linux-gnu/libc.so.6(__clone+0x40)[0x7f102df8d880]
2024-06-13T00:01:40.749126462Z 11 0 [
2024-06-13T00:01:40.749137312Z '/usr/src/app/node_modules/segfault-handler/build/Release/segfault-handler.node(+0x31e8) [0x7f102e38e1e8]',
2024-06-13T00:01:40.749140852Z '/lib/x86_64-linux-gnu/libc.so.6(+0x3bfd0) [0x7f102dec0fd0]',
2024-06-13T00:01:40.749143602Z '/lib/x86_64-linux-gnu/libc.so.6(abort+0x170) [0x7f102deab50f]',
2024-06-13T00:01:40.749146592Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x9b766) [0x7f102c4c6766]',
2024-06-13T00:01:40.749183173Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x83e9f) [0x7f102c4aee9f]',
2024-06-13T00:01:40.749186083Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x8f0e3) [0x7f102c4ba0e3]',
2024-06-13T00:01:40.749188483Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x8f667) [0x7f102c4ba667]',
2024-06-13T00:01:40.749191133Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x983df) [0x7f102c4c33df]',
2024-06-13T00:01:40.749193653Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x543dd) [0x7f102c47f3dd]',
2024-06-13T00:01:40.749196043Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x548fe) [0x7f102c47f8fe]',
2024-06-13T00:01:40.749199043Z '/usr/src/app/node_modules/@lmdb/lmdb-linux-x64/node.abi115.glibc.node(+0x54793) [0x7f102c47f793]',
2024-06-13T00:01:40.749202233Z ' main process(_ZZN4node14ThreadPoolWork12ScheduleWorkEvENUlP9uv_work_sE_4_FUNES2_+0x59) [0xc612d9]',
2024-06-13T00:01:40.749204713Z ' main process() [0x187e7b0]',
2024-06-13T00:01:40.749207113Z '/lib/x86_64-linux-gnu/libc.so.6(+0x89044) [0x7f102df0e044]',
2024-06-13T00:01:40.749209343Z '/lib/x86_64-linux-gnu/libc.so.6(__clone+0x40) [0x7f102df8d880]'
2024-06-13T00:01:40.749211743Z ]
2024-06-13T00:01:40.749680598Z [main] The main process has stopped.

If anyone has any insights into the cause of this issue, I would appreciate hearing from you. Thank you in advance for your assistance. lmdb js v. 3.0.12

kriszyp commented 2 weeks ago

Do you know if this error occurs in 3.0.11? (sounds like it may be difficult to reproduce?)

falcon027 commented 2 weeks ago

I have checked the logs and 3.0.11 also has it. Could this be related to low free space or memory?

kriszyp commented 2 weeks ago

I wouldn't expect this to be due to low free space or memory, but can't be certain. Certainly if you have any suggestions for how to reproduce, will gladly give it a try. I can also try add some more debugging information for this case.

falcon027 commented 2 weeks ago

This may be unrelated, but I got another error message tonight. ´´´ SyntaxError: Unexpected token '', "�"... is not valid JSON File "", in JSON.parse File "/usr/src/app/node_modules/lmdb/read.js", line 771, col 23, in Object.next value = JSON.parse(value); File "/usr/src/app/node_modules/lmdb/util/RangeIterable.js", line 38, col 35, in Object.next iteratorResult = iterator.next(); File "/usr/src/app/node_modules/lmdb/util/RangeIterable.js", line 38, col 35, in Object.next iteratorResult = iterator.next(); File "/usr/src/app/node_modules/lmdb/util/RangeIterable.js", line 324, col 24, in next result = iterator.next(); ´´´

falcon027 commented 1 week ago

I have identified the source of the problem as a corrupt database entry. I have included the code to reproduce both errors in the Bug.js file. When performing a range read of the file, it returns some correct objects until it encounters an invalid one. Writing is not functional. Unfortunately, I am unaware of how the incorrect record entered the database. However, there should be a way to recover from such a corrupted state. Please find attached a link to the corrupted database file, called "relationGraph", as well as the JavaScript code to reproduce the errors.

https://drive.google.com/drive/folders/1hYe_GtScpX5SFI95qNbP8NRlfpMAXJ3u?usp=sharing

falcon027 commented 1 week ago

I am still looking for the reason for the broken record and have found a segfault that seems to occur under heavy load. I have attached a script to reproduce the error.

import {open} from "lmdb";
import SegfaultHandler from "segfault-handler";
import fs from "fs";

SegfaultHandler.registerHandler("crash.log", function (signal, address, stack) {
    console.log(signal, address, stack);
});

fs.rmSync("./corruptionTest", {recursive: true, force: true});

async function main() {

    let newObject = open({
        path: "./corruptionTest2",
        compression: false,
        dupSort: false,
        cache: false,
        maxDbs: 1,
        commitDelay: 50,
        noMemInit: true,
        keyEncoding: "ordered-binary",
        remapChunks: false,
        pageSize: 16384,
        useWritemap: false,
        safeRestore: true,
        encoding: "json",

    });

    while (true) {
        newObject.put("S:c1edfab0-ca28-4124-822b-bdc5a78cf527", ["bb5ed3951fc9"])
    }

    // output =>
    // PID 63939 received SIGSEGV for address: 0x0
    // 0   segfault-handler.node               0x0000000107ea2538 _ZL16segfault_handleriP9__siginfoPv + 252
    // 1   libsystem_platform.dylib            0x000000019c317584 _sigtramp + 56
    // 2   node.abi115.node                    0x000000010865858c mdb_cursor_put + 4608
    // 3   node.abi115.node                    0x00000001086647e4 mdb_put + 364
    // 4   node.abi115.node                    0x0000000108612318 _ZN11WriteWorker8DoWritesEP7MDB_txnP7EnvWrapPjPS_ + 2352
    // 5   node.abi115.node                    0x0000000108612968 _ZN11WriteWorker5WriteEv + 344
    // 6   node.abi115.node                    0x00000001086127f4 _Z8do_writeP10napi_env__Pv + 36
    // 7   node                                0x00000001051958d8 _ZZN4node14ThreadPoolWork12ScheduleWorkEvENKUlP9uv_work_sE_clES2_ + 236
    // 8   node                                0x0000000105195638 _ZZN4node14ThreadPoolWork12ScheduleWorkEvENUlP9uv_work_sE_8__invokeES2_ + 24
    // 9   libuv.1.dylib                       0x0000000108277178 worker + 224
    // 10  libsystem_pthread.dylib             0x000000019c2e6f94 _pthread_start + 136
    // 11  libsystem_pthread.dylib             0x000000019c2e1d34 thread_start + 8
}
main()
kriszyp commented 1 week ago

When I attempt this test, it just runs out of memory because the loop never yields to the event turn for completing the resolution of all the promises of the puts that have been made. It is necessary to yield to the event turn to allow commit events to complete.

Hopefully in the next week or so I can take a look at possible mechanisms to tolerate a corrupted entry, but generally those are too deep in the structure to recover from.

falcon027 commented 1 week ago

Thanks for your input. I did some more debugging in my test environment and found that 14,000 operations were being performed in the second before the crashes occurred. This is what I was trying to simulate with the script in my old comment, but you are right that using a while loop without an await is probably not the right way to simulate a high load.

So I tried another approach to simulate the load as seen in the code below, I am actually not sure if it is correct this time, but I have nevertheless found that if I turn off eventTurnBatching the program does not crash.

I am not sure what to make of this, if you have any thoughts let me know.

import {open} from "lmdb";
import SegfaultHandler from "segfault-handler";
import fs from "fs";

SegfaultHandler.registerHandler("crash.log", function (signal, address, stack) {
    console.log(signal, address, stack);
});

fs.rmSync("./corruptionTest", {recursive: true, force: true});

async function main() {

    let newObject = open({
        path: "./corruptionTest",
        compression: false,
        dupSort: false,
        cache: false,
        maxDbs: 1,
        commitDelay: 0,
        eventTurnBatching: true, // <=If set to true it will crash, if set to false it will not.
        noMemInit: true,
        keyEncoding: "ordered-binary",
        remapChunks: false,
        pageSize: 16384,
        useWritemap: false,
        safeRestore: true,
        encoding: "json",

    });

    for (let i = 0; i < 15000; i++) {
        setInterval(async () => {
            await newObject.put("S:c1edfab0-ca28-4124-822b-bdc5a78cf527", ["bb5ed3951fc9"])
        }, 10)
    }

    // output =>
    // PID 63939 received SIGSEGV for address: 0x0
    // 0   segfault-handler.node               0x0000000107ea2538 _ZL16segfault_handleriP9__siginfoPv + 252
    // 1   libsystem_platform.dylib            0x000000019c317584 _sigtramp + 56
    // 2   node.abi115.node                    0x000000010865858c mdb_cursor_put + 4608
    // 3   node.abi115.node                    0x00000001086647e4 mdb_put + 364
    // 4   node.abi115.node                    0x0000000108612318 _ZN11WriteWorker8DoWritesEP7MDB_txnP7EnvWrapPjPS_ + 2352
    // 5   node.abi115.node                    0x0000000108612968 _ZN11WriteWorker5WriteEv + 344
    // 6   node.abi115.node                    0x00000001086127f4 _Z8do_writeP10napi_env__Pv + 36
    // 7   node                                0x00000001051958d8 _ZZN4node14ThreadPoolWork12ScheduleWorkEvENKUlP9uv_work_sE_clES2_ + 236
    // 8   node                                0x0000000105195638 _ZZN4node14ThreadPoolWork12ScheduleWorkEvENUlP9uv_work_sE_8__invokeES2_ + 24
    // 9   libuv.1.dylib                       0x0000000108277178 worker + 224
    // 10  libsystem_pthread.dylib             0x000000019c2e6f94 _pthread_start + 136
    // 11  libsystem_pthread.dylib             0x000000019c2e1d34 thread_start + 8
}

main()
kriszyp commented 1 week ago

How long does this script take to trigger a segfault? I have tried running it for about 24 without any crash.

falcon027 commented 1 week ago

I just ran it again and on my system it only takes a few seconds before it crashes.