koverstreet / bcachefs-tools

http://bcachefs.org
GNU General Public License v2.0
124 stars 91 forks source link

[5ef62f] btree_node_read_work: rewriting node deadlock #144

Open DarkKirb opened 1 year ago

DarkKirb commented 1 year ago

I’m currently in the process of migrating an older bcachefs pool (version 16). It fscks without issues on 3c39b422acd3346321185be0ce263809e2a9a23f, corresponding to the kernel it was last mounted on. Now I am running another fsck on 5ef62f56ab50c5799f713e3a42f5c7ad7e8283d3 and it reports a lot of fixable errors (such as bad val size (48 < 56)), which are automatically applied without -p flag, which I assume are due to the FS upgrade.

However at some point it stops execution entirely, with the most recent message being:

btree_node_read_work: rewriting btree node at btree=alloc level=0 3:2107798:0 due to error

More of the log can be seen here: https://thermalpaste.technogothic.net/p/Mpcy02.txt

checking in GDB, it appears that https://github.com/koverstreet/bcachefs-tools/blob/5ef62f56ab50c5799f713e3a42f5c7ad7e8283d3/linux/sched.c#L41-L42 is trying to wait for the current task to no longer be running, which never occurs (likely because the task is waiting for itself to stop running?

output of bt full:

#0  0x00007fea2fff4d8d in syscall () from /nix/store/yaz7pyf0ah88g2v505l38n0f3wg2vzdj-glibc-2.37-8/lib/libc.so.6
No symbol table info available.
#1  0x0000000000426641 in futex (val3=0, uaddr2=0x0, timeout=0x0, val=<optimized out>, op=128, uaddr=<optimized out>) at /nix/store/jsn69rwvff82p8mkhznsc4c4c74s2ild-liburcu-0.14.0-dev/include/urcu/futex.h:81
No locals.
#2  schedule () at linux/sched.c:41
        v = <optimized out>
#3  0x0000000000426535 in __closure_sync (cl=cl@entry=0x7fff1f3f7eb0) at linux/closure.c:129
        s = {task = 0x179c2a0, done = 0}
#4  0x00000000004e4a06 in closure_sync (cl=0x7fff1f3f7eb0) at include/linux/closure.h:189
No locals.
#5  bch2_btree_update_start (trans=trans@entry=0x7fff1f3f80e0, path=path@entry=0x152f2b000, level=<optimized out>, split=<optimized out>, split@entry=true, flags=flags@entry=86) at libbcachefs/btree_update_interior.c:1170
        cl = {{{wq = 0x0, s = 0x7fff1f3f7e30, list = {next = 0x0}, fn = 0x4260c0 <closure_sync_fn>}, work = {data = {counter = 0}, entry = {next = 0x7fff1f3f7e30, prev = 0x0}, func = 0x4260c0 <closure_sync_fn>}}, parent = 0x0, remaining = {counter = 268435457}}
        c = 0x7fea2fe17000
        as = 0x15c06b000
        start_time = <optimized out>
        disk_res_flags = 1
        nr_nodes = {2, 0}
        update_level = <optimized out>
        journal_flags = <optimized out>
        ret = -2215
        restart_count = <optimized out>
        __PRETTY_FUNCTION__ = "bch2_btree_update_start"
#6  0x00000000004e89f7 in bch2_btree_split_leaf (trans=trans@entry=0x7fff1f3f80e0, path=0x152f2b000, flags=flags@entry=86) at libbcachefs/btree_update_interior.c:1739
        b = 0x316f01200
        as = <optimized out>
        l = <optimized out>
        ret = 0
#7  0x0000000000474798 in bch2_trans_commit_error (trace_ip=<optimized out>, ret=<optimized out>, i=0x152f2d000, flags=86, trans=0x7fff1f3f80e0) at libbcachefs/btree_update_leaf.c:959
        c = 0x7fea2fe17000
        c = <optimized out>
        __PRETTY_FUNCTION__ = "bch2_trans_commit_error"
        __func__ = "bch2_trans_commit_error"
        __wait = <optimized out>
        __ret = <optimized out>
        wb = <optimized out>
        __here = <optimized out>
        _ret = <optimized out>
#8  __bch2_trans_commit (trans=trans@entry=0x7fff1f3f80e0, flags=<optimized out>) at libbcachefs/btree_update_leaf.c:1208
        c = <optimized out>
        i = 0x152f2d000
        wb = <optimized out>
        u64s = <optimized out>
        ret = <optimized out>
#9  0x000000000047d4ff in bch2_trans_commit (flags=<optimized out>, journal_seq=0x0, disk_res=0x0, trans=0x7fff1f3f80e0) at libbcachefs/btree_update.h:121
No locals.
#10 bch2_journal_replay (c=c@entry=0x7fea2fe17000, start_seq=<optimized out>, end_seq=<optimized out>) at libbcachefs/recovery.c:663
        _restart_count = 1
        _ret = <optimized out>
        trans = {c = 0x7fea2fe17000, fn = 0x6e33c0 <__func__.5> "bch2_journal_replay", ref = {{{wq = 0x0, s = 0x0, list = {next = 0x0}, fn = 0x0}, work = {data = {counter = 0}, entry = {next = 0x0, prev = 0x0}, func = 0x0}}, parent = 0x0, remaining = {
              counter = 1073741825}}, list = {next = 0x0, prev = 0x0}, last_begin_time = 103840815472494, lock_may_not_fail = 0 '\000', lock_must_abort = 0 '\000', locking = 0x0, locking_wait = {list = {next = 0x0, prev = 0x0}, task = 0x179c2a0, 
            lock_want = SIX_LOCK_read, lock_acquired = false, start_time = 0}, srcu_idx = 0, fn_idx = 17 '\021', nr_sorted = 8 '\b', nr_updates = 1 '\001', nr_wb_updates = 0 '\000', wb_updates_size = 0 '\000', used_mempool = false, in_traverse_all = false, 
          paths_sorted = false, memory_allocation_failure = false, journal_transaction_names = false, journal_replay_not_finished = true, is_initial_gc = false, notrace_relock_fail = false, restarted = 0, restart_count = 1, last_begin_ip = 4707439, 
          last_restarted_ip = 0, srcu_lock_time = 103840815, extra_journal_res = 0, nr_max_paths = 27, paths_allocated = 255, mem_top = 0, mem_max = 0, mem_bytes = 0, mem = 0x0, sorted = "\000\005\003\001\a\002\006\004\004\004", '\000' <repeats 61 times>, 
          paths = 0x152f2b000, updates = 0x152f2d000, wb_updates = 0x0, hooks = 0x0, extra_journal_entries = {nr = 0, size = 0, data = 0x0}, journal_pin = 0x0, journal_res = {ref = false, idx = 0 '\000', u64s = 0, offset = 0, seq = 0}, journal_preres = {u64s = 0}, 
          journal_seq = 0x0, disk_res = 0x0, journal_u64s = 11, journal_preres_u64s = 0, fs_usage_deltas = 0x0}
        _ret = <optimized out>
        keys = 0x7fea2fe56d90
        keys_sorted = 0x7fea0959d000
        k = 0x7fe7bdec26e0
        j = 0x7fea2fe56440
        i = 297330
        ret = <optimized out>
        __func__ = "bch2_journal_replay"
#11 0x000000000047fd70 in bch2_fs_recovery (c=<optimized out>) at libbcachefs/recovery.c:1338
        metadata_only = false
        err = 0x6e34ed "journal replay failed"
        clean = 0x0
        last_journal_entry = <optimized out>
        last_seq = 63786339
        blacklist_seq = 63786424
        journal_seq = 63786432
        write_sb = false
        ret = 0
        __func__ = "bch2_fs_recovery"
#12 0x00000000004590be in bch2_fs_start (c=c@entry=0x7fea2fe17000) at libbcachefs/super.c:934
        mi = 0x17b32f0
        ca = <optimized out>
        now = <optimized out>
        i = <optimized out>
        ret = <optimized out>
        __PRETTY_FUNCTION__ = "bch2_fs_start"
#13 0x000000000045bdce in bch2_fs_open (devices=devices@entry=0x7fff1f3f8bc8, nr_devices=<optimized out>, nr_devices@entry=4, opts=...) at libbcachefs/super.c:1924
        sb = <optimized out>
        c = 0x7fea2fe17000
        mi = <optimized out>
        i = <optimized out>
        best_sb = <optimized out>
        errbuf = {buf = 0x0, size = 0, pos = 0, last_newline = 0, last_field = 0, indent = 0, atomic = 0 '\000', allocation_failure = false, heap_allocated = true, si_units = PRINTBUF_UNITS_2, human_readable_units = false, has_indent_or_tabstops = false, 
          suppress_indent_tabstop_handling = false, nr_tabstops = 0 '\000', cur_tabstop = 0 '\000', _tabstops = "\000\000\000\000\000"}
        ret = <optimized out>
        __func__ = "bch2_fs_open"
#14 0x00000000004349a9 in cmd_fsck (argc=4, argv=0x7fff1f3f8bc8, argv@entry=0x7fff1f3f8bb8) at cmd_fsck.c:96
        longopts = {{name = 0x6d9138 "ratelimit_errors", has_arg = 0, flag = 0x0, val = 114}, {name = 0x6d9149 "reconstruct_alloc", has_arg = 0, flag = 0x0, val = 82}, {name = 0x6f23e4 "help", has_arg = 0, flag = 0x0, val = 104}, {name = 0x0, has_arg = 0, flag = 0x0, 
            val = 0}}
        opts = {block_size_defined = 0, btree_node_size_defined = 0, errors_defined = 0, metadata_replicas_defined = 0, data_replicas_defined = 0, metadata_replicas_required_defined = 0, data_replicas_required_defined = 0, encoded_extent_max_defined = 0, 
          metadata_checksum_defined = 0, data_checksum_defined = 0, compression_defined = 0, background_compression_defined = 0, str_hash_defined = 0, metadata_target_defined = 0, foreground_target_defined = 0, background_target_defined = 0, 
          promote_target_defined = 0, erasure_code_defined = 0, inodes_32bit_defined = 0, shard_inode_numbers_defined = 0, inodes_use_key_cache_defined = 0, btree_node_mem_ptr_optimization_defined = 0, btree_write_buffer_size_defined = 0, 
          gc_reserve_percent_defined = 0, gc_reserve_bytes_defined = 0, root_reserve_percent_defined = 0, wide_macs_defined = 0, inline_data_defined = 0, acl_defined = 0, usrquota_defined = 0, grpquota_defined = 0, prjquota_defined = 0, degraded_defined = 1, 
          very_degraded_defined = 0, discard_defined = 0, verbose_defined = 1, journal_flush_delay_defined = 0, journal_flush_disabled_defined = 0, journal_reclaim_delay_defined = 0, move_bytes_in_flight_defined = 0, move_ios_in_flight_defined = 0, fsck_defined = 1, 
          fix_errors_defined = 1, ratelimit_errors_defined = 0, nochanges_defined = 0, norecovery_defined = 0, keep_journal_defined = 0, read_entire_journal_defined = 0, read_journal_only_defined = 0, journal_transaction_names_defined = 0, noexcl_defined = 0, 
          direct_io_defined = 0, sb_defined = 0, read_only_defined = 0, nostart_defined = 0, reconstruct_alloc_defined = 0, version_upgrade_defined = 0, buckets_nouse_defined = 0, project_defined = 0, nocow_defined = 0, nocow_enabled_defined = 0, 
          no_data_io_defined = 0, fs_size_defined = 0, bucket_defined = 0, durability_defined = 0, block_size = 0, btree_node_size = 0, errors = 0 '\000', metadata_replicas = 0 '\000', data_replicas = 0 '\000', metadata_replicas_required = 0 '\000', 
          data_replicas_required = 0 '\000', encoded_extent_max = 0, metadata_checksum = 0 '\000', data_checksum = 0 '\000', compression = 0 '\000', background_compression = 0 '\000', str_hash = 0 '\000', metadata_target = 0, foreground_target = 0, 
          background_target = 0, promote_target = 0, erasure_code = 0, inodes_32bit = 0 '\000', shard_inode_numbers = 0 '\000', inodes_use_key_cache = 0 '\000', btree_node_mem_ptr_optimization = 0 '\000', btree_write_buffer_size = 0, gc_reserve_percent = 0 '\000', 
          gc_reserve_bytes = 0, root_reserve_percent = 0 '\000', wide_macs = 0 '\000', inline_data = 0 '\000', acl = 0 '\000', usrquota = 0 '\000', grpquota = 0 '\000', prjquota = 0 '\000', degraded = 1 '\001', very_degraded = 0 '\000', discard = 0 '\000', 
          verbose = 1 '\001', journal_flush_delay = 0, journal_flush_disabled = 0 '\000', journal_reclaim_delay = 0, move_bytes_in_flight = 0, move_ios_in_flight = 0, fsck = 1 '\001', fix_errors = 3 '\003', ratelimit_errors = 0 '\000', nochanges = 0 '\000', 
          norecovery = 0 '\000', keep_journal = 0 '\000', read_entire_journal = 0 '\000', read_journal_only = 0 '\000', journal_transaction_names = 0 '\000', noexcl = 0 '\000', direct_io = 0 '\000', sb = 0, read_only = 0 '\000', nostart = 0 '\000', 
          reconstruct_alloc = 0 '\000', version_upgrade = 0 '\000', buckets_nouse = 0 '\000', project = 0 '\000', nocow = 0 '\000', nocow_enabled = 0 '\000', no_data_io = 0 '\000', fs_size = 0, bucket = 0, durability = 0 '\000'}
        i = 4
        opt = <optimized out>
        ret = 0
        c = <optimized out>
#15 0x000000000041122b in main (argc=<optimized out>, argv=0x7fff1f3f8bb8) at bcachefs.c:205
        cmd = 0x7fff1f3fa727 "fsck"
koverstreet commented 1 year ago

The "bad val size" was due to a fix that hadn't been merged into -tools - I just did so.

Will look at the rest when I have time.