openzfsonosx / openzfs

OpenZFS on Linux and FreeBSD, and, macOS. This is where development for macOS happens.
https://github.com/openzfs/zfs/wiki
Other
144 stars 11 forks source link

sa.sa_magic panic in zfs_space_delta_cb() #5

Open lundman opened 4 years ago

lundman commented 4 years ago

System information

Type Version/Name
Distribution Name macOS
Distribution Version I've not given it a version yet
Linux Kernel heh funny
Architecture x64
ZFS Version 0.8.4
SPL Version

Describe the problem you're observing

panic(cpu 1 caller 0xffffff7fadbeaa0e): VERIFY3(sa.sa_magic == SA_MAGIC) failed (1591864135 == 3100762)

Backtrace (CPU 1), Frame : Return Address
0xffffff80321338c0 : 0xffffff802b29052d mach_kernel : _handle_debugger_trap + 0x46d
0xffffff8032133910 : 0xffffff802b424826 mach_kernel : _kdp_i386_trap + 0x196
0xffffff8032133950 : 0xffffff802b413e42 mach_kernel : _kernel_trap + 0x3c2
0xffffff80321339c0 : 0xffffff802b22bb40 mach_kernel : trap_from_kernel + 0x26
0xffffff80321339e0 : 0xffffff802b28fc25 mach_kernel : _panic_trap_to_debugger + 0x1c5
0xffffff8032133b10 : 0xffffff802b28fa43 mach_kernel : _panic + 0x63
0xffffff8032133b80 : 0xffffff7fadbeaa0e net.lundman.zfs : _spl_panic + 0xce
0xffffff8032133d30 : 0xffffff7fadb00ce5 net.lundman.zfs : _zfs_space_delta_cb + 0x185
0xffffff8032133dc0 : 0xffffff7fad9d0817 net.lundman.zfs : _dmu_objset_userquota_get_ids + 0x367
0xffffff8032133e30 : 0xffffff7fad9f5773 net.lundman.zfs : _dnode_sync + 0x183
0xffffff8032133eb0 : 0xffffff7fad9d24ad net.lundman.zfs : _dmu_objset_sync_dnodes + 0xfd
0xffffff8032133ef0 : 0xffffff7fad9cfcbc net.lundman.zfs : _sync_dnodes_task + 0x3c
0xffffff8032133f20 : 0xffffff7fadc00ac1 net.lundman.zfs : _taskq_thread + 0x3a1
0xffffff8032133fa0 : 0xffffff802b22b0ce mach_kernel : _call_continuation + 0x2e

Describe how to reproduce the problem

zfs_upgrade/zfs_upgrade_004_pos triggers it easily.

Include any warning/errors/backtraces from the system logs

Possible explanation:

https://github.com/openzfs/zfs/issues/2025#issuecomment-40459546

Since it is relatively easy to trigger on macOS, it may help to debug it, should upstream people be into it.

lundman commented 4 years ago

OH ok, so @ahrens added commit https://github.com/openzfs/zfs/commit/7bcb7f0840d1857370dd1f9ee0ad48f9b7939dfd which changes the layout a little. The panic now looks like:

panic(cpu 1 caller): VERIFY3(sa.sa_magic == SA_MAGIC) failed (1591938303 == 3100762)

Backtrace (CPU 1), Frame : Return Address
0xffffff90b81f38b0 : mach_kernel : _handle_debugger_trap + 0x46d
0xffffff90b81f3900 : mach_kernel : _kdp_i386_trap + 0x196
0xffffff90b81f3940 : mach_kernel : _kernel_trap + 0x3c2
0xffffff90b81f39b0 : mach_kernel : trap_from_kernel + 0x26
0xffffff90b81f39d0 : mach_kernel : _panic_trap_to_debugger + 0x1c5
0xffffff90b81f3b00 : mach_kernel : _panic + 0x63
0xffffff90b81f3b70 : net.lundman.zfs : _spl_panic + 0xce
0xffffff90b81f3d20 : net.lundman.zfs : _zpl_get_file_info + 0x18c
0xffffff90b81f3db0 : net.lundman.zfs : _dmu_objset_userquota_get_ids + 0x2d4
0xffffff90b81f3e30 : net.lundman.zfs : _dnode_sync + 0x183
0xffffff90b81f3eb0 : net.lundman.zfs : _dmu_objset_sync_dnodes + 0xfd
0xffffff90b81f3ef0 : net.lundman.zfs : _sync_dnodes_task + 0x3c
0xffffff90b81f3f20 : net.lundman.zfs : _taskq_thread + 0x3a1
0xffffff90b81f3fa0 : mach_kernel : _call_continuation + 0x2e
-> 2160     error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
(lldb) p before
(boolean_t) $2 = 0
(lldb) p data
(void *) $3 = 0xffffff90abc54800
(lldb) p os->os_phys->os_type
(uint64_t) $4 = 2
(lldb) p *dn
(dnode_t) $1 = {
  dn_struct_rwlock = {
    rw_lock = ([0] = 2701131776, [1] = 0, [2] = 3135097598, [3] = 3135097598)
    rw_owner = 0x0000000000000000
    rw_readers = 0
    rw_pad = 305419896
  }
  dn_link = {
    list_next = 0xffffff90ab445680
    list_prev = 0xffffff90ab17e358
  }
  dn_objset = 0xffffff90aada6b80
  dn_object = 20
  dn_dbuf = 0xffffff90a956e4d8
  dn_handle = 0xffffff90ab588b60
  dn_phys = 0xffffff90a9026800
  dn_type = DMU_OT_PLAIN_FILE_CONTENTS
  dn_bonuslen = 176
  dn_bonustype = ','
  dn_nblkptr = '\x01'
  dn_checksum = '\0'
  dn_compress = '\0'
  dn_nlevels = '\x01'
  dn_indblkshift = '\x11'
  dn_datablkshift = '\f'
  dn_moved = '\0'
  dn_datablkszsec = 8
  dn_datablksz = 4096
  dn_maxblkid = 0
  dn_next_type = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
  dn_num_slots = '\x01'
  dn_next_nblkptr = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
  dn_next_nlevels = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
  dn_next_indblkshift = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
  dn_next_bonustype = ([0] = '\0', [1] = '\0', [2] = ',', [3] = '\0')
  dn_rm_spillblk = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
  dn_next_bonuslen = ([0] = 0, [1] = 0, [2] = 176, [3] = 0)
  dn_next_blksz = ([0] = 0, [1] = 0, [2] = 0, [3] = 0)
  dn_next_maxblkid = ([0] = 0, [1] = 0, [2] = 0, [3] = 0)
  dn_dbufs_count = 1
  dn_dirty_link = {
    [0] = {
      list_next = 0x0000000000000000
      list_prev = 0x0000000000000000
    }
    [1] = {
      list_next = 0xffffff90aa8baa88
      list_prev = 0xffffff90aa483d38
    }
    [2] = {
      list_next = 0xffffff90acf59db8
      list_prev = 0xffffff90a862ab38
    }
    [3] = {
      list_next = 0x0000000000000000
      list_prev = 0x0000000000000000
    }
  }
  dn_mtx = {
    m_owner = 0x0000000000000000
    m_lock = {
      opaque = ([0] = 0, [1] = 0, [2] = 0, [3] = 4294967295)
    }
    leak = 0xffffff8024a27a00
    m_initialised = 1311768467463790320
  }
  dn_dirty_records = {
    [0] = {
      list_size = 392
      list_offset = 0
      list_head = {
        list_next = 0xffffff90aa7f07b8
        list_prev = 0xffffff90aa7f07b8
      }
    }
    [1] = {
      list_size = 392
      list_offset = 0
      list_head = {
        list_next = 0xffffff90a8708700
        list_prev = 0xffffff90acc4d640
      }
    }
    [2] = {
      list_size = 392
      list_offset = 0
      list_head = {
        list_next = 0xffffff90abfd6d00
        list_prev = 0xffffff90abfd6d00
      }
    }
    [3] = {
      list_size = 392
      list_offset = 0
      list_head = {
        list_next = 0xffffff90aa7f0818
        list_prev = 0xffffff90aa7f0818
      }
    }
  }
  dn_free_ranges = {
    [0] = 0x0000000000000000
    [1] = 0x0000000000000000
    [2] = 0x0000000000000000
    [3] = 0x0000000000000000
  }
  dn_allocated_txg = 208
  dn_free_txg = 0
  dn_assigned_txg = 0
  dn_dirty_txg = 230
  dn_notxholds = (pad = 13465141656313252606)
  dn_nodnholds = (pad = 13465141656313252606)
  dn_dirtyctx = DN_DIRTY_OPEN
  dn_dirtyctx_firstset = 0xffffff90ab46c518
  dn_tx_holds = (rc_count = 0)
  dn_holds = (rc_count = 7)
  dn_dbufs_mtx = {
    m_owner = 0x0000000000000000
    m_lock = {
      opaque = ([0] = 0, [1] = 0, [2] = 0, [3] = 4294967295)
    }
    leak = 0xffffff8024a28540
    m_initialised = 1311768467463790320
  }
  dn_dbufs = {
    avl_root = 0xffffff90aaa91d50
    avl_compar = 0xffffff7f8d5f5c80 (zfs`dbuf_compare at dnode.c:88)
    avl_offset = 64
    avl_numnodes = 1
    avl_size = 296
  }
  dn_bonus = 0xffffff90ab46c518
  dn_have_spill = 0
  dn_zio = 0xffffff90a8cf7500
  dn_oldused = 4096
  dn_oldflags = 11
  dn_olduid = 0
  dn_oldgid = 0
  dn_oldprojid = 0
  dn_newuid = 0
  dn_newgid = 0
  dn_newprojid = 0
  dn_id_flags = 5
  dn_zfetch = {
    zf_lock = {
      m_owner = 0x0000000000000000
      m_lock = {
        opaque = ([0] = 0, [1] = 0, [2] = 0, [3] = 4294967295)
      }
      leak = 0xffffff801edc8860
      m_initialised = 1311768467463790320
    }
    zf_stream = {
      list_size = 88
      list_offset = 72
      list_head = {
        list_next = 0xffffff90aa7f0980
        list_prev = 0xffffff90aa7f0980
      }
    }
    zf_dnode = 0xffffff90aa7f0668
  }
}
lundman commented 4 years ago

Inviting @avg-I should they have time to assist

bu7cher commented 3 years ago

I have core dump from similar panic on FreeBSD 12.0:

Unread portion of the kernel message buffer:
panic: solaris assert: sa.sa_magic == 0x2F505A (0xb656f0e0 == 0x2f505a), file: /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c, line: 609
cpuid = 4
time = 1623060642
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe3fd9321780
vpanic() at vpanic+0x194/frame 0xfffffe3fd93217e0
panic() at panic+0x43/frame 0xfffffe3fd9321840
assfail3() at assfail3+0x2c/frame 0xfffffe3fd9321860
zfs_space_delta_cb() at zfs_space_delta_cb+0x100/frame 0xfffffe3fd93218a0
dmu_objset_userquota_get_ids() at dmu_objset_userquota_get_ids+0x1b7/frame 0xfffffe3fd93218f0
dnode_sync() at dnode_sync+0xa6/frame 0xfffffe3fd9321980
sync_dnodes_task() at sync_dnodes_task+0x92/frame 0xfffffe3fd93219c0
taskq_run() at taskq_run+0x10/frame 0xfffffe3fd93219e0
taskqueue_run_locked() at taskqueue_run_locked+0x147/frame 0xfffffe3fd9321a40
taskqueue_thread_loop() at taskqueue_thread_loop+0xb8/frame 0xfffffe3fd9321a70
fork_exit() at fork_exit+0x86/frame 0xfffffe3fd9321ab0
fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe3fd9321ab0
--- trap 0, rip = 0, rsp = 0, rbp = 0 ---
Uptime: 14d8h32m53s
Dumping 14912 out of 261996 MB:..1%..11%..21%..31%..41%..51%..61%..71%..81%..91%

__curthread () at ./machine/pcpu.h:232
232     ./machine/pcpu.h: No such file or directory.
(kgdb) #0  __curthread () at ./machine/pcpu.h:232
#1  doadump (textdump=1) at /usr/src/sys/kern/kern_shutdown.c:318
#2  0xffffffff80aa31c3 in kern_reboot (howto=260)
    at /usr/src/sys/kern/kern_shutdown.c:386
#3  0xffffffff80aa36ae in vpanic (fmt=<optimized out>, ap=0xfffffe3fd9321820)
    at /usr/src/sys/kern/kern_shutdown.c:779
#4  0xffffffff80aa34d3 in panic (fmt=<unavailable>)
    at /usr/src/sys/kern/kern_shutdown.c:710
#5  0xffffffff822df23c in assfail3 (a=<unavailable>, lv=<unavailable>, 
    op=<unavailable>, rv=<unavailable>, f=<unavailable>, l=<optimized out>)
    at /usr/src/sys/cddl/compat/opensolaris/kern/opensolaris_cmn_err.c:91
#6  0xffffffff8209e9c0 in zfs_space_delta_cb (bonustype=<optimized out>, 
    data=0xfffff803e64b2f40, userp=0xfffff8052a3b2278, 
    groupp=0xfffff8052a3b2280)
    at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c:609
#7  0xffffffff8200d917 in dmu_objset_userquota_get_ids (
    dn=0xfffff8052a3b2000, before=0, tx=<optimized out>)
    at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c:1592
#8  0xffffffff82015396 in dnode_sync (dn=0xfffff8052a3b2000, 
    tx=0xfffff803ae49e800)
    at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c:570
#9  0xffffffff8200d192 in dmu_objset_sync_dnodes (list=0xfffff801763bb420, 
   tx=0xfffff803ae49e800)
    at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c:1093
#10 sync_dnodes_task (arg=0xfffff82419cdd260)
    at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c:1160
#11 0xffffffff81fd3750 in taskq_run (arg=0xfffff801c66d08d0, 
    pending=<unavailable>)
    at /usr/src/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c:109
#12 0xffffffff80afc0e7 in taskqueue_run_locked (queue=0xfffff8016b01cd00)
    at /usr/src/sys/kern/subr_taskqueue.c:463
#13 0xffffffff80afd2c8 in taskqueue_thread_loop (arg=<optimized out>)
    at /usr/src/sys/kern/subr_taskqueue.c:755
#14 0xffffffff80a65796 in fork_exit (
    callout=0xffffffff80afd210 <taskqueue_thread_loop>, 
    arg=0xfffff801281c3f20, frame=0xfffffe3fd9321ac0)
    at /usr/src/sys/kern/kern_fork.c:1038
smalukav commented 3 years ago

We're encountering a similar panic. @lundman Can you also print dn.dn_bonus and dn.dn_bonus.db_last_dirty if it's still reproducible?