Open lundman opened 4 years ago
OH ok, so @ahrens added commit https://github.com/openzfs/zfs/commit/7bcb7f0840d1857370dd1f9ee0ad48f9b7939dfd which changes the layout a little. The panic now looks like:
panic(cpu 1 caller): VERIFY3(sa.sa_magic == SA_MAGIC) failed (1591938303 == 3100762)
Backtrace (CPU 1), Frame : Return Address
0xffffff90b81f38b0 : mach_kernel : _handle_debugger_trap + 0x46d
0xffffff90b81f3900 : mach_kernel : _kdp_i386_trap + 0x196
0xffffff90b81f3940 : mach_kernel : _kernel_trap + 0x3c2
0xffffff90b81f39b0 : mach_kernel : trap_from_kernel + 0x26
0xffffff90b81f39d0 : mach_kernel : _panic_trap_to_debugger + 0x1c5
0xffffff90b81f3b00 : mach_kernel : _panic + 0x63
0xffffff90b81f3b70 : net.lundman.zfs : _spl_panic + 0xce
0xffffff90b81f3d20 : net.lundman.zfs : _zpl_get_file_info + 0x18c
0xffffff90b81f3db0 : net.lundman.zfs : _dmu_objset_userquota_get_ids + 0x2d4
0xffffff90b81f3e30 : net.lundman.zfs : _dnode_sync + 0x183
0xffffff90b81f3eb0 : net.lundman.zfs : _dmu_objset_sync_dnodes + 0xfd
0xffffff90b81f3ef0 : net.lundman.zfs : _sync_dnodes_task + 0x3c
0xffffff90b81f3f20 : net.lundman.zfs : _taskq_thread + 0x3a1
0xffffff90b81f3fa0 : mach_kernel : _call_continuation + 0x2e
-> 2160 error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
(lldb) p before
(boolean_t) $2 = 0
(lldb) p data
(void *) $3 = 0xffffff90abc54800
(lldb) p os->os_phys->os_type
(uint64_t) $4 = 2
(lldb) p *dn
(dnode_t) $1 = {
dn_struct_rwlock = {
rw_lock = ([0] = 2701131776, [1] = 0, [2] = 3135097598, [3] = 3135097598)
rw_owner = 0x0000000000000000
rw_readers = 0
rw_pad = 305419896
}
dn_link = {
list_next = 0xffffff90ab445680
list_prev = 0xffffff90ab17e358
}
dn_objset = 0xffffff90aada6b80
dn_object = 20
dn_dbuf = 0xffffff90a956e4d8
dn_handle = 0xffffff90ab588b60
dn_phys = 0xffffff90a9026800
dn_type = DMU_OT_PLAIN_FILE_CONTENTS
dn_bonuslen = 176
dn_bonustype = ','
dn_nblkptr = '\x01'
dn_checksum = '\0'
dn_compress = '\0'
dn_nlevels = '\x01'
dn_indblkshift = '\x11'
dn_datablkshift = '\f'
dn_moved = '\0'
dn_datablkszsec = 8
dn_datablksz = 4096
dn_maxblkid = 0
dn_next_type = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
dn_num_slots = '\x01'
dn_next_nblkptr = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
dn_next_nlevels = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
dn_next_indblkshift = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
dn_next_bonustype = ([0] = '\0', [1] = '\0', [2] = ',', [3] = '\0')
dn_rm_spillblk = ([0] = '\0', [1] = '\0', [2] = '\0', [3] = '\0')
dn_next_bonuslen = ([0] = 0, [1] = 0, [2] = 176, [3] = 0)
dn_next_blksz = ([0] = 0, [1] = 0, [2] = 0, [3] = 0)
dn_next_maxblkid = ([0] = 0, [1] = 0, [2] = 0, [3] = 0)
dn_dbufs_count = 1
dn_dirty_link = {
[0] = {
list_next = 0x0000000000000000
list_prev = 0x0000000000000000
}
[1] = {
list_next = 0xffffff90aa8baa88
list_prev = 0xffffff90aa483d38
}
[2] = {
list_next = 0xffffff90acf59db8
list_prev = 0xffffff90a862ab38
}
[3] = {
list_next = 0x0000000000000000
list_prev = 0x0000000000000000
}
}
dn_mtx = {
m_owner = 0x0000000000000000
m_lock = {
opaque = ([0] = 0, [1] = 0, [2] = 0, [3] = 4294967295)
}
leak = 0xffffff8024a27a00
m_initialised = 1311768467463790320
}
dn_dirty_records = {
[0] = {
list_size = 392
list_offset = 0
list_head = {
list_next = 0xffffff90aa7f07b8
list_prev = 0xffffff90aa7f07b8
}
}
[1] = {
list_size = 392
list_offset = 0
list_head = {
list_next = 0xffffff90a8708700
list_prev = 0xffffff90acc4d640
}
}
[2] = {
list_size = 392
list_offset = 0
list_head = {
list_next = 0xffffff90abfd6d00
list_prev = 0xffffff90abfd6d00
}
}
[3] = {
list_size = 392
list_offset = 0
list_head = {
list_next = 0xffffff90aa7f0818
list_prev = 0xffffff90aa7f0818
}
}
}
dn_free_ranges = {
[0] = 0x0000000000000000
[1] = 0x0000000000000000
[2] = 0x0000000000000000
[3] = 0x0000000000000000
}
dn_allocated_txg = 208
dn_free_txg = 0
dn_assigned_txg = 0
dn_dirty_txg = 230
dn_notxholds = (pad = 13465141656313252606)
dn_nodnholds = (pad = 13465141656313252606)
dn_dirtyctx = DN_DIRTY_OPEN
dn_dirtyctx_firstset = 0xffffff90ab46c518
dn_tx_holds = (rc_count = 0)
dn_holds = (rc_count = 7)
dn_dbufs_mtx = {
m_owner = 0x0000000000000000
m_lock = {
opaque = ([0] = 0, [1] = 0, [2] = 0, [3] = 4294967295)
}
leak = 0xffffff8024a28540
m_initialised = 1311768467463790320
}
dn_dbufs = {
avl_root = 0xffffff90aaa91d50
avl_compar = 0xffffff7f8d5f5c80 (zfs`dbuf_compare at dnode.c:88)
avl_offset = 64
avl_numnodes = 1
avl_size = 296
}
dn_bonus = 0xffffff90ab46c518
dn_have_spill = 0
dn_zio = 0xffffff90a8cf7500
dn_oldused = 4096
dn_oldflags = 11
dn_olduid = 0
dn_oldgid = 0
dn_oldprojid = 0
dn_newuid = 0
dn_newgid = 0
dn_newprojid = 0
dn_id_flags = 5
dn_zfetch = {
zf_lock = {
m_owner = 0x0000000000000000
m_lock = {
opaque = ([0] = 0, [1] = 0, [2] = 0, [3] = 4294967295)
}
leak = 0xffffff801edc8860
m_initialised = 1311768467463790320
}
zf_stream = {
list_size = 88
list_offset = 72
list_head = {
list_next = 0xffffff90aa7f0980
list_prev = 0xffffff90aa7f0980
}
}
zf_dnode = 0xffffff90aa7f0668
}
}
Inviting @avg-I should they have time to assist
I have core dump from similar panic on FreeBSD 12.0:
Unread portion of the kernel message buffer:
panic: solaris assert: sa.sa_magic == 0x2F505A (0xb656f0e0 == 0x2f505a), file: /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c, line: 609
cpuid = 4
time = 1623060642
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe3fd9321780
vpanic() at vpanic+0x194/frame 0xfffffe3fd93217e0
panic() at panic+0x43/frame 0xfffffe3fd9321840
assfail3() at assfail3+0x2c/frame 0xfffffe3fd9321860
zfs_space_delta_cb() at zfs_space_delta_cb+0x100/frame 0xfffffe3fd93218a0
dmu_objset_userquota_get_ids() at dmu_objset_userquota_get_ids+0x1b7/frame 0xfffffe3fd93218f0
dnode_sync() at dnode_sync+0xa6/frame 0xfffffe3fd9321980
sync_dnodes_task() at sync_dnodes_task+0x92/frame 0xfffffe3fd93219c0
taskq_run() at taskq_run+0x10/frame 0xfffffe3fd93219e0
taskqueue_run_locked() at taskqueue_run_locked+0x147/frame 0xfffffe3fd9321a40
taskqueue_thread_loop() at taskqueue_thread_loop+0xb8/frame 0xfffffe3fd9321a70
fork_exit() at fork_exit+0x86/frame 0xfffffe3fd9321ab0
fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe3fd9321ab0
--- trap 0, rip = 0, rsp = 0, rbp = 0 ---
Uptime: 14d8h32m53s
Dumping 14912 out of 261996 MB:..1%..11%..21%..31%..41%..51%..61%..71%..81%..91%
__curthread () at ./machine/pcpu.h:232
232 ./machine/pcpu.h: No such file or directory.
(kgdb) #0 __curthread () at ./machine/pcpu.h:232
#1 doadump (textdump=1) at /usr/src/sys/kern/kern_shutdown.c:318
#2 0xffffffff80aa31c3 in kern_reboot (howto=260)
at /usr/src/sys/kern/kern_shutdown.c:386
#3 0xffffffff80aa36ae in vpanic (fmt=<optimized out>, ap=0xfffffe3fd9321820)
at /usr/src/sys/kern/kern_shutdown.c:779
#4 0xffffffff80aa34d3 in panic (fmt=<unavailable>)
at /usr/src/sys/kern/kern_shutdown.c:710
#5 0xffffffff822df23c in assfail3 (a=<unavailable>, lv=<unavailable>,
op=<unavailable>, rv=<unavailable>, f=<unavailable>, l=<optimized out>)
at /usr/src/sys/cddl/compat/opensolaris/kern/opensolaris_cmn_err.c:91
#6 0xffffffff8209e9c0 in zfs_space_delta_cb (bonustype=<optimized out>,
data=0xfffff803e64b2f40, userp=0xfffff8052a3b2278,
groupp=0xfffff8052a3b2280)
at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c:609
#7 0xffffffff8200d917 in dmu_objset_userquota_get_ids (
dn=0xfffff8052a3b2000, before=0, tx=<optimized out>)
at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c:1592
#8 0xffffffff82015396 in dnode_sync (dn=0xfffff8052a3b2000,
tx=0xfffff803ae49e800)
at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c:570
#9 0xffffffff8200d192 in dmu_objset_sync_dnodes (list=0xfffff801763bb420,
tx=0xfffff803ae49e800)
at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c:1093
#10 sync_dnodes_task (arg=0xfffff82419cdd260)
at /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c:1160
#11 0xffffffff81fd3750 in taskq_run (arg=0xfffff801c66d08d0,
pending=<unavailable>)
at /usr/src/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c:109
#12 0xffffffff80afc0e7 in taskqueue_run_locked (queue=0xfffff8016b01cd00)
at /usr/src/sys/kern/subr_taskqueue.c:463
#13 0xffffffff80afd2c8 in taskqueue_thread_loop (arg=<optimized out>)
at /usr/src/sys/kern/subr_taskqueue.c:755
#14 0xffffffff80a65796 in fork_exit (
callout=0xffffffff80afd210 <taskqueue_thread_loop>,
arg=0xfffff801281c3f20, frame=0xfffffe3fd9321ac0)
at /usr/src/sys/kern/kern_fork.c:1038
We're encountering a similar panic. @lundman Can you also print dn.dn_bonus and dn.dn_bonus.db_last_dirty if it's still reproducible?
System information
Describe the problem you're observing
Describe how to reproduce the problem
zfs_upgrade/zfs_upgrade_004_pos
triggers it easily.Include any warning/errors/backtraces from the system logs
Possible explanation:
https://github.com/openzfs/zfs/issues/2025#issuecomment-40459546
Since it is relatively easy to trigger on macOS, it may help to debug it, should upstream people be into it.