389ds / 389-ds-base

The enterprise-class Open Source LDAP server for Linux
https://www.port389.org/
Other
213 stars 94 forks source link

Abort in Rust NDN cache #5522

Open mreynolds389 opened 2 years ago

mreynolds389 commented 2 years ago

Issue Description

Server crashes during automember rebuild task using debug build:

#0  0x00007f2a732a156c in __pthread_kill_implementation () from target:/lib64/libc.so.6
#1  0x00007f2a73254d16 in raise () from target:/lib64/libc.so.6
#2  0x00007f2a732287f3 in abort () from target:/lib64/libc.so.6
#3  0x00007f2a73a49207 in std::sys::unix::abort_internal () from target:/usr/lib64/dirsrv/libslapd.so.0
#4  0x00007f2a73a3e17e in rust_panic () from target:/usr/lib64/dirsrv/libslapd.so.0
#5  0x00007f2a73a3de77 in std::panicking::rust_panic_with_hook () from target:/usr/lib64/dirsrv/libslapd.so.0
#6  0x00007f2a73a3dc29 in std::panicking::begin_panic_handler::{{closure}} () from target:/usr/lib64/dirsrv/libslapd.so.0
#7  0x00007f2a73a3af44 in std::sys_common::backtrace::__rust_end_short_backtrace () from target:/usr/lib64/dirsrv/libslapd.so.0
#8  0x00007f2a73a3d989 in rust_begin_unwind () from target:/usr/lib64/dirsrv/libslapd.so.0
#9  0x00007f2a73825493 in core::panicking::panic_fmt () from target:/usr/lib64/dirsrv/libslapd.so.0
#10 0x00007f2a7382535d in core::panicking::panic () from target:/usr/lib64/dirsrv/libslapd.so.0
#11 0x00007f2a73ac558d in concread::arcache::ARCache<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::evict<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (self=0x7f2a72043e00, 
    cache=0x7f299e5fcc70, inner=0x7f2a72043e98, shared=0x7f2a72043e58, stats=0x7f299e5fc888, commit_txid=3270280)
    at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:1353
#12 0x00007f2a73ac6a16 in concread::arcache::ARCache<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::commit<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (self=0x7f2a72043e00, 
    cache=..., tlocal=..., hit=..., clear=false, init_above_watermark=true) at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:1457
#13 0x00007f2a73ac7f21 in concread::arcache::ARCacheWriteTxn<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::commit<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (self=...)
    at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:1506
#14 0x00007f2a73ac0e02 in concread::arcache::ARCache<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::try_quiesce<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (self=0x7f2a72043e00)
    at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:796
#15 0x00007f2a7390607f in concread::arcache::{impl#13}::drop<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (self=0x7f2996243700)
    at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:1929
#16 0x00007f2a73902a17 in core::ptr::drop_in_place<concread::arcache::ARCacheReadTxn<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>> ()
    at /builddir/build/BUILD/rustc-1.64.0-src/library/core/src/ptr/mod.rs:487
#17 0x00007f2a73903efb in core::ptr::drop_in_place<rslapd::cache::ARCacheCharRead> () at /builddir/build/BUILD/rustc-1.64.0-src/library/core/src/ptr/mod.rs:487
#18 0x00007f2a7390448a in core::ptr::drop_in_place<alloc::boxed::Box<rslapd::cache::ARCacheCharRead, alloc::alloc::Global>> () at /builddir/build/BUILD/rustc-1.64.0-src/library/core/src/ptr/mod.rs:487
#19 0x00007f2a739001ef in rslapd::cache::cache_char_read_complete (read_txn=0x7f2996243700) at librslapd/src/cache.rs:94
#20 0x00007f2a73842287 in ndn_cache_add (dn=0x7f29ea3c8200 "uid=user16783client000.hv2.lab.eng.bos.redhat.com,cn=users,cn=accounts,dc=hv2,dc=lab,dc=eng,dc=bos,dc=redhat,dc=com", dn_len=115, 
    ndn=0x7f297ebe3680 "uid=user16783client000.hv2.lab.eng.bos.redhat.com,cn=users,cn=accounts,dc=hv2,dc=lab,dc=eng,dc=bos,dc=redhat,dc=com", ndn_len=115) at ../389-ds-base/ldap/servers/slapd/dn.c:2982
#21 0x00007f2a7383f4af in slapi_dn_normalize_ext (src=0x7f297ebe3680 "uid=user16783client000.hv2.lab.eng.bos.redhat.com,cn=users,cn=accounts,dc=hv2,dc=lab,dc=eng,dc=bos,dc=redhat,dc=com", src_len=115, 
    dest=0x7f299e5fd408, dest_len=0x7f299e5fd400) at ../389-ds-base/ldap/servers/slapd/dn.c:1143
#22 0x00007f2a73840003 in slapi_dn_normalize_case_ext (src=0x7f297ebe3680 "uid=user16783client000.hv2.lab.eng.bos.redhat.com,cn=users,cn=accounts,dc=hv2,dc=lab,dc=eng,dc=bos,dc=redhat,dc=com", 
    src_len=0, dest=0x7f299e5fd408, dest_len=0x7f299e5fd400) at ../389-ds-base/ldap/servers/slapd/dn.c:1484
#23 0x00007f2a7253a4b7 in value_normalize_ext (s=0x7f297ebe3680 "uid=user16783client000.hv2.lab.eng.bos.redhat.com,cn=users,cn=accounts,dc=hv2,dc=lab,dc=eng,dc=bos,dc=redhat,dc=com", syntax=9, 
    trim_spaces=1, alt=0x7f299e5fd478) at ../389-ds-base/ldap/servers/plugins/syntaxes/value.c:93
#24 0x00007f2a725366eb in string_values2keys (pb=0x7f2996301540, bvals=0x7f297dc383c0, ivals=0x7f299e5fd608, syntax=9, ftype=164) at ../389-ds-base/ldap/servers/plugins/syntaxes/string.c:564
#25 0x00007f2a72530130 in dn_values2keys (pb=0x7f2996301540, vals=0x7f297dc383c0, ivals=0x7f299e5fd608, ftype=164) at ../389-ds-base/ldap/servers/plugins/syntaxes/dn.c:161
#26 0x00007f2a738b0625 in slapi_attr_values2keys_sv_pb (sattr=0x7f2a721cc490, vals=0x7f297dc383c0, ivals=0x7f299e5fd608, ftype=164, pb=0x7f2996301540)
    at ../389-ds-base/ldap/servers/slapd/plugin_syntax.c:628
#27 0x00007f2a6eec9cba in index_addordel_values_ext_sv (be=0x7f2a6e97d6c0, type=0x7f299e5fd850 "member", vals=0x7f2a29da1130, evals=0x7f297dc383c0, id=77, flags=34, txn=0x7f299e5fda60, 
    idl_disposition=0x0, buffer_handle=0x0) at ../389-ds-base/ldap/servers/slapd/back-ldbm/index.c:2138
#28 0x00007f2a6eec9720 in index_addordel_values_sv (be=0x7f2a6e97d6c0, type=0x7f299e5fd850 "member", vals=0x7f2a29da1130, evals=0x7f297dc383c0, id=77, flags=34, txn=0x7f299e5fda60)
    at ../389-ds-base/ldap/servers/slapd/back-ldbm/index.c:2015
--Type <RET> for more, q to quit, c to continue without paging--
#29 0x00007f2a6eec5ce8 in index_add_mods (be=0x7f2a6e97d6c0, mods=0x7f2a2c7e1d40, olde=0x7f297a2b6980, newe=0x7f2a244360e0, txn=0x7f299e5fda60)
    at ../389-ds-base/ldap/servers/slapd/back-ldbm/index.c:770
#30 0x00007f2a6eeef59e in ldbm_back_modify (pb=0x7f297a30f7c0) at ../389-ds-base/ldap/servers/slapd/back-ldbm/ldbm_modify.c:874
#31 0x00007f2a7388bb8b in op_shared_modify (pb=0x7f297a30f7c0, pw_change=0, old_pw=0x0) at ../389-ds-base/ldap/servers/slapd/modify.c:1022
#32 0x00007f2a7388a670 in modify_internal_pb (pb=0x7f297a30f7c0) at ../389-ds-base/ldap/servers/slapd/modify.c:572
#33 0x00007f2a7388a1c0 in slapi_modify_internal_pb (pb=0x7f297a30f7c0) at ../389-ds-base/ldap/servers/slapd/modify.c:442
#34 0x00007f2a7241ddf6 in automember_update_member_value (member_e=0x7f2996243340, 
    group_dn=0x7f2a6e2ceb40 "cn=Host Group Administrators,cn=privileges,cn=pbac,dc=hv2,dc=lab,dc=eng,dc=bos,dc=redhat,dc=com", grouping_attr=0x7f2a6e2d5718 "member", 
    grouping_value=0x7f2a6e2d5710 "dn", ldif_fd=0x0, add=0) at ../389-ds-base/ldap/servers/plugins/automember/automember.c:1726
#35 0x00007f2a7241f620 in automember_rebuild_task_thread (arg=0x7f2a3359ee00) at ../389-ds-base/ldap/servers/plugins/automember/automember.c:2481
#36 0x00007f2a72f337b4 in _pt_root () from target:/lib64/libnspr4.so
#37 0x00007f2a7329f822 in start_thread () from target:/lib64/libc.so.6
#38 0x00007f2a7323f450 in clone3 () from target:/lib64/libc.so.6
Firstyear commented 2 years ago

The most likely cause of this is a bug in the calling automember code causing a double free in the cstring code. This is because it is invalid for C to free memory owned by the rust side (and vice versa).

We can see that this is likely because the fault occurs in "evict" where we are dropping (freeing) the cstring, which is occuring because the item is evicted. The drop is here https://github.com/kanidm/concread/blob/a51422b1cdfa09df2646d34c9a6ffcc56d72ca34/src/arcache/mod.rs#L488 where ci (Cache Item) is swapped for the new state, meaning that ci will drop here. Since the CString was already freed, that causes the panic.

Note it's not that we are hitting the impossible state (unreachable) because then the backtrace would be different.

Most likely what we are hitting is:

https://doc.rust-lang.org/src/alloc/ffi/c_str.rs.html#699

Which then calls

https://doc.rust-lang.org/src/core/slice/index.rs.html#232

finally triggering the unsafe precondition check

https://stdrs.dev/nightly/x86_64-unknown-linux-gnu/core/intrinsics/macro.assert_unsafe_precondition.html

I think this means you probably need to track where automember is "freeing" things returned from the ndn cache instead, and they should be removed or checked.

Firstyear commented 2 years ago

PS: This is a really over version of concread, I seem to remember we couldn't upgrade because RH/fedora rust were too old, so it could be worth you trying to upgrade this as there have been significant performance and api improvements.

mreynolds389 commented 2 years ago

The only issue with your theory is that we always a return a copy from the ndn cache, and never return a reference. So it's probably not automember doing something wrong. I will run an ASAN test tomorrow though...

Firstyear commented 2 years ago

Yeah, I think that asan would be the best option because that will report double frees properly.

mreynolds389 commented 2 years ago

ASAN shows no issues

mreynolds389 commented 3 weeks ago

Also seeing this in replication:

#7  0x00007f598cc6968b in std::panicking::begin_panic_handler::{closure#0} () at library/std/src/panicking.rs:656
#8  0x00007f598cc66ed9 in std::sys_common::backtrace::__rust_end_short_backtrace<std::panicking::begin_panic_handler::{closure_env#0}, !> (
    f=<error reading variable: Cannot access memory at address 0x2c542b>) at library/std/src/sys_common/backtrace.rs:171
#9  0x00007f598cc69417 in std::panicking::begin_panic_handler (info=<optimized out>) at library/std/src/panicking.rs:652
#10 0x00007f598c9bc533 in core::panicking::panic_fmt (fmt=...) at library/core/src/panicking.rs:72
#11 0x00007f598c9bc5dc in core::panicking::panic (expr=...) at library/core/src/panicking.rs:146
#12 0x00007f598cac00f3 in concread::arcache::ARCache<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::evict<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (
    self=0x7f598aab9380, cache=0x7f58f65fe670, inner=0x7f598aab9428, shared=0x7f598aab9388, stats=0x7f58f65fe288, commit_txid=2095678)
    at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:474
#13 0x00007f598cac1399 in concread::arcache::ARCache<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::commit<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (
    self=0x7f598aab9380, cache=..., tlocal=..., hit=..., clear=false, init_above_watermark=true)
    at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:1457
#14 0x00007f598cac2ce5 in concread::arcache::ARCacheWriteTxn<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::commit<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (
    self=...) at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:1506
#15 0x00007f598cabc3e6 in concread::arcache::ARCache<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>::try_quiesce<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (
    self=0x7f598aab9380) at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:796
#16 0x00007f598caa5a28 in concread::arcache::{impl#13}::drop<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString> (self=0x7f58f5a15480)
    at /root/.cargo/registry/src/github.com-1ecc6299db9ec823/concread-0.2.21/src/arcache/mod.rs:1929
#17 0x00007f598caa1dd7 in core::ptr::drop_in_place<concread::arcache::ARCacheReadTxn<alloc::ffi::c_str::CString, alloc::ffi::c_str::CString>> ()
    at /builddir/build/BUILD/rustc-1.80.1-src/library/core/src/ptr/mod.rs:542
#18 0x00007f598caa2f7b in core::ptr::drop_in_place<rslapd::cache::ARCacheCharRead> () at /builddir/build/BUILD/rustc-1.80.1-src/library/core/src/ptr/mod.rs:542
#19 0x00007f598caa342a in core::ptr::drop_in_place<alloc::boxed::Box<rslapd::cache::ARCacheCharRead, alloc::alloc::Global>> ()
    at /builddir/build/BUILD/rustc-1.80.1-src/library/core/src/ptr/mod.rs:542
#20 0x00007f598caa0695 in rslapd::cache::cache_char_read_complete (read_txn=0x7f58f5a15480) at librslapd/src/cache.rs:92
#21 0x00007f598c9daffc in ndn_cache_add (dn=0x7f58c2468aa0 "uid=test_entry09842,dc=example,dc=com", dn_len=37, ndn=0x7f58d08a69f0 "uid=test_entry09842,dc=example,dc=com", ndn_len=37)
    at ../389-ds-base/ldap/servers/slapd/dn.c:2982
#22 0x00007f598c9d822f in slapi_dn_normalize_ext (src=0x7f58d08a69f0 "uid=test_entry09842,dc=example,dc=com", src_len=37, dest=0x7f58f65fecd8, dest_len=0x7f58f65fecd0)
    at ../389-ds-base/ldap/servers/slapd/dn.c:1143
#23 0x00007f598c9da1ef in slapi_sdn_get_dn (sdn=0x7f58c507d000) at ../389-ds-base/ldap/servers/slapd/dn.c:2335
#24 0x00007f598ad0ba3d in replay_update (prp=0x7f598762b100, op=0x7f58f65fee50, message_id=0x7f58f65fedc8) at ../389-ds-base/ldap/servers/plugins/replication/repl5_inc_protocol.c:1377
#25 0x00007f598ad0c90f in send_updates (prp=0x7f598762b100, remote_update_vector=0x7f58b0344900, num_changes_sent=0x7f58f65fef00)
    at ../389-ds-base/ldap/servers/plugins/replication/repl5_inc_protocol.c:1708
#26 0x00007f598ad0aeab in repl5_inc_run (prp=0x7f598762b100) at ../389-ds-base/ldap/servers/plugins/replication/repl5_inc_protocol.c:1045
#27 0x00007f598ad13618 in prot_thread_main (arg=0x7f59876035e0) at ../389-ds-base/ldap/servers/plugins/replication/repl5_protocol.c:252
#28 0x00007f598c243749 in _pt_root (arg=0x7f59875fd140) at ../../../../nspr/pr/src/pthreads/ptthread.c:201
#29 0x00007f598c6ac897 in start_thread (arg=<optimized out>) at pthread_create.c:444
#30 0x00007f598c733a5c in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:78