apache / incubator-pegasus

Apache Pegasus - A horizontally scalable, strongly consistent and high-performance key-value store
https://pegasus.apache.org/
Apache License 2.0
1.99k stars 312 forks source link

Meta server and Replica server process could not exit normally after backing up or restoring on HDFS #1913

Closed acelyc111 closed 9 months ago

acelyc111 commented 9 months ago

Bug Report

Please answer these questions before submitting your issue. Thanks!

  1. What did you do? Using HDFS as the remote storage to backup or restore data.

  2. What did you expect to see? The server processes could not exit normally.

    Thread 24 (Thread 0x7f3af56aa700 (LWP 726)):
    #0  0x00007f3b57507017 in pthread_join () from /lib64/libpthread.so.0
    #1  0x00007f3b562880f7 in std::thread::join() () from /lib64/libstdc++.so.6
    #2  0x00007f3b5ab2eb1e in dsn::task_worker::stop (this=<optimized out>) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/task/task_worker.cpp:89
    #3  0x00007f3b5ab1f82e in dsn::task_worker_pool::stop (this=0x1cb2240) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/task/task_engine.cpp:116
    #4  0x00007f3b5ab1f975 in dsn::task_engine::stop (this=0x23b4cf0) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/task/task_engine.cpp:254
    #5  0x00007f3b5aae0075 in dsn::service_node::~service_node (this=0x1b3a370, __in_chrg=<optimized out>) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/service_engine.cpp:173
    #6  0x00007f3b5aae08c2 in _M_release (this=0x1b3a360) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr_base.h:154
    #7  ~__shared_count (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr_base.h:684
    #8  ~__shared_ptr (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr_base.h:1123
    #9  ~shared_ptr (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr.h:93
    #10 ~pair (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_pair.h:198
    #11 destroy<std::pair<int const, std::shared_ptr<dsn::service_node> > > (this=<optimized out>, __p=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/ext/new_allocator.h:140
    #12 destroy<std::pair<int const, std::shared_ptr<dsn::service_node> > > (__a=..., __p=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/alloc_traits.h:487
    #13 _M_destroy_node (this=0x7f3b5aef8d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>, __p=0x2438ec0) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_tree.h:650
    #14 _M_drop_node (this=0x7f3b5aef8d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>, __p=0x2438ec0) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_tree.h:658
    #15 std::_Rb_tree<int, std::pair<int const, std::shared_ptr<dsn::service_node> >, std::_Select1st<std::pair<int const, std::shared_ptr<dsn::service_node> > >, std::less<int>, std::allocator<std::pair<int const, std::shared_ptr<dsn::service_node> > > >::_M_erase (this=this
    #16 0x00007f3b5aadeca0 in clear (this=0x7f3b5aef8d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_tree.h:1171
    #17 clear (this=0x7f3b5aef8d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_map.h:1128
    #18 dsn::service_engine::~service_engine (this=0x7f3b5aef8c20 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance>, __in_chrg=<optimized out>) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/service_engine.cpp:197
    #19 0x00007f3b55926ce9 in __run_exit_handlers () from /lib64/libc.so.6
    #20 0x00007f3b55926d37 in exit () from /lib64/libc.so.6
    #21 0x00007f3b5b5b155c in vm_direct_exit(int) () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
    #22 0x00007f3b5ba82c05 in VM_Operation::evaluate() () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
    #23 0x00007f3b5ba80c2a in VMThread::evaluate_operation(VM_Operation*) () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
    #24 0x00007f3b5ba81099 in VMThread::loop() () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
    #25 0x00007f3b5ba81549 in VMThread::run() () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
    #26 0x00007f3b5b853cd2 in java_start(Thread*) () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
    #27 0x00007f3b57505ea5 in start_thread () from /lib64/libpthread.so.0
    #28 0x00007f3b559ebb0d in clone () from /lib64/libc.so.6
Thread 22 (Thread 0x7f30a9132700 (LWP 9032)):
#0  0x00007f315f0fb54d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f315f0f914d in pthread_cond_signal@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#2  0x00007f315de73b09 in std::condition_variable::notify_one() () from /lib64/libstdc++.so.6
#3  0x000000000087a46d in rocksdb::ThreadPoolImpl::Schedule(void (*)(void*), void*, void*, void (*)(void*)) ()
#4  0x00000000006a7c18 in rocksdb::DBImpl::MaybeScheduleFlushOrCompaction() ()
#5  0x00000000006a9aa1 in rocksdb::DBImpl::AtomicFlushMemTables(rocksdb::autovector<rocksdb::ColumnFamilyData*, 8ul> const&, rocksdb::FlushOptions const&, rocksdb::FlushReason, bool) ()
#6  0x00000000006aa21d in rocksdb::DBImpl::Flush(rocksdb::FlushOptions const&, std::vector<rocksdb::ColumnFamilyHandle*, std::allocator<rocksdb::ColumnFamilyHandle*> > const&) ()
#7  0x00000000005bb2a7 in pegasus::server::pegasus_server_impl::flush_all_family_columns (this=0x4b20400, wait=wait@entry=true) at /home/laiyingchun/dev/skv_240/src/server/pegasus_server_impl.cpp:3235
#8  0x00000000005bc05c in pegasus::server::pegasus_server_impl::stop (this=0x4b20400, clear_state=<optimized out>) at /home/laiyingchun/dev/skv_240/src/server/pegasus_server_impl.cpp:1790
#9  0x00007f316257c69c in dsn::replication::replication_app_base::close (this=this@entry=0x4b20400, clear_state=clear_state@entry=false) at /home/laiyingchun/dev/skv_240/src/rdsn/src/replica/replication_app_base.cpp:345
#10 0x00007f31624dafff in dsn::replication::replica::close (this=0x4f20800) at /home/laiyingchun/dev/skv_240/src/rdsn/src/replica/replica.cpp:502
#11 0x00007f316253d585 in dsn::replication::replica_stub::close (this=0x2cd2e00) at /home/laiyingchun/dev/skv_240/src/rdsn/src/replica/replica_stub.cpp:2834
#12 0x00007f316258433d in dsn::replication::replication_service_app::stop (this=this@entry=0x3571b30, cleanup=cleanup@entry=false) at /home/laiyingchun/dev/skv_240/src/rdsn/src/replica/replication_service_app.cpp:73
#13 0x0000000000599d61 in pegasus::server::pegasus_replication_service_app::stop (this=0x3571b30, cleanup=<optimized out>) at /home/laiyingchun/dev/skv_240/src/server/pegasus_service_app.h:57
#14 0x00007f31626cc6ee in dsn::service_node::stop_app (this=this@entry=0x2cb8370, cleanup=cleanup@entry=false) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/service_engine.cpp:84
#15 0x00007f31626cf069 in dsn::service_node::~service_node (this=0x2cb8370, __in_chrg=<optimized out>) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/service_engine.cpp:172
#16 0x00007f31626cf8c2 in _M_release (this=0x2cb8360) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr_base.h:154
#17 ~__shared_count (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr_base.h:684
#18 ~__shared_ptr (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr_base.h:1123
#19 ~shared_ptr (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/shared_ptr.h:93
#20 ~pair (this=<optimized out>, __in_chrg=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_pair.h:198
#21 destroy<std::pair<int const, std::shared_ptr<dsn::service_node> > > (this=<optimized out>, __p=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/ext/new_allocator.h:140
#22 destroy<std::pair<int const, std::shared_ptr<dsn::service_node> > > (__a=..., __p=<optimized out>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/alloc_traits.h:487
#23 _M_destroy_node (this=0x7f3162ae7d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>, __p=0x359fd80) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_tree.h:650
#24 _M_drop_node (this=0x7f3162ae7d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>, __p=0x359fd80) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_tree.h:658
#25 std::_Rb_tree<int, std::pair<int const, std::shared_ptr<dsn::service_node> >, std::_Select1st<std::pair<int const, std::shared_ptr<dsn::service_node> > >, std::less<int>, std::allocator<std::pair<int const, std::shared_ptr<dsn::service_node> > > >::_M_erase (this=this
#26 0x00007f31626cdca0 in clear (this=0x7f3162ae7d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_tree.h:1171
#27 clear (this=0x7f3162ae7d38 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance+280>) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/stl_map.h:1128
#28 dsn::service_engine::~service_engine (this=0x7f3162ae7c20 <dsn::utils::singleton<dsn::service_engine>::instance()::_instance>, __in_chrg=<optimized out>) at /home/laiyingchun/dev/skv_240/src/rdsn/src/runtime/service_engine.cpp:197
#29 0x00007f315d515ce9 in __run_exit_handlers () from /lib64/libc.so.6
#30 0x00007f315d515d37 in exit () from /lib64/libc.so.6
#31 0x00007f31631a055c in vm_direct_exit(int) () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
#32 0x00007f3163671c05 in VM_Operation::evaluate() () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
#33 0x00007f316366fc2a in VMThread::evaluate_operation(VM_Operation*) () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
#34 0x00007f3163670099 in VMThread::loop() () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
#35 0x00007f3163670549 in VMThread::run() () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
#36 0x00007f3163442cd2 in java_start(Thread*) () from /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.382.b05-1.el7_9.x86_64/jre/lib/amd64/server/libjvm.so
#37 0x00007f315f0f4ea5 in start_thread () from /lib64/libpthread.so.0
#38 0x00007f315d5dab0d in clone () from /lib64/libc.so.6
  1. What did you see instead? Servers could exit normally.

  2. What version of Pegasus are you using? 2.4