yugabyte / yugabyte-db

YugabyteDB - the cloud native distributed SQL database for mission-critical applications.
https://www.yugabyte.com
Other
8.99k stars 1.07k forks source link

[CDCSDK] Tserver crashes in CDC LRU master 0 case (Regression) #16679

Closed shamanthchandra-yb closed 1 year ago

shamanthchandra-yb commented 1 year ago

Jira Link: DB-6053

Description

http://stress.dev.yugabyte.com/stress_test/3c884bbf-459e-4ade-98f8-3ffa6fab6566

thread #1, name = 'yb-tserver', stop reason = signal SIGSEGV
  * frame #0: 0x0000560b7a6ea50a yb-tserver`yb::cdc::GetChangesForCDCSDK(stream_id="", tablet_id="0e26b151efeb4e9b9af5e802092bd2db", from_op_id=0x00007ffa0ad61bf8, stream_metadata=0x00007ffa0ad61d30, tablet_peer=std::__1::shared_ptr<yb::tablet::TabletPeer>::element_type @ 0x0000560b92aba600, mem_tracker=std::__1::shared_ptr<yb::MemTracker>::element_type @ 0x0000560bc63f0fe0, enum_oid_label_map=0x00007ffa0ad61c68, composite_atts_map=0x00007ffa0ad61c98, client=0x0000560b926b4e68, msgs_holder=0x00007ffa0ad61ae0, resp=0x0000560c5139e678, commit_timestamp=0x00007ffa0ad61bc0, cached_schema_details=0x00007ffa0ad61dc0, last_streamed_op_id=0x00007ffa0ad61a40, last_readable_opid_index=0x00007ffa0ad61bb8, colocated_table_id="", deadline=yb::CoarseTimePoint @ 0x00007ffa0ad61a08) at cdc_service.pb.h:0:10
    frame #1: 0x0000560b7a69b837 yb-tserver`yb::cdc::CDCServiceImpl::GetChanges(this=0x0000560b929a0020, req=0x0000560c5139e620, resp=0x0000560c5139e678, context=RpcContext @ 0x00007ffa0ad61fe0) at cdc_service.cc:1703:14
    frame #2: 0x0000560b7a74acc2 yb-tserver`std::__1::__function::__func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) [inlined] yb::cdc::CDCServiceIf::InitMethods(this=<unavailable>, req=<unavailable>, resp=<unavailable>, rpc_context=RpcContext @ 0x00007ffa0ad61fa0)::$_3::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>) const::'lambda'(yb::cdc::GetChangesRequestPB const*, yb::cdc::GetChangesResponsePB*, yb::rpc::RpcContext)::operator()(yb::cdc::GetChangesRequestPB const*, yb::cdc::GetChangesResponsePB*, yb::rpc::RpcContext) const at cdc_service.service.cc:355:9
    frame #3: 0x0000560b7a74ac84 yb-tserver`std::__1::__function::__func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) at local_call.h:118:7
    frame #4: 0x0000560b7a74a972 yb-tserver`std::__1::__function::__func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) [inlined] yb::cdc::CDCServiceIf::InitMethods(this=<unavailable>, call=nullptr)::$_3::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>) const at cdc_service.service.cc:353:7
    frame #5: 0x0000560b7a74a8f5 yb-tserver`std::__1::__function::__func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) [inlined] decltype(__f=<unavailable>, __args=<unavailable>)::$_3&>()(std::declval<std::__1::shared_ptr<yb::rpc::InboundCall>>())) std::__1::__invoke[abi:v15003]<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3&, std::__1::shared_ptr<yb::rpc::InboundCall>>(yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3&, std::__1::shared_ptr<yb::rpc::InboundCall>&&) at invoke.h:394:23
    frame #6: 0x0000560b7a74a8d4 yb-tserver`std::__1::__function::__func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) [inlined] void std::__1::__invoke_void_return_wrapper<void, true>::__call<yb::cdc::CDCServiceIf::InitMethods(__args=<unavailable>, __args=<unavailable>)::$_3&, std::__1::shared_ptr<yb::rpc::InboundCall>>(yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3&, std::__1::shared_ptr<yb::rpc::InboundCall>&&) at invoke.h:479:9
    frame #7: 0x0000560b7a74a8d4 yb-tserver`std::__1::__function::__func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) [inlined] std::__1::__function::__alloc_func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator(this=<unavailable>, __arg=<unavailable>)[abi:v15003](std::__1::shared_ptr<yb::rpc::InboundCall>&&) at function.h:185:16
    frame #8: 0x0000560b7a74a8d4 yb-tserver`std::__1::__function::__func<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3, std::__1::allocator<yb::cdc::CDCServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_3>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator(this=<unavailable>, __arg=<unavailable>)(std::__1::shared_ptr<yb::rpc::InboundCall>&&) at function.h:359:12
    frame #9: 0x0000560b7a74d12f yb-tserver`yb::cdc::CDCServiceIf::Handle(std::__1::shared_ptr<yb::rpc::InboundCall>) [inlined] std::__1::__function::__value_func<void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator(this=<unavailable>, __args=nullptr)[abi:v15003](std::__1::shared_ptr<yb::rpc::InboundCall>&&) const at function.h:512:16
    frame #10: 0x0000560b7a74d110 yb-tserver`yb::cdc::CDCServiceIf::Handle(std::__1::shared_ptr<yb::rpc::InboundCall>) [inlined] std::__1::function<void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator(this=<unavailable>, __arg=nullptr)(std::__1::shared_ptr<yb::rpc::InboundCall>) const at function.h:1197:12
    frame #11: 0x0000560b7a74d110 yb-tserver`yb::cdc::CDCServiceIf::Handle(this=<unavailable>, call=<unavailable>) at cdc_service.service.cc:293:3
    frame #12: 0x0000560b7b4e965a yb-tserver`yb::rpc::ServicePoolImpl::Handle(this=0x0000560b9270cd80, incoming=nullptr) at service_pool.cc:263:19
    frame #13: 0x0000560b7b42a02f yb-tserver`yb::rpc::InboundCall::InboundCallTask::Run(this=<unavailable>) at inbound_call.cc:245:13
    frame #14: 0x0000560b7b4f81f3 yb-tserver`yb::rpc::(anonymous namespace)::Worker::Execute(this=0x0000560bd895af50) at thread_pool.cc:104:15
    frame #15: 0x0000560b7bbb9d32 yb-tserver`yb::Thread::SuperviseThread(void*) [inlined] std::__1::__function::__value_func<void ()>::operator(this=0x0000560bdb449ec0)[abi:v15003]() const at function.h:512:16
    frame #16: 0x0000560b7bbb9d1c yb-tserver`yb::Thread::SuperviseThread(void*) [inlined] std::__1::function<void ()>::operator(this=0x0000560bdb449ec0)() const at function.h:1197:12
    frame #17: 0x0000560b7bbb9d1c yb-tserver`yb::Thread::SuperviseThread(arg=0x0000560bdb449e60) at thread.cc:842:3
    frame #18: 0x00007ffa8ddea694 libpthread.so.0`start_thread(arg=0x00007ffa0ad6a700) at pthread_create.c:333
    frame #19: 0x00007ffa8e2ec41d libc.so.6`__clone at clone.S:109

Source connector version

1.9.5.y.18

Connector configuration

add connector connector_name='ybconnector_cdc_3f369f_test_cdc_030e90_test_cdc_276c3f' stream_id='bf29c867452a468aac48e8c6aeef2c69' db_name='cdc_3f369f' connector_host='172.151.21.65' table_list=['test_cdc_030e90', 'test_cdc_276c3f'] {'name': 'ybconnector_cdc_3f369f_test_cdc_030e90_test_cdc_276c3f', 'config': {'connector.class': 'io.debezium.connector.yugabytedb.YugabyteDBConnector', 'database.hostname': '172.151.25.55', 'database.master.addresses': '172.151.27.64:7100,172.151.25.55:7100,172.151.21.32:7100', 'database.port': 5433, 'database.masterhost': '172.151.25.55', 'database.masterport': '7100', 'database.user': 'yugabyte', 'database.password': 'yugabyte', 'database.dbname': 'cdc_3f369f', 'database.server.name': 'db_cdc', 'database.streamid': 'bf29c867452a468aac48e8c6aeef2c69', 'snapshot.mode': 'never', 'admin.operation.timeout.ms': 600000, 'socket.read.timeout.ms': 600000, 'max.connector.retries': '10', 'operation.timeout.ms': 600000, 'topic.creation.default.compression.type': 'lz4', 'topic.creation.default.cleanup.policy': 'delete', 'topic.creation.default.partitions': 2, 'topic.creation.default.replication.factor': '1', 'tasks.max': '5', 'table.include.list': 'public.test_cdc_030e90,public.test_cdc_276c3f'}}

YugabyteDB version

2.17.4.0-b50

Warning: Please confirm that this issue does not contain any sensitive information

shamanthchandra-yb commented 1 year ago

Also seen here: http://stress.dev.yugabyte.com/stress_test/334b4b66-305b-4ca7-8401-a39354a1919a

Apart from crash and cores, in connector log

2023-04-04 04:40:01,389 ERROR  ||  WorkerSourceTask{id=ybconnector_cdc_ad02d2_test_cdc_7e06d9_test_cdc_07701c-4} Task threw an uncaught and unrecoverable exception. Task is being killed and will not recover until manually restarted   [org.apache.kafka.connect.runtime.WorkerTask]
org.apache.kafka.connect.errors.ConnectException: An exception occurred in the change event producer. This connector will be stopped.
    at io.debezium.pipeline.ErrorHandler.setProducerThrowable(ErrorHandler.java:50)
    at io.debezium.connector.yugabytedb.YugabyteDBStreamingChangeEventSource.execute(YugabyteDBStreamingChangeEventSource.java:144)
    at io.debezium.connector.yugabytedb.YugabyteDBStreamingChangeEventSource.execute(YugabyteDBStreamingChangeEventSource.java:47)
    at io.debezium.pipeline.ChangeEventSourceCoordinator.streamEvents(ChangeEventSourceCoordinator.java:174)
    at io.debezium.connector.yugabytedb.YugabyteDBChangeEventSourceCoordinator.executeChangeEventSources(YugabyteDBChangeEventSourceCoordinator.java:138)
    at io.debezium.pipeline.ChangeEventSourceCoordinator.lambda$start$0(ChangeEventSourceCoordinator.java:109)
    at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
    at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.yb.client.NonRecoverableException: Time out: YRpc(method=GetChanges, service=yb.cdc.CDCService, tablet=eed57e66d46d495daebffb765de9e9d1, attempt=23, maxAttempts=100, maxTimeoutMs=600000, elapsedTimeMs=600001). Server[23f50084c7d54e80870df4f461cf6093] TIMED_OUT[code 14]: GetChanges RPC (request call id 16556) to 172.151.19.160:9100 timed out after 0.252s
    at org.yb.client.AsyncYBClient.tooManyAttemptsOrTimeout(AsyncYBClient.java:2127)
    at org.yb.client.AsyncYBClient.delayedSendRpcToTablet(AsyncYBClient.java:2335)
    at org.yb.client.AsyncYBClient.handleRetryableError(AsyncYBClient.java:2315)
    at org.yb.client.TabletClient.dispatchCDCErrorOrReturnException(TabletClient.java:518)
    at org.yb.client.TabletClient.decode(TabletClient.java:437)
    at io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:510)
    at io.netty.handler.codec.ReplayingDecoder.callDecode(ReplayingDecoder.java:366)
    at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:279)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
    at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:286)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
    at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
    at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
    at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
    at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:722)
    at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:658)
    at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:584)
    at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:496)
    at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
    at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
    ... 3 more