ISSUE-13892: Error writing broker data on metadata store KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080 #3610
One of zookeepers died and was replaced in kubernetes.
Now we constantly see following error in broker logs:
{
logLevel: 'WARN',
logThread: 'pulsar-load-manager-1-1',
logger: 'org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl',
message: 'Error writing broker data on metadata store',
stack_trace: 'java.util.concurrent.CompletionException: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331)
at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088)
at org.apache.pulsar.metadata.impl.ZKMetadataStore.lambda$storePut$19(ZKMetadataStore.java:252)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:308)
... 5 more
Caused by: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at org.apache.zookeeper.KeeperException.create(KeeperException.java:122)
at org.apache.zookeeper.KeeperException.create(KeeperException.java:54)
at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:304)
... 5 more',
}
Before that, when one of zookeeper servers was dying I saw lots of errors like below, but that stopped showing after zookeeper was replaced.
{
logLevel: 'WARN',
logThread: 'Thread-4383',
logger: 'org.apache.pulsar.broker.service.ServerCnx',
message: '[/10.240.0.193:35596][persistent://public/batch/batch12_pulsar5_SEARCH2_5ad8f2b7feebd125a22f05219306fefcb3623ef9][pulsar5] Failed to create consumer: consumerId=147, No such ledger exists on Metadata Server - ledger=19028 - operation=Failed to open ledger',
}
{
logLevel: 'WARN',
logThread: 'metadata-store-6-1',
logger: 'org.apache.pulsar.broker.service.ServerCnx',
message: '[/10.240.0.193:35596][persistent://public/batch/batch12_pulsar5_SEARCH2_45866a999bad07a4c754bf2e9813656578a12e98][pulsar5] Failed to create consumer: consumerId=152, Topic is temporarily unavailable',
}
{
logLevel: 'WARN',
logThread: 'BookieHighPriorityThread-3181-OrderedExecutor-4-0',
logger: 'org.apache.bookkeeper.proto.ReadEntryProcessor',
message: 'Ledger: 25316 fenced by: /10.240.0.177:54732',
}
{
logLevel: 'WARN',
logThread: 'bookkeeper-ml-scheduler-OrderedScheduler-1-0',
logger: 'org.apache.pulsar.broker.service.ServerCnx',
message: '[/10.240.0.193:35596][persistent://public/batch/batch12_pulsar5_SEARCH2_45866a999bad07a4c754bf2e9813656578a12e98][pulsar5] Failed to create consumer: consumerId=152, Topic persistent://public/batch/batch12_pulsar5_SEARCH2_45866a999bad07a4c754bf2e9813656578a12e98 does not exist',
}
{
logLevel: 'WARN',
logThread: 'Thread-4412',
logger: 'org.apache.bookkeeper.mledger.impl.ManagedCursorImpl',
message: '[aks48we/batch_frontnotifier/persistent/pulsar5_batch_completed] [pulsar5fbe8d7e4-44ee-4805-ad94-2a22829ef970] Current position 23868:-1 is ahead of last position 23454:288',
}
{
logLevel: 'WARN',
logThread: 'pulsar-msg-expiry-monitor-27-1',
logger: 'org.apache.pulsar.broker.service.persistent.PersistentTopic',
message: '[persistent://aks48we/batch_frontnotifier/pulsar5_batch_completed] Error while getting the oldest message',
stack_trace: 'java.lang.NullPointerException',
}
{
logLevel: 'WARN',
logThread: 'bookkeeper-ml-scheduler-OrderedScheduler-5-0',
logger: 'org.apache.bookkeeper.mledger.impl.ManagedLedgerImpl',
message: 'Cursor: ManagedCursorImpl{ledger=aks48we/batch/persistent/batch12_pulsar5_SEARCH2_19b5c04005f5aedc48fba25ffe1eeeed34f56ee5, name=pulsar.dedup, ackPos=23546:95, readPos=23546:96} does not exist in the managed-ledger.',
}
{
logLevel: 'WARN',
logThread: 'metadata-store-zk-session-watcher-7-1',
logger: 'org.apache.pulsar.metadata.impl.ZKSessionWatcher',
message: 'ZooKeeper client is disconnected. Waiting to reconnect, time remaining = 25.0 seconds',
}
{
logLevel: 'WARN',
logThread: 'main-SendThread(pulsar-zookeeper-zone2-0.pulsar-zookeeper.default.svc.cluster.local:2181)',
logger: 'org.apache.zookeeper.ClientCnxn',
message: 'Session 0x1000003fee00014 for sever pulsar-zookeeper-zone2-0.pulsar-zookeeper.default.svc.cluster.local/10.240.0.140:2181, Closing socket connection. Attempting reconnect except it is a SessionExpiredException.',
stack_trace: 'org.apache.zookeeper.ClientCnxn$SessionTimeoutException: Client session timed out, have not heard from server in 20000ms for session id 0x1000003fee00014
at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1258)',
}
{
logLevel: 'WARN',
logThread: 'metadata-store-zk-session-watcher-7-1',
logger: 'org.apache.pulsar.metadata.impl.ZKSessionWatcher',
message: 'ZooKeeper client is disconnected. Waiting to reconnect, time remaining = 12.999 seconds',
}
...
A lot of closed zookeeper connections / sessions timeouts
...
{
logLevel: 'WARN',
logThread: 'pulsar-load-manager-1-1',
logger: 'org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl',
message: 'Error writing broker data on metadata store',
stack_trace: 'java.util.concurrent.CompletionException: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331)
at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088)
at org.apache.pulsar.metadata.impl.ZKMetadataStore.lambda$storePut$19(ZKMetadataStore.java:252)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:308)
... 5 more
Caused by: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at org.apache.zookeeper.KeeperException.create(KeeperException.java:122)
at org.apache.zookeeper.KeeperException.create(KeeperException.java:54)
at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:304)
... 5 more',
}
{
logLevel: 'ERROR',
logThread: 'metadata-store-zk-session-watcher-7-1',
logger: 'org.apache.pulsar.metadata.impl.ZKSessionWatcher',
message: 'ZooKeeper session reconnection timeout. Notifying session is lost.',
}
{
logLevel: 'WARN',
logThread: 'pulsar-load-manager-1-1',
logger: 'org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl',
message: 'Error writing broker data on metadata store',
stack_trace: 'java.util.concurrent.CompletionException: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331)
at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088)
at org.apache.pulsar.metadata.impl.ZKMetadataStore.lambda$storePut$19(ZKMetadataStore.java:252)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:308)
... 5 more
Caused by: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
at org.apache.zookeeper.KeeperException.create(KeeperException.java:122)
at org.apache.zookeeper.KeeperException.create(KeeperException.java:54)
at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:304)
... 5 more',
}
Original Issue: apache/pulsar#13892
Describe the bug Pulsar version: 2.8.2 + branch-2.8 up to https://github.com/apache/pulsar/tree/36443112c86afc296780c180154f922d611ebf25
One of zookeepers died and was replaced in kubernetes. Now we constantly see following error in broker logs:
Before that, when one of zookeeper servers was dying I saw lots of errors like below, but that stopped showing after zookeeper was replaced.