streamnative / pulsar-archived

Apache Pulsar - distributed pub-sub messaging system
https://pulsar.apache.org
Apache License 2.0
73 stars 25 forks source link

ISSUE-13892: Error writing broker data on metadata store KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080 #3610

Open sijie opened 2 years ago

sijie commented 2 years ago

Original Issue: apache/pulsar#13892


Describe the bug Pulsar version: 2.8.2 + branch-2.8 up to https://github.com/apache/pulsar/tree/36443112c86afc296780c180154f922d611ebf25

One of zookeepers died and was replaced in kubernetes. Now we constantly see following error in broker logs:

{
  logLevel:    'WARN',
  logThread:   'pulsar-load-manager-1-1',
  logger:      'org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl',
  message:     'Error writing broker data on metadata store',
  stack_trace: 'java.util.concurrent.CompletionException: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331)
    at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346)
    at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632)
    at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
    at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088)
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.lambda$storePut$19(ZKMetadataStore.java:252)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
    at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:308)
    ... 5 more
Caused by: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at org.apache.zookeeper.KeeperException.create(KeeperException.java:122)
    at org.apache.zookeeper.KeeperException.create(KeeperException.java:54)
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:304)
    ... 5 more',
}

Before that, when one of zookeeper servers was dying I saw lots of errors like below, but that stopped showing after zookeeper was replaced.

{
  logLevel:  'WARN',
  logThread: 'Thread-4383',
  logger:    'org.apache.pulsar.broker.service.ServerCnx',
  message:   '[/10.240.0.193:35596][persistent://public/batch/batch12_pulsar5_SEARCH2_5ad8f2b7feebd125a22f05219306fefcb3623ef9][pulsar5] Failed to create consumer: consumerId=147, No such ledger exists on Metadata Server -  ledger=19028 - operation=Failed to open ledger',
}
{
  logLevel:  'WARN',
  logThread: 'metadata-store-6-1',
  logger:    'org.apache.pulsar.broker.service.ServerCnx',
  message:   '[/10.240.0.193:35596][persistent://public/batch/batch12_pulsar5_SEARCH2_45866a999bad07a4c754bf2e9813656578a12e98][pulsar5] Failed to create consumer: consumerId=152, Topic is temporarily unavailable',
}
{
  logLevel:  'WARN',
  logThread: 'BookieHighPriorityThread-3181-OrderedExecutor-4-0',
  logger:    'org.apache.bookkeeper.proto.ReadEntryProcessor',
  message:   'Ledger: 25316  fenced by: /10.240.0.177:54732',
}
{
  logLevel:  'WARN',
  logThread: 'bookkeeper-ml-scheduler-OrderedScheduler-1-0',
  logger:    'org.apache.pulsar.broker.service.ServerCnx',
  message:   '[/10.240.0.193:35596][persistent://public/batch/batch12_pulsar5_SEARCH2_45866a999bad07a4c754bf2e9813656578a12e98][pulsar5] Failed to create consumer: consumerId=152, Topic persistent://public/batch/batch12_pulsar5_SEARCH2_45866a999bad07a4c754bf2e9813656578a12e98 does not exist',
}
{
  logLevel:  'WARN',
  logThread: 'Thread-4412',
  logger:    'org.apache.bookkeeper.mledger.impl.ManagedCursorImpl',
  message:   '[aks48we/batch_frontnotifier/persistent/pulsar5_batch_completed] [pulsar5fbe8d7e4-44ee-4805-ad94-2a22829ef970] Current position 23868:-1 is ahead of last position 23454:288',
}
{
  logLevel:    'WARN',
  logThread:   'pulsar-msg-expiry-monitor-27-1',
  logger:      'org.apache.pulsar.broker.service.persistent.PersistentTopic',
  message:     '[persistent://aks48we/batch_frontnotifier/pulsar5_batch_completed] Error while getting the oldest message',
  stack_trace: 'java.lang.NullPointerException',
}
{
  logLevel:  'WARN',
  logThread: 'bookkeeper-ml-scheduler-OrderedScheduler-5-0',
  logger:    'org.apache.bookkeeper.mledger.impl.ManagedLedgerImpl',
  message:   'Cursor: ManagedCursorImpl{ledger=aks48we/batch/persistent/batch12_pulsar5_SEARCH2_19b5c04005f5aedc48fba25ffe1eeeed34f56ee5, name=pulsar.dedup, ackPos=23546:95, readPos=23546:96} does not exist in the managed-ledger.',
}
{
  logLevel:  'WARN',
  logThread: 'metadata-store-zk-session-watcher-7-1',
  logger:    'org.apache.pulsar.metadata.impl.ZKSessionWatcher',
  message:   'ZooKeeper client is disconnected. Waiting to reconnect, time remaining = 25.0 seconds',
}

{
  logLevel:    'WARN',
  logThread:   'main-SendThread(pulsar-zookeeper-zone2-0.pulsar-zookeeper.default.svc.cluster.local:2181)',
  logger:      'org.apache.zookeeper.ClientCnxn',
  message:     'Session 0x1000003fee00014 for sever pulsar-zookeeper-zone2-0.pulsar-zookeeper.default.svc.cluster.local/10.240.0.140:2181, Closing socket connection. Attempting reconnect except it is a SessionExpiredException.',
  stack_trace: 'org.apache.zookeeper.ClientCnxn$SessionTimeoutException: Client session timed out, have not heard from server in 20000ms for session id 0x1000003fee00014
    at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1258)',
}

{
  logLevel:  'WARN',
  logThread: 'metadata-store-zk-session-watcher-7-1',
  logger:    'org.apache.pulsar.metadata.impl.ZKSessionWatcher',
  message:   'ZooKeeper client is disconnected. Waiting to reconnect, time remaining = 12.999 seconds',
}
...
A lot of closed zookeeper connections / sessions timeouts
...
{
  logLevel:    'WARN',
  logThread:   'pulsar-load-manager-1-1',
  logger:      'org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl',
  message:     'Error writing broker data on metadata store',
  stack_trace: 'java.util.concurrent.CompletionException: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331)
    at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346)
    at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632)
    at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
    at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088)
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.lambda$storePut$19(ZKMetadataStore.java:252)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
    at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:308)
    ... 5 more
Caused by: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at org.apache.zookeeper.KeeperException.create(KeeperException.java:122)
    at org.apache.zookeeper.KeeperException.create(KeeperException.java:54)
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:304)
    ... 5 more',
}
{
  logLevel:  'ERROR',
  logThread: 'metadata-store-zk-session-watcher-7-1',
  logger:    'org.apache.pulsar.metadata.impl.ZKSessionWatcher',
  message:   'ZooKeeper session reconnection timeout. Notifying session is lost.',
}
{
  logLevel:    'WARN',
  logThread:   'pulsar-load-manager-1-1',
  logger:      'org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl',
  message:     'Error writing broker data on metadata store',
  stack_trace: 'java.util.concurrent.CompletionException: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331)
    at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346)
    at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632)
    at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
    at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088)
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.lambda$storePut$19(ZKMetadataStore.java:252)
    at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
    at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
    at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
    at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.pulsar.metadata.api.MetadataStoreException$BadVersionException: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:308)
    ... 5 more
Caused by: org.apache.zookeeper.KeeperException$BadVersionException: KeeperErrorCode = BadVersion for /loadbalance/brokers/pulsar-broker-1.pulsar-broker.default.svc.cluster.local:8080
    at org.apache.zookeeper.KeeperException.create(KeeperException.java:122)
    at org.apache.zookeeper.KeeperException.create(KeeperException.java:54)
    at org.apache.pulsar.metadata.impl.ZKMetadataStore.getException(ZKMetadataStore.java:304)
    ... 5 more',
}
github-actions[bot] commented 2 years ago

The issue had no activity for 30 days, mark with Stale label.