OpenTSDB / opentsdb

A scalable, distributed Time Series Database.
http://opentsdb.net
GNU Lesser General Public License v2.1
5k stars 1.25k forks source link

Caused by: org.hbase.async.NonRecoverableException: Too many attempts #2268

Open mindflying opened 1 year ago

mindflying commented 1 year ago

description

our company use opentsdb to write data into hbase like below:

 deferred = tsdb.addPoint(tmetric.getMetric(), tmetric.getTimestamp(), value.longValue(), tags);

sometimes, it will throws Caused by: org.hbase.async.NonRecoverableException: Too many attempts: PutRequest(table="tsdb-caz", key=[3, 0, 0, 74, 47, 99, -24, 29, -16, 0, 0, 1, 0, 2, 68, 0, 0, 3, 0, 0, 37], family="t", qualifiers=[[-16, 0, -6, 1]], values=["\x01,"], timestamp=9223372036854775807, lockid=-1, durable=true, bufferable=true, attempt=11, region=null)

from the error content, it seems caused by region is null, but I checked the hbase:meta(use hbase hback command), all region's status is ok, I cann't locate the root cause, is there anyone come accross the same issue or give me some suggestion?

error detail

Exception:com.stumbleupon.async.DeferredGroupException: At least one of the Deferreds failed, first exception: at com.stumbleupon.async.DeferredGroup.done(DeferredGroup.java:169) at com.stumbleupon.async.DeferredGroup.recordCompletion(DeferredGroup.java:158) at com.stumbleupon.async.DeferredGroup.access$200(DeferredGroup.java:36) at com.stumbleupon.async.DeferredGroup$1NotifyOrdered.call(DeferredGroup.java:97) at com.stumbleupon.async.Deferred.doCall(Deferred.java:1278) at com.stumbleupon.async.Deferred.runCallbacks(Deferred.java:1257) at com.stumbleupon.async.Deferred.access$300(Deferred.java:430) at com.stumbleupon.async.Deferred$Continue.call(Deferred.java:1366) at com.stumbleupon.async.Deferred.doCall(Deferred.java:1278) at com.stumbleupon.async.Deferred.runCallbacks(Deferred.java:1257) at com.stumbleupon.async.Deferred.access$300(Deferred.java:430) at com.stumbleupon.async.Deferred$Continue.call(Deferred.java:1366) at com.stumbleupon.async.Deferred.doCall(Deferred.java:1278) at com.stumbleupon.async.Deferred.runCallbacks(Deferred.java:1257) at com.stumbleupon.async.Deferred.callback(Deferred.java:1005) at org.hbase.async.HBaseRpc.callback(HBaseRpc.java:720) at org.hbase.async.RegionClient$1MultiActionCallback.call(RegionClient.java:892) at com.stumbleupon.async.Deferred.doCall(Deferred.java:1278) at com.stumbleupon.async.Deferred.runCallbacks(Deferred.java:1257) at com.stumbleupon.async.Deferred.callback(Deferred.java:1005) at org.hbase.async.HBaseRpc.callback(HBaseRpc.java:720) at org.hbase.async.RegionClient.decode(RegionClient.java:1575) at org.hbase.async.RegionClient.decode(RegionClient.java:88) at org.jboss.netty.handler.codec.replay.ReplayingDecoder.callDecode(ReplayingDecoder.java:500) at org.jboss.netty.handler.codec.replay.ReplayingDecoder.messageReceived(ReplayingDecoder.java:435) at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70) at org.hbase.async.RegionClient.handleUpstream(RegionClient.java:1230) at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) at org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791) at org.jboss.netty.channel.SimpleChannelHandler.messageReceived(SimpleChannelHandler.java:142) at org.jboss.netty.channel.SimpleChannelHandler.handleUpstream(SimpleChannelHandler.java:88) at org.jboss.netty.handler.timeout.IdleStateAwareChannelHandler.handleUpstream(IdleStateAwareChannelHandler.java:36) at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) at org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791) at org.jboss.netty.handler.timeout.IdleStateHandler.messageReceived(IdleStateHandler.java:294) at org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70) at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564) at org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559) at org.hbase.async.HBaseClient$RegionClientPipeline.sendUpstream(HBaseClient.java:3857) at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:268) at org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:255) at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:88) at org.jboss.netty.channel.socket.nio.AbstractNioWorker.process(AbstractNioWorker.java:108) at org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:337) at org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89) at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178) at org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) at org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.hbase.async.NonRecoverableException: Too many attempts: PutRequest(table="tsdb-caz", key=[3, 0, 0, 74, 47, 99, -24, 29, -16, 0, 0, 1, 0, 2, 68, 0, 0, 3, 0, 0, 37], family="t", qualifiers=[[-16, 0, -6, 1]], values=["\x01,"], timestamp=9223372036854775807, lockid=-1, durable=true, bufferable=true, attempt=11, region=null) at org.hbase.async.HBaseClient.tooManyAttempts(HBaseClient.java:2556) at org.hbase.async.HBaseClient.sendRpcToRegion(HBaseClient.java:2420) at org.hbase.async.HBaseClient$1RetryRpc.call(HBaseClient.java:2444) at org.hbase.async.HBaseClient$1RetryRpc.call(HBaseClient.java:2427) at com.stumbleupon.async.Deferred.doCall(Deferred.java:1278) at com.stumbleupon.async.Deferred.runCallbacks(Deferred.java:1257) at com.stumbleupon.async.Deferred.callback(Deferred.java:1005) at org.hbase.async.HBaseClient$1MetaScanCB.call(HBaseClient.java:2937) at org.hbase.async.HBaseClient$1MetaScanCB.call(HBaseClient.java:2930) at com.stumbleupon.async.Deferred.doCall(Deferred.java:1278) at com.stumbleupon.async.Deferred.runCallbacks(Deferred.java:1257) at com.stumbleupon.async.Deferred.handleContinuation(Deferred.java:1313) at com.stumbleupon.async.Deferred.doCall(Deferred.java:1284) at com.stumbleupon.async.Deferred.runCallbacks(Deferred.java:1257) at com.stumbleupon.async.Deferred.callback(Deferred.java:1005) at org.hbase.async.HBaseRpc.callback(HBaseRpc.java:720) at org.hbase.async.RegionClient.decode(RegionClient.java:1575) at org.hbase.async.RegionClient.decode(RegionClient.java:88) at org.jboss.netty.handler.codec.replay.ReplayingDecoder.callDecode(ReplayingDecoder.java:500) at org.jboss.netty.handler.codec.replay.ReplayingDecoder.messageReceived(ReplayingDecoder.java:485) ... 26 common frames omitted

analyze

it seems caused by: put request try to get region info according specified row key, but region=null even try many times

 Deferred<Object> sendRpcToRegion(final HBaseRpc request) {
    if (cannotRetryRequest(request)) {
      return tooManyAttempts(request, null);
    }
    request.attempt++;
    final byte[] table = request.table;
    final byte[] key = request.key;
    final RegionInfo region = getRegion(table, key);

    final class RetryRpc implements Callback<Deferred<Object>, Object> {
      public Deferred<Object> call(final Object arg) {
        if (arg instanceof NonRecoverableException) {
          // No point in retrying here, so fail the RPC.
          HBaseException e = (NonRecoverableException) arg;
          if (e instanceof HasFailedRpcException
              && ((HasFailedRpcException) e).getFailedRpc() != request) {
            // If we get here it's because a dependent RPC (such as a META
            // lookup) has failed.  Therefore the exception we're getting
            // indicates that the META lookup failed, but we need to return
            // to our caller here that it's their RPC that failed.  Here we
            // re-create the exception but with the correct RPC in argument.
            e = e.make(e, request);  // e is likely a PleaseThrottleException.
          }
          request.callback(e);
          return Deferred.fromError(e);
        }
        return sendRpcToRegion(request);  // Retry the RPC.
      }
      public String toString() {
        return "retry RPC";
      }
    }

    if (region != null) {
      if (knownToBeNSREd(region)) {
        final NotServingRegionException nsre =
          new NotServingRegionException("Region known to be unavailable",
                                        request);
        final Deferred<Object> d = request.getDeferred();
        handleNSRE(request, region.name(), nsre);
        return d;
      }
      final RegionClient client = clientFor(region);
      if (client != null && client.isAlive()) {
        request.setRegion(region);
        final Deferred<Object> d = request.getDeferred();
        client.sendRpc(request);
        return d;
      }
    }
    return locateRegion(request, table, key).addBothDeferring(new RetryRpc());
  }
deamerfire commented 1 year ago

你的邮件我已收到,会及时回复你的,谢谢啦啦。。。。吴琼

manolama commented 1 year ago

Yes, this can happen if the region is splitting and takes more than a second or two to for the new regions to update meta.