quarkiverse / quarkus-artemis

Quarkus Artemis extensions
Apache License 2.0
13 stars 12 forks source link

RA health check is always UP, even if target broker is down #591

Open vsevel opened 1 month ago

vsevel commented 1 month ago

I am using quarkus artemis 3.4.1 with the new io.quarkus.artemis.jms.ra.runtime.ConnectionFactoryHealthCheck. I start my app with the hornetq cluster running, and check I can send and receive messages. I check that the Artemis JMS Resource Adaptor health check is UP I shut down all hornetq nodes, and start seeing exceptions such as:

IJ000604: Throwable while attempting to get a new connection: null

jakarta.resource.ResourceException: Failed to create session factory
    at org.apache.activemq.artemis.ra.ActiveMQRAManagedConnection.setup(ActiveMQRAManagedConnection.java:725)
    at org.apache.activemq.artemis.ra.ActiveMQRAManagedConnection.<init>(ActiveMQRAManagedConnection.java:161)
    at org.apache.activemq.artemis.ra.ActiveMQRAManagedConnectionFactory.createManagedConnection(ActiveMQRAManagedConnectionFactory.java:144)
    at org.jboss.jca.core.connectionmanager.pool.mcp.SemaphoreArrayListManagedConnectionPool.createConnectionEventListener(SemaphoreArrayListManagedConnectionPool.java:1267)
    at org.jboss.jca.core.connectionmanager.pool.mcp.SemaphoreArrayListManagedConnectionPool.getConnection(SemaphoreArrayListManagedConnectionPool.java:495)
    at org.jboss.jca.core.connectionmanager.pool.AbstractPool.getTransactionNewConnection(AbstractPool.java:770)
    at org.jboss.jca.core.connectionmanager.pool.AbstractPool.getConnection(AbstractPool.java:666)
    at org.jboss.jca.core.connectionmanager.AbstractConnectionManager.getManagedConnection(AbstractConnectionManager.java:624)
    at org.jboss.jca.core.connectionmanager.tx.TxConnectionManagerImpl.getManagedConnection(TxConnectionManagerImpl.java:440)
    at org.jboss.jca.core.connectionmanager.AbstractConnectionManager.allocateConnection(AbstractConnectionManager.java:789)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.allocateConnection(ActiveMQRASessionFactoryImpl.java:792)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.createSession(ActiveMQRASessionFactoryImpl.java:482)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.createSession(ActiveMQRASessionFactoryImpl.java:673)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.createSession(ActiveMQRASessionFactoryImpl.java:678)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.validateUser(ActiveMQRAConnectionFactoryImpl.java:422)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.createContext(ActiveMQRAConnectionFactoryImpl.java:379)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.createContext(ActiveMQRAConnectionFactoryImpl.java:369)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.createContext(ActiveMQRAConnectionFactoryImpl.java:364)
    at com.x.quarkus.MyJMSProducer.send(MyJMSProducer.java:52)
    at com.x.quarkus.MyJMSProducer_Subclass.send$$superforward(Unknown Source)
    at com.x.quarkus.MyJMSProducer_Subclass$$function$$2.apply(Unknown Source)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.proceed(AroundInvokeInvocationContext.java:73)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.proceed(AroundInvokeInvocationContext.java:62)
    at io.quarkus.narayana.jta.runtime.interceptor.TransactionalInterceptorBase.invokeInOurTx(TransactionalInterceptorBase.java:136)
    at io.quarkus.narayana.jta.runtime.interceptor.TransactionalInterceptorBase.invokeInOurTx(TransactionalInterceptorBase.java:107)
    at io.quarkus.narayana.jta.runtime.interceptor.TransactionalInterceptorRequired.doIntercept(TransactionalInterceptorRequired.java:38)
    at io.quarkus.narayana.jta.runtime.interceptor.TransactionalInterceptorBase.intercept(TransactionalInterceptorBase.java:61)
    at io.quarkus.narayana.jta.runtime.interceptor.TransactionalInterceptorRequired.intercept(TransactionalInterceptorRequired.java:32)
    at io.quarkus.narayana.jta.runtime.interceptor.TransactionalInterceptorRequired_Bean.intercept(Unknown Source)
    at io.quarkus.arc.impl.InterceptorInvocation.invoke(InterceptorInvocation.java:42)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.perform(AroundInvokeInvocationContext.java:30)
    at io.quarkus.arc.impl.InvocationContexts.performAroundInvoke(InvocationContexts.java:27)
    at com.x.quarkus.MyJMSProducer_Subclass.send(Unknown Source)
    at com.x.quarkus.MyJMSProducer.produceOneMessage(MyJMSProducer.java:45)
    at com.x.quarkus.MyJMSProducer_Subclass.produceOneMessage$$superforward(Unknown Source)
    at com.x.quarkus.MyJMSProducer_Subclass$$function$$1.apply(Unknown Source)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.proceed(AroundInvokeInvocationContext.java:73)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.proceed(AroundInvokeInvocationContext.java:62)
    at io.quarkus.arc.impl.InterceptorInvocation.invoke(InterceptorInvocation.java:42)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.perform(AroundInvokeInvocationContext.java:30)
    at io.quarkus.arc.impl.InvocationContexts.performAroundInvoke(InvocationContexts.java:27)
    at com.x.quarkus.MyJMSProducer_Subclass.produceOneMessage(Unknown Source)
    at com.x.quarkus.MyJMSProducer_ClientProxy.produceOneMessage(Unknown Source)
    at com.x.quarkus.MyJMSProducer_ScheduledInvoker_produceOneMessage_32bc737e9e482a38cc2b6a3ccc5169371526db3c.invokeBean(Unknown Source)
    at io.quarkus.scheduler.common.runtime.DefaultInvoker.invoke(DefaultInvoker.java:24)
    at io.quarkus.scheduler.common.runtime.StatusEmitterInvoker.invoke(StatusEmitterInvoker.java:35)
    at io.quarkus.scheduler.common.runtime.SkipPredicateInvoker.invoke(SkipPredicateInvoker.java:43)
    at io.quarkus.scheduler.runtime.SimpleScheduler$ScheduledTask.doInvoke(SimpleScheduler.java:463)
    at io.quarkus.scheduler.runtime.SimpleScheduler$ScheduledTask$2.call(SimpleScheduler.java:445)
    at io.quarkus.scheduler.runtime.SimpleScheduler$ScheduledTask$2.call(SimpleScheduler.java:442)
    at io.vertx.core.impl.ContextImpl.lambda$executeBlocking$0(ContextImpl.java:178)
    at io.vertx.core.impl.ContextInternal.dispatch(ContextInternal.java:279)
    at io.vertx.core.impl.ContextImpl.lambda$internalExecuteBlocking$2(ContextImpl.java:210)
    at io.quarkus.vertx.core.runtime.VertxCoreRecorder$14.runWith(VertxCoreRecorder.java:640)
    at org.jboss.threads.EnhancedQueueExecutor$Task.doRunWith(EnhancedQueueExecutor.java:2516)
    at org.jboss.threads.EnhancedQueueExecutor$Task.run(EnhancedQueueExecutor.java:2495)
    at org.jboss.threads.EnhancedQueueExecutor$ThreadBody.run(EnhancedQueueExecutor.java:1495)
    at org.jboss.threads.DelegatingRunnable.run(DelegatingRunnable.java:11)
    at org.jboss.threads.ThreadLocalResettingRunnable.run(ThreadLocalResettingRunnable.java:11)
    at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
    at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: jakarta.jms.JMSException: Failed to create session factory
    at org.apache.activemq.artemis.jms.client.ActiveMQConnectionFactory.createConnectionInternal(ActiveMQConnectionFactory.java:912)
    at org.apache.activemq.artemis.jms.client.ActiveMQConnectionFactory.createXAConnection(ActiveMQConnectionFactory.java:384)
    at org.apache.activemq.artemis.ra.ActiveMQRAManagedConnection.setup(ActiveMQRAManagedConnection.java:713)
    ... 62 more
Caused by: ActiveMQNotConnectedException[errorType=NOT_CONNECTED message=AMQ219007: Cannot connect to server(s). Tried with all available servers.]
    at org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl.createSessionFactory(ServerLocatorImpl.java:729)
    at org.apache.activemq.artemis.jms.client.ActiveMQConnectionFactory.createConnectionInternal(ActiveMQConnectionFactory.java:910)
    ... 64 more

and yet, the readiness check is still UP:

$ curl -s localhost:9000/q/health/ready | grep -A 5 -B 5 -i artemis
{
    "status": "UP",
    "checks": [
        {
            "name": "Artemis JMS Resource Adaptor health check",
            "status": "UP",
            "data": {
                "<default>": "UP"
            }
        },

the issue is that createConnection() is always returning a connection even if no broker node is up:

closed = false
reference = null
userName = null
password = null
clientID = null
type = 0
started = false
mcf = {ActiveMQRAManagedConnectionFactory@15476} 
tsr = {NarayanaJtaProducers_ProducerMethod_transactionSynchronizationRegistry_bBrh9sABV2TvKl0YN2UrqpBUvbU_ClientProxy@15477} "io.quarkus.narayana.jta.runtime.internal.tsr.TransactionSynchronizationRegistryWrapper@47e01c97"
cm = {TxConnectionManagerImpl@15478} 
sessions = {HashSet@15479}  size = 0
tempQueues = {HashSet@15480}  size = 0
tempTopics = {HashSet@15481}  size = 0
allowLocalTransaction = false
refCounter = {ReferenceCounterUtil@15482} 
threadAwareContext = {ThreadAwareContext@15483} 

I tried using Session session = ignored.createSession() in the health check, but the call never returns (might be missing a read timeout).

expectation: the health check should always return a relevant status immediately (no blocking waiting for a response from the check).

cc @zhfeng

zhfeng commented 1 month ago

Thanks @vsevel - Does this still happen if there is only one backend node ? (No cluster setting)

vsevel commented 1 month ago

I do not know. and I do not have that particular setup available to test easily.

zhfeng commented 1 month ago

Is it possible to set userName and password ? It seems that it can validate user when creating ConnectionFactory. see here

vsevel commented 1 month ago

where? in the health check?

I was wondering if an other approach could be to check how many connections are in the pool. if greater than 0, and min connection is greater than 0 as well, this could be a minimal relevant check. in SR reactive messaging there are different checks, depending if you configure an admin client to connect on the server.

zhfeng commented 1 month ago

where? in the health check?

Yeah.

I was wondering if an other approach could be to check how many connections are in the pool. if greater than 0, and min connection is greater than 0 as well, this could be a minimal relevant check.

I'm not a expert of JCA, maybe @gastaldi can answer it.

vsevel commented 1 month ago

Is it possible to set userName and password ? It seems that it can validate user when creating ConnectionFactory

I tried. it does attempt to create a session, which results in the call being blocked forever, if broker is down when I start the app. does not seem encouraging. if I start the broker, start the app, then kill the broker, the health check continues to report UP for a while, then blocks for 90 seconds, but still report UP??

vsevel commented 1 month ago

something like this for instance:

public class MyJMSHealthCheck implements HealthCheck {

    private final Set<String> connectionFactoryNames;

    private final Instance<IronJacamarContainer> containers;

    public MyJMSHealthCheck(IronJacamarBuildtimeConfig buildTimeConfigs, @Any Instance<IronJacamarContainer> containers) {
        this.connectionFactoryNames = buildTimeConfigs.resourceAdapters().keySet();
        this.containers = containers;
    }

    public HealthCheckResponse call() {

        HealthCheckResponseBuilder builder = HealthCheckResponse.named("Artemis JMS Resource Adaptor health check").up();

        for (String name : connectionFactoryNames) {
            Annotation identifier = ArtemisUtil.toIdentifier(name);
            IronJacamarContainer container = containers.select(identifier).get();
            PoolStatistics stats = container.getConnectionManager().getPool().getStatistics();
            int activeCount = stats.getActiveCount();
            String s = String.format(
                    "avail=%s active=%s created=%s idel=%s destroyed=%s in-use=%s wait=%s, blocking-failure=%s",
                    stats.getAvailableCount(), activeCount, stats.getCreatedCount(), stats.getIdleCount(),
                    stats.getDestroyedCount(), stats.getInUseCount(), stats.getWaitCount(), stats.getBlockingFailureCount());
            builder.withData(name, s);
            if (activeCount == 0) {
                builder.down();
            }
        }

        return builder.build();
    }
}

what do you think @gastaldi ?

gastaldi commented 1 month ago

@vsevel that's an interesting approach, but bear in mind that inbound communication doesn't use the pool (at least in Artemis, but the spec doesn't mandate that - https://issues.apache.org/jira/browse/ARTEMIS-4429), so the active count will always return zero when using only MessageListeners in your application

FYI we also publish the same metrics in https://github.com/quarkiverse/quarkus-ironjacamar/blob/main/runtime/src/main/java/io/quarkiverse/ironjacamar/runtime/metrics/IronJacamarMetricsRecorder.java

vsevel commented 1 month ago

bear in mind that inbound communication doesn't use the pool

ah yes right. it reminds me of a conversation.

we also publish the same metrics

yes. that is where I got my inspiration from.

the active count will always return zero when using only MessageListeners in your application

so I do not see a viable solution at this point @zhfeng . getting a connection and a session may be a valid approach but only if we can exercise it, and as long as we have a short read and connect timeout.

zhfeng commented 1 month ago

@vsevel can you try to add such connect-timeout-millis=3000 setting with the quarkus.ironjacamar.ra.config.connection-parameters to see if it works?

vsevel commented 1 month ago

I tried connection-parameters: host=x;port=x;protocols=CORE;connect-timeout-millis=10000,host=x;port=x;x=CORE;connect-timeout-millis=10000

and

    public HealthCheckResponse call() {
        HealthCheckResponseBuilder builder = HealthCheckResponse.named("Artemis JMS Resource Adaptor health check").up();
        for (String name : connectionFactoryNames) {
            Annotation identifier = ArtemisUtil.toIdentifier(name);
            try (
                    Connection conn = connectionFactories.select(identifier).get().createConnection();
                    Session session = conn.createSession()) {
                builder.withData(name, "UP");
            } catch (Exception e) {
                builder.withData(name, "DOWN").down();
            }
        }
        return builder.build();
    }

it blocked for a long time (I did not wait).

I tried:

            try (Connection conn = connectionFactories.select(identifier).get().createConnection("x", "x")) {

and it blocked for 2 mins 20 secs.

vsevel commented 1 month ago

bear in mind that inbound communication doesn't use the pool

@gastaldi I tried setting a min-size for the pool, but even with the following config, the active connection count stays at 0 in my app without an active producer. I was expecting a connection to be created,

...
  ironjacamar:
    ra:
      kind: artemis
      config:
        connection-parameters: host=x;port=x;protocols=CORE;connect-timeout-millis=10000,host=x;port=x;protocols=CORE;connect-timeout-millis=10000
        protocol-manager-factory: org.apache.activemq.artemis.core.protocol.hornetq.client.HornetQClientProtocolManagerFactory
        user: x
        password: x
      cm:
        pool:
          config:
            min-size: 1
            initial-size: 1
            prefill: true
            strict-min: true
...
turing85 commented 1 month ago

All of this gets me thinking... is this only a symptom of a deeper issue? If the connection blocks indefinitely for the health check... does it also block indefinitely for a "regular" connection?

vsevel commented 1 month ago

this is the stack when creating a connection from an active producer:

createManagedConnection:144, ActiveMQRAManagedConnectionFactory (org.apache.activemq.artemis.ra)
createConnectionEventListener:1267, SemaphoreArrayListManagedConnectionPool (org.jboss.jca.core.connectionmanager.pool.mcp)
getConnection:495, SemaphoreArrayListManagedConnectionPool (org.jboss.jca.core.connectionmanager.pool.mcp)
getTransactionNewConnection:770, AbstractPool (org.jboss.jca.core.connectionmanager.pool)
getConnection:666, AbstractPool (org.jboss.jca.core.connectionmanager.pool)
getManagedConnection:624, AbstractConnectionManager (org.jboss.jca.core.connectionmanager)
getManagedConnection:440, TxConnectionManagerImpl (org.jboss.jca.core.connectionmanager.tx)
allocateConnection:789, AbstractConnectionManager (org.jboss.jca.core.connectionmanager)
allocateConnection:792, ActiveMQRASessionFactoryImpl (org.apache.activemq.artemis.ra)
createSession:482, ActiveMQRASessionFactoryImpl (org.apache.activemq.artemis.ra)
createSession:673, ActiveMQRASessionFactoryImpl (org.apache.activemq.artemis.ra)
createSession:678, ActiveMQRASessionFactoryImpl (org.apache.activemq.artemis.ra)
validateUser:422, ActiveMQRAConnectionFactoryImpl (org.apache.activemq.artemis.ra)
createContext:379, ActiveMQRAConnectionFactoryImpl (org.apache.activemq.artemis.ra)
createContext:369, ActiveMQRAConnectionFactoryImpl (org.apache.activemq.artemis.ra)
createContext:364, ActiveMQRAConnectionFactoryImpl (org.apache.activemq.artemis.ra)
send:52, MyJMSProducer

if there is no reachable broker, the creation fails in less than 10 secs.

if in the health check I execute try (JMSContext ctx = connectionFactories.select(identifier).get().createContext()) {, this fails in approx 2 minutes with stack trace:

health get context error

jakarta.jms.JMSRuntimeException: Failed to create session factory
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.createContext(ActiveMQRAConnectionFactoryImpl.java:385)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.createContext(ActiveMQRAConnectionFactoryImpl.java:369)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.createContext(ActiveMQRAConnectionFactoryImpl.java:364)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck.call(MyJMSHealthCheck.java:49)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck_ClientProxy.call(Unknown Source)
    at io.smallrye.context.impl.wrappers.SlowContextualSupplier.get(SlowContextualSupplier.java:21)
    at io.smallrye.mutiny.operators.uni.builders.UniCreateFromItemSupplier.subscribe(UniCreateFromItemSupplier.java:28)
    at io.smallrye.mutiny.operators.AbstractUni.subscribe(AbstractUni.java:36)
    at io.smallrye.mutiny.operators.uni.UniOnFailureFlatMap.subscribe(UniOnFailureFlatMap.java:31)
    at io.smallrye.mutiny.operators.AbstractUni.subscribe(AbstractUni.java:36)
    at io.smallrye.mutiny.operators.uni.UniOnItemTransform.subscribe(UniOnItemTransform.java:22)
    at io.smallrye.mutiny.operators.AbstractUni.subscribe(AbstractUni.java:36)
    at io.smallrye.mutiny.operators.uni.UniRunSubscribeOn.lambda$subscribe$0(UniRunSubscribeOn.java:27)
    at io.quarkus.smallrye.health.runtime.QuarkusAsyncHealthCheckFactory$1$1.call(QuarkusAsyncHealthCheckFactory.java:42)
    at io.quarkus.smallrye.health.runtime.QuarkusAsyncHealthCheckFactory$1$1.call(QuarkusAsyncHealthCheckFactory.java:39)
    at io.vertx.core.impl.ContextImpl.lambda$executeBlocking$0(ContextImpl.java:178)
    at io.vertx.core.impl.ContextInternal.dispatch(ContextInternal.java:279)
    at io.vertx.core.impl.ContextImpl.lambda$internalExecuteBlocking$2(ContextImpl.java:210)
    at io.quarkus.vertx.core.runtime.VertxCoreRecorder$14.runWith(VertxCoreRecorder.java:635)
    at org.jboss.threads.EnhancedQueueExecutor$Task.doRunWith(EnhancedQueueExecutor.java:2516)
    at org.jboss.threads.EnhancedQueueExecutor$Task.run(EnhancedQueueExecutor.java:2495)
    at org.jboss.threads.EnhancedQueueExecutor$ThreadBody.run(EnhancedQueueExecutor.java:1495)
    at org.jboss.threads.DelegatingRunnable.run(DelegatingRunnable.java:11)
    at org.jboss.threads.ThreadLocalResettingRunnable.run(ThreadLocalResettingRunnable.java:11)
    at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
    at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: jakarta.jms.JMSException: Failed to create session factory
    at org.apache.activemq.artemis.jms.client.ActiveMQConnectionFactory.createConnectionInternal(ActiveMQConnectionFactory.java:912)
    at org.apache.activemq.artemis.jms.client.ActiveMQConnectionFactory.createXAConnection(ActiveMQConnectionFactory.java:384)
    at org.apache.activemq.artemis.ra.ActiveMQRAManagedConnection.setup(ActiveMQRAManagedConnection.java:713)
    at org.apache.activemq.artemis.ra.ActiveMQRAManagedConnection.<init>(ActiveMQRAManagedConnection.java:161)
    at org.apache.activemq.artemis.ra.ActiveMQRAManagedConnectionFactory.createManagedConnection(ActiveMQRAManagedConnectionFactory.java:144)
    at org.jboss.jca.core.connectionmanager.pool.mcp.SemaphoreArrayListManagedConnectionPool.createConnectionEventListener(SemaphoreArrayListManagedConnectionPool.java:1267)
    at org.jboss.jca.core.connectionmanager.pool.mcp.SemaphoreArrayListManagedConnectionPool.getConnection(SemaphoreArrayListManagedConnectionPool.java:495)
    at org.jboss.jca.core.connectionmanager.pool.AbstractPool.getSimpleConnection(AbstractPool.java:688)
    at org.jboss.jca.core.connectionmanager.pool.AbstractPool.getConnection(AbstractPool.java:653)
    at org.jboss.jca.core.connectionmanager.AbstractConnectionManager.getManagedConnection(AbstractConnectionManager.java:660)
    at org.jboss.jca.core.connectionmanager.tx.TxConnectionManagerImpl.getManagedConnection(TxConnectionManagerImpl.java:440)
    at org.jboss.jca.core.connectionmanager.AbstractConnectionManager.allocateConnection(AbstractConnectionManager.java:789)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.allocateConnection(ActiveMQRASessionFactoryImpl.java:792)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.createSession(ActiveMQRASessionFactoryImpl.java:482)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.createSession(ActiveMQRASessionFactoryImpl.java:673)
    at org.apache.activemq.artemis.ra.ActiveMQRASessionFactoryImpl.createSession(ActiveMQRASessionFactoryImpl.java:678)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.validateUser(ActiveMQRAConnectionFactoryImpl.java:422)
    at org.apache.activemq.artemis.ra.ActiveMQRAConnectionFactoryImpl.createContext(ActiveMQRAConnectionFactoryImpl.java:379)
    ... 25 more
Caused by: ActiveMQNotConnectedException[errorType=NOT_CONNECTED message=AMQ219007: Cannot connect to server(s). Tried with all available servers.]
    at org.apache.activemq.artemis.core.client.impl.ServerLocatorImpl.createSessionFactory(ServerLocatorImpl.java:729)
    at org.apache.activemq.artemis.jms.client.ActiveMQConnectionFactory.createConnectionInternal(ActiveMQConnectionFactory.java:910)
    ... 42 more

the difference of behavior is really awkward.

vsevel commented 1 month ago

hmm. I get consistent results now. I compare a rest end point sending a jms message and the health check. the hosts exist, but nothing is listening on the ports. both code fail in 40 secs. if I put hosts that do not exist, I get a failure with both codes in 20 secs. I think a health check should fail in a shorter time, but that is a different issue, and at least we have consistent behavior now.

zhfeng commented 1 month ago

Hmm, how about the network? is there any firewall between the application and the hosts? are you using the hostname or the ip address?

vsevel commented 1 month ago

I can't get a reliable check. if I start with the broker up, then I shut it down, even with Connection conn = connectionFactories.select(identifier).get().createConnection(); Session session = conn.createSession(); I get no exception.

vsevel commented 1 month ago

with code to create a temporary queue, I can get it to fail reliably in 60 secs with exception:

            try (JMSContext ctx = connectionFactories.select(identifier).get().createContext()) {
                TemporaryQueue temporaryQueue = ctx.createTemporaryQueue();
                temporaryQueue.delete();
                status.put(name, UP);
            } catch (Exception e) {
                status.put(name, DOWN);
            }
health get context error

jakarta.jms.JMSRuntimeException: AMQ219014: Timed out after waiting 30000 ms for response when sending packet 34
    at org.apache.activemq.artemis.jms.client.JmsExceptionUtils.convertToRuntimeException(JmsExceptionUtils.java:88)
    at org.apache.activemq.artemis.jms.client.ActiveMQJMSContext.createTemporaryQueue(ActiveMQJMSContext.java:526)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck.check(MyJMSHealthCheck.java:59)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck_Subclass.check$$superforward(Unknown Source)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck_Subclass$$function$$1.apply(Unknown Source)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.proceed(AroundInvokeInvocationContext.java:73)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.proceed(AroundInvokeInvocationContext.java:62)
    at com.lodh.util.timer.TimerCorrelationIdInterceptor.around(TimerCorrelationIdInterceptor.java:34)
    at com.lodh.util.timer.TimerCorrelationIdInterceptor_Bean.intercept(Unknown Source)
    at io.quarkus.arc.impl.InterceptorInvocation.invoke(InterceptorInvocation.java:42)
    at io.quarkus.arc.impl.AroundInvokeInvocationContext.perform(AroundInvokeInvocationContext.java:30)
    at io.quarkus.arc.impl.InvocationContexts.performAroundInvoke(InvocationContexts.java:27)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck_Subclass.check(Unknown Source)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck_ClientProxy.check(Unknown Source)
    at com.lodh.arte.bank.quarkus.MyJMSHealthCheck_ScheduledInvoker_check_0aaf5d0517498a7a6fb26e6330e8ea0e220dc9bd.invokeBean(Unknown Source)
    at io.quarkus.scheduler.common.runtime.DefaultInvoker.invoke(DefaultInvoker.java:24)
    at io.quarkus.scheduler.common.runtime.StatusEmitterInvoker.invoke(StatusEmitterInvoker.java:35)
    at io.quarkus.scheduler.common.runtime.SkipConcurrentExecutionInvoker.invoke(SkipConcurrentExecutionInvoker.java:37)
    at io.quarkus.scheduler.runtime.SimpleScheduler$ScheduledTask.doInvoke(SimpleScheduler.java:463)
    at io.quarkus.scheduler.runtime.SimpleScheduler$ScheduledTask$2.call(SimpleScheduler.java:445)
    at io.quarkus.scheduler.runtime.SimpleScheduler$ScheduledTask$2.call(SimpleScheduler.java:442)
    at io.vertx.core.impl.ContextImpl.lambda$executeBlocking$0(ContextImpl.java:178)
    at io.vertx.core.impl.ContextInternal.dispatch(ContextInternal.java:279)
    at io.vertx.core.impl.ContextImpl.lambda$internalExecuteBlocking$2(ContextImpl.java:210)
    at io.quarkus.vertx.core.runtime.VertxCoreRecorder$14.runWith(VertxCoreRecorder.java:640)
    at org.jboss.threads.EnhancedQueueExecutor$Task.doRunWith(EnhancedQueueExecutor.java:2516)
    at org.jboss.threads.EnhancedQueueExecutor$Task.run(EnhancedQueueExecutor.java:2495)
    at org.jboss.threads.EnhancedQueueExecutor$ThreadBody.run(EnhancedQueueExecutor.java:1521)
    at org.jboss.threads.DelegatingRunnable.run(DelegatingRunnable.java:11)
    at org.jboss.threads.ThreadLocalResettingRunnable.run(ThreadLocalResettingRunnable.java:11)
    at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
    at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: jakarta.jms.JMSException: AMQ219014: Timed out after waiting 30000 ms for response when sending packet 34
    at org.apache.activemq.artemis.core.protocol.core.impl.ChannelImpl.sendBlocking(ChannelImpl.java:550)
    at org.apache.activemq.artemis.core.protocol.core.impl.ChannelImpl.sendBlocking(ChannelImpl.java:452)
    at org.apache.activemq.artemis.core.protocol.core.impl.ActiveMQSessionContext.createQueue(ActiveMQSessionContext.java:856)
    at org.apache.activemq.artemis.core.client.impl.ClientSessionImpl.internalCreateQueue(ClientSessionImpl.java:2070)
    at org.apache.activemq.artemis.core.client.impl.ClientSessionImpl.createQueue(ClientSessionImpl.java:321)
    at org.apache.activemq.artemis.jms.client.ActiveMQSession.createTemporaryQueue(ActiveMQSession.java:1042)
    at org.apache.activemq.artemis.ra.ActiveMQRASession.createTemporaryQueue(ActiveMQRASession.java:844)
    at org.apache.activemq.artemis.jms.client.ActiveMQJMSContext.createTemporaryQueue(ActiveMQJMSContext.java:524)
    ... 30 more
Caused by: ActiveMQConnectionTimedOutException[errorType=CONNECTION_TIMEDOUT message=AMQ219014: Timed out after waiting 30000 ms for response when sending packet 34]
    ... 38 more

my scenario:

vsevel commented 1 month ago

there is still a reliability issue when I restart just one broker node in the cluster. the quarkus managed listener reconnects successfully, but the health check still fails, until I restart the other node.

zhfeng commented 1 month ago

Well, I think we need some inputs from the artemis team. It looks like there is a cli tool which can be used to check health of the brokers. like https://activemq.apache.org/components/artemis/documentation/latest/using-cli.html

Then we might do the similar operations in our health check method?

vsevel commented 1 month ago

that is a good idea. I tried with the one pool. I do get an active count greater than 0, but it stays the same even after I shut down the broker.

vsevel commented 1 month ago

that is the best I could get (and still I am not getting a reliable behavior when just one node is down):

public class JMSSanityHealthCheck implements HealthCheck {

    private static final Logger log = LoggerFactory.getLogger(JMSSanityHealthCheck.class);

    public static final String NON_EXISTENT = "non-existent";
    public static final String ERROR_MESSAGE = "There is no queue with name " + NON_EXISTENT;

    private final Set<String> connectionFactoryNames;

    private final Instance<ConnectionFactory> connectionFactories;

    private final Map<String, HealthCheckResponse.Status> status = new ConcurrentHashMap<>();

    public JMSSanityHealthCheck(IronJacamarBuildtimeConfig buildTimeConfigs,
            @Any Instance<ConnectionFactory> connectionFactories) {
        this.connectionFactoryNames = buildTimeConfigs.resourceAdapters().keySet();
        this.connectionFactories = connectionFactories;
    }

    @Scheduled(every = "60S", concurrentExecution = SKIP)
    void check() {
        for (String name : connectionFactoryNames) {
            Annotation identifier = ArtemisUtil.toIdentifier(name);
            ConnectionFactory cf = connectionFactories.select(identifier).get();
            checkBroker(name, cf);
        }
    }

    private void checkBroker(String name, ConnectionFactory connectionFactory) {
        try (JMSContext ctx = connectionFactory.createContext()) {
            ctx.createQueue(NON_EXISTENT);
        } catch (JMSRuntimeException e) {
            if (ERROR_MESSAGE.equals(e.getMessage())) {
                status.put(name, UP);
            } else {
                status.put(name, DOWN);
            }
        } catch (Exception e) {
            status.put(name, DOWN);
        }
    }

    public HealthCheckResponse call() {
        HealthCheckResponseBuilder builder = HealthCheckResponse
                .named("Artemis JMS Resource Adaptor health check")
                .up();
        status.forEach((k, v) -> {
            builder.withData(k, v.toString());
            if (v == DOWN) {
                builder.down();
            }
        });
        return builder.build();
    }
}