eclipse-vertx / vert.x

Vert.x is a tool-kit for building reactive applications on the JVM
http://vertx.io
Other
14.32k stars 2.08k forks source link

[DNS] Issues with GraalVM and Alpine Linux #3121

Open smoell opened 5 years ago

smoell commented 5 years ago

Hi all,

I have a DNS issue with Vert.x in combination with GraalVM and Alpine Linux (works with Debian-9-slim)

The project branch can be found here:

https://github.com/aws-samples/reactive-refarch-cloudformation/tree/alpine, I get the following error message:

Failed to mark a promise as success because it has failed already: DefaultChannelPromise@fec987d(failure: io.netty.handler.codec.EncoderException: io.netty.util.IllegalReferenceCountException: refCnt: 0, decrement: 1), unnotified cause: io.netty.handler.codec.EncoderException: io.netty.util.IllegalReferenceCountException: refCnt: 0, decrement: 1 at io.netty.handler.codec.MessageToMessageEncoder.write(MessageToMessageEncoder.java:107) at io.netty.channel.AbstractChannelHandlerContext.invokeWrite0(AbstractChannelHandlerContext.java:716) at io.netty.channel.AbstractChannelHandlerContext.invokeWrite(AbstractChannelHandlerContext.java:708) at io.netty.channel.AbstractChannelHandlerContext.write(AbstractChannelHandlerContext.java:791) at io.netty.channel.AbstractChannelHandlerContext.write(AbstractChannelHandlerContext.java:701) at io.netty.channel.DefaultChannelPipeline.write(DefaultChannelPipeline.java:1026) at io.netty.channel.AbstractChannel.write(AbstractChannel.java:288) at io.netty.resolver.dns.DnsQueryContext.writeQuery(DnsQueryContext.java:147) at io.netty.resolver.dns.DnsQueryContext.sendQuery(DnsQueryContext.java:125) at io.netty.resolver.dns.DnsQueryContext.query(DnsQueryContext.java:120) at io.netty.resolver.dns.DnsNameResolver.query0(DnsNameResolver.java:1152) at io.netty.resolver.dns.DnsResolveContext.query(DnsResolveContext.java:374) at io.netty.resolver.dns.DnsResolveContext.query(DnsResolveContext.java:999) at io.netty.resolver.dns.DnsResolveContext.internalResolve(DnsResolveContext.java:296) at io.netty.resolver.dns.DnsResolveContext.doSearchDomainQuery(DnsResolveContext.java:267) at io.netty.resolver.dns.DnsAddressResolveContext.doSearchDomainQuery(DnsAddressResolveContext.java:93) at io.netty.resolver.dns.DnsResolveContext.resolve(DnsResolveContext.java:234) at io.netty.resolver.dns.DnsNameResolver.doResolveAllUncached0(DnsNameResolver.java:1038) at io.netty.resolver.dns.DnsNameResolver.doResolveAllUncached(DnsNameResolver.java:1016) at io.netty.resolver.dns.DnsNameResolver.doResolveUncached(DnsNameResolver.java:920) at io.netty.resolver.dns.DnsNameResolver.doResolve(DnsNameResolver.java:864) at io.netty.resolver.dns.DnsNameResolver.doResolve(DnsNameResolver.java:707) at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:63) at io.netty.resolver.dns.InflightNameResolver.resolve(InflightNameResolver.java:100) at io.netty.resolver.dns.InflightNameResolver.resolve(InflightNameResolver.java:66) at io.netty.resolver.dns.InflightNameResolver.resolve(InflightNameResolver.java:51) at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:57) at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:32) at io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:108) at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:204) at io.netty.bootstrap.Bootstrap.doResolveAndConnect(Bootstrap.java:166) at io.netty.bootstrap.Bootstrap.connect(Bootstrap.java:143) at io.vertx.core.net.impl.ChannelProvider.handleConnect(ChannelProvider.java:134) at io.vertx.core.net.impl.ChannelProvider.connect(ChannelProvider.java:87) at io.vertx.core.net.impl.NetClientImpl.doConnect(NetClientImpl.java:212) at io.vertx.core.net.impl.NetClientImpl.doConnect(NetClientImpl.java:168) at io.vertx.core.net.impl.NetClientImpl.connect(NetClientImpl.java:163) at io.vertx.redis.client.impl.RedisClient.connect(RedisClient.java:114) at io.vertx.redis.impl.RedisClientImpl.send(RedisClientImpl.java:123) at io.vertx.redis.impl.RedisClientImpl.sendJsonArray(RedisClientImpl.java:203) at io.vertx.redis.impl.RedisClientImpl.subscribe(RedisClientImpl.java:1410) at com.amazon.verticles.RedisVerticle.registerToEventBusForPubSub(RedisVerticle.java:108) at com.amazon.verticles.RedisVerticle.start(RedisVerticle.java:139) at io.vertx.core.AbstractVerticle.start(AbstractVerticle.java:106) at io.vertx.core.Verticle.start(Verticle.java:66) at io.vertx.core.impl.DeploymentManager.lambda$doDeploy$8(DeploymentManager.java:556) at io.vertx.core.impl.ContextImpl.executeTask(ContextImpl.java:369) at io.vertx.core.impl.EventLoopContext.lambda$executeAsync$0(EventLoopContext.java:38) at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:163) at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:416) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:515) at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:918) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.lang.Thread.run(Thread.java:748) at com.oracle.svm.core.thread.JavaThreads.threadStartRoutine(JavaThreads.java:460) at com.oracle.svm.core.posix.thread.PosixJavaThreads.pthreadStartRoutine(PosixJavaThreads.java:193) Caused by: io.netty.util.IllegalReferenceCountException: refCnt: 0, decrement: 1 at io.netty.util.internal.ReferenceCountUpdater.toLiveRealRefCnt(ReferenceCountUpdater.java:74) at io.netty.util.internal.ReferenceCountUpdater.release(ReferenceCountUpdater.java:138) at io.netty.util.AbstractReferenceCounted.release(AbstractReferenceCounted.java:76) at io.netty.util.ReferenceCountUtil.release(ReferenceCountUtil.java:88) at io.netty.handler.codec.MessageToMessageEncoder.write(MessageToMessageEncoder.java:91) ... 56 more io.netty.resolver.dns.DnsResolveContext$SearchDomainUnknownHostException: Search domain query failed. Original hostname: 'rer9zy24vuxypvw.6issos.ng.0001.use1.cache.amazonaws.com' failed to resolve 'rer9zy24vuxypvw.6issos.ng.0001.use1.cache.amazonaws.com'

vietj commented 5 years ago

I think when running in graalvm we should fallback on JVM's resolver.

@pmlopes do you know how we can detect running in a native image ?

pmlopes commented 5 years ago

We can, but with the latest graal 19.2.0.1 and vertx 3.8.1 the async resolver works (at least if I run it on my fedora box) however under Alpine the async resolver fails.

I believe this is a Alpine specific issue. @vietj do you have tips / hints where do debug?

pmlopes commented 5 years ago

Also there are a few system properties that tell you if you're on a native image (either runtime or build) I need to digg them up...

pmlopes commented 5 years ago

org.graalvm.nativeimage.imagecode.runtime

https://github.com/oracle/graal/blob/master/sdk/src/org.graalvm.nativeimage/src/org/graalvm/nativeimage/ImageInfo.java

vietj commented 5 years ago

it might be a bug in Netty ?

vietj commented 5 years ago

I think we need to determine if that's a native specific issue.

have you tried to run the same without native ?

On 30 Sep 2019, at 21:04, Paulo Lopes notifications@github.com wrote:

org.graalvm.nativeimage.imagecode.runtime

https://github.com/oracle/graal/blob/master/sdk/src/org.graalvm.nativeimage/src/org/graalvm/nativeimage/ImageInfo.java https://github.com/oracle/graal/blob/master/sdk/src/org.graalvm.nativeimage/src/org/graalvm/nativeimage/ImageInfo.java — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/eclipse-vertx/vert.x/issues/3121?email_source=notifications&email_token=AABXDCW3LFRIYOU3Z2BSSPDQMJETXA5CNFSM4I3Z3GQ2YY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOD76XTEA#issuecomment-536705424, or mute the thread https://github.com/notifications/unsubscribe-auth/AABXDCXBHJYZCODM5I4K7FLQMJETXANCNFSM4I3Z3GQQ.

smoell commented 5 years ago

You mean just GraalVM without building a static image? Not yet, but I can try.

pmlopes commented 5 years ago

Yes, but the trick will be running it on Alpine. Alpine is not a glibc runtime and jdk requires it. The Alpine images usually hack glibc into Alpine and that might hide the issue...

smoell commented 5 years ago

Hm ok, how should we proceed here?

pmlopes commented 5 years ago

I think we can try to isolate the issue to the OS. Perhaps build a test application both as jar and native image.

Then run on a glibc environment, say debian-slim container. If both work then the issue is alpine specific.

Then run on a alpine that has glibc included, say the openjdk:alpine images it both applications work, then the issue is glibc specific i guess...

smoell commented 5 years ago

debian-slim works like a charm, I'll try the openjdk:alpine Docker image

pmlopes commented 5 years ago

That's good news for us, it shows my machine isn't biased 😅 and a glibc system seems to work.

Now the question is, is it glibc specific or a difference due to Alpine ?

If the same app binary and jar works on the openjdk Alpine image then we need to see what are the chances the image has compared to the plain Alpine.

smoell commented 5 years ago

We're one step closer ... target base image is now openjdk:8-alpine, the Uber-JAR works perfectly fine, however, not the native image.

pmlopes commented 5 years ago

So it's definitely the combo graal + alpine

pmlopes commented 5 years ago

BTW are you building a full static image or a regular one? --static flag?

smoell commented 5 years ago

I use the --static flag in native-image.properties

pmlopes commented 5 years ago

one last test :) what if you use openjdk:8-alpine image with a image built without --static? Just trying to isolate the problem...

smoell commented 5 years ago

In this case, the binary won't start at all: "standard_init_linux.go:190: exec user process caused "no such file or directory""

Trumeet commented 5 years ago

I had the same issue when using Docker frolvlad/alpine-glibc + GraalVM Native Image (No fallback) + Vert.x, and I'm resolving another Docker container (name: wordpress) in the same network, but stacktrace or exception is not the same:

pls_1        | java.net.UnknownHostException: failed to resolve 'wordpress' after 2 queries 
pls_1        |  at io.netty.resolver.dns.DnsResolveContext.finishResolve(DnsResolveContext.java:925)
pls_1        |  at io.netty.resolver.dns.DnsResolveContext.tryToFinishResolve(DnsResolveContext.java:884)
pls_1        |  at io.netty.resolver.dns.DnsResolveContext.query(DnsResolveContext.java:356)
pls_1        |  at io.netty.resolver.dns.DnsResolveContext.onResponse(DnsResolveContext.java:543)
pls_1        |  at io.netty.resolver.dns.DnsResolveContext.access$400(DnsResolveContext.java:64)
pls_1        |  at io.netty.resolver.dns.DnsResolveContext$2.operationComplete(DnsResolveContext.java:400)
pls_1        |  at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:577)
pls_1        |  at io.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:570)
pls_1        |  at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:549)
pls_1        |  at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:490)
pls_1        |  at io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:615)
pls_1        |  at io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:604)
pls_1        |  at io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104)
pls_1        |  at io.netty.resolver.dns.DnsQueryContext.setSuccess(DnsQueryContext.java:204)
pls_1        |  at io.netty.resolver.dns.DnsQueryContext.finish(DnsQueryContext.java:196)
pls_1        |  at io.netty.resolver.dns.DnsNameResolver$DnsResponseHandler.channelRead(DnsNameResolver.java:1320)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
pls_1        |  at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:352)
pls_1        |  at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1422)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:374)
pls_1        |  at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:360)
pls_1        |  at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:931)
pls_1        |  at io.netty.channel.nio.AbstractNioMessageChannel$NioMessageUnsafe.read(AbstractNioMessageChannel.java:93)
pls_1        |  at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:700)
pls_1        |  at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:600)
pls_1        |  at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:554)
pls_1        |  at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:514)
pls_1        |  at io.netty.util.concurrent.SingleThreadEventExecutor$6.run(SingleThreadEventExecutor.java:1044)
pls_1        |  at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
pls_1        |  at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
pls_1        |  at java.lang.Thread.run(Thread.java:748)
pls_1        |  at com.oracle.svm.core.thread.JavaThreads.threadStartRoutine(JavaThreads.java:460)
pls_1        |  at com.oracle.svm.core.posix.thread.PosixJavaThreads.pthreadStartRoutine(PosixJavaThreads.java:193)

Using ping to solve it works well. I have no idea which part goes wrong.

UPDATE: It seems that resolving public domains works.