real-logic / aeron

Efficient reliable UDP unicast, UDP multicast, and IPC message transport
Apache License 2.0
7.37k stars 888 forks source link

ConsensusModule segfault if Aeron is closed due to a timeout #693

Closed zyulyaev closed 5 years ago

zyulyaev commented 5 years ago

If ConsensusModuleAgent throws AgentTerminationException after realizing that Aeron is closed, AgentRunner gets a segfault by incrementing an error counter in handleError method. Here is sample test:

package io.aeron.cluster;

import io.aeron.Aeron;
import io.aeron.archive.Archive;
import io.aeron.archive.ArchiveThreadingMode;
import io.aeron.archive.ArchivingMediaDriver;
import io.aeron.cluster.service.ClientSession;
import io.aeron.cluster.service.ClusteredService;
import io.aeron.cluster.service.ClusteredServiceContainer;
import io.aeron.driver.MediaDriver;
import io.aeron.driver.ThreadingMode;
import io.aeron.logbuffer.Header;
import org.agrona.CloseHelper;
import org.agrona.DirectBuffer;
import org.agrona.concurrent.NanoClock;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.util.concurrent.TimeUnit;

import static io.aeron.driver.status.SystemCounterDescriptor.SYSTEM_COUNTER_TYPE_ID;

public final class SegfaultTest
{
    private ArchivingMediaDriver archivingMediaDriver;
    private ConsensusModule consensusModule;
    private ClusteredServiceContainer container;
    private ControlledClock controlledClock;

    @Before
    public void before()
    {
        controlledClock = new ControlledClock();

        archivingMediaDriver = ArchivingMediaDriver.launch(
            new MediaDriver.Context()
                .threadingMode(ThreadingMode.SHARED)
                .termBufferSparseFile(true)
                .errorHandler(TestUtil.errorHandler(0))
                .dirDeleteOnStart(true)
                .spiesSimulateConnection(true),
            new Archive.Context()
                .threadingMode(ArchiveThreadingMode.SHARED)
                .deleteArchiveOnStart(true));

        Aeron aeron = Aeron.connect(new Aeron.Context()
            .nanoClock(controlledClock)
            .useConductorAgentInvoker(true)
        );
        consensusModule = ConsensusModule.launch(
            new ConsensusModule.Context()
                .aeron(aeron)
                .errorCounter(aeron.addCounter(SYSTEM_COUNTER_TYPE_ID, "Cluster errors"))
                .errorHandler(TestUtil.errorHandler(0))
                .terminationHook(TestUtil.TERMINATION_HOOK)
                .deleteDirOnStart(true));

        container = launchEchoService();
    }

    @After
    public void after()
    {
        CloseHelper.close(container);
        CloseHelper.close(consensusModule);
        CloseHelper.close(archivingMediaDriver);

        if (null != consensusModule)
        {
            consensusModule.context().deleteDirectory();
        }
        if (null != archivingMediaDriver)
        {
            archivingMediaDriver.archive().context().deleteArchiveDirectory();
            archivingMediaDriver.mediaDriver().context().deleteAeronDirectory();
        }
    }

    @Test
    public void segfaultOnTimeout() throws InterruptedException
    {
        Thread.sleep(1000);
        controlledClock.offset = TimeUnit.SECONDS.toNanos(10);
        Thread.sleep(2000);
    }

    private ClusteredServiceContainer launchEchoService()
    {
        final ClusteredService clusteredService = new StubClusteredService()
        {
            public void onSessionMessage(
                final ClientSession session,
                final long timestamp,
                final DirectBuffer buffer,
                final int offset,
                final int length,
                final Header header)
            {
                while (session.offer(buffer, offset, length) < 0)
                {
                    cluster.idle();
                }
            }
        };

        return ClusteredServiceContainer.launch(
            new ClusteredServiceContainer.Context()
                .clusteredService(clusteredService)
                .errorHandler(Throwable::printStackTrace));
    }

    private static final class ControlledClock implements NanoClock {
        private volatile long offset;

        @Override
        public long nanoTime()
        {
            return System.nanoTime() + offset;
        }
    }
}
zyulyaev commented 5 years ago

Here is the interesting part of a dump file:

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x000000010ab8563f, pid=76894, tid=0x0000000000005903
#
# JRE version: Java(TM) SE Runtime Environment (8.0_202-b08) (build 1.8.0_202-b08)
# Java VM: Java HotSpot(TM) 64-Bit Server VM (25.202-b08 mixed mode bsd-amd64 compressed oops)
# Problematic frame:
# V  [libjvm.dylib+0x58563f]  Unsafe_GetLongVolatile+0x3e
#
# Failed to write core dump. Core dumps have been disabled. To enable core dumping, try "ulimit -c unlimited" before starting Java again
#
# If you would like to submit a bug report, please visit:
#   http://bugreport.java.com/bugreport/crash.jsp
#

---------------  T H R E A D  ---------------

Current thread (0x00007fd83aa4d800):  JavaThread "consensus-module" [_thread_in_vm, id=22787, stack(0x0000700006a00000,0x0000700006b00000)]

siginfo: si_signo: 11 (SIGSEGV), si_code: 1 (SEGV_MAPERR), si_addr: 0x000000012ef85500

Registers:
RAX=0x000000010aefcd70, RBX=0x000000012ef85500, RCX=0x000000010ae2c5e0, RDX=0x0000000000000000
RSP=0x0000700006aff650, RBP=0x0000700006aff680, RSI=0x0000000000000006, RDI=0x0000700006aff660
R8 =0x0000000000000000, R9 =0x0000000000000000, R10=0x000000011941a50c, R11=0x000000011941a4d8
R12=0x0000000000000000, R13=0x0000700006aff6f0, R14=0x0000000000000000, R15=0x00007fd83aa4d800
RIP=0x000000010ab8563f, EFLAGS=0x0000000000010246, ERR=0x0000000000000004
  TRAPNO=0x000000000000000e

Top of Stack: (sp=0x0000700006aff650)
0x0000700006aff650:   00007fd83aa4d800 00007fd83aa4d800
0x0000700006aff660:   00007fd83aa4d800 0000000114f51360
0x0000700006aff670:   0000700006aff798 00007fd83aa4d800
0x0000700006aff680:   0000700006aff6d0 000000011941a57e
0x0000700006aff690:   000000076ab089a8 0000000000000000
0x0000700006aff6a0:   00000001152f1a40 00007fd83aa4d800
0x0000700006aff6b0:   00007fd83aa4d800 0000000600000001
0x0000700006aff6c0:   0000000000000000 0000000000000000
0x0000700006aff6d0:   0000700006aff750 0000000119188dd0
0x0000700006aff6e0:   00000001155a3608 0000000119188dd0
0x0000700006aff6f0:   000000012ef85500 00000001191a5523
0x0000700006aff700:   0000000000000000 000000076ab089a8
0x0000700006aff710:   0000700006aff710 0000000114f51b33
0x0000700006aff720:   0000700006aff798 0000000114f7d5d8
0x0000700006aff730:   0000000000000000 0000000114f51b58
0x0000700006aff740:   0000700006aff6f0 0000700006aff770
0x0000700006aff750:   0000700006aff7e0 0000000119188dd0
0x0000700006aff760:   0000000000000000 0000000000000000
0x0000700006aff770:   0000000000000001 00007fd837d4c3b8
0x0000700006aff780:   000000012ef85500 000000076ce30258
0x0000700006aff790:   0000000000000000 000000076ab089a8
0x0000700006aff7a0:   0000700006aff7a0 00000001152f2454
0x0000700006aff7b0:   0000700006aff7f0 0000000115543258
0x0000700006aff7c0:   00000001156aa9f0 00000001152f2470
0x0000700006aff7d0:   0000700006aff770 0000700006aff7f0
0x0000700006aff7e0:   0000700006aff838 0000000119188dd0
0x0000700006aff7f0:   000000076ce30258 0000700006aff7f8
0x0000700006aff800:   00000001155a35d3 0000700006aff850
0x0000700006aff810:   00000001155a3740 000000012f33d828
0x0000700006aff820:   00000001155a3608 0000700006aff7f0
0x0000700006aff830:   0000700006aff860 000000076d3fd240
0x0000700006aff840:   00000001194b1224 000000076d784170 

Instructions: (pc=0x000000010ab8563f)
0x000000010ab8561f:   e0 48 8d 7d e0 be 06 00 00 00 e8 8a 54 be ff 4c
0x000000010ab8562f:   89 7d d8 4d 85 f6 75 05 45 31 f6 eb 03 4d 8b 36
0x000000010ab8563f:   49 8b 04 1e 48 89 45 d0 48 8b 5d d0 48 8d 7d d8
0x000000010ab8564f:   e8 a8 c8 be ff 48 8d 7d e0 be 06 00 00 00 ba 04 

Register to memory mapping:

RAX=0x000000010aefcd70: _ZN20SafepointSynchronize6_stateE+0 in /Library/Java/JavaVirtualMachines/jdk1.8.0_202.jdk/Contents/Home/jre/lib/server/libjvm.dylib at 0x000000010a600000
RBX=0x000000012ef85500 is an unknown value
RCX=0x000000010ae2c5e0: UseMembar+0 in /Library/Java/JavaVirtualMachines/jdk1.8.0_202.jdk/Contents/Home/jre/lib/server/libjvm.dylib at 0x000000010a600000
RDX=0x0000000000000000 is an unknown value
RSP=0x0000700006aff650 is pointing into the stack for thread: 0x00007fd83aa4d800
RBP=0x0000700006aff680 is pointing into the stack for thread: 0x00007fd83aa4d800
RSI=0x0000000000000006 is an unknown value
RDI=0x0000700006aff660 is pointing into the stack for thread: 0x00007fd83aa4d800
R8 =0x0000000000000000 is an unknown value
R9 =0x0000000000000000 is an unknown value
R10=0x000000011941a50c is at entry_point+76 in (nmethod*)0x000000011941a350
R11=0x000000011941a4d8 is at entry_point+24 in (nmethod*)0x000000011941a350
R12=0x0000000000000000 is an unknown value
R13=0x0000700006aff6f0 is pointing into the stack for thread: 0x00007fd83aa4d800
R14=0x0000000000000000 is an unknown value
R15=0x00007fd83aa4d800 is a thread

Stack: [0x0000700006a00000,0x0000700006b00000],  sp=0x0000700006aff650,  free space=1021k
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
V  [libjvm.dylib+0x58563f]  Unsafe_GetLongVolatile+0x3e
J 621  sun.misc.Unsafe.getLongVolatile(Ljava/lang/Object;J)J (0 bytes) @ 0x000000011941a57e [0x000000011941a4c0+0xbe]
j  sun.misc.Unsafe.getAndAddLong(Ljava/lang/Object;JJ)J+3
j  org.agrona.concurrent.status.AtomicCounter.increment()J+12
j  org.agrona.concurrent.AgentRunner.handleError(Ljava/lang/Throwable;)V+19
J 846 C2 org.agrona.concurrent.AgentRunner.doDutyCycle(Lorg/agrona/concurrent/IdleStrategy;Lorg/agrona/concurrent/Agent;)Z (40 bytes) @ 0x00000001194b1224 [0x00000001194b1000+0x224]
j  org.agrona.concurrent.AgentRunner.run()V+65
j  java.lang.Thread.run()V+11
v  ~StubRoutines::call_stub
V  [libjvm.dylib+0x2f0d1e]  JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)+0x6ae
V  [libjvm.dylib+0x2f14c2]  JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*)+0x164
V  [libjvm.dylib+0x2f166e]  JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*)+0x4a
V  [libjvm.dylib+0x34bf5d]  thread_entry(JavaThread*, Thread*)+0x7c
V  [libjvm.dylib+0x570b93]  JavaThread::thread_main_inner()+0x9b
V  [libjvm.dylib+0x57228e]  JavaThread::run()+0x1c2
V  [libjvm.dylib+0x48f30a]  java_start(Thread*)+0xf6
C  [libsystem_pthread.dylib+0x3339]  _pthread_body+0x7e
C  [libsystem_pthread.dylib+0x62a7]  _pthread_start+0x46
C  [libsystem_pthread.dylib+0x2445]  thread_start+0xd
C  0x0000000000000000

Java frames: (J=compiled Java code, j=interpreted, Vv=VM code)
J 621  sun.misc.Unsafe.getLongVolatile(Ljava/lang/Object;J)J (0 bytes) @ 0x000000011941a50c [0x000000011941a4c0+0x4c]
j  sun.misc.Unsafe.getAndAddLong(Ljava/lang/Object;JJ)J+3
j  org.agrona.concurrent.status.AtomicCounter.increment()J+12
j  org.agrona.concurrent.AgentRunner.handleError(Ljava/lang/Throwable;)V+19
J 846 C2 org.agrona.concurrent.AgentRunner.doDutyCycle(Lorg/agrona/concurrent/IdleStrategy;Lorg/agrona/concurrent/Agent;)Z (40 bytes) @ 0x00000001194b1224 [0x00000001194b1000+0x224]
j  org.agrona.concurrent.AgentRunner.run()V+65
j  java.lang.Thread.run()V+11
v  ~StubRoutines::call_stub
zyulyaev commented 5 years ago

Also I guess there is similar problem with Archive

mjpt777 commented 5 years ago

Fixed with this commit. https://github.com/real-logic/agrona/commit/c037d32ffa4ff64188b92e358033eb4fdc4dd457