vapor / postgres-nio

🐘 Non-blocking, event-driven Swift client for PostgreSQL.
https://api.vapor.codes/postgresnio/documentation/postgresnio/
MIT License
321 stars 75 forks source link

ConnectionPool crash when minimum connections > 0 #479

Closed lovetodream closed 4 months ago

lovetodream commented 4 months ago

Describe the issue

The connection pool crashes after all connections are dropped (due to e.g. temporary networking issue) and the minimum amount of connections get reestablished.

Vapor version

n/a

Operating system and version

macOS 14.5

Swift version

Swift Package Manager - Swift 5.10.0-dev

Steps to reproduce

Run the following code:

var config = PostgresClient.Configuration(...)
config.options.minimumConnections = 1
config.options.maximumConnections = 10

let client = PostgresClient(configuration: config)

try await withThrowingDiscardingTaskGroup { group in
    group.addTask {  // do not cancel me!
        await client.run()
    }

    group.addTask { // run one query
        try await client.withConnection { conn in
            let rows = try await conn.query("SELECT 'count(*)'", logger: logger)
            for try await row in rows {
                print(row)
            }
        }
        print("query done")
    }
}

After query done is logged, use a tool like network conditioner on macOS to simulate 100% packet loss. Keepalive fails and new connection attempts are made, after the socket timeout logs, reenable networking, and you'll see the crash.

_ConnectionPoolModule/PoolStateMachine+ConnectionGroup.swift:194: Fatal error: There is a new connection that we didn't request!

  thread #1, queue = 'com.apple.main-thread'
    frame #0: 0x000000018563a1f4 libsystem_kernel.dylib`mach_msg2_trap + 8
    frame #1: 0x000000018564cb24 libsystem_kernel.dylib`mach_msg2_internal + 80
    frame #2: 0x0000000185642e34 libsystem_kernel.dylib`mach_msg_overwrite + 476
    frame #3: 0x000000018563a578 libsystem_kernel.dylib`mach_msg + 24
    frame #4: 0x000000018575a680 CoreFoundation`__CFRunLoopServiceMachPort + 160
    frame #5: 0x0000000185758f44 CoreFoundation`__CFRunLoopRun + 1208
    frame #6: 0x0000000185758434 CoreFoundation`CFRunLoopRunSpecific + 608
    frame #7: 0x00000001857d645c CoreFoundation`CFRunLoopRun + 64
    frame #8: 0x000000024d7ee4e0 libswift_Concurrency.dylib`swift_task_asyncMainDrainQueueImpl() + 40
    frame #9: 0x000000024d7ee4a0 libswift_Concurrency.dylib`swift_task_asyncMainDrainQueue + 92
    frame #10: 0x0000000100003c04 Issue21`main at main.swift:0
    frame #11: 0x00000001852f20e0 dyld`start + 2360
* thread #6, queue = 'com.apple.root.default-qos.cooperative', stop reason = Fatal error: There is a new connection that we didn't request!
    frame #0: 0x00000001956ed890 libswiftCore.dylib`_swift_runtime_on_report
    frame #1: 0x00000001957acb10 libswiftCore.dylib`_swift_stdlib_reportFatalErrorInFile + 208
    frame #2: 0x000000019537ed94 libswiftCore.dylib`closure #1 (Swift.UnsafeBufferPointer<Swift.UInt8>) -> () in closure #1 (Swift.UnsafeBufferPointer<Swift.UInt8>) -> () in Swift._assertionFailure(_: Swift.StaticString, _: Swift.String, file: Swift.StaticString, line: Swift.UInt, flags: Swift.UInt32) -> Swift.Never + 104
    frame #3: 0x000000019537ded8 libswiftCore.dylib`Swift._assertionFailure(_: Swift.StaticString, _: Swift.String, file: Swift.StaticString, line: Swift.UInt, flags: Swift.UInt32) -> Swift.Never + 260
  * frame #4: 0x00000001001da20c Issue21`PoolStateMachine.ConnectionGroup.newConnectionEstablished(connection=0x0000600001408380, maxStreams=1, self=_ConnectionPoolModule.PoolStateMachine<PostgresNIO.PostgresConnection, _ConnectionPoolModule.ConnectionIDGenerator, Swift.Int, _ConnectionPoolModule.ConnectionRequest<PostgresNIO.PostgresConnection>, Swift.Int, Swift.CheckedContinuation<(), Swift.Never>>.ConnectionGroup @ 0x000060000031c858) at PoolStateMachine+ConnectionGroup.swift:194:17
    frame #5: 0x0000000100225670 Issue21`PoolStateMachine.connectionEstablished(connection=0x0000600001408380, maxStreams=1, self=_ConnectionPoolModule.PoolStateMachine<PostgresNIO.PostgresConnection, _ConnectionPoolModule.ConnectionIDGenerator, Swift.Int, _ConnectionPoolModule.ConnectionRequest<PostgresNIO.PostgresConnection>, Swift.Int, Swift.CheckedContinuation<(), Swift.Never>> @ 0x000060000031c810) at PoolStateMachine.swift:266:53
    frame #6: 0x00000001001bed74 Issue21`closure #1 in ConnectionPool.connectionEstablished(state=_ConnectionPoolModule.ConnectionPool<PostgresNIO.PostgresConnection, Swift.Int, _ConnectionPoolModule.ConnectionIDGenerator, _ConnectionPoolModule.ConnectionRequest<PostgresNIO.PostgresConnection>, Swift.Int, PostgresNIO.PostgresKeepAliveBehavor, PostgresNIO.PostgresClientMetrics, Swift.ContinuousClock>.State @ 0x000060000031c810, connectionBundle=_ConnectionPoolModule.ConnectionAndMetadata<PostgresNIO.PostgresConnection> @ 0x000000016ff122b8) at ConnectionPool.swift:450:39
    frame #7: 0x00000001001c0b14 Issue21`partial apply for closure #1 in ConnectionPool.connectionEstablished(_:) at <compiler-generated>:0
    frame #8: 0x00000001001b7c6c Issue21`closure #1 in ConnectionPool.modifyStateAndRunActions(state=_ConnectionPoolModule.ConnectionPool<PostgresNIO.PostgresConnection, Swift.Int, _ConnectionPoolModule.ConnectionIDGenerator, _ConnectionPoolModule.ConnectionRequest<PostgresNIO.PostgresConnection>, Swift.Int, PostgresNIO.PostgresKeepAliveBehavor, PostgresNIO.PostgresClientMetrics, Swift.ContinuousClock>.State @ 0x000060000031c810, closure=0x1001c0a84) at ConnectionPool.swift:356:13
    frame #9: 0x00000001001b7d18 Issue21`partial apply for closure #1 in ConnectionPool.modifyStateAndRunActions(_:) at <compiler-generated>:0
    frame #10: 0x00000001001d42dc Issue21`closure #1 in LockStorage.withLockedValue<τ_0_0>(valuePtr=0x60000031c810, lockPtr=0x60000031c8c0, mutate=0x1001b7c80) at NIOLock.swift:176:24
    frame #11: 0x00000001001d4904 Issue21`partial apply for closure #1 in LockStorage.withLockedValue<τ_0_0>(_:) at <compiler-generated>:0
    frame #12: 0x00000001954b65a8 libswiftCore.dylib`Swift.ManagedBuffer.withUnsafeMutablePointers<τ_0_0>((Swift.UnsafeMutablePointer<τ_0_0>, Swift.UnsafeMutablePointer<τ_0_1>) throws -> τ_1_0) throws -> τ_1_0 + 180
    frame #13: 0x00000001001d422c Issue21`LockStorage.withLockedValue<τ_0_0>(mutate=0x1001b7c80, self=0x000060000031c800) at NIOLock.swift:173:18
    frame #14: 0x00000001001d5050 Issue21`NIOLockedValueBox.withLockedValue<τ_0_0>(mutate=0x1001b7c80, self=_ConnectionPoolModule.NIOLockedValueBox<_ConnectionPoolModule.ConnectionPool<PostgresNIO.PostgresConnection, Swift.Int, _ConnectionPoolModule.ConnectionIDGenerator, _ConnectionPoolModule.ConnectionRequest<PostgresNIO.PostgresConnection>, Swift.Int, PostgresNIO.PostgresKeepAliveBehavor, PostgresNIO.PostgresClientMetrics, Swift.ContinuousClock>.State> @ 0x000000016ff12688) at NIOLockedValueBox.swift:42:34
    frame #15: 0x00000001001b0540 Issue21`ConnectionPool.modifyStateAndRunActions(closure=0x1001c0a84, self=0x000060000001c000) at ConnectionPool.swift:355:37
    frame #16: 0x00000001001be4d0 Issue21`ConnectionPool.connectionEstablished(connectionBundle=_ConnectionPoolModule.ConnectionAndMetadata<PostgresNIO.PostgresConnection> @ 0x000000016ff12d70, self=0x000060000001c000) at ConnectionPool.swift:448:14
    frame #17: 0x00000001001bda5c Issue21`closure #1 in ConnectionPool.makeConnection<τ_0_0>(self=0x000060000001c000, request=_ConnectionPoolModule.PoolStateMachine<τ_0_0, τ_0_2, τ_0_1, τ_0_3, τ_0_4, Swift.CheckedContinuation<(), Swift.Never>>.ConnectionRequest @ 0x0000600000e1c8b0) at ConnectionPool.swift:427:22
    frame #18: 0x00000001001be138 Issue21`partial apply for closure #1 in ConnectionPool.makeConnection<τ_0_0>(for:in:) at <compiler-generated>:0
    frame #19: 0x00000001001c7c2c Issue21`thunk for @escaping @callee_guaranteed @Sendable @async () -> () at <compiler-generated>:0
    frame #20: 0x00000001001c7d7c Issue21`partial apply for thunk for @escaping @callee_guaranteed @Sendable @async () -> () at <compiler-generated>:0
  thread #7
    frame #0: 0x000000018563bea4 libsystem_kernel.dylib`__workq_kernreturn + 8
  thread #8
    frame #0: 0x0000000000000000

Outcome

The connection pool crashes due to receiving a connection it didn't request, but it should have requested the connection as it is the one that is fulfilling the minimal connection amount.

Additional notes

I've originally discovered the issue while resolving another issue in oracle-nio, therefore some credit goes to @iliasaz (https://github.com/lovetodream/oracle-nio/issues/21).

I've started investigating, it seems like PoolStateMachine.connectionClosed(_:) requests a new connection as expected, but PoolStateMachine.createNewConnection() is never called.

Here's a test case for PoolStateMachineTests to reproduce it:

func testConnectionRefillAfterClosed() {
    var configuration = PoolConfiguration()
    configuration.minimumConnectionCount = 1
    configuration.maximumConnectionSoftLimit = 10
    configuration.maximumConnectionHardLimit = 10
    configuration.keepAliveDuration = .seconds(2)
    configuration.idleTimeoutDuration = .seconds(4)

    var stateMachine = TestPoolStateMachine(
        configuration: configuration,
        generator: .init(),
        timerCancellationTokenType: MockTimerCancellationToken.self
    )

    let connection0 = MockConnection(id: 0)
    let connection1 = MockConnection(id: 1)

    stateMachine.refillConnections()
    let request0 = MockRequest()
    stateMachine.leaseConnection(request0)
    stateMachine.connectionEstablished(connection0, maxStreams: 1)

    stateMachine.releaseConnection(connection0, streams: 1)
    let connection0KeepAliveTimer0 = TestPoolStateMachine.Timer(.init(timerID: 0, connectionID: 0, usecase: .keepAlive), duration: .seconds(30))
    let connection0KeepAliveTimer0CancellationToken = MockTimerCancellationToken(connection0KeepAliveTimer0)
    stateMachine.timerScheduled(connection0KeepAliveTimer0, cancelContinuation: connection0KeepAliveTimer0CancellationToken)
    stateMachine.timerTriggered(connection0KeepAliveTimer0)

    connection0.close() // force close
    connection0.closeIfClosing()

    stateMachine.connectionKeepAliveFailed(0)
    let closed = stateMachine.connectionClosed(connection0)
    XCTAssertEqual(closed.connection, .makeConnection(.init(connectionID: 1), []))
    XCTAssertEqual(closed.request, .none)
    stateMachine.connectionEstablished(connection1, maxStreams: 1)
}

There's an existing test which can be extended to cover the crash. You can add stateMachine.connectionEstablished(.init(id: 1), maxStreams: 1) to the end of PoolStateMachineTests.testConnectionIsEstablishedAfterFailedKeepAliveIfNotEnoughConnectionsLeft() and the same crash is caused.