Erratic behaviour in connecting to on-prem Sql-server with mssql from NodeJS Lambda. Randomly issues Connection Errors.

ANTGOMEZ commented 2 years ago

Running from a NodeJS v14 AWS Lambda, with mssql module v8.0.2,...

The Lambda is being triggered by a SQS QUEUE, we feed in rapid fire events to the Queue which trigger the lambda. In turn, the lambda calls the code below to post the events to an on prem DB.

The result is a mixed bag. Sometimes all data gets posted to DB, no errors, other times we get connection errors, below. No correlation to type of event.

ERROR   Invoke Error    {     "errorType": "ConnectionError",     "errorMessage": "Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)",     "code": "ESOCKET",     "originalError": {         "errorType": "Error",         "errorMessage": "Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)",         "code": "ESOCKET",         "stack": [             "Error: Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)",             "    at Connection.socketError (/var/task/common/node_modules/tedious/lib/connection.js:1412:28)",             "    at /var/task/common/node_modules/tedious/lib/connection.js:1172:21",             "    at SequentialConnectionStrategy.connect (/var/task/common/node_modules/tedious/lib/connector.js:129:14)",             "    at Socket.onError (/var/task/common/node_modules/tedious/lib/connector.js:149:12)",             "    at Socket.emit (events.js:400:28)",             "    at Socket.emit (domain.js:475:12)",             "    at emitErrorNT (internal/streams/destroy.js:106:8)",             "    at emitErrorCloseNT (internal/streams/destroy.js:74:3)",             "    at processTicksAndRejections (internal/process/task_queues.js:82:21)"         ]     },     "name": "ConnectionError",     "stack": [         "ConnectionError: Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)",         "    at /var/task/common/node_modules/mssql/lib/tedious/connection-pool.js:70:17",         "    at Connection.onConnect (/var/task/common/node_modules/tedious/lib/connection.js:1038:9)",         "    at Object.onceWrapper (events.js:520:26)",         "    at Connection.emit (events.js:400:28)",         "    at Connection.emit (domain.js:475:12)",         "    at Connection.emit (/var/task/common/node_modules/tedious/lib/connection.js:1066:18)",         "    at Connection.socketError (/var/task/common/node_modules/tedious/lib/connection.js:1412:12)",         "    at /var/task/common/node_modules/tedious/lib/connection.js:1172:21",         "    at SequentialConnectionStrategy.connect (/var/task/common/node_modules/tedious/lib/connector.js:129:14)",         "    at Socket.onError (/var/task/common/node_modules/tedious/lib/connector.js:149:12)"     ] } | 2022-03-15T20:45:26.740Z e8dc17d8-cee7-5f84-b59d-4d7233c5807e ERROR Invoke Error {"errorType":"ConnectionError","errorMessage":"Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)","code":"ESOCKET","originalError":{"errorType":"Error","errorMessage":"Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)","code":"ESOCKET","stack":["Error: Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)"," at Connection.socketError (/var/task/common/node_modules/tedious/lib/connection.js:1412:28)"," at /var/task/common/node_modules/tedious/lib/connection.js:1172:21"," at SequentialConnectionStrategy.connect (/var/task/common/node_modules/tedious/lib/connector.js:129:14)"," at Socket.onError (/var/task/common/node_modules/tedious/lib/connector.js:149:12)"," at Socket.emit (events.js:400:28)"," at Socket.emit (domain.js:475:12)"," at emitErrorNT (internal/streams/destroy.js:106:8)"," at emitErrorCloseNT (internal/streams/destroy.js:74:3)"," at processTicksAndRejections (internal/process/task_queues.js:82:21)"]},"name":"ConnectionError","stack":["ConnectionError: Failed to connect to xx.xx.xx.xx:1433 - Could not connect (sequence)"," at /var/task/common/node_modules/mssql/lib/tedious/connection-pool.js:70:17"," at Connection.onConnect (/var/task/common/node_modules/tedious/lib/connection.js:1038:9)"," at Object.onceWrapper (events.js:520:26)"," at Connection.emit (events.js:400:28)"," at Connection.emit (domain.js:475:12)"," at Connection.emit (/var/task/common/node_modules/tedious/lib/connection.js:1066:18)"," at Connection.socketError (/var/task/common/node_modules/tedious/lib/connection.js:1412:12)"," at /var/task/common/node_modules/tedious/lib/connection.js:1172:21"," at SequentialConnectionStrategy.connect (/var/task/common/node_modules/tedious/lib/connector.js:129:14)"," at Socket.onError (/var/task/common/node_modules/tedious/lib/connector.js:149:12)"]}

Is there a problem with this code? Doesn't seem like it.

const sql = require('mssql')

let poolRequest;

function getConnection(config) {
    if (!poolRequest) {
      const pool = new sql.ConnectionPool(config);
      poolRequest = pool.connect();
    }
    return poolRequest;
  }

exports.execSQL = async function (conn_cfg, cmd) {
    let data
    try
    {
        console.log('BEFORE CONNECT')
        let conn = await getConnection(conn_cfg)
        console.log('AFTER CONNECT')

        console.log('BEFORE QUERY EXEC')
        data = await conn.request().query(cmd)
        .then(result=> {console.log('QUERY EXEC COMPLETE')})
        .catch(err => {console.log("REQUEST ERROR: " + err)})

        sql.close()

    }
    catch(err)
    {        
        sql?.close()
        console.log(err)
        throw err
    }
}

Expected behaviour:

Consistent connections.

Actual behaviour:

Query sometimes throws a connection error; couldn't connect to IP Address.

Configuration:

I've set all relevant timeouts to15 minutes (the lambda timeout value)

{"user": "user", "password": "password", "server": "xx.xx.xx.xx", "database": "TestDB", "port": 1433,  "trustServerCertificate": true, "connectionTimeout": 900000, "requestTimeout": 900000, "pool": {"max": 10, "min": 0, "idleTimeoutMillis": 900000}}

Software versions

AWS Lambda
NodeJS: 14
node-mssql: 8.0.2
SQL Server:12.0.5207.0

dhensby commented 2 years ago

As a caveat, I'm not an expert with AWS Lamda and my experience is with Azure Function Apps as as serverless environment and no doubt there are differences. The main unknown for me is if any resources persist between lamda invocations (they do in Azure Functions), ie: a pool created the way you are doing is available to many function executions in azure, I'm not sure if that's the case in lamdas.

You're calling sql.close() despite not using the global connection, so the pool you're creating won't actually be closed. However, that's a good thing because you have no way to reconnect the pool you're caching and returning in getConnection(). You also don't want to be opening and closing the pool for each request, the pools are designed to be long running but you will have problems if your conn_cfg changes over time.

As for what is going on, it's hard to know exactly as you're the one with access to the infrastructure and knowledge about how many connections are likely to be created or lambda instances spawned at any given time.

You're running a pool with max 10 connections, so each spawned instance of the lamda could have up to 10 connections. Depending on what config your on-prem SQL server has, you could be hitting connection limits, etc. You'll need to consult with your database administrator to look into / debug what is going on.

in terms of the code, there are a few issues:

You're caching the pool connection promise (fine in theory) but you're caching it even if it rejects, which means that if you fail to connect once because of some intermittent error, every call to getConnection() will reject.
When the pool is closed (though, I note you're not closing the pool you make, but the non-existent global pool), you're not un-setting the poolRequest variable, so this will begin to return a disconnected pool, leading to rejections.
You're passing a config parameter to getConnection() and then to the ConnectionPool but you're caching the pool regardless of the config, so any changed configs will return the original pool (this may be fine, but it's just worth noting).

const sql = require('mssql')

let poolRequest;

function getConnection(config) {
    if (!poolRequest) {
      const pool = new sql.ConnectionPool(config);
+        // if the pool connection errors, you're actually caching that error so you'll see that
      poolRequest = pool.connect();
    }
    return poolRequest;
  }

exports.execSQL = async function (conn_cfg, cmd) {
    let data
    try {
            console.log('BEFORE CONNECT')
-            let conn = await getConnection(conn_cfg)
+            const conn = await getConnection(conn_cfg)
            console.log('AFTER CONNECT')

            console.log('BEFORE QUERY EXEC')
-            data = await conn.request().query(cmd)
-       .then(result=> {console.log('QUERY EXEC COMPLETE')})
+            await conn.request().query(cmd)
+       .then(()=> {console.log('QUERY EXEC COMPLETE')})
                .catch(err => {console.log("REQUEST ERROR: " + err)})
-            sql.close()
    }
    catch(err) {        
-        sql?.close()
        console.log(err)
        throw err
+    } finally {
+        // personally I would not do this, you want to keep the pool open
+        sql?.close()
+        poolRequest = null
    }
}

ANTGOMEZ commented 2 years ago

Thank you for your quick response. We will take these into account and post back the results.

ANTGOMEZ commented 2 years ago

I think the key was the resetting the poolRequest to null in the finally clause. We also set the Lambda concurrency to 100. We don't receive the connection errors anymore.

Here's the code we ended up with.

const sql = require("mssql");

let poolRequest;

function getConnection(config) {
    if (!poolRequest) {
        console.log("creating new connection");
        const pool = new sql.ConnectionPool(config);
        poolRequest = pool.connect();
    }
    return poolRequest;
}

exports.execSQL = async function(conn_cfg, cmd) {
    try {
        const pool = await getConnection(conn_cfg);

        console.log("executing query");
        await pool
            .request()
            .query(cmd)
            .then(() => {
                console.log("query completed successfully");
            })
            .catch(err => {
                console.log(`query error: ${err}`);
                throw err;
            });
    } catch (err) {
        console.log(`execSQL error: ${err}`);
        throw err;
    } finally {
        poolRequest = null;
    }
};

Thanks for your help!

dhensby commented 2 years ago

You're not actually closing your pools here so you're going to be keeping a lot of connections active until they timeout or the node process is killed.

if you're going to unset the poolRequest variable, then you also need to close the pool, but that may be a problem for any concurrent requests that are using the pool whilst another tries to close it. Pools should only be created at start-up (which you're effectively doing by caching it) and closed when the app shuts-down.

Last note, if you're running concurrency of 100 on the lamda, you may want to bump your max connection pool size up too.

tediousjs / node-mssql