aws / aws-cdk

The AWS Cloud Development Kit is a framework for defining cloud infrastructure in code
https://aws.amazon.com/cdk
Apache License 2.0
11.52k stars 3.86k forks source link

@aws-cdk/aws-redshift-alpha: lambda timeout on create table with many columns #31329

Open matteo-pirola opened 1 week ago

matteo-pirola commented 1 week ago

Describe the bug

The problem arises while trying to create a redshift table with many columns (in the order of the hundreds) using the Table construct.

During the deployment of the resource (by the means of a CodeBuild process issuing cdk deploy -c env=test), I obtain the following error during the cloudformation CREATE action:

DpSlsMarketFrForecastAndMonitoringRedshiftTables | 3/6 | 7:40:50 AM | CREATE_FAILED        | Custom::RedshiftDatabaseQuery | DpSlsMarketFrForecastMonitoringTable_example_table/Resource/Resource/Default (DpSlsMarketFrForecastMonitoringTableexampletable0E5B78B0) Received response status [FAILED] from custom resource. Message returned: Error: 2024-08-12T07:40:48.077Z df13b1ab-f383-46d6-8d17-079f0f966428 Task timed out after 60.07 seconds
Logs: /aws/lambda/DpSlsMarketFrForecastAndM-QueryRedshiftDatabase3de-k79HUXaOlRym
    at invokeUserFunction (/var/task/framework.js:2:6)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async onEvent (/var/task/framework.js:1:369)
    at async Runtime.handler (/var/task/cfn-response.js:1:1826) (RequestId: ced8d7fe-26fa-4189-a931-b92c26d08b66)

The timeout error is related to a lambda generated automatically by the framework. Such timeout is by default set to 1 minute and I did not find a way to change it programmatically at runtime. Here's the code of the lambda in framework.js which raises the Exception:

"use strict";const cfnResponse=require("./cfn-response"),consts=require("./consts"),outbound_1=require("./outbound"),util_1=require("./util");async function onEvent(cfnRequest){const sanitizedRequest={...cfnRequest,ResponseURL:"..."};(0,util_1.log)("onEventHandler",sanitizedRequest),cfnRequest.ResourceProperties=cfnRequest.ResourceProperties||{};const onEventResult=await invokeUserFunction(consts.USER_ON_EVENT_FUNCTION_ARN_ENV,sanitizedRequest,cfnRequest.ResponseURL);onEventResult?.NoEcho?(0,util_1.log)("redacted onEvent returned:",cfnResponse.redactDataFromPayload(onEventResult)):(0,util_1.log)("onEvent returned:",onEventResult);const resourceEvent=createResponseEvent(cfnRequest,onEventResult);if(onEventResult?.NoEcho?(0,util_1.log)("readacted event:",cfnResponse.redactDataFromPayload(resourceEvent)):(0,util_1.log)("event:",resourceEvent),!process.env[consts.USER_IS_COMPLETE_FUNCTION_ARN_ENV])return cfnResponse.submitResponse("SUCCESS",resourceEvent,{noEcho:resourceEvent.NoEcho});const waiter={stateMachineArn:(0,util_1.getEnv)(consts.WAITER_STATE_MACHINE_ARN_ENV),name:resourceEvent.RequestId,input:JSON.stringify(resourceEvent)};(0,util_1.log)("starting waiter",{stateMachineArn:(0,util_1.getEnv)(consts.WAITER_STATE_MACHINE_ARN_ENV),name:resourceEvent.RequestId}),await(0,outbound_1.startExecution)(waiter)}async function isComplete(event){const sanitizedRequest={...event,ResponseURL:"..."};event?.NoEcho?(0,util_1.log)("redacted isComplete request",cfnResponse.redactDataFromPayload(sanitizedRequest)):(0,util_1.log)("isComplete",sanitizedRequest);const isCompleteResult=await invokeUserFunction(consts.USER_IS_COMPLETE_FUNCTION_ARN_ENV,sanitizedRequest,event.ResponseURL);if(event?.NoEcho?(0,util_1.log)("redacted user isComplete returned:",cfnResponse.redactDataFromPayload(isCompleteResult)):(0,util_1.log)("user isComplete returned:",isCompleteResult),!isCompleteResult.IsComplete)throw isCompleteResult.Data&&Object.keys(isCompleteResult.Data).length>0?new Error('"Data" is not allowed if "IsComplete" is "False"'):new cfnResponse.Retry(JSON.stringify(event));const response={...event,...isCompleteResult,Data:{...event.Data,...isCompleteResult.Data}};await cfnResponse.submitResponse("SUCCESS",response,{noEcho:event.NoEcho})}async function onTimeout(timeoutEvent){(0,util_1.log)("timeoutHandler",timeoutEvent);const isCompleteRequest=JSON.parse(JSON.parse(timeoutEvent.Cause).errorMessage);await cfnResponse.submitResponse("FAILED",isCompleteRequest,{reason:"Operation timed out"})}async function invokeUserFunction(functionArnEnv,sanitizedPayload,responseUrl){const functionArn=(0,util_1.getEnv)(functionArnEnv);(0,util_1.log)(`executing user function ${functionArn} with payload`,sanitizedPayload);const resp=await(0,outbound_1.invokeFunction)({FunctionName:functionArn,Payload:JSON.stringify({...sanitizedPayload,ResponseURL:responseUrl})});(0,util_1.log)("user function response:",resp,typeof resp);const jsonPayload=(0,util_1.parseJsonPayload)(resp.Payload);if(resp.FunctionError){(0,util_1.log)("user function threw an error:",resp.FunctionError);const errorMessage=jsonPayload.errorMessage||"error",arn=functionArn.split(":"),functionName=arn[arn.length-1],message=[errorMessage,"",`Logs: /aws/lambda/${functionName}`,""].join(`
`),e=new Error(message);throw jsonPayload.trace&&(e.stack=[message,...jsonPayload.trace.slice(1)].join(`
`)),e}return jsonPayload}function createResponseEvent(cfnRequest,onEventResult){onEventResult=onEventResult||{};const physicalResourceId=onEventResult.PhysicalResourceId||defaultPhysicalResourceId(cfnRequest);if(cfnRequest.RequestType==="Delete"&&physicalResourceId!==cfnRequest.PhysicalResourceId)throw new Error(`DELETE: cannot change the physical resource ID from "${cfnRequest.PhysicalResourceId}" to "${onEventResult.PhysicalResourceId}" during deletion`);return cfnRequest.RequestType==="Update"&&physicalResourceId!==cfnRequest.PhysicalResourceId&&(0,util_1.log)(`UPDATE: changing physical resource ID from "${cfnRequest.PhysicalResourceId}" to "${onEventResult.PhysicalResourceId}"`),{...cfnRequest,...onEventResult,PhysicalResourceId:physicalResourceId}}function defaultPhysicalResourceId(req){switch(req.RequestType){case"Create":return req.RequestId;case"Update":case"Delete":return req.PhysicalResourceId;default:throw new Error(`Invalid "RequestType" in request "${JSON.stringify(req)}"`)}}module.exports={[consts.FRAMEWORK_ON_EVENT_HANDLER_NAME]:cfnResponse.safeHandler(onEvent),[consts.FRAMEWORK_IS_COMPLETE_HANDLER_NAME]:cfnResponse.safeHandler(isComplete),[consts.FRAMEWORK_ON_TIMEOUT_HANDLER_NAME]:onTimeout};

Moreover, the table creation in redshfit is completed correctly (it looks like the CREATE TABLE instruction is issued and not aborted when the lambda fails), leaving the table untied to the cloudformation stack.

Here's an example of the code I am using:

import aws_cdk as cdk
from aws_cdk import Stack
from aws_cdk.aws_redshift_alpha import Cluster, Column, Table, TableDistStyle, TableSortStyle
from constructs import Construct

class RedshiftCdkStack(Stack):

    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
        ## ...
        ## some stuff here and there
        ## ...

        cluster = Cluster.from_cluster_attributes(
            self,
            "ImportedCluster",
            cluster_name=env_config["clusterName"],
            cluster_endpoint_address=env_config["clusterEndpointAddress"],
            cluster_endpoint_port=env_config["clusterPort"],
        )
        table_columns = [
            Column(
                name="column_1",
                data_type="integer",
                dist_key=False,
                comment="My first column"
            )
        ]

        dist_style = TableDistStyle["KEY"]
        removal_policy = cdk.RemovalPolicy["DESTROY"]
        sort_style = TableSortStyle["AUTO"]

        Table(
            self,
            "MyTableName",
            cluster=cluster,
            database_name="my_db",
            admin_user=my_secret,
            table_columns=table_columns,
            dist_style=dist_style,
            removal_policy=removal_policy,
            sort_style=sort_style,
            table_name="my_schema.my_table",
            table_comment="My first table"
        )

Regression Issue

Last Known Working CDK Version

No response

Expected Behavior

I expect the table to be created successfully in redshift and the table to be part of the cloudformation stack.

Current Behavior

The table is created successfully in redshift but the table is not part of the cloudformation stack.

Reproduction Steps

import aws_cdk as cdk
from aws_cdk import Stack
from aws_cdk.aws_redshift_alpha import Cluster, Column, Table, TableDistStyle, TableSortStyle
from constructs import Construct

class RedshiftCdkStack(Stack):

    def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:

        cluster = Cluster.from_cluster_attributes(
            self,
            "ImportedCluster",
            cluster_name=env_config["clusterName"],
            cluster_endpoint_address=env_config["clusterEndpointAddress"],
            cluster_endpoint_port=env_config["clusterPort"],
        )
        table_columns = [
            Column(
                name=f"column_{i}",
                data_type="integer",
                dist_key=False,
                comment=f"My #{i} column"
            )
            for i in range(1000)
        ]

        dist_style = TableDistStyle["KEY"]
        removal_policy = cdk.RemovalPolicy["DESTROY"]
        sort_style = TableSortStyle["AUTO"]

        Table(
            self,
            "MyTableName",
            cluster=cluster,
            database_name="my_db",
            admin_user=my_secret,
            table_columns=table_columns,
            dist_style=dist_style,
            removal_policy=removal_policy,
            sort_style=sort_style,
            table_name="my_schema.my_table",
            table_comment="My first table"
        )

########
# main #
########

app = cdk.App()
RedshiftCdkStack(app, "MyRedshiftTablesStack",
    env=cdk.Environment(account=os.getenv('AWS_DEFAULT_ACCOUNT'), region='eu-west-1'),
    )
app.synth()

Possible Solution

I think it is sufficient to increase the timeout for the lambda function

Additional Information/Context

No response

CDK CLI Version

2.150.0

Framework Version

No response

Node.js Version

18

OS

Ubuntu

Language

Python

Language Version

No response

Other information

I am using an EC2 instance with the following runtime provided in codebuild

aws/codebuild/standard:7.0 (Ubuntu standard:7.0)

ashishdhingra commented 2 days ago

Reproducible using below code (TypeScript):

import * as cdk from 'aws-cdk-lib';
import { Construct } from 'constructs';
import * as redshift from '@aws-cdk/aws-redshift-alpha';
import * as ec2 from 'aws-cdk-lib/aws-ec2';

export class RedshiftCdkStack extends cdk.Stack {
  constructor(
    scope: Construct,
    id: string,
    props?: cdk.StackProps
  ) {
    super(scope, id, props);

    const vpc = new ec2.Vpc(this, 'RedshiftVpc');

    const cluster = new redshift.Cluster(this, 'RedshiftCluster', {
      vpc: vpc,
      masterUser:  { 
        masterUsername: 'admin'
      },
      removalPolicy: cdk.RemovalPolicy.DESTROY,
      defaultDatabaseName: 'redshiftdb'
    });

    cluster.connections.allowDefaultPortFromAnyIpv4('Open to the world');

    new cdk.CfnOutput(this, 'RedShiftClusterEndpoint', {
      exportName: 'RedShiftClusterEndpoint',
      value: cluster.clusterEndpoint.socketAddress
    });

    const user = new redshift.User(this, 'RedshiftUser', {
      username: 'testredshiftuser',
      cluster: cluster,
      databaseName: 'redshiftdb',
    });

    const tableColumns: redshift.Column[] = [];
    tableColumns.push(
      {
        name: 'column_0',
        dataType: 'integer',
        distKey: true,
        comment: 'My #0 column'
      }
    );

    for (var i = 1; i<= 1000; i++) {
      tableColumns.push({
        name: `column_${i}`,
        dataType: 'integer',
        distKey: false,
        comment: `My #${i} column`
      });
    }

    const table = new redshift.Table(this, 'RedshiftTable', {
      cluster: cluster,
      databaseName: 'redshiftdb',
      tableColumns: tableColumns,
      distStyle: redshift.TableDistStyle.KEY,
      removalPolicy: cdk.RemovalPolicy.DESTROY,
      sortStyle: redshift.TableSortStyle.AUTO,
      tableName: 'my_redshift_table',
      tableComment: 'My first table'
    });

    table.grant(user, redshift.TableAction.ALL);
  }
}

Gives the below error during deployment:

11:10:41 AM | CREATE_FAILED        | Custom::RedshiftDatabaseQuery               | RedshiftTable3DF2B3B1
Received response status [FAILED] from custom resource. Message returned: Error: RequestId: 1d0b3fd7-f29f-4b8b-905b-946d33155884 Error: Task timed out after 60.00 seconds

Logs: /aws/lambda/RedshiftCdkStack-QueryRedshiftDatabase3de5bea727da-7EJlN1dOMyCq

at invokeUserFunction (/var/task/framework.js:2:6)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async onEvent (/var/task/framework.js:1:369)
at async Runtime.handler (/var/task/cfn-response.js:1:1826) (RequestId: fbd6ca02-8a10-4f80-ae16-706f0241125a)

 ❌  RedshiftCdkStack failed: Error: The stack named RedshiftCdkStack failed creation, it may need to be manually deleted from the AWS console: ROLLBACK_COMPLETE: Received response status [FAILED] from custom resource. Message returned: Error: RequestId: 1d0b3fd7-f29f-4b8b-905b-946d33155884 Error: Task timed out after 60.00 seconds

Looking at the synthesized template, the below custom resource has timeout set to 60 (whereas other custom resources have timout of 900):

...
"QueryRedshiftDatabase3de5bea727da479686625efb56431b5f3DF81997": {
   "Type": "AWS::Lambda::Function",
   "Properties": {
    "Code": {
     "S3Bucket": "cdk-hnb659fds-assets-139480602983-us-east-2",
     "S3Key": "6bdd909f81c84ffe7d00cf4d6a2dbac8606429bcc05b0db3da842c1941a532f2.zip"
    },
    "Handler": "index.handler",
    "Role": {
     "Fn::GetAtt": [
      "QueryRedshiftDatabase3de5bea727da479686625efb56431b5fServiceRole0A90D717",
      "Arn"
     ]
    },
    "Runtime": "nodejs20.x",
    "Timeout": 60
   },
   "DependsOn": [
    "QueryRedshiftDatabase3de5bea727da479686625efb56431b5fServiceRoleDefaultPolicyDDD1388D",
    "QueryRedshiftDatabase3de5bea727da479686625efb56431b5fServiceRole0A90D717"
   ],
   "Metadata": {
    "aws:cdk:path": "RedshiftCdkStack/Query Redshift Database3de5bea727da479686625efb56431b5f/Resource",
    "aws:asset:path": "asset.6bdd909f81c84ffe7d00cf4d6a2dbac8606429bcc05b0db3da842c1941a532f2",
    "aws:asset:is-bundled": false,
    "aws:asset:property": "Code"
   }
  },
...

Looks like the SingletonFunction used here in DatabaseQuery has hardcoded timeout to 1 minute.

Perhaps the fix could be to expose handlerTimout property in DatabaseQueryHandlerProps which corresponds to Lambda function timeout property and use it's value to customize timeout. Unsure if this should be exposed at table level since end user should not be aware/concerned of any custom resource handlers used behind the scenes.