aws-samples / aws-batch-runtime-monitoring

Serverless application to monitor an AWS Batch architecture through dashboards.
MIT No Attribution
56 stars 16 forks source link

Job placement dashboard no longer shows data for executed jobs #20

Closed devendra-d-chavan closed 1 year ago

devendra-d-chavan commented 1 year ago

Problem

A recent change in ECS RunTask Cloudtrail breaks the ability to parse AWS Batch job metadata (like job id, compute environment name, etc.) from environment variables in the ECS RunTask Cloudtrail event as the environment variables are now redacted.

Specifically, the ECS RunTask state machine relies on parsing the following meta data from ECS RunTask Cloudtrail event

Metadata ContainerOverride environment variable
JobId AWS_BATCH_JOB_ID
CEName AWS_BATCH_CE_NAME
JQName AWS_BATCH_JQ_NAME
JobAttempt AWS_BATCH_JOB_ATTEMPT

Redaction of the environment variables breaks this state machine result in an error when the incoming Cloudtrail event is processed by the step function

Sample ECS RunTask event shows the redacted environment variables

{
    "eventVersion": "1.08",
    "userIdentity": {
        "type": "AssumedRole",
        ...
        "invokedBy": "batch.amazonaws.com"
    },
    "eventTime": "2023-06-20T04:34:38Z",
    "eventSource": "ecs.amazonaws.com",
    "eventName": "RunTask",
    "awsRegion": "us-east-1",
    "sourceIPAddress": "batch.amazonaws.com",
    "userAgent": "batch.amazonaws.com",
    "requestParameters": {
        ...
        "overrides": {
            "containerOverrides": [
                {
                    "name": "default",
                    "environment": "HIDDEN_DUE_TO_SECURITY_REASONS",
                    "cpu": 1024,
                    "memory": 128,
                    "resourceRequirements": []
                }
            ]
        },
        "count": 1,
        "launchType": "EC2",
        "tags": [
            {
                "key": "aws:batch:compute-environment",
                "value": "..."
            },
            {
                "key": "aws:batch:job-definition",
                "value": "..."
            },
            {
                "key": "aws:batch:job-queue",
                "value": "..."
            }
        ],
        "cluster": "...",
        "enableExecuteCommand": false,
        "taskDefinition": "...",
        ...
    },
    "responseElements": {
        "failures": [],
        "tasks": [
            {
                "attachments": [],
                "attributes": [
                    {
                        "name": "ecs.cpu-architecture",
                        "value": "x86_64"
                    }
                ],
                "availabilityZone": "us-east-1f",
                "clusterArn": "...",
                "containerInstanceArn": "...",
                "containers": [
                    {
                        "containerArn": "...",
                        "taskArn": "...",
                        "name": "default",
                        "image": "...",
                        "lastStatus": "PENDING",
                        "networkInterfaces": [],
                        "cpu": "0",
                        "memory": "1"
                    }
                ],
                "cpu": "1024",
                "createdAt": "Jun 20, 2023, 4:34:38 AM",
                "desiredStatus": "RUNNING",
                "enableExecuteCommand": false,
                "group": "family:...",
                "lastStatus": "PENDING",
                "launchType": "EC2",
                "memory": "128",
                "overrides": {
                    "containerOverrides": [
                        {
                            "name": "default",
                            "environment": "HIDDEN_DUE_TO_SECURITY_REASONS",
                            "cpu": 1024,
                            "memory": 128,
                            "resourceRequirements": []
                        }
                    ],
                    "inferenceAcceleratorOverrides": []
                },
                "tags": [
                    {
                        "key": "aws:batch:job-queue",
                        "value": "..."
                    },
                    {
                        "key": "aws:batch:compute-environment",
                        "value": "..."
                    },
                    {
                        "key": "aws:batch:job-definition",
                        "value": "..."
                    }
                ],
                "taskArn": "...",
                "taskDefinitionArn": "...",
                "version": 1
            }
        ]
    },
    "requestID": "...",
    "eventID": "...",
    "readOnly": false,
    "eventType": "AwsApiCall",
    "managementEvent": true,
    "recipientAccountId": "...",
    "eventCategory": "Management"
}

This results in the following failure in the Select Common fields step in the ECS RunTask state machine impairing the ability to post the metrics used in the job placement dashboard

{
  "cause": "An error occurred while executing the state 'Select common fields' (entered at the event id #4). The JSONPath '$.detail.requestParameters.overrides.containerOverrides[0].environment[?(@.name=='AWS_BATCH_JOB_ID')].value' specified for the field 'JobId.$' could not be found in the input '...'",
  "error": "States.Runtime"
}

Proposed solution