Closed achave11-ucsc closed 8 months ago
@nadove-ucsc: "Spike to correlate the failed request with an invocation of the service lambda in CloudWatch Log Insights. Since the lambda timed out, the log entry may be missing from ApiGateway."
This is a similar behavior to what's observed in Lambda azul-service-dev timesout and causes an alarm
#5472.
The service lambda has a execution timeout of 31 secs, and it may take longer than 10s to obtain /head responses from the API endpoints, 10s is the servicecachehealth
execution timeout.
CloudWatch Logs Insights
region: us-east-1
log-group-names: /aws/lambda/azul-service-anvilprod-servicecachehealth, /aws/lambda/azul-service-anvilprod
start-time: 2023-08-11T15:28:29.000Z
end-time: 2023-08-11T15:29:09.000Z
query-string:
fields @timestamp, @message
| filter @message like /START|HEAD|REPORT/
| sort @timestamp asc
| limit 50
@timestamp | @message |
---|---|
2023-08-11 15:28:37.986 | START RequestId: f3feb75c-6d88-4a66-9534-d82309b5ffd1 Version: $LATEST |
2023-08-11 15:28:38.623 | START RequestId: ccfc09f7-47d6-4057-94ca-6a4093fe5b0e Version: $LATEST |
2023-08-11 15:28:38.626 | [INFO] 2023-08-11T15:28:38.624Z ccfc09f7-47d6-4057-94ca-6a4093fe5b0e azul.chalice Received HEAD request for '/index/datasets', with {"query": {"size": "1"}, "headers": {"accept": "/", "accept-encoding": "gzip, deflate", "cloudfront-forwarded-proto": "https", "cloudfront-is-desktop-viewer": "true", "cloudfront-is-mobile-viewer": "false", "cloudfront-is-smarttv-viewer": "false", "cloudfront-is-tablet-viewer": "false", "cloudfront-viewer-asn": "14618", "cloudfront-viewer-country": "US", "host": "service.prod.anvil.gi.ucsc.edu", "user-agent": "python-requests/2.31.0", "via": "1.1 2f66aa06710fece8ed203ab0ea81eb56.cloudfront.net (CloudFront)", "x-amz-cf-id": "TTN-ca7DEX86W3d1YwWZMvhzggTdRbNy-OUP5XPR8JjrgDfLiwEI2w==", "x-amzn-trace-id": "Root=1-64d653a6-3af69e3067c2d14c0b13975b", "x-forwarded-for": "35.168.152.160, 130.176.98.154", "x-forwarded-port": "443", "x-forwarded-proto": "https"}}. |
2023-08-11 15:28:38.729 | START RequestId: dca268db-84ff-4549-9bec-47aa109b0d39 Version: $LATEST |
2023-08-11 15:28:38.729 | [INFO] 2023-08-11T15:28:38.729Z dca268db-84ff-4549-9bec-47aa109b0d39 azul.chalice Received HEAD request for '/index/activities', with {"query": {"size": "1"}, "headers": {"accept": "/", "accept-encoding": "gzip, deflate", "cloudfront-forwarded-proto": "https", "cloudfront-is-desktop-viewer": "true", "cloudfront-is-mobile-viewer": "false", "cloudfront-is-smarttv-viewer": "false", "cloudfront-is-tablet-viewer": "false", "cloudfront-viewer-asn": "14618", "cloudfront-viewer-country": "US", "host": "service.prod.anvil.gi.ucsc.edu", "user-agent": "python-requests/2.31.0", "via": "1.1 e3e94284a800d30d02bd662be67e1bf2.cloudfront.net (CloudFront)", "x-amz-cf-id": "ocERjZmXR9yUTD6zZraMTFRzHrOelHU3BeTAZvwUAXwl15eqbI1SuA==", "x-amzn-trace-id": "Root=1-64d653a6-455f98b1610609da18e54a3e", "x-forwarded-for": "35.168.152.160, 130.176.98.85", "x-forwarded-port": "443", "x-forwarded-proto": "https"}}. |
2023-08-11 15:28:38.811 | START RequestId: a8ba3c0f-65fa-4bb3-948d-da2052e9b33a Version: $LATEST |
2023-08-11 15:28:38.812 | [INFO] 2023-08-11T15:28:38.812Z a8ba3c0f-65fa-4bb3-948d-da2052e9b33a azul.chalice Received HEAD request for '/index/donors', with {"query": {"size": "1"}, "headers": {"accept": "/", "accept-encoding": "gzip, deflate", "cloudfront-forwarded-proto": "https", "cloudfront-is-desktop-viewer": "true", "cloudfront-is-mobile-viewer": "false", "cloudfront-is-smarttv-viewer": "false", "cloudfront-is-tablet-viewer": "false", "cloudfront-viewer-asn": "14618", "cloudfront-viewer-country": "US", "host": "service.prod.anvil.gi.ucsc.edu", "user-agent": "python-requests/2.31.0", "via": "1.1 157ebd6865840045fc8b5ed1cce7e466.cloudfront.net (CloudFront)", "x-amz-cf-id": "BZMFJdu21Pi_Lm9oT4JqVwtB7rv8xkEupTvPu0uSogJelHR-Z4JuSg==", "x-amzn-trace-id": "Root=1-64d653a6-58743e3108b286402d7630d4", "x-forwarded-for": "35.168.152.160, 130.176.98.142", "x-forwarded-port": "443", "x-forwarded-proto": "https"}}. |
2023-08-11 15:28:38.831 | START RequestId: bb742e51-9730-4635-b5f5-52133b4d239c Version: $LATEST |
2023-08-11 15:28:38.835 | [INFO] 2023-08-11T15:28:38.832Z bb742e51-9730-4635-b5f5-52133b4d239c azul.chalice Received HEAD request for '/index/biosamples', with {"query": {"size": "1"}, "headers": {"accept": "/", "accept-encoding": "gzip, deflate", "cloudfront-forwarded-proto": "https", "cloudfront-is-desktop-viewer": "true", "cloudfront-is-mobile-viewer": "false", "cloudfront-is-smarttv-viewer": "false", "cloudfront-is-tablet-viewer": "false", "cloudfront-viewer-asn": "14618", "cloudfront-viewer-country": "US", "host": "service.prod.anvil.gi.ucsc.edu", "user-agent": "python-requests/2.31.0", "via": "1.1 ffa4b37ccdc94a8c62bf6b6414725210.cloudfront.net (CloudFront)", "x-amz-cf-id": "SkHSMFRSJY0En2S_jdgGAYrIweN-uRSTeoJ68UJz3sJEmeOxE7WM3g==", "x-amzn-trace-id": "Root=1-64d653a6-14ecea643dc679562802a771", "x-forwarded-for": "35.168.152.160, 130.176.98.139", "x-forwarded-port": "443", "x-forwarded-proto": "https"}}. |
2023-08-11 15:28:38.915 | START RequestId: 59cfe9e5-d3de-48db-87a5-1786b1bab6cc Version: $LATEST |
2023-08-11 15:28:38.918 | [INFO] 2023-08-11T15:28:38.916Z 59cfe9e5-d3de-48db-87a5-1786b1bab6cc azul.chalice Received HEAD request for '/index/files', with {"query": {"size": "1"}, "headers": {"accept": "/", "accept-encoding": "gzip, deflate", "cloudfront-forwarded-proto": "https", "cloudfront-is-desktop-viewer": "true", "cloudfront-is-mobile-viewer": "false", "cloudfront-is-smarttv-viewer": "false", "cloudfront-is-tablet-viewer": "false", "cloudfront-viewer-asn": "14618", "cloudfront-viewer-country": "US", "host": "service.prod.anvil.gi.ucsc.edu", "user-agent": "python-requests/2.31.0", "via": "1.1 b471d3775e81a9be536b52b99f39452a.cloudfront.net (CloudFront)", "x-amz-cf-id": "RP0UFv01s7G8NsgoEqxLm-MD-VJEKD4_n6cB0DleM41Ws9KeU10rlw==", "x-amzn-trace-id": "Root=1-64d653a6-6899384a697c2ea654b68edb", "x-forwarded-for": "35.168.152.160, 130.176.98.76", "x-forwarded-port": "443", "x-forwarded-proto": "https"}}. |
2023-08-11 15:28:48.006 | REPORT RequestId: f3feb75c-6d88-4a66-9534-d82309b5ffd1 Duration: 10020.03 ms Billed Duration: 10000 ms Memory Size: 128 MB Max Memory Used: 122 MB |
2023-08-11 15:28:48.159 | INIT_START Runtime Version: python:3.9.v26 Runtime Version ARN: arn:aws:lambda:us-east-1::runtime:130681a0855afedf31b2b3fbcc2fbf1ca62875e0500edb56fd16cad65045b05b |
2023-08-11 15:28:49.747 | REPORT RequestId: bb742e51-9730-4635-b5f5-52133b4d239c Duration: 10915.00 ms Billed Duration: 10916 ms Memory Size: 2048 MB Max Memory Used: 149 MB |
2023-08-11 15:28:49.832 | REPORT RequestId: 59cfe9e5-d3de-48db-87a5-1786b1bab6cc Duration: 10916.96 ms Billed Duration: 10917 ms Memory Size: 2048 MB Max Memory Used: 158 MB |
2023-08-11 15:28:50.519 | REPORT RequestId: a8ba3c0f-65fa-4bb3-948d-da2052e9b33a Duration: 11707.90 ms Billed Duration: 11708 ms Memory Size: 2048 MB Max Memory Used: 165 MB |
2023-08-11 15:28:53.711 | REPORT RequestId: ccfc09f7-47d6-4057-94ca-6a4093fe5b0e Duration: 15088.10 ms Billed Duration: 15089 ms Memory Size: 2048 MB Max Memory Used: 160 MB |
2023-08-11 15:28:53.809 | REPORT RequestId: dca268db-84ff-4549-9bec-47aa109b0d39 Duration: 15080.19 ms Billed Duration: 15081 ms Memory Size: 2048 MB Max Memory Used: 215 MB |
2023-08-11 15:28:55.416 | START RequestId: 466ba2a9-0366-47ac-adb2-2c6c3f39a257 Version: $LATEST |
2023-08-11 15:28:55.417 | START RequestId: 0792fe28-c7d5-4435-b154-4c0339fe9089 Version: $LATEST |
2023-08-11 15:28:55.601 | REPORT RequestId: 466ba2a9-0366-47ac-adb2-2c6c3f39a257 Duration: 184.29 ms Billed Duration: 185 ms Memory Size: 2048 MB Max Memory Used: 215 MB |
2023-08-11 15:28:55.804 | REPORT RequestId: 0792fe28-c7d5-4435-b154-4c0339fe9089 Duration: 387.19 ms Billed Duration: 388 ms Memory Size: 2048 MB Max Memory Used: 170 MB |
This might be the same underlying problem as in https://github.com/DataBiosphere/azul/issues/5472#issuecomment-1681106348.
The multiple request to the various /index/{entity_type}
endpoints took longer than expected, because they received timeouts from the /api/repository/v1/snapshots/roleMap
request that the service makes. This requests to Terra timed out after 5 seconds, and each was retried at least twice, with each having a timeout of 5 seconds, two of those retries already puts the servicecache
lambda beyond its execution time limit.
This may also be affecting azul-indexer-anvildev-indexercachehealth
, which recently failed due to a Lambda execution timeout.
Around this time, the GL anvildev
instance was being rebooted as part of scheduled volume backup, but I'm unsure that it's related.
@nadove-ucsc: "Logs show that during the past weeks, when there is a delayed response from TDR the delay doesn't exceed 12 seconds. Because the TDR client is configured to attempt a maximum of two connect retries each with a five second timeout, the maximum latency we can observe from TDR is 15 seconds. The service cache health lambda has a timeout of 10 seconds, meaning that it may timeout even when the service lambda is within the bounds of its expected latency. We either need to increase the timeout of the service cache Lambda or reduce the timeout of the TDR retries."
Degraded performance from TDR (503 responses) also causes the servicecache
lambda to timeout.
@hannes-ucsc: "Assignee to determine how often the service lambda times out in response to requests made by the servicecachehealth lambda, in terms of a ratio between number of total requests and timed out requests"
From 08.01.23 to 09.07.23, the following number of azul-service-anvilprod-servicecachehealth
executions failed…
[
{
"bin(24h)": "2023-09-05 00:00:00.000",
"count(@requestId)": "121"
},
{
"bin(24h)": "2023-08-30 00:00:00.000",
"count(@requestId)": "2"
},
{
"bin(24h)": "2023-08-29 00:00:00.000",
"count(@requestId)": "1"
},
{
"bin(24h)": "2023-08-24 00:00:00.000",
"count(@requestId)": "114"
},
{
"bin(24h)": "2023-08-11 00:00:00.000",
"count(@requestId)": "2"
},
{
"bin(24h)": "2023-08-08 00:00:00.000",
"count(@requestId)": "1"
}
]
… note that, approximately 1440 service/indexer cachehealth executions run on a daily basis, with each execution making five service requests (one per entity endpoint). For the same timeframe, the following service executions timeout…
[
{
"bin(24h)": "2023-09-05 00:00:00.000",
"count(@requestId)": "2"
},
{
"bin(24h)": "2023-08-30 00:00:00.000",
"count(@requestId)": "2"
},
{
"bin(24h)": "2023-08-10 00:00:00.000",
"count(@requestId)": "1"
},
{
"bin(24h)": "2023-08-09 00:00:00.000",
"count(@requestId)": "3"
},
{
"bin(24h)": "2023-08-08 00:00:00.000",
"count(@requestId)": "3"
}
]
… and these, the count of the service executions that took longer than 10s …
[
{
"bin(24h)": "2023-09-05 00:00:00.000",
"count(@requestId)": "590"
},
{
"bin(24h)": "2023-08-30 00:00:00.000",
"count(@requestId)": "2"
},
{
"bin(24h)": "2023-08-24 00:00:00.000",
"count(@requestId)": "572"
},
{
"bin(24h)": "2023-08-18 00:00:00.000",
"count(@requestId)": "1"
},
{
"bin(24h)": "2023-08-14 00:00:00.000",
"count(@requestId)": "1"
},
{
"bin(24h)": "2023-08-11 00:00:00.000",
"count(@requestId)": "9"
},
{
"bin(24h)": "2023-08-10 00:00:00.000",
"count(@requestId)": "1"
},
{
"bin(24h)": "2023-08-09 00:00:00.000",
"count(@requestId)": "4"
},
{
"bin(24h)": "2023-08-08 00:00:00.000",
"count(@requestId)": "6"
}
]
@hannes-ucsc: "I'd like to tackle this by optimizing the cachehealth Lambda and our retry policy with the goal of reducing the number of false alarms to an acceptably low level: I don't think we need to make a service request for every entity type, (which also eliminates the need for a thread pool) and we shouldn't retry on read when the timing is restricted. The use of a thread pool for making service requests in parallel also subverts our caching of accessible sources."
For demo, show that there was a reduction in the number of occurrences of this issue using CW, for a period of two weeks before and after this lands in a lower main deployment.
@hannes-ucsc: "The changes from partial PR #5564 significantly reduced the number of occurrences of this issue. We just observed one timeout in dev
and one in anvildev
, but we don't know exactly what the lambda was doing when it timed out. It could have been interacting with SQS or writing the Health object back to S3. We won't know until we enable boto3 logging for the S3 and SQS clients, similar to how we enable it for DynamoDb in PR #5479 for #5472. Additionally, we observed that the order in which Health properties are evaluated is non-deterministic. We should fix that too. We also observed that the servicecachealth
lambda is retried by default. We should disable the retry because the lambda is scheduled to run every minute anyways. Disabling retries cannot be done with Chalice, but through Terraform (aws_lambda_function_event_invoke_config
) like we do for the log forwarder lambdas."
Also, update the timeout for the servicecachehealth
lambda from 10 to 30 seconds.
@hannes-ucsc: "We've already observed a reduction in number of occurrences, so no demo will be necessary."
For demo, two weeks after this lands in anvilprod
or prod
, show the reduction in the incident rate of the corresponding alarm.
Assignee to consider next steps in light of updated description.
Looking at the invocation that timed out, the one from the updated description: Memory is consistently high and close to the maximum allowed memory. The execution context is fairly old when the invocation timed out ("Init Duration" in the REPORT message marks the first invocation in an execution context). The invocation immediately following the timeout also takes quite a long time (10s). It is possible that the Lambda was just GCing (and, as part of that, calling destructors with side effects, potentially for a large number of objects) when the the timeout occurred. That may be an indication of a some sort of resource leak in the Lambda.
Now that the timing is more relaxed the prevalence of these timeouts may be much reduced. If this occurs again and frequently enough we should consider bumping the max allowed memory to, say, 196 MiB. We could also try initiating GC at the beginning of every invocation.
@hannes-ucsc: "@achave11-ucsc and I research another occurrence. We looked at the S3 data events in the trail and could not find a data event for the boto3 PUT request that timed out. We also looked at the VPC flow logs between the Lambda and the NAT Gateway as well as the NAT Gateway and the external IPs but could not find anything out of the ordinary, specifically no dropped or rejected connection. While it's still possible that the network is flaky, the garbage collection hypothesis is more likely."
Actually, googling around, there are reports of intermittent connection errors between a Lambda function in a VPC and S3, and of the fact that enabling a VPC endpoint for S3 fixes them. Currently, all traffic from our Lambda functions and AWS services is routed via a NAT gateway and that seems to be causing the flakiness.
There are two types of endpoints to choose from for S3, gateway endpoints and interface endpoints. The former operate at the routing level and, unlike the latter, don't require configuring clients to use a dedicated hostname for the endpoint. It appears that interface endpoints function like load balancers or proxies and they do require special configuration for clients.
Gateway endpoints are free, interface endpoints are not, but still four times cheaper than NAT gateways. On the other hand, gateway endpoints are currently only available for S3 and DynamoDB. We should enable gateway endpoints for both to resolve this issue, and follow up with enabling interface endpoints for other AWS services heavily used by our Lambda functions, SQS comes to mind. We should also double check that traffic between the Lambda functions and the ES domains aren't routed through the NAT gateway.
On the other hand, gateway endpoints are currently only available for S3 and DynamoDB. We should enable gateway endpoints for both to resolve this issue,
Assignee to do that.
We should also double check that traffic between the Lambda functions and the ES domains aren't routed through the NAT gateway.
Assignee to run the VPC Reachability Analyzer for that. They may need to coordinate with me for assistance.
VPC Reachability Analyzer findings:
The analyzer was ran from the Lambda (in personal deployment) directly to the IP from the ES sandbox
instance. The IP for the ES instance was obtained by running the host
command on the VPC domain endpoint.
$ host vpc-azul-index-sandbox-mcwjphhhdivigzrsrdmxm2uude.us-east-1.es.amazonaws.com
vpc-azul-index-sandbox-mcwjphhhdivigzrsrdmxm2uude.us-east-1.es.amazonaws.com has address 172.71.0.63
vpc-azul-index-sandbox-mcwjphhhdivigzrsrdmxm2uude.us-east-1.es.amazonaws.com has address 172.71.2.113
The results exhibit the desired behavior, traffic between Lambda and ES domain not being routed through the NAT gateway:
aws ec2 describe-network-insights-analyses --network-insights-analysis-id nia-0579c6e281a3d9644
{
"NetworkInsightsAnalyses": [
{
"NetworkInsightsAnalysisId": "nia-0579c6e281a3d9644",
"NetworkInsightsAnalysisArn": "arn:aws:ec2:us-east-1:122796619775:network-insights-analysis/nia-0579c6e281a3d9644",
"NetworkInsightsPathId": "nip-0cea7cb0c1d9a3398",
"StartDate": "2024-01-30T20:44:11.674Z",
"Status": "succeeded",
"NetworkPathFound": true,
"ForwardPathComponents": [
{
"SequenceNumber": 1,
"Component": {
"Id": "eni-00ea1974cae084df4",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-00ea1974cae084df4"
},
"OutboundHeader": {
"DestinationAddresses": [
"172.71.2.113/32"
],
"DestinationPortRanges": [
{
"From": 443,
"To": 443
}
],
"Protocol": "6",
"SourceAddresses": [
"172.71.2.109/32"
],
"SourcePortRanges": [
{
"From": 0,
"To": 65535
}
]
},
"Subnet": {
"Id": "subnet-03e66f953199f819b",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-03e66f953199f819b",
"Name": "azul-gitlab_private_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 2,
"Component": {
"Id": "sg-016f9bece0ab8f7c7",
"Arn": "arn:aws:ec2:us-east-1:122796619775:security-group/sg-016f9bece0ab8f7c7",
"Name": "azul-service-abrahamsc"
},
"SecurityGroupRule": {
"Cidr": "0.0.0.0/0",
"Direction": "egress",
"Protocol": "all"
}
},
{
"SequenceNumber": 3,
"Component": {
"Id": "sg-074d6489649695b48",
"Arn": "arn:aws:ec2:us-east-1:122796619775:security-group/sg-074d6489649695b48",
"Name": "azul-elasticsearch-sandbox"
},
"SecurityGroupRule": {
"Cidr": "172.71.0.0/16",
"Direction": "ingress",
"PortRange": {
"From": 443,
"To": 443
},
"Protocol": "tcp"
}
},
{
"SequenceNumber": 4,
"Component": {
"Id": "eni-0607e9a8015369a0e",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-0607e9a8015369a0e"
},
"InboundHeader": {
"DestinationAddresses": [
"172.71.2.113/32"
],
"DestinationPortRanges": [
{
"From": 443,
"To": 443
}
],
"Protocol": "6",
"SourceAddresses": [
"172.71.2.109/32"
],
"SourcePortRanges": [
{
"From": 0,
"To": 65535
}
]
},
"Subnet": {
"Id": "subnet-03e66f953199f819b",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-03e66f953199f819b",
"Name": "azul-gitlab_private_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
}
],
"ReturnPathComponents": [
{
"SequenceNumber": 1,
"Component": {
"Id": "eni-0607e9a8015369a0e",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-0607e9a8015369a0e"
},
"OutboundHeader": {
"DestinationAddresses": [
"172.71.2.109/32"
],
"DestinationPortRanges": [
{
"From": 0,
"To": 65535
}
],
"Protocol": "6",
"SourceAddresses": [
"172.71.2.113/32"
],
"SourcePortRanges": [
{
"From": 443,
"To": 443
}
]
},
"Subnet": {
"Id": "subnet-03e66f953199f819b",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-03e66f953199f819b",
"Name": "azul-gitlab_private_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 2,
"Component": {
"Id": "sg-074d6489649695b48",
"Arn": "arn:aws:ec2:us-east-1:122796619775:security-group/sg-074d6489649695b48",
"Name": "azul-elasticsearch-sandbox"
}
},
{
"SequenceNumber": 3,
"Component": {
"Id": "sg-016f9bece0ab8f7c7",
"Arn": "arn:aws:ec2:us-east-1:122796619775:security-group/sg-016f9bece0ab8f7c7",
"Name": "azul-service-abrahamsc"
}
},
{
"SequenceNumber": 4,
"Component": {
"Id": "eni-00ea1974cae084df4",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-00ea1974cae084df4"
},
"InboundHeader": {
"DestinationAddresses": [
"172.71.2.109/32"
],
"DestinationPortRanges": [
{
"From": 0,
"To": 65535
}
],
"Protocol": "6",
"SourceAddresses": [
"172.71.2.113/32"
],
"SourcePortRanges": [
{
"From": 443,
"To": 443
}
]
},
"Subnet": {
"Id": "subnet-03e66f953199f819b",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-03e66f953199f819b",
"Name": "azul-gitlab_private_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
}
],
"Tags": []
}
]
}
For good measure, we also analyzed the behavior of the connection between the Lambda and a known public IP (8.8.8.8) which exhibits the behavior of the traffic being routed through the NAT:
aws ec2 describe-network-insights-analyses --network-insights-analysis-id nia-004e5d470a71d66d3
{
"NetworkInsightsAnalyses": [
{
"NetworkInsightsAnalysisId": "nia-004e5d470a71d66d3",
"NetworkInsightsAnalysisArn": "arn:aws:ec2:us-east-1:122796619775:network-insights-analysis/nia-004e5d470a71d66d3",
"NetworkInsightsPathId": "nip-0da3b654edc465468",
"StartDate": "2024-01-30T20:47:13.330Z",
"Status": "succeeded",
"NetworkPathFound": true,
"ForwardPathComponents": [
{
"SequenceNumber": 1,
"Component": {
"Id": "eni-00ea1974cae084df4",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-00ea1974cae084df4"
},
"OutboundHeader": {
"DestinationAddresses": [
"8.8.8.8/32"
],
"DestinationPortRanges": [
{
"From": 0,
"To": 65535
}
],
"Protocol": "6",
"SourceAddresses": [
"172.71.2.109/32"
],
"SourcePortRanges": [
{
"From": 0,
"To": 65535
}
]
},
"Subnet": {
"Id": "subnet-03e66f953199f819b",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-03e66f953199f819b",
"Name": "azul-gitlab_private_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 2,
"Component": {
"Id": "sg-016f9bece0ab8f7c7",
"Arn": "arn:aws:ec2:us-east-1:122796619775:security-group/sg-016f9bece0ab8f7c7",
"Name": "azul-service-abrahamsc"
},
"SecurityGroupRule": {
"Cidr": "0.0.0.0/0",
"Direction": "egress",
"Protocol": "all"
}
},
{
"SequenceNumber": 3,
"AclRule": {
"Cidr": "0.0.0.0/0",
"Egress": true,
"Protocol": "all",
"RuleAction": "allow",
"RuleNumber": 100
},
"Component": {
"Id": "acl-05c982d914728ee6a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-acl/acl-05c982d914728ee6a"
}
},
{
"SequenceNumber": 4,
"Component": {
"Id": "rtb-03d5e85affa440cbe",
"Arn": "arn:aws:ec2:us-east-1:122796619775:route-table/rtb-03d5e85affa440cbe",
"Name": "azul-gitlab_1"
},
"RouteTableRoute": {
"DestinationCidr": "0.0.0.0/0",
"NatGatewayId": "nat-065371190014e1037",
"Origin": "createroute",
"State": "active"
}
},
{
"SequenceNumber": 5,
"AclRule": {
"Cidr": "0.0.0.0/0",
"Egress": false,
"Protocol": "all",
"RuleAction": "allow",
"RuleNumber": 100
},
"Component": {
"Id": "acl-05c982d914728ee6a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-acl/acl-05c982d914728ee6a"
}
},
{
"SequenceNumber": 6,
"AttachedTo": {
"Id": "nat-065371190014e1037",
"Arn": "arn:aws:ec2:us-east-1:122796619775:natgateway/nat-065371190014e1037"
},
"Component": {
"Id": "eni-0becd601af14ac6eb",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-0becd601af14ac6eb"
},
"Subnet": {
"Id": "subnet-06a1e10523948ab2a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-06a1e10523948ab2a",
"Name": "azul-gitlab_public_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 7,
"Component": {
"Id": "nat-065371190014e1037",
"Arn": "arn:aws:ec2:us-east-1:122796619775:natgateway/nat-065371190014e1037"
},
"OutboundHeader": {
"SourceAddresses": [
"172.71.3.245/32"
],
"SourcePortRanges": [
{
"From": 1024,
"To": 65535
}
]
}
},
{
"SequenceNumber": 8,
"AttachedTo": {
"Id": "nat-065371190014e1037",
"Arn": "arn:aws:ec2:us-east-1:122796619775:natgateway/nat-065371190014e1037"
},
"Component": {
"Id": "eni-0becd601af14ac6eb",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-0becd601af14ac6eb"
},
"Subnet": {
"Id": "subnet-06a1e10523948ab2a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-06a1e10523948ab2a",
"Name": "azul-gitlab_public_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 9,
"AclRule": {
"Cidr": "0.0.0.0/0",
"Egress": true,
"Protocol": "all",
"RuleAction": "allow",
"RuleNumber": 100
},
"Component": {
"Id": "acl-05c982d914728ee6a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-acl/acl-05c982d914728ee6a"
}
},
{
"SequenceNumber": 10,
"Component": {
"Id": "rtb-0bf39b4e467da13db",
"Arn": "arn:aws:ec2:us-east-1:122796619775:route-table/rtb-0bf39b4e467da13db"
},
"RouteTableRoute": {
"DestinationCidr": "0.0.0.0/0",
"GatewayId": "igw-0f4231a8570673a3d",
"Origin": "createroute",
"State": "active"
}
},
{
"SequenceNumber": 11,
"Component": {
"Id": "igw-0f4231a8570673a3d",
"Arn": "arn:aws:ec2:us-east-1:122796619775:internet-gateway/igw-0f4231a8570673a3d",
"Name": "azul-gitlab"
},
"OutboundHeader": {
"DestinationAddresses": [
"8.8.8.8/32"
],
"DestinationPortRanges": [
{
"From": 0,
"To": 65535
}
],
"Protocol": "6",
"SourceAddresses": [
"3.214.122.182/32"
],
"SourcePortRanges": [
{
"From": 1024,
"To": 65535
}
]
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
}
],
"ReturnPathComponents": [
{
"SequenceNumber": 1,
"Component": {
"Id": "igw-0f4231a8570673a3d",
"Arn": "arn:aws:ec2:us-east-1:122796619775:internet-gateway/igw-0f4231a8570673a3d",
"Name": "azul-gitlab"
},
"OutboundHeader": {
"DestinationAddresses": [
"172.71.3.245/32"
]
},
"InboundHeader": {
"DestinationAddresses": [
"3.214.122.182/32"
],
"DestinationPortRanges": [
{
"From": 1024,
"To": 65535
}
],
"Protocol": "6",
"SourceAddresses": [
"8.8.8.8/32"
],
"SourcePortRanges": [
{
"From": 0,
"To": 65535
}
]
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 2,
"AclRule": {
"Cidr": "0.0.0.0/0",
"Egress": false,
"Protocol": "all",
"RuleAction": "allow",
"RuleNumber": 100
},
"Component": {
"Id": "acl-05c982d914728ee6a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-acl/acl-05c982d914728ee6a"
}
},
{
"SequenceNumber": 3,
"AttachedTo": {
"Id": "nat-065371190014e1037",
"Arn": "arn:aws:ec2:us-east-1:122796619775:natgateway/nat-065371190014e1037"
},
"Component": {
"Id": "eni-0becd601af14ac6eb",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-0becd601af14ac6eb"
},
"Subnet": {
"Id": "subnet-06a1e10523948ab2a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-06a1e10523948ab2a",
"Name": "azul-gitlab_public_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 4,
"Component": {
"Id": "nat-065371190014e1037",
"Arn": "arn:aws:ec2:us-east-1:122796619775:natgateway/nat-065371190014e1037"
},
"OutboundHeader": {
"DestinationAddresses": [
"172.71.2.109/32"
],
"DestinationPortRanges": [
{
"From": 0,
"To": 65535
}
]
}
},
{
"SequenceNumber": 5,
"AttachedTo": {
"Id": "nat-065371190014e1037",
"Arn": "arn:aws:ec2:us-east-1:122796619775:natgateway/nat-065371190014e1037"
},
"Component": {
"Id": "eni-0becd601af14ac6eb",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-0becd601af14ac6eb"
},
"Subnet": {
"Id": "subnet-06a1e10523948ab2a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-06a1e10523948ab2a",
"Name": "azul-gitlab_public_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
},
{
"SequenceNumber": 6,
"AclRule": {
"Cidr": "0.0.0.0/0",
"Egress": true,
"Protocol": "all",
"RuleAction": "allow",
"RuleNumber": 100
},
"Component": {
"Id": "acl-05c982d914728ee6a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-acl/acl-05c982d914728ee6a"
}
},
{
"SequenceNumber": 7,
"Component": {
"Id": "rtb-0bf39b4e467da13db",
"Arn": "arn:aws:ec2:us-east-1:122796619775:route-table/rtb-0bf39b4e467da13db"
},
"RouteTableRoute": {
"DestinationCidr": "172.71.0.0/16",
"GatewayId": "local",
"Origin": "createroutetable",
"State": "active"
}
},
{
"SequenceNumber": 8,
"AclRule": {
"Cidr": "0.0.0.0/0",
"Egress": false,
"Protocol": "all",
"RuleAction": "allow",
"RuleNumber": 100
},
"Component": {
"Id": "acl-05c982d914728ee6a",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-acl/acl-05c982d914728ee6a"
}
},
{
"SequenceNumber": 9,
"Component": {
"Id": "sg-016f9bece0ab8f7c7",
"Arn": "arn:aws:ec2:us-east-1:122796619775:security-group/sg-016f9bece0ab8f7c7",
"Name": "azul-service-abrahamsc"
}
},
{
"SequenceNumber": 10,
"Component": {
"Id": "eni-00ea1974cae084df4",
"Arn": "arn:aws:ec2:us-east-1:122796619775:network-interface/eni-00ea1974cae084df4"
},
"InboundHeader": {
"DestinationAddresses": [
"172.71.2.109/32"
],
"DestinationPortRanges": [
{
"From": 0,
"To": 65535
}
],
"Protocol": "6",
"SourceAddresses": [
"8.8.8.8/32"
],
"SourcePortRanges": [
{
"From": 0,
"To": 65535
}
]
},
"Subnet": {
"Id": "subnet-03e66f953199f819b",
"Arn": "arn:aws:ec2:us-east-1:122796619775:subnet/subnet-03e66f953199f819b",
"Name": "azul-gitlab_private_1"
},
"Vpc": {
"Id": "vpc-03aeb4a3542c7a5d3",
"Arn": "arn:aws:ec2:us-east-1:122796619775:vpc/vpc-03aeb4a3542c7a5d3",
"Name": "azul-gitlab"
}
}
],
"Tags": []
}
]
}
… when requesting
https://service.prod.anvil.gi.ucsc.edu/index/files?size=1
to build the health object.Logs Insights)
EDIT: (@achave11-ucsc)
Better logging has now allowed for the detection of an S3 PUT operation timing out. It's somewhat surprising since the object isn't of significant size.
CloudWatch Logs Insights
region: us-east-1
log-group-names: /aws/lambda/azul-service-prod-servicecachehealth
start-time: 2024-01-04T18:34:57.841Z
end-time: 2024-01-05T04:01:43.597Z
query-string: