Closed SrDayne closed 1 year ago
Hi,
Thanks for the issue and narrowing it down!
Any chance you would be willing to provide a full copy of the metadata endpoint that I could add for testing? I can update just the cpu_stats, but I would love to know that all the other functions are working as expected as well.
Thanks again!
Here is a PR to test as well: https://github.com/influxdata/telegraf/pull/14197
Hello.
Thanks for the really fast response.
Here is data from metadata.
From ${ECS_CONTAINER_METADATA_URI}/task
endpoint:
{
"Cluster": "ecs-cluster-prod",
"TaskARN": "arn:aws:ecs:eu-west-2:<account_id>:task/ecs-cluster-prod/a22cfc58cdb04b27ada2caa3fd526463",
"Family": "nginx-telegraf",
"Revision": "3",
"DesiredStatus": "RUNNING",
"KnownStatus": "RUNNING",
"Containers": [
{
"DockerId": "c69461b2c836cc3f0e3e5deb07b1f16e25f6009da2a48bb0adc7dd580befaf55",
"Name": "~internal~ecs~pause",
"DockerName": "ecs-nginx-telegraf-3-internalecspause-f0cc8fe3bfc9a8efd601",
"Image": "amazon/amazon-ecs-pause:0.1.0",
"ImageID": "",
"Labels": {
"com.amazonaws.ecs.cluster": "ecs-cluster-prod",
"com.amazonaws.ecs.container-name": "~internal~ecs~pause",
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:eu-west-2:<account_id>:task/ecs-cluster-prod/a22cfc58cdb04b27ada2caa3fd526463",
"com.amazonaws.ecs.task-definition-family": "nginx-telegraf",
"com.amazonaws.ecs.task-definition-version": "3"
},
"DesiredStatus": "RESOURCES_PROVISIONED",
"KnownStatus": "RESOURCES_PROVISIONED",
"Limits": {
"CPU": 2,
"Memory": 0
},
"CreatedAt": "2023-10-26T15:57:02.520698803Z",
"StartedAt": "2023-10-26T15:57:02.806357646Z",
"Type": "CNI_PAUSE",
"Networks": [
{
"NetworkMode": "awsvpc",
"IPv4Addresses": [
"10.1.4.120"
]
}
]
},
{
"DockerId": "f1101546d0a33be09214bf220b0e39a2551c1557751770d545095a86e9cd87b5",
"Name": "nginx",
"DockerName": "ecs-nginx-telegraf-3-nginx-eed4b29dad8cb3caec01",
"Image": "nginx:latest",
"ImageID": "sha256:593aee2afb642798b83a85306d2625fd7f089c0a1242c7e75a237846d80aa2a0",
"Labels": {
"com.amazonaws.ecs.cluster": "ecs-cluster-prod",
"com.amazonaws.ecs.container-name": "nginx",
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:eu-west-2:<account_id>:task/ecs-cluster-prod/a22cfc58cdb04b27ada2caa3fd526463",
"com.amazonaws.ecs.task-definition-family": "nginx-telegraf",
"com.amazonaws.ecs.task-definition-version": "3"
},
"DesiredStatus": "RUNNING",
"KnownStatus": "RUNNING",
"Limits": {
"CPU": 256,
"Memory": 0
},
"CreatedAt": "2023-10-26T15:57:12.429615405Z",
"StartedAt": "2023-10-26T15:57:13.688670241Z",
"Type": "NORMAL",
"Networks": [
{
"NetworkMode": "awsvpc",
"IPv4Addresses": [
"10.1.4.120"
]
}
]
},
{
"DockerId": "6c6b1a390db5743dde7a912e362d94a042705b655599707ca3a6792f9a1b0d2d",
"Name": "telegraf",
"DockerName": "ecs-nginx-telegraf-3-telegraf-e288e09cb68be1fa2b00",
"Image": "telegraf:latest",
"ImageID": "sha256:f0e412589f82499fa2f37df76113b83db2f4b5ad0d0f0a53a6b38a037fc307d8",
"Labels": {
"com.amazonaws.ecs.cluster": "ecs-cluster-prod",
"com.amazonaws.ecs.container-name": "telegraf",
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:eu-west-2:<account_id>:task/ecs-cluster-prod/a22cfc58cdb04b27ada2caa3fd526463",
"com.amazonaws.ecs.task-definition-family": "nginx-telegraf",
"com.amazonaws.ecs.task-definition-version": "3"
},
"DesiredStatus": "STOPPED",
"KnownStatus": "STOPPED",
"Limits": {
"CPU": 256,
"Memory": 0
},
"CreatedAt": "2023-10-26T15:57:18.334772649Z",
"Type": "NORMAL",
"Networks": [
{
"NetworkMode": "awsvpc",
"IPv4Addresses": [
"10.1.4.120"
]
}
]
}
],
"Limits": {
"CPU": 0.5,
"Memory": 1024
},
"PullStartedAt": "2023-10-26T15:57:03.528549391Z",
"PullStoppedAt": "2023-10-26T15:57:18.310985676Z",
"AvailabilityZone": "eu-west-2b"
}
From ${ECS_CONTAINER_METADATA_URI}/task/stats
endpoint:
{
"6c6b1a390db5743dde7a912e362d94a042705b655599707ca3a6792f9a1b0d2d": null,
"c69461b2c836cc3f0e3e5deb07b1f16e25f6009da2a48bb0adc7dd580befaf55": {
"read": "2023-10-26T15:58:32.690109934Z",
"preread": "2023-10-26T15:58:31.642796089Z",
"pids_stats": {
"current": 1,
"limit": 4618
},
"blkio_stats": {
"io_service_bytes_recursive": null,
"io_serviced_recursive": null,
"io_queue_recursive": null,
"io_service_time_recursive": null,
"io_wait_time_recursive": null,
"io_merged_recursive": null,
"io_time_recursive": null,
"sectors_recursive": null
},
"num_procs": 0,
"storage_stats": {},
"cpu_stats": {
"cpu_usage": {
"total_usage": 34818000,
"usage_in_kernelmode": 5397000,
"usage_in_usermode": 29420000
},
"system_cpu_usage": 16970500000000,
"online_cpus": 2,
"throttling_data": {
"periods": 0,
"throttled_periods": 0,
"throttled_time": 0
}
},
"precpu_stats": {
"cpu_usage": {
"total_usage": 34818000,
"usage_in_kernelmode": 5397000,
"usage_in_usermode": 29420000
},
"system_cpu_usage": 16968420000000,
"online_cpus": 2,
"throttling_data": {
"periods": 0,
"throttled_periods": 0,
"throttled_time": 0
}
},
"memory_stats": {
"usage": 303104,
"stats": {
"active_anon": 4096,
"active_file": 0,
"anon": 45056,
"anon_thp": 0,
"file": 0,
"file_dirty": 0,
"file_mapped": 0,
"file_writeback": 0,
"inactive_anon": 40960,
"inactive_file": 0,
"kernel_stack": 16384,
"pgactivate": 1,
"pgdeactivate": 0,
"pgfault": 1113,
"pglazyfree": 0,
"pglazyfreed": 0,
"pgmajfault": 0,
"pgrefill": 0,
"pgscan": 0,
"pgsteal": 0,
"shmem": 0,
"slab": 190768,
"slab_reclaimable": 66320,
"slab_unreclaimable": 124448,
"sock": 0,
"thp_collapse_alloc": 0,
"thp_fault_alloc": 0,
"unevictable": 0,
"workingset_activate": 0,
"workingset_nodereclaim": 0,
"workingset_refault": 0
},
"limit": 4051226624
},
"name": "/ecs-nginx-telegraf-3-internalecspause-f0cc8fe3bfc9a8efd601",
"id": "c69461b2c836cc3f0e3e5deb07b1f16e25f6009da2a48bb0adc7dd580befaf55",
"networks": {
"eth0": {
"rx_bytes": 14,
"rx_packets": 0,
"rx_errors": 0,
"rx_dropped": 0,
"tx_bytes": 21,
"tx_packets": 0,
"tx_errors": 0,
"tx_dropped": 0
}
}
},
"f1101546d0a33be09214bf220b0e39a2551c1557751770d545095a86e9cd87b5": {
"read": "2023-10-26T15:58:32.682138243Z",
"preread": "2023-10-26T15:58:31.645707286Z",
"pids_stats": {
"current": 3,
"limit": 4618
},
"blkio_stats": {
"io_service_bytes_recursive": [
{
"major": 259,
"minor": 0,
"op": "read",
"value": 0
},
{
"major": 259,
"minor": 0,
"op": "write",
"value": 12288
}
],
"io_serviced_recursive": null,
"io_queue_recursive": null,
"io_service_time_recursive": null,
"io_wait_time_recursive": null,
"io_merged_recursive": null,
"io_time_recursive": null,
"sectors_recursive": null
},
"num_procs": 0,
"storage_stats": {},
"cpu_stats": {
"cpu_usage": {
"total_usage": 86145000,
"usage_in_kernelmode": 35079000,
"usage_in_usermode": 51066000
},
"system_cpu_usage": 16970480000000,
"online_cpus": 2,
"throttling_data": {
"periods": 0,
"throttled_periods": 0,
"throttled_time": 0
}
},
"precpu_stats": {
"cpu_usage": {
"total_usage": 86145000,
"usage_in_kernelmode": 35079000,
"usage_in_usermode": 51066000
},
"system_cpu_usage": 16968440000000,
"online_cpus": 2,
"throttling_data": {
"periods": 0,
"throttled_periods": 0,
"throttled_time": 0
}
},
"memory_stats": {
"usage": 3170304,
"stats": {
"active_anon": 12288,
"active_file": 61440,
"anon": 2465792,
"anon_thp": 0,
"file": 81920,
"file_dirty": 0,
"file_mapped": 4096,
"file_writeback": 0,
"inactive_anon": 2457600,
"inactive_file": 16384,
"kernel_stack": 49152,
"pgactivate": 5,
"pgdeactivate": 0,
"pgfault": 4041,
"pglazyfree": 0,
"pglazyfreed": 0,
"pgmajfault": 0,
"pgrefill": 0,
"pgscan": 0,
"pgsteal": 0,
"shmem": 4096,
"slab": 354160,
"slab_reclaimable": 212088,
"slab_unreclaimable": 142072,
"sock": 0,
"thp_collapse_alloc": 0,
"thp_fault_alloc": 0,
"unevictable": 0,
"workingset_activate": 0,
"workingset_nodereclaim": 0,
"workingset_refault": 0
},
"limit": 4051226624
},
"name": "/ecs-nginx-telegraf-3-nginx-eed4b29dad8cb3caec01",
"id": "f1101546d0a33be09214bf220b0e39a2551c1557751770d545095a86e9cd87b5",
"networks": {
"eth0": {
"rx_bytes": 14,
"rx_packets": 0,
"rx_errors": 0,
"rx_dropped": 0,
"tx_bytes": 21,
"tx_packets": 0,
"tx_errors": 0,
"tx_dropped": 0
}
}
}
}
Thank you for your help!
Thank you very much! I've updated the PR with your test cases. Can you try the new artifacts in https://github.com/influxdata/telegraf/pull/14197 and let me know if it resolves your issue?
I checked new artifacts and can confirm that problem is solved. Here is example of gathered metrics:
{
"results": {
"A": {
"frames": [
{
"schema": {
"name": "ecs_container_cpu",
"refId": "A",
"meta": {
"executedQueryString": "from(bucket: \"bucket\")\n |> range(start: 2023-10-27T08:33:51.52Z, stop:2023-10-27T08:38:51.52Z)\n |> filter(fn: (r) => \n r._field == \"usage_total\" and\n r.host == \"fa455c76d97c\")\n |> aggregateWindow(every: 60s, fn: mean)\n |> yield(name: \"mean\")"
},
"fields": [
{
"name": "Time",
"type": "time",
"typeInfo": {
"frame": "time.Time",
"nullable": true
}
},
{
"name": "usage_total",
"type": "number",
"typeInfo": {
"frame": "float64",
"nullable": true
},
"labels": {
"cluster": "ecs-cluster-prod",
"com.amazonaws.ecs.cluster": "ecs-cluster-prod",
"com.amazonaws.ecs.container-name": "nginx",
"com.amazonaws.ecs.task-arn": "arn:aws:ecs:eu-west-2:<account_id>:task/ecs-cluster-prod/a22cfc58cdb04b27ada2caa3fd526463",
"com.amazonaws.ecs.task-definition-family": "nginx-telegraf",
"com.amazonaws.ecs.task-definition-version": "3",
"cpu": "cpu-total",
"family": "nginx-telegraf",
"host": "fa455c76d97c",
"id": "f1101546d0a33be09214bf220b0e39a2551c1557751770d545095a86e9cd87b5",
"name": "nginx",
"revision": "3",
"task_arn": "arn:aws:ecs:eu-west-2:<account_id>:task/ecs-cluster-prod/a22cfc58cdb04b27ada2caa3fd526463"
}
}
]
},
"data": {
"values": [
[
1698395640000,
1698395700000,
1698395760000,
1698395820000,
1698395880000,
1698395931520
],
[
86145000,
86145000,
86145000,
86145000,
86145000,
86145000
]
]
}
}
]
}
}
}
Thank you very much.
Relevant telegraf.conf
Logs from Telegraf
System info
Telegraf 1.28.3, Ubuntu 22.04, Docker 24.0.6
Docker
Container is started by ECS, but you can run this command. It produces same error:
docker run --rm -h ip-10-1-3-163.eu-west-2.compute.internal -v /tmp/telegraf.conf:/etc/telegraf/telegraf.conf -e ECS_CONTAINER_METADATA_URI=<metadata_uri> telegraf:latest
Steps to reproduce
Expected behavior
Running telegraf, collecting metrics from the docker containers and sending them to Influxdb.
Actual behavior
Telegraf container exits with the next error:
panic: runtime error: slice bounds out of range [:2] with capacity 0
Additional info
Following this code, ecs plugin tries to parse
cpu_stats.cpu_usage.percpu_usage
array:https://github.com/influxdata/telegraf/blob/321c5a4070cd46d699826432ab4858224f25001d/plugins/inputs/ecs/stats.go#L129
However, regarding Docker docs, Docker API doesn't return such metrics if cgroups v2 is using:
https://docs.docker.com/engine/api/v1.43/#tag/Container/operation/ContainerStats
In other words, ecs plugin tries to parse empty array of metrics. Here is part of the data which was used in ecs plugin tests:
And here is actual response from the metadata endpoint: