influxdata / telegraf

Agent for collecting, processing, aggregating, and writing metrics, logs, and other arbitrary data.
https://influxdata.com/telegraf
MIT License
14.5k stars 5.55k forks source link

Excessive memory usage caused by json_v2 #11068

Open turboproc opened 2 years ago

turboproc commented 2 years ago

Relevant telegraf.conf

4289 # # Generic HTTP write listener
4290 [[inputs.http_listener_v2]]
4291
4292 name_override = "dsl"
4293
4294 ## Address and port to host HTTP listener on
4295 service_address = ":8080"
4296 path = "/telegraf"
4297 methods = ["POST"]
4298
4299 data_source = "body"
4300 data_format = "json_v2"
4301 #
4302 [inputs.http_listener_v2.tags]
4303     influxdb_database = "httplistener"
4304
4305 [[inputs.http_listener_v2.json_v2]]
4306     measurement_name = "dsl"
4307 #    timestamp_path = "Timestamp"
4308 #    timestamp_format = "2006-01-02T15:04:05 0200"
4309
4310
4311 #[[inputs.http_listener_v2.json_v2.object]]
4312 #    path = "upstream"
4313 #    disable_prepend_keys = false
4314
4315 [[inputs.http_listener_v2.json_v2.object]]
4316     path = "@this"
4317     disable_prepend_keys = false
4318
4319 #[[inputs.http_listener_v2.json_v2.field]]
4320 #   path = "state_num"
4321
4322 #[[inputs.http_listener_v2.json_v2.field]]
4323 #   path = "chipset"
4324
4325 #[[inputs.http_listener_v2.json_v2.field]]
4326 #   path = "firmware_version"
4327

Logs from Telegraf

Nothing showing up in logs referring to a potential issue.

System info

Telegraf 1.22.3 (git: HEAD ff950615) / PRETTY_NAME="Debian GNU/Linux 9 (stretch)"

Docker

No response

Steps to reproduce

  1. Starting telegraf, no input yet
  2. Send json input using Postman
  3. ...

Expected behavior

Would expect no significant change in memory usage. In idle stat stats are like

F   UID   PID  PPID PRI  NI    VSZ   RSS WCHAN  STAT TTY        TIME COMMAND
4   999  2578     1  20   0 5149068 89452 -     Ssl  ?          0:01 /usr/bin/telegraf -config /etc/telegraf/telegraf.conf -config-directory /etc/telegraf/telegraf.d

Actual behavior

As soon as a json message is send to telegraf, the memory usage will increase excessive. Waiting will not resolve the situation, only a restart will solve it bit just for the duration of 1 message.

F   UID   PID  PPID PRI  NI    VSZ   RSS WCHAN  STAT TTY        TIME COMMAND
4   999  2578     1  20   0 6685008 1378852 -   Ssl  ?          0:19 /usr/bin/telegraf -config /etc/telegraf/telegraf.conf -config-directory /etc/telegraf/telegraf.d

Additional info

Message send to telegraf using Postman is as follows:

{
    "api_version": "4.17.18.6",
    "firmware_version": "5.9.1.4.0.7",
    "chipset": "Lantiq-VRX200",
    "driver_version": "1.5.17.6",
    "state": "Showtime with TC-Layer sync",
    "state_num": 7,
    "up": true,
    "uptime": 16706,
    "atu_c": {
        "vendor_id": [
            181,
            0,
            66,
            68,
            67,
            77,
            178,
            30
        ],
        "vendor": "Broadcom 178.30",
        "system_vendor_id": [
            181,
            0,
            66,
            68,
            67,
            77,
            0,
            0
        ],
        "system_vendor": "Broadcom",
        "version": [
            49,
            57,
            46,
            48,
            46,
            51,
            57,
            46,
            50,
            32,
            86,
            69,
            95,
            49,
            49,
            95
        ],
        "serial": [
            65,
            65,
            49,
            53,
            52,
            56,
            70,
            83,
            49,
            76,
            81,
            45,
            52,
            51,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0
        ]
    },
    "power_state": "L0 - Synchronized",
    "power_state_num": 0,
    "xtse": [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        2
    ],
    "annex": "B",
    "standard": "G.993.2",
    "profile": "17a",
    "mode": "G.993.2 (VDSL2, Profile 17a, with down- and upstream vectoring)                                                                                                                                                             ",
    "upstream": {
        "vector": true,
        "trellis": true,
        "bitswap": true,
        "retx": true,
        "virtual_noise": false,
        "interleave_delay": 0,
        "data_rate": 33023000,
        "latn": 14.800000,
        "satn": 14.600000,
        "snr": 7.000000,
        "actps": -90.100000,
        "actatp": 11.300000,
        "attndr": 43144000
    },
    "downstream": {
        "vector": true,
        "trellis": true,
        "bitswap": true,
        "retx": true,
        "virtual_noise": true,
        "interleave_delay": 130,
        "data_rate": 104871000,
        "latn": 14.100000,
        "satn": 14.300000,
        "snr": 8.00000,
        "actps": -90.100000,
        "actatp": 6.100000,
        "attndr": 108445696
    },
    "errors": {
        "near": {
            "es": 2494,
            "ses": 1,
            "loss": 0,
            "uas": 188,
            "lofs": 0,
            "fecs": 0,
            "hec": 0,
            "ibe": 0,
            "crc_p": 412608,
            "crcp_p": 0,
            "cv_p": 22234373,
            "cvp_p": 0,
            "rx_corrupted": 32177978,
            "rx_uncorrected_protected": 47012,
            "rx_retransmitted": 0,
            "rx_corrected": 32130966,
            "tx_retransmitted": 7035666
        },
        "far": {
            "es": 42,
            "ses": 21,
            "loss": 0,
            "uas": 188,
            "lofs": 0,
            "fecs": 5193391,
            "hec": 0,
            "ibe": 0,
            "crc_p": 0,
            "crcp_p": 0,
            "cv_p": 0,
            "cvp_p": 0,
            "rx_corrupted": 6561417,
            "rx_uncorrected_protected": 340065,
            "rx_retransmitted": 0,
            "rx_corrected": 6221352,
            "tx_retransmitted": 1548056665
        }
    },
    "erb": {
        "sent": 133435,
        "discarded": 0
    }
    }

Also tried this on a different Linux distribution (Ubuntu 20.04) which shows the same behaviour.

reimda commented 2 years ago

@sspaink Have you seen this behavior before? I'm not sure if gjson is using the memory or if there is something telegraf needs to do differently.

turboproc commented 2 years ago

Hi @sspaink, no this started to occur when I included json_v2. Before that I've been using Telegraf for a long time without any issues.

powersj commented 1 year ago

@sspaink what is the purpose of the cartesianProduct function in the JSONv2 parser? If I parse a simple JSON file with a single value e.g. {"state_num": 7} the memory usage remains low after parsing a message, but as soon as the number of fields grow, as with this bug, the memory usage in that function grows greatly as soon as that and the mergeMetric functions are called:

image

SudoNova commented 3 weeks ago

This is a real problem and still persists. One scenario is pulling metrics from '${ELASTIC_SEARCH_URL}/_stats/_all' and it eats GBs of ram until an out of memory exception gets thrown.

powersj commented 3 weeks ago

@SudoNova I would highly recommend looking at the xpath parser in the mean time, or go look at our elasticserach plugin that pulls from that endpoint already!