Kong / kong

🦍 The Cloud-Native API Gateway and AI Gateway.
https://konghq.com/install/#kong-community
Apache License 2.0
38.89k stars 4.78k forks source link

Could not write to shm after 6 tries (no memory), it is either fragmented or cannot allocate more memory #5203

Closed goober closed 4 years ago

goober commented 4 years ago

Summary

We have recently started to recognize error events in our kong-proxy (see logs below). First occurrence happens a couple of minutes after startup.

Additional Details & Logs

Output from curl https://localhost:8444/status

{
    "database": {
        "reachable": true
    },
    "memory": {
        "workers_lua_vms": [{
            "http_allocated_gc": "0.03 MiB",
            "pid": 32
        }, {
            "http_allocated_gc": "0.03 MiB",
            "pid": 33
        }, {
            "http_allocated_gc": "0.07 MiB",
            "pid": 34
        }, {
            "http_allocated_gc": "0.03 MiB",
            "pid": 35
        }, {
            "http_allocated_gc": "0.03 MiB",
            "pid": 36
        }, {
            "http_allocated_gc": "0.03 MiB",
            "pid": 37
        }, {
            "http_allocated_gc": "0.03 MiB",
            "pid": 38
        }, {
            "http_allocated_gc": "0.03 MiB",
            "pid": 39
        }],
        "lua_shared_dicts": {
            "kong_locks": {
                "allocated_slabs": "0.06 MiB",
                "capacity": "8.00 MiB"
            },
            "kong_db_cache_2": {
                "allocated_slabs": "2.12 MiB",
                "capacity": "128.00 MiB"
            },
            "kong": {
                "allocated_slabs": "0.04 MiB",
                "capacity": "5.00 MiB"
            },
            "kong_db_cache_miss_2": {
                "allocated_slabs": "0.09 MiB",
                "capacity": "12.00 MiB"
            },
            "kong_db_cache": {
                "allocated_slabs": "0.76 MiB",
                "capacity": "128.00 MiB"
            },
            "kong_process_events": {
                "allocated_slabs": "5.00 MiB",
                "capacity": "5.00 MiB"
            },
            "kong_db_cache_miss": {
                "allocated_slabs": "0.08 MiB",
                "capacity": "12.00 MiB"
            },
            "kong_cluster_events": {
                "allocated_slabs": "0.04 MiB",
                "capacity": "5.00 MiB"
            },
            "prometheus_metrics": {
                "allocated_slabs": "0.04 MiB",
                "capacity": "5.00 MiB"
            },
            "kong_healthchecks": {
                "allocated_slabs": "0.08 MiB",
                "capacity": "5.00 MiB"
            },
            "kong_rate_limiting_counters": {
                "allocated_slabs": "0.08 MiB",
                "capacity": "12.00 MiB"
            }
        }
    },
    "server": {
        "connections_writing": 1,
        "total_requests": 108,
        "connections_handled": 108,
        "connections_accepted": 108,
        "connections_reading": 0,
        "connections_active": 1,
        "connections_waiting": 0
    }
}

Kong configuration:

{
    "plugins": {
        "enabled_in_cluster": ["prometheus", "zipkin", "request-transformer", "post-function"],
        "available_on_server": {
            "correlation-id": true,
            "pre-function": true,
            "cors": true,
            "ldap-auth": true,
            "loggly": true,
            "hmac-auth": true,
            "zipkin": true,
            "request-size-limiting": true,
            "azure-functions": true,
            "request-transformer": true,
            "oauth2": true,
            "response-transformer": true,
            "ip-restriction": true,
            "statsd": true,
            "jwt": true,
            "proxy-cache": true,
            "basic-auth": true,
            "key-auth": true,
            "http-log": true,
            "datadog": true,
            "tcp-log": true,
            "rate-limiting": true,
            "post-function": true,
            "prometheus": true,
            "acl": true,
            "kubernetes-sidecar-injector": true,
            "syslog": true,
            "file-log": true,
            "udp-log": true,
            "response-ratelimiting": true,
            "aws-lambda": true,
            "session": true,
            "bot-detection": true,
            "request-termination": true
        }
    },
    "tagline": "Welcome to kong",
    "configuration": {
        "plugins": ["bundled"],
        "admin_ssl_enabled": true,
        "lua_ssl_verify_depth": 1,
        "trusted_ips": {},
        "prefix": "\/usr\/local\/kong",
        "loaded_plugins": {
            "session": true,
            "pre-function": true,
            "cors": true,
            "ldap-auth": true,
            "loggly": true,
            "hmac-auth": true,
            "zipkin": true,
            "request-size-limiting": true,
            "azure-functions": true,
            "request-transformer": true,
            "oauth2": true,
            "response-transformer": true,
            "syslog": true,
            "statsd": true,
            "jwt": true,
            "proxy-cache": true,
            "basic-auth": true,
            "key-auth": true,
            "http-log": true,
            "datadog": true,
            "tcp-log": true,
            "correlation-id": true,
            "post-function": true,
            "bot-detection": true,
            "acl": true,
            "kubernetes-sidecar-injector": true,
            "ip-restriction": true,
            "file-log": true,
            "udp-log": true,
            "response-ratelimiting": true,
            "aws-lambda": true,
            "rate-limiting": true,
            "prometheus": true,
            "request-termination": true
        },
        "cassandra_username": "kong",
        "ssl_cert_key": "\/usr\/local\/kong\/ssl\/kong-default.key",
        "admin_ssl_cert_key": "\/usr\/local\/kong\/ssl\/admin-kong-default.key",
        "dns_resolver": {},
        "pg_user": "kong",
        "mem_cache_size": "128m",
        "nginx_admin_directives": {},
        "nginx_http_upstream_directives": [{
            "value": "60s",
            "name": "keepalive_timeout"
        }, {
            "value": "100",
            "name": "keepalive_requests"
        }, {
            "value": "60",
            "name": "keepalive"
        }],
        "nginx_http_directives": [{
            "value": "TLSv1.1 TLSv1.2 TLSv1.3",
            "name": "ssl_protocols"
        }, {
            "value": "\/kong\/servers.conf",
            "name": "include"
        }, {
            "value": "prometheus_metrics 5m",
            "name": "lua_shared_dict"
        }],
        "pg_host": "127.0.0.1",
        "nginx_acc_logs": "\/usr\/local\/kong\/logs\/access.log",
        "pg_semaphore_timeout": 60000,
        "proxy_listen": ["0.0.0.0:8000", "0.0.0.0:8443 ssl"],
        "client_ssl_cert_default": "\/usr\/local\/kong\/ssl\/kong-default.crt",
        "cassandra_ssl": false,
        "db_update_frequency": 5,
        "db_update_propagation": 0,
        "stream_listen": ["off"],
        "nginx_err_logs": "\/usr\/local\/kong\/logs\/error.log",
        "cassandra_port": 9042,
        "dns_order": ["LAST", "SRV", "A", "CNAME"],
        "dns_error_ttl": 1,
        "headers": ["server_tokens", "latency_tokens"],
        "cassandra_lb_policy": "RequestRoundRobin",
        "nginx_optimizations": true,
        "nginx_http_upstream_keepalive_timeout": "60s",
        "pg_timeout": 5000,
        "nginx_http_upstream_keepalive_requests": "100",
        "database": "off",
        "proxy_access_log": "logs\/access.log",
        "pg_database": "kong",
        "nginx_worker_processes": "auto",
        "client_ssl": false,
        "lua_package_cpath": "",
        "ssl_cert_key_default": "\/usr\/local\/kong\/ssl\/kong-default.key",
        "admin_acc_logs": "\/usr\/local\/kong\/logs\/admin_access.log",
        "cassandra_contact_points": ["127.0.0.1"],
        "cassandra_repl_factor": 1,
        "lua_package_path": ".\/?.lua;.\/?\/init.lua;",
        "nginx_pid": "\/usr\/local\/kong\/pids\/nginx.pid",
        "upstream_keepalive": 60,
        "dns_stale_ttl": 4,
        "origins": {},
        "nginx_kong_stream_conf": "\/usr\/local\/kong\/nginx-kong-stream.conf",
        "error_default_type": "text\/plain",
        "admin_access_log": "\/dev\/stdout",
        "stream_listeners": {},
        "nginx_daemon": "off",
        "proxy_listeners": [{
            "listener": "0.0.0.0:8000",
            "proxy_protocol": false,
            "reuseport": false,
            "transparent": false,
            "ssl": false,
            "ip": "0.0.0.0",
            "deferred": false,
            "http2": false,
            "port": 8000,
            "bind": false
        }, {
            "listener": "0.0.0.0:8443 ssl",
            "proxy_protocol": false,
            "reuseport": false,
            "transparent": false,
            "ssl": true,
            "ip": "0.0.0.0",
            "deferred": false,
            "http2": false,
            "port": 8443,
            "bind": false
        }],
        "proxy_ssl_enabled": true,
        "nginx_http_upstream_keepalive": "60",
        "db_cache_warmup_entities": ["services", "plugins"],
        "lua_socket_pool_size": 30,
        "nginx_http_ssl_protocols": "TLSv1.1 TLSv1.2 TLSv1.3",
        "router_consistency": "strict",
        "db_resurrect_ttl": 30,
        "nginx_stream_directives": {},
        "cassandra_consistency": "ONE",
        "db_cache_ttl": 0,
        "admin_error_log": "\/dev\/stderr",
        "admin_ssl_cert_default": "\/usr\/local\/kong\/ssl\/admin-kong-default.crt",
        "dns_not_found_ttl": 30,
        "pg_ssl": false,
        "nginx_http_include": "\/kong\/servers.conf",
        "ssl_cipher_suite": "modern",
        "cassandra_repl_strategy": "SimpleStrategy",
        "kong_env": "\/usr\/local\/kong\/.kong_env",
        "cassandra_schema_consensus_timeout": 10000,
        "pg_max_concurrent_queries": 0,
        "client_max_body_size": "0",
        "nginx_kong_conf": "\/usr\/local\/kong\/nginx-kong.conf",
        "real_ip_header": "X-Forwarded-For",
        "dns_hostsfile": "\/etc\/hosts",
        "admin_listeners": [{
            "listener": "127.0.0.1:8444 ssl",
            "proxy_protocol": false,
            "reuseport": false,
            "transparent": false,
            "ssl": true,
            "ip": "127.0.0.1",
            "deferred": false,
            "http2": false,
            "port": 8444,
            "bind": false
        }],
        "dns_no_sync": false,
        "ssl_cert": "\/usr\/local\/kong\/ssl\/kong-default.crt",
        "cassandra_timeout": 5000,
        "admin_ssl_cert_key_default": "\/usr\/local\/kong\/ssl\/admin-kong-default.key",
        "cassandra_ssl_verify": false,
        "cassandra_data_centers": ["dc1:2", "dc2:3"],
        "log_level": "notice",
        "real_ip_recursive": "on",
        "proxy_error_log": "logs\/error.log",
        "client_ssl_cert_key_default": "\/usr\/local\/kong\/ssl\/kong-default.key",
        "admin_ssl_cert": "\/usr\/local\/kong\/ssl\/admin-kong-default.crt",
        "anonymous_reports": true,
        "nginx_proxy_directives": {},
        "nginx_sproxy_directives": {},
        "pg_port": 5432,
        "pg_ssl_verify": false,
        "client_body_buffer_size": "8k",
        "ssl_preread_enabled": true,
        "ssl_cert_csr_default": "\/usr\/local\/kong\/ssl\/kong-default.csr",
        "nginx_conf": "\/usr\/local\/kong\/nginx.conf",
        "cassandra_keyspace": "kong",
        "ssl_cert_default": "\/usr\/local\/kong\/ssl\/kong-default.crt",
        "enabled_headers": {
            "latency_tokens": true,
            "X-Kong-Proxy-Latency": true,
            "Via": true,
            "server_tokens": true,
            "Server": true,
            "X-Kong-Upstream-Latency": true,
            "X-Kong-Upstream-Status": false
        },
        "admin_listen": ["127.0.0.1:8444 ssl"]
    },
    "version": "1.3.0",
    "node_id": "19a09d1a-f414-4f19-9604-f12c2ea2bb0a",
    "lua_version": "LuaJIT 2.1.0-beta3",
    "prng_seeds": {
        "pid: 32": 182465973931,
        "pid: 35": 911142381632,
        "pid: 39": 205124189184,
        "pid: 1": 219190232120,
        "pid: 34": 120102220132,
        "pid: 38": 405410720719,
        "pid: 37": 932922421325,
        "pid: 36": 541265322358,
        "pid: 33": 111174441921
    },
    "timers": {
        "pending": 144,
        "running": 0
    },
    "hostname": "ingress-kong-998974759-gtxvn"
}

You can also see that the used memory keeps going upwards:

image

goober commented 4 years ago

After upgrading to: kong: 1.14 kong-ingress-controller: 0.6.1

And set KONG_NGINX_WORKER_PROCESSES to 1

The above error messages goes away. ~However, it seems that I hit another issue were I constantly get the the below timeout message~:

UPDATE The below error message is due to our firewall and after setting KONG_ANONYMOUS_REPORTS: off the message goes away. However, the increasing memory consumption is still there.

2019/11/05 11:22:29 [notice] 27#0: *4456557 [lua] cache.lua:321: purge(): [DB cache] purging (local) cache, client: 127.0.0.1, server: kong_admin, request: "POST /config?check_hash=1 HTTP/1.1", host: "localhost:8444"
--
  | 2019/11/05 11:22:59 [warn] 27#0: *4456557 [lua] reports.lua:70: log(): [reports] could not connect to TCP socket: timeout, client: 127.0.0.1, server: kong_admin, request: "POST /config?check_hash=1 HTTP/1.1", host: "localhost:8444"

And the memory consumption constantly grows aswell:

image

bungle commented 4 years ago

That reports thing is possibly because Kong is blocked on udp to internet. On most recent version that would be tcp. You can turn it off: https://docs.konghq.com/1.4.x/configuration/#anonymous_reports

Edit: oh, you figured it out already! :-)

chenjinxuan commented 4 years ago

I also have this problem. How to solve the growing memory? version: 1.4.0

chenjinxuan commented 4 years ago

After upgrading to: kong: 1.14 kong-ingress-controller: 0.6.1

And set KONG_NGINX_WORKER_PROCESSES to 1

The above error messages goes away. ~However, it seems that I hit another issue were I constantly get the the below timeout message~:

UPDATE The below error message is due to our firewall and after setting KONG_ANONYMOUS_REPORTS: off the message goes away. However, the increasing memory consumption is still there.

2019/11/05 11:22:29 [notice] 27#0: *4456557 [lua] cache.lua:321: purge(): [DB cache] purging (local) cache, client: 127.0.0.1, server: kong_admin, request: "POST /config?check_hash=1 HTTP/1.1", host: "localhost:8444"
--
  | 2019/11/05 11:22:59 [warn] 27#0: *4456557 [lua] reports.lua:70: log(): [reports] could not connect to TCP socket: timeout, client: 127.0.0.1, server: kong_admin, request: "POST /config?check_hash=1 HTTP/1.1", host: "localhost:8444"

And the memory consumption constantly grows aswell:

image

Excuse me, do you know how to solve this problem

goober commented 4 years ago

@chenjinxuan The maintainers are currently investigating this issue. https://discuss.konghq.com/t/possible-memory-leak-kong-1-4-kic-0-6-1/4800/6

TwoToneBytes commented 4 years ago

Any update on this? This is preventing us from using the latest version of Kong in production.

hishamhm commented 4 years ago

Some testing is still needed but PR #5229 might be the solution to this!

abenitovsc commented 4 years ago

Hi all, I got same issue with Kong 1.3 and ingress 0.6.0, installed with helm (stable/kong, 0.19.1), DB-less mode. i have upgraded kong version to 1.4 with the same chart version and defining memory limits in K8S the memory gets stable and errors have disappeared, if no limits the memory increases without control. limits 5GB with 4 workers and stable around 3.5-4GB.

edgarcolque commented 4 years ago

I tested the Fix in out INT environment and it resolved the memory leak

hishamhm commented 4 years ago

Thank you for your feedback, @edgarcolque !! We have now merged #5229 by @zeeshen !

hishamhm commented 4 years ago

@goober Thank you for the report! If this problem persists in the next release, feel free to reopen!

goober commented 4 years ago

Thank you for looking into this. I will wait for the next release and verify it. Great job!

mhaziq commented 9 months ago

@goober @edgarcolque We are running kong gateway v-3.3.1 with dbless mode and see the same memory issue where it keeps growing and it is not going down. Please suggest.