Kong / kong

🦍 The Cloud-Native API Gateway and AI Gateway.
https://konghq.com/install/#kong-community
Apache License 2.0
39.27k stars 4.82k forks source link

Kong Server Node CPU usage imbalance #8863

Closed vinniemo closed 1 year ago

vinniemo commented 2 years ago

Is there an existing issue for this?

Kong version ($ kong version)

Kong 2.7.1 pg 9.6

Current Behavior

Deployed on K8S,Kong server cluster instance 30 pod(4C4G), Kong service 125, Kong route 2500+(PCRE Regex route 95%, Short path prefix 5%), Total QPS 45000+

Expected Behavior

Kong Server Node CPU usage imbalance,It's getting worse with traffic growth,Until the CPU fails to respond image image

Steps To Reproduce

nginx.conf

pid pids/nginx.pid;

# injected nginx_main_* directives
daemon off;
worker_processes 4;
worker_rlimit_nofile 65535;

events {
    # injected nginx_events_* directives
    use epoll;
    worker_connections 65535;
    accept_mutex off;
    multi_accept on;

}

http {
    default_type  text/html;

    log_format  main  '$host - $remote_addr - $remote_user [$time_local] "$request" '
                          '$status $body_bytes_sent "$http_referer" "$http_origin" '
                          '"$http_user_agent" "$http_x_forwarded_for" $request_time $upstream_response_time';

    log_format  proxy '$host - $http_x_real_ip - $remote_user [$time_local] "$request" '
                      '$status $body_bytes_sent "$http_referer" "$http_origin" '
                      '"$http_user_agent" "$http_x_forwarded_for" $request_time $upstream_response_time';

    log_format json '{"host":"$host",'
            '"http_x_real_ip":"$http_x_real_ip",'
            '"remote_addr":"$remote_addr",'
            '"remote_user":"$remote_user",'
            '"time_local":"[$time_local]",'
            '"request":"$request",'
            '"status_code":"$status",'
            '"body_bytes_sent":"$body_bytes_sent",'
            '"http_referer":"$http_referer",'
            '"http_origin":"$http_origin",'
            '"http_user_agent":"$http_user_agent",'
            '"http_x_forwarded_for":"$http_x_forwarded_for",'
            '"request_time":"$request_time",'
            '"upstream_addr":"$upstream_addr",'
            '"upstream_response_time":"$upstream_response_time"}';

    map $upstream_response_time $u_r_t {
            default 0;
            ~(\d+).[1-9][0-9][0-9] 1;
        }

    map $status $loggable{
        default 0;
        ~^[345] 1;
    }

    sendfile        on;
    tcp_nopush      on;
    tcp_nodelay     on;

    keepalive_timeout  60;

    gzip  on;
    gzip_http_version 1.0;
    gzip_disable "MSIE [1-6]\.";
    gzip_vary on;
    gzip_comp_level 5;
    gzip_min_length 1024;
    gzip_buffers 4 32k;
    gzip_types text/plain application/json application/x-javascript application/javascript text/css;

    server_names_hash_bucket_size 64;
    large_client_header_buffers 4 128k;
    client_header_buffer_size 32k;
    client_header_timeout 12;
    send_timeout 10;
    client_body_timeout 12;

    proxy_cache_path /dev/shm/proxy_cache levels=1:2 keys_zone=proxy_cache_one:1024m;

    include 'nginx-kong.conf';

}

nginx-kong.conf

charset UTF-8;
server_tokens off;

error_log /data/logs/kong-server/error.log error;

lua_package_path       './?.lua;./?/init.lua;;;;';
lua_package_cpath      ';;;';
lua_socket_pool_size   30;
lua_socket_log_errors  off;
lua_max_running_timers 4096;
lua_max_pending_timers 16384;
lua_ssl_verify_depth   1;

lua_shared_dict kong                        5m;
lua_shared_dict kong_locks                  8m;
lua_shared_dict kong_healthchecks           5m;
lua_shared_dict kong_process_events         5m;
lua_shared_dict kong_cluster_events         5m;
lua_shared_dict kong_rate_limiting_counters 12m;
lua_shared_dict kong_core_db_cache          1024m;
lua_shared_dict kong_core_db_cache_miss     12m;
lua_shared_dict kong_db_cache               1024m;
lua_shared_dict kong_db_cache_miss          12m;

underscores_in_headers on;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384;

# injected nginx_http_* directives
client_body_buffer_size 256K;
client_max_body_size 200M;
lua_regex_cache_max_entries 8000;
lua_regex_match_limit 100000;
lua_shared_dict prometheus_metrics 10m;
lua_ssl_protocols TLSv1.1 TLSv1.2 TLSv1.3;
ssl_dhparam /usr/local/kong/ssl/ffdhe2048.pem;
ssl_prefer_server_ciphers off;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_session_tickets on;
ssl_session_timeout 1d;

init_by_lua_block {
    Kong = require 'kong'
    Kong.init()
}

init_worker_by_lua_block {
    Kong.init_worker()
}

# Load variable indexes
lua_kong_load_var_index $args;
lua_kong_load_var_index $bytes_sent;
lua_kong_load_var_index $content_type;
lua_kong_load_var_index $ctx_ref;
lua_kong_load_var_index $host;
lua_kong_load_var_index $http_authorization;
lua_kong_load_var_index $http_connection;
lua_kong_load_var_index $http_host;
lua_kong_load_var_index $http_kong_debug;
lua_kong_load_var_index $http_proxy;
lua_kong_load_var_index $http_proxy_connection;
lua_kong_load_var_index $http_te;
lua_kong_load_var_index $http_upgrade;
lua_kong_load_var_index $http_x_forwarded_for;
lua_kong_load_var_index $http_x_forwarded_host;
lua_kong_load_var_index $http_x_forwarded_path;
lua_kong_load_var_index $http_x_forwarded_port;
lua_kong_load_var_index $http_x_forwarded_prefix;
lua_kong_load_var_index $http_x_forwarded_proto;
lua_kong_load_var_index $https;
lua_kong_load_var_index $http2;
lua_kong_load_var_index $is_args;
lua_kong_load_var_index $kong_proxy_mode;
lua_kong_load_var_index $realip_remote_addr;
lua_kong_load_var_index $realip_remote_port;
lua_kong_load_var_index $remote_addr;
lua_kong_load_var_index $remote_port;
lua_kong_load_var_index $request;
lua_kong_load_var_index $request_length;
lua_kong_load_var_index $request_method;
lua_kong_load_var_index $request_time;
lua_kong_load_var_index $request_uri;
lua_kong_load_var_index $scheme;
lua_kong_load_var_index $server_addr;
lua_kong_load_var_index $server_port;
lua_kong_load_var_index $ssl_cipher;
lua_kong_load_var_index $ssl_client_raw_cert;
lua_kong_load_var_index $ssl_client_verify;
lua_kong_load_var_index $ssl_protocol;
lua_kong_load_var_index $ssl_server_name;
lua_kong_load_var_index $upstream_connection;
lua_kong_load_var_index $upstream_host;
lua_kong_load_var_index $upstream_http_connection;
lua_kong_load_var_index $upstream_http_trailer;
lua_kong_load_var_index $upstream_http_upgrade;
lua_kong_load_var_index $upstream_scheme;
lua_kong_load_var_index $upstream_status;
lua_kong_load_var_index $upstream_te;
lua_kong_load_var_index $upstream_uri;
lua_kong_load_var_index $upstream_upgrade;
lua_kong_load_var_index $proxy_add_x_forwarded_for;
lua_kong_load_var_index $upstream_x_forwarded_host;
lua_kong_load_var_index $upstream_x_forwarded_path;
lua_kong_load_var_index $upstream_x_forwarded_port;
lua_kong_load_var_index $upstream_x_forwarded_prefix;
lua_kong_load_var_index $upstream_x_forwarded_proto;

upstream kong_upstream {
    server 0.0.0.1;

    # injected nginx_upstream_* directives

    balancer_by_lua_block {
        Kong.balancer()
    }
}

server {
    server_name kong;
    listen 0.0.0.0:80 reuseport backlog=16384;
    listen 0.0.0.0:443 ssl http2 reuseport backlog=16384;

    error_page 400 404 405 408 411 412 413 414 417 494 /kong_error_handler;
    error_page 500 502 503 504                     /kong_error_handler;

    access_log  /data/logs/kong-server/access.log json buffer=10K if=$u_r_t;
    access_log  /data/logs/kong-server/access.log json buffer=10K if=$loggable;
    error_log  /data/logs/kong-server/error.log error;

    ssl_certificate     /usr/local/kong/ssl/kong-default.crt;
    ssl_certificate_key /usr/local/kong/ssl/kong-default.key;
    ssl_certificate     /usr/local/kong/ssl/kong-default-ecdsa.crt;
    ssl_certificate_key /usr/local/kong/ssl/kong-default-ecdsa.key;
    ssl_session_cache   shared:SSL:10m;
    ssl_certificate_by_lua_block {
        Kong.ssl_certificate()
    }

    # injected nginx_proxy_* directives
    real_ip_header X-Real-IP;
    real_ip_recursive off;

    rewrite_by_lua_block {
        Kong.rewrite()
    }

    access_by_lua_block {
        Kong.access()
    }

    header_filter_by_lua_block {
        Kong.header_filter()
    }

    body_filter_by_lua_block {
        Kong.body_filter()
    }

    log_by_lua_block {
        Kong.log()
    }

    location / {
        default_type                     '';

        set $ctx_ref                     '';
        set $upstream_te                 '';
        set $upstream_host               '';
        set $upstream_upgrade            '';
        set $upstream_connection         '';
        set $upstream_scheme             '';
        set $upstream_uri                '';
        set $upstream_x_forwarded_for    '';
        set $upstream_x_forwarded_proto  '';
        set $upstream_x_forwarded_host   '';
        set $upstream_x_forwarded_port   '';
        set $upstream_x_forwarded_path   '';
        set $upstream_x_forwarded_prefix '';
        set $kong_proxy_mode             'http';

        proxy_http_version      1.1;
        proxy_buffering          on;
        proxy_request_buffering  on;

        proxy_set_header      TE                 $upstream_te;
        proxy_set_header      Host               $host;
        proxy_set_header      Upgrade            $upstream_upgrade;
        proxy_set_header      Connection         "";
        proxy_set_header      X-Forwarded-For    $proxy_add_x_forwarded_for;
        #proxy_set_header      X-Forwarded-Proto  $upstream_x_forwarded_proto;
        #proxy_set_header      X-Forwarded-Host   $upstream_x_forwarded_host;
        #proxy_set_header      X-Forwarded-Port   $upstream_x_forwarded_port;
        #proxy_set_header      X-Forwarded-Path   $upstream_x_forwarded_path;
        #proxy_set_header      X-Forwarded-Prefix $upstream_x_forwarded_prefix;
        proxy_set_header      X-Real-IP          $remote_addr;
        proxy_pass_header     Server;
        proxy_pass_header     Date;
        proxy_ssl_name        $upstream_host;
        proxy_ssl_server_name on;
        proxy_pass            $upstream_scheme://kong_upstream$upstream_uri;
    }

    location @unbuffered {
        internal;
        default_type         '';
        set $kong_proxy_mode 'unbuffered';

        proxy_http_version      1.1;
        proxy_buffering         off;
        proxy_request_buffering off;

        proxy_set_header      TE                 $upstream_te;
        proxy_set_header      Host               $host;
        proxy_set_header      Upgrade            $upstream_upgrade;
        proxy_set_header      Connection         "";
        proxy_set_header      X-Forwarded-For    $proxy_add_x_forwarded_for;
        #proxy_set_header      X-Forwarded-Proto  $upstream_x_forwarded_proto;
        #proxy_set_header      X-Forwarded-Host   $upstream_x_forwarded_host;
        #proxy_set_header      X-Forwarded-Port   $upstream_x_forwarded_port;
        #proxy_set_header      X-Forwarded-Path   $upstream_x_forwarded_path;
        #proxy_set_header      X-Forwarded-Prefix $upstream_x_forwarded_prefix;
        proxy_set_header      X-Real-IP          $remote_addr;
        proxy_pass_header     Server;
        proxy_pass_header     Date;
        proxy_ssl_name        $upstream_host;
        proxy_ssl_server_name on;
        proxy_pass            $upstream_scheme://kong_upstream$upstream_uri;
    }

    location @unbuffered_request {
        internal;
        default_type         '';
        set $kong_proxy_mode 'unbuffered';

        proxy_http_version      1.1;
        proxy_buffering          on;
        proxy_request_buffering off;

        proxy_set_header      TE                 $upstream_te;
        proxy_set_header      Host               $host;
        proxy_set_header      Upgrade            $upstream_upgrade;
        proxy_set_header      Connection         "";
        proxy_set_header      X-Forwarded-For    $proxy_add_x_forwarded_for;
        #proxy_set_header      X-Forwarded-Proto  $upstream_x_forwarded_proto;
        #proxy_set_header      X-Forwarded-Host   $upstream_x_forwarded_host;
        #proxy_set_header      X-Forwarded-Port   $upstream_x_forwarded_port;
        #proxy_set_header      X-Forwarded-Path   $upstream_x_forwarded_path;
        #proxy_set_header      X-Forwarded-Prefix $upstream_x_forwarded_prefix;
        proxy_set_header      X-Real-IP          $remote_addr;
        proxy_pass_header     Server;
        proxy_pass_header     Date;
        proxy_ssl_name        $upstream_host;
        proxy_ssl_server_name on;
        proxy_pass            $upstream_scheme://kong_upstream$upstream_uri;
    }

    location @unbuffered_response {
        internal;
        default_type         '';
        set $kong_proxy_mode 'unbuffered';

        proxy_http_version      1.1;
        proxy_buffering         off;
        proxy_request_buffering  on;

        proxy_set_header      TE                 $upstream_te;
        proxy_set_header      Host               $host;
        proxy_set_header      Upgrade            $upstream_upgrade;
        proxy_set_header      Connection         "";
        proxy_set_header      X-Forwarded-For    $proxy_add_x_forwarded_for;
        #proxy_set_header      X-Forwarded-Proto  $upstream_x_forwarded_proto;
        #proxy_set_header      X-Forwarded-Host   $upstream_x_forwarded_host;
        #proxy_set_header      X-Forwarded-Port   $upstream_x_forwarded_port;
        #proxy_set_header      X-Forwarded-Path   $upstream_x_forwarded_path;
        #proxy_set_header      X-Forwarded-Prefix $upstream_x_forwarded_prefix;
        proxy_set_header      X-Real-IP          $remote_addr;
        proxy_pass_header     Server;
        proxy_pass_header     Date;
        proxy_ssl_name        $upstream_host;
        proxy_ssl_server_name on;
        proxy_pass            $upstream_scheme://kong_upstream$upstream_uri;
    }

    location @grpc {
        internal;
        default_type         '';
        set $kong_proxy_mode 'grpc';

        grpc_set_header      TE                 $upstream_te;
        grpc_set_header      X-Forwarded-For    $proxy_add_x_forwarded_for;
        #grpc_set_header      X-Forwarded-Proto  $upstream_x_forwarded_proto;
        #grpc_set_header      X-Forwarded-Host   $upstream_x_forwarded_host;
        #grpc_set_header      X-Forwarded-Port   $upstream_x_forwarded_port;
        #grpc_set_header      X-Forwarded-Path   $upstream_x_forwarded_path;
        #grpc_set_header      X-Forwarded-Prefix $upstream_x_forwarded_prefix;
        grpc_set_header      X-Real-IP          $remote_addr;
        grpc_pass_header     Server;
        grpc_pass_header     Date;
        grpc_ssl_name        $upstream_host;
        grpc_ssl_server_name on;
        grpc_pass            $upstream_scheme://kong_upstream;
    }

    location = /kong_buffered_http {
        internal;
        default_type         '';
        set $kong_proxy_mode 'http';

        rewrite_by_lua_block       {;}
        access_by_lua_block        {;}
        header_filter_by_lua_block {;}
        body_filter_by_lua_block   {;}
        log_by_lua_block           {;}

        proxy_http_version 1.1;
        proxy_set_header      TE                 $upstream_te;
        proxy_set_header      Host               $host;
        proxy_set_header      Upgrade            $upstream_upgrade;
        proxy_set_header      Connection         "";
        proxy_set_header      X-Forwarded-For    $proxy_add_x_forwarded_for;
        #proxy_set_header      X-Forwarded-Proto  $upstream_x_forwarded_proto;
        #proxy_set_header      X-Forwarded-Host   $upstream_x_forwarded_host;
        #proxy_set_header      X-Forwarded-Port   $upstream_x_forwarded_port;
        #proxy_set_header      X-Forwarded-Path   $upstream_x_forwarded_path;
        #proxy_set_header      X-Forwarded-Prefix $upstream_x_forwarded_prefix;
        proxy_set_header      X-Real-IP          $remote_addr;
        proxy_pass_header     Server;
        proxy_pass_header     Date;
        proxy_ssl_name        $upstream_host;
        proxy_ssl_server_name on;
        proxy_pass            $upstream_scheme://kong_upstream$upstream_uri;
    }

    location = /kong_error_handler {
        internal;
        default_type                 '';

        uninitialized_variable_warn  off;

        rewrite_by_lua_block {;}
        access_by_lua_block  {;}

        content_by_lua_block {
            Kong.handle_error()
        }
    }
}

server {
    server_name kong_admin;
    listen 0.0.0.0:8001;

    access_log /dev/stdout;
    error_log  /data/logs/kong-server/access-admin.log error;

    # injected nginx_admin_* directives
    client_body_buffer_size 10m;
    client_max_body_size 10m;

    location / {
        default_type application/json;
        content_by_lua_block {
            Kong.admin_content()
        }
        header_filter_by_lua_block {
            Kong.admin_header_filter()
        }
    }

    location /nginx_status {
        internal;
        access_log off;
        stub_status;
    }

    location /robots.txt {
        return 200 'User-agent: *\nDisallow: /';
    }
}

Anything else?

No response

mayocream commented 2 years ago

Hi @vinniemo, thanks for reporting this, I think we might lack some information about your environment.

I am not surprised since the traffic path among Loadbalancer, Kong, and upstream could be keep-alived. Please put more information there for us to investigate.

ms2008 commented 2 years ago

I would suggest turning off the HTTP/2 listening and see if still see this?

HTTP/2 is designed to have a single long-lived TCP connection, across which all requests are multiplexed—meaning multiple requests can be active on the same connection at any point in time. Normally, this is great, as it reduces the overhead of connection management. However, it also means that (as you might imagine) connection-level balancing isn't very useful. Once the connection is established, there's no more balancing to be done.

See also:

  1. gRPC Load Balancing on Kubernetes without Tears
LoremipsumSharp commented 2 years ago

@mayocream the lj_BC_TGETS has a large part of flame graph,we don`t have much information about it, is that normal?

ms2008 commented 2 years ago

Possibly related to the information mentioned in #8838

ADD-SP commented 2 years ago

@mayocream the lj_BC_TGETS has a large part of flame graph,we don`t have much information about it, is that normal?

Hi @LoremipsumSharp , you provided a C-land flamegraph. Kong is written in Lua, so we should use the Lua-land flamegraph to analyze the performance of Kong. Now, I don't think we need the flamegraph to analyze this issue.

I don't think the imbalance is a Kong issue though, it's most likely related to Kubernetes.

LoremipsumSharp commented 2 years ago

@mayocream the lj_BC_TGETS has a large part of flame graph,we don`t have much information about it, is that normal?

Hi @LoremipsumSharp , you provided a C-land flamegraph. Kong is written in Lua, so we should use the Lua-land flamegraph to analyze the performance of Kong. Now, I don't think we need the flamegraph to analyze this issue.

I don't think the imbalance is a Kong issue though, it's most likely related to Kubernetes. @ADD-SP Is there any tool for generating the Lua-land flamegraph?

fffonion commented 2 years ago

@LoremipsumSharp you can use this one https://github.com/kong/stapxx#lj-lua-bt

ADD-SP commented 2 years ago

I don't think we need to use the flamegraph to analyze this issue, I think this is a Kubernetes-related issue.

ADD-SP commented 2 years ago

@LoremipsumSharp Would you mind telling us the Kubernetes config mentioned by @mayocream ?

vinniemo commented 2 years ago

@LoremipsumSharp you can use this one https://github.com/kong/stapxx#lj-lua-bt

Thank you for your advice . You can see it in this lj-lua-bt svg picture, The CPU is mostly dealing with regex router match. For this,Do you have any better suggestions?

image

vinniemo commented 2 years ago

The problem has been resolved. The root cause is that short path prefix routes are placed at the lower part of the table during sort. The pcre regex routes are sorted to the first position.

The complexity of each request short path prefix route in find_route function is O(N)=2500+, So CPU usage is particularly high and imbalance.

This problem can be solved temporarily by setting the short path prefix route as a pcre regex route and making it regex priority to a higher level. For example, /api/ to /api/(\S+) . But the best way to deal with it is find_route function is O(N)=1.

image image image image

ms2008 commented 2 years ago

@vinniemo Thanks for the investigation. Based on the current implementation of Router, I think it is hard to reduce the time complexity of find_route to O(1).

stale[bot] commented 1 year ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.