envoyproxy / envoy

Cloud-native high-performance edge/middle/service proxy
https://www.envoyproxy.io
Apache License 2.0
25k stars 4.81k forks source link

Suspected memory leak when handling HTTPS traffic. #35786

Closed johnlanni closed 2 months ago

johnlanni commented 2 months ago

Description:

After we upgraded Envoy from version 1.23 to 1.27, we observed a memory leak in the gateway handling HTTPS traffic. However, we were unable to reproduce the issue consistently in an offline environment. Consequently, we proceeded to upgrade Envoy further to version 1.31.0, only to find that the memory leakage problem persisted.

image image image

Heap profiler (1.27.2): leak-1.27.2.pdf

Heap profiler (1.31.0): leak-1.31.0.pdf

Envoy Config

LDS config snippet:

    {
     "name": "0.0.0.0_443",
     "active_state": {
      "version_info": "2024-08-21T15:26:29+08:00/3",
      "listener": {
       "@type": "type.googleapis.com/envoy.config.listener.v3.Listener",
       "name": "0.0.0.0_443",
       "address": {
        "socket_address": {
         "address": "0.0.0.0",
         "port_value": 443
        }
       },
       "filter_chains": [
        {
         "filter_chain_match": {
          "server_names": [
           "api.xxxxx.com"
          ]
         },
         "filters": [
          {
           "name": "envoy.filters.network.http_connection_manager",
           "typed_config": {
            "@type": "type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager",
            "stat_prefix": "outbound_0.0.0.0_443",
            "rds": {
             "config_source": {
              "ads": {},
              "initial_fetch_timeout": "0s",
              "resource_api_version": "V3"
             },
             "route_config_name": "xxxxxxxx"
            },
            "http_filters": [
             {
              "name": "envoy.filters.http.cors",
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.filters.http.cors.v3.Cors"
              }
             },
             {
              "name": "envoy.filters.http.rbac",
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.filters.http.rbac.v3.RBAC"
              }
             },
             {
              "name": "envoy.filters.http.local_ratelimit",
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit",
               "stat_prefix": "http_local_rate_limiter"
              }
             },
             {
              "name": "envoy.filters.http.fault",
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.filters.http.fault.v3.HTTPFault"
              }
             },
             {
              "name": "envoy.filters.http.router",
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.filters.http.router.v3.Router"
              }
             }
            ],
            "http_protocol_options": {
             "accept_http_10": true
            },
            "http2_protocol_options": {
             "max_concurrent_streams": 100,
             "initial_stream_window_size": 65535,
             "initial_connection_window_size": 1048576
            },
            "server_name": "istio-envoy",
            "access_log": [
             {
              "name": "envoy.access_loggers.file",
              "filter": {
               "not_health_check_filter": {}
              },
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog",
               "path": "/dev/stdout",
               "log_format": {
                "text_format_source": {
                 "inline_string": "xxxxxx"
                }
               }
              }
             }
            ],
            "use_remote_address": true,
            "generate_request_id": true,
            "forward_client_cert_details": "SANITIZE_SET",
            "set_current_client_cert_details": {
             "subject": true,
             "cert": true,
             "dns": true,
             "uri": true
            },
            "upgrade_configs": [
             {
              "upgrade_type": "websocket"
             }
            ],
            "stream_idle_timeout": "30s",
            "normalize_path": true,
            "common_http_protocol_options": {
             "idle_timeout": "30s"
            },
            "request_id_extension": {
             "typed_config": {
              "@type": "type.googleapis.com/envoy.extensions.request_id.uuid.v3.UuidRequestIdConfig",
              "use_request_id_for_trace_sampling": true
             }
            },
            "path_with_escaped_slashes_action": "KEEP_UNCHANGED"
           }
          }
         ],
         "transport_socket": {
          "name": "envoy.transport_sockets.tls",
          "typed_config": {
           "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext",
           "common_tls_context": {
            "tls_params": {
             "tls_minimum_protocol_version": "TLSv1_0",
             "tls_maximum_protocol_version": "TLSv1_3",
             "cipher_suites": [
              "ECDHE-ECDSA-AES128-GCM-SHA256",
              "ECDHE-ECDSA-CHACHA20-POLY1305",
              "ECDHE-RSA-AES128-GCM-SHA256",
              "ECDHE-RSA-CHACHA20-POLY1305",
              "ECDHE-ECDSA-AES128-SHA",
              "ECDHE-RSA-AES128-SHA",
              "AES128-GCM-SHA256",
              "AES128-SHA",
              "ECDHE-ECDSA-AES256-GCM-SHA384",
              "ECDHE-RSA-AES256-GCM-SHA384",
              "ECDHE-ECDSA-AES256-SHA",
              "ECDHE-RSA-AES256-SHA",
              "AES256-GCM-SHA384",
              "AES256-SHA"
             ]
            },
            "alpn_protocols": [
             "http/1.1"
            ],
            "tls_certificate_sds_secret_configs": [
             {
              "name": "kubernetes://xxxxxxxx",
              "sds_config": {
               "ads": {},
               "resource_api_version": "V3"
              }
             }
            ]
           },
           "require_client_certificate": false
          }
         }
        },
       "per_connection_buffer_limit_bytes": 32768,
       "listener_filters": [
        {
         "name": "envoy.filters.listener.tls_inspector",
         "typed_config": {
          "@type": "type.googleapis.com/envoy.extensions.filters.listener.tls_inspector.v3.TlsInspector"
         }
        }
       ],
       "listener_filters_timeout": "0s",
       "traffic_direction": "OUTBOUND",
       "access_log": [
        {
         "name": "envoy.access_loggers.file",
         "filter": {
          "response_flag_filter": {
           "flags": [
            "NR"
           ]
          }
         },
         "typed_config": {
          "@type": "type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog",
          "path": "/dev/stdout",
          "log_format": {
           "text_format_source": {
            "inline_string": "xxxxxxxx"
           }
          }
         }
        }

RDS config snippet:

   {
     "version_info": "2024-08-21T15:26:29+08:00/3",
     "route_config": {
      "@type": "type.googleapis.com/envoy.config.route.v3.RouteConfiguration",
      "name": "xxxxxxxxx",
      "virtual_hosts": [
       {
        "name": "api.xxx.com:443",
        "domains": [
         "api.xxx.com"
        ],
        "routes": [
         {
          "match": {
           "prefix": "/xxx",
           "case_sensitive": true
          },
          "route": {
           "weighted_clusters": {
            "clusters": [
             {
              "name": "outbound|8089||xxxxxx",
              "weight": 50
             },
             {
              "name": "outbound|8089||xxx",
              "weight": 50
             }
            ],
            "total_weight": 100
           },
           "timeout": "0s",
           "retry_policy": {
            "retry_on": "connect-failure,refused-stream,unavailable,cancelled,retriable-status-codes",
            "num_retries": 2,
            "retry_host_predicate": [
             {
              "name": "envoy.retry_host_predicates.previous_hosts",
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.retry.host.previous_hosts.v3.PreviousHostsPredicate"
              }
             }
            ],
            "host_selection_retry_max_attempts": "5",
            "retriable_status_codes": [
             503
            ],
            "retriable_request_headers": [
             {
              "name": ":method",
              "invert_match": true,
              "string_match": {
               "safe_regex": {
                "google_re2": {},
                "regex": "POST|PATCH|LOCK"
               }
              }
             }
            ]
           },
           "max_grpc_timeout": "0s"
          },
          "metadata": {
           "filter_metadata": {
            "istio": {
             "config": "xxxxxxx"
            }
           }
          },
          "decorator": {
           "operation": "xxxxxx"
          },
          "name": "xxx"
         },
         {
          "match": {
           "prefix": "/xxx",
           "case_sensitive": true
          },
          "route": {
           "weighted_clusters": {
            "clusters": [
             {
              "name": "outbound|8090||xxxxxx",
              "weight": 50
             },
             {
              "name": "outbound|8090||xxxxx",
              "weight": 50
             }
            ],
            "total_weight": 100
           },
           "timeout": "0s",
           "retry_policy": {
            "retry_on": "connect-failure,refused-stream,unavailable,cancelled,retriable-status-codes",
            "num_retries": 2,
            "retry_host_predicate": [
             {
              "name": "envoy.retry_host_predicates.previous_hosts",
              "typed_config": {
               "@type": "type.googleapis.com/envoy.extensions.retry.host.previous_hosts.v3.PreviousHostsPredicate"
              }
             }
            ],
            "host_selection_retry_max_attempts": "5",
            "retriable_status_codes": [
             503
            ],
            "retriable_request_headers": [
             {
              "name": ":method",
              "invert_match": true,
              "string_match": {
               "safe_regex": {
                "google_re2": {},
                "regex": "POST|PATCH|LOCK"
               }
              }
             }
            ]
           },
           "cors": {
            "allow_methods": "GET,POST,PUT,DELETE,HEAD,OPTIONS,PATCH",
            "allow_headers": "*",
            "expose_headers": "*",
            "max_age": "86400",
            "allow_credentials": true,
            "filter_enabled": {
             "default_value": {
              "numerator": 100
             }
            },
            "allow_origin_string_match": [
             {
              "safe_regex": {
               "regex": ".*"
              }
             }
            ]
           },
           "max_grpc_timeout": "0s"
          },
          "metadata": {
           "filter_metadata": {
            "istio": {
             "config": "xxxxx"
            }
           }
          },
          "decorator": {
           "operation": "xxxxx"
          },
          "name": "xxxxx"
         }
        ],
        "include_request_attempt_count": true
       }
      ],
      "validate_clusters": false,
      "max_direct_response_body_size_bytes": 1048576,
      "ignore_port_in_host_matching": true
     },
     "last_updated": "2024-08-21T13:48:15.519Z"
    },

CDS config snippet:

    {
     "version_info": "2024-08-21T15:26:29+08:00/3",
     "cluster": {
      "@type": "type.googleapis.com/envoy.config.cluster.v3.Cluster",
      "name": "outbound|8081||xxxxx",
      "type": "EDS",
      "eds_cluster_config": {
       "eds_config": {
        "ads": {},
        "initial_fetch_timeout": "0s",
        "resource_api_version": "V3"
       },
       "service_name": "outbound|8081||xxxxx"
      },
      "connect_timeout": "10s",
      "circuit_breakers": {
       "thresholds": [
        {
         "max_connections": 4294967295,
         "max_pending_requests": 4294967295,
         "max_requests": 4294967295,
         "max_retries": 4294967295,
         "track_remaining": true
        }
       ]
      },
      "metadata": {
       "filter_metadata": {
        "istio": {
         "services": [
          {
           "name": "xxxxxx",
           "host": "xxxxxx",
           "namespace": "xxxxxx"
          }
         ],
         "config": "xxxxxxx",
         "external": true
        }
       }
      },
      "common_lb_config": {
       "locality_weighted_lb_config": {}
      },
      "typed_extension_protocol_options": {
       "envoy.extensions.upstreams.http.v3.HttpProtocolOptions": {
        "@type": "type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions",
        "common_http_protocol_options": {
         "idle_timeout": "30s"
        },
        "explicit_http_config": {
         "http_protocol_options": {}
        }
       }
      }
     },
     "last_updated": "2024-08-21T13:48:15.390Z"
    }

SDS config snippet:

    {
     "name": "kubernetes://xxxxxxxxxx",
     "version_info": "2024-08-21T15:26:29+08:00/3",
     "last_updated": "2024-08-21T13:48:15.521Z",
     "secret": {
      "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.Secret",
      "name": "kubernetes://xxxxxxxx",
      "tls_certificate": {
       "certificate_chain": {
        "inline_bytes": "xxxxxxxxxx"
       },
       "private_key": {
        "inline_bytes": "xxxxxxxxx"
       }
      }
     }
    }

Connection tracing log

proxy.log

johnlanni commented 2 months ago

The suspicion that HTTPS is causing the leak arises because we have deployed multiple envoy instances, and only those handling HTTPS traffic exhibit memory leaks. We attempted to revert the BoringSSL version back to the one used in 1.23, but the issue persists.

johnlanni commented 2 months ago

Identified the issue as being related to Istio: https://github.com/istio/istio/issues/52850