Open bearchess opened 1 year ago
@bearchess Can you provide some relevant configuration that you used here?
@Revolyssup
About 20% of the requests are access the old IP address.
route config
{
"uri": "/micro-user/*",
"name": "micro-user",
"desc": "用户服务",
"methods": [
"GET",
"POST",
"HEAD",
"PUT",
"PATCH",
"DELETE",
"OPTIONS",
"TRACE",
"CONNECT"
],
"plugins": {
"request-validation": {
"header_schema": {
"properties": {},
"type": "object"
}
}
},
"upstream": {
"timeout": {
"connect": 60,
"send": 60,
"read": 60
},
"type": "roundrobin",
"scheme": "http",
"discovery_type": "nacos",
"discovery_args": {
"group_name": "DEFAULT_GROUP",
"namespace_id": "a2f5c588-a038-444f-b220-162d40172886"
},
"pass_host": "pass",
"service_name": "micro-user",
"keepalive_pool": {
"idle_timeout": 60,
"requests": 1000,
"size": 320
}
},
"status": 1
}
apisix config
apiVersion: v1
data:
config.yaml: |-
apisix:
node_listen: 9080 # APISIX listening port
enable_heartbeat: true
enable_admin: true
enable_admin_cors: true
enable_debug: false
enable_dev_mode: false # Sets nginx worker_processes to 1 if set to true
enable_reuseport: true # Enable nginx SO_REUSEPORT switch if set to true.
enable_ipv6: true # Enable nginx IPv6 resolver
config_center: etcd # etcd: use etcd to store the config value
# yaml: fetch the config value from local yaml file `/your_path/conf/apisix.yaml`
proxy_cache: # Proxy Caching configuration
cache_ttl: 10s # The default caching time if the upstream does not specify the cache time
zones: # The parameters of a cache
- name: disk_cache_one # The name of the cache, administrator can be specify
# which cache to use by name in the admin api
memory_size: 50m # The size of shared memory, it's used to store the cache index
disk_size: 1G # The size of disk, it's used to store the cache data
disk_path: "/tmp/disk_cache_one" # The path to store the cache data
cache_levels: "1:2" # The hierarchy levels of a cache
allow_admin: # http://nginx.org/en/docs/http/ngx_http_access_module.html#allow
- 0.0.0.0/0
port_admin: 9180
# Default token when use API to call for Admin API.
# *NOTE*: Highly recommended to modify this value to protect APISIX's Admin API.
# Disabling this configuration item means that the Admin API does not
# require any authentication.
admin_key:
# admin: can everything for configuration data
- name: "admin"
key:
role: admin
# viewer: only can view configuration data
- name: "viewer"
key:
role: viewer
router:
http: 'radixtree_uri' # radixtree_uri: match route by uri(base on radixtree)
# radixtree_host_uri: match route by host + uri(base on radixtree)
ssl: 'radixtree_sni' # radixtree_sni: match route by SNI(base on radixtree)
dns_resolver_valid: 30
resolver_timeout: 5
ssl:
enable: false
enable_http2: true
listen_port: 9443
ssl_protocols: "TLSv1 TLSv1.1 TLSv1.2 TLSv1.3"
ssl_ciphers: "ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES256-SHA256:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA"
nginx_config: # config for render the template to genarate nginx.conf
error_log: "/dev/stderr"
error_log_level: "warn" # warn,error
worker_rlimit_nofile: 20480 # the number of files a worker process can open, should be larger than worker_connections
event:
worker_connections: 10620
http:
enable_access_log: true
access_log: "/dev/stdout"
access_log_format: "$remote_addr - $remote_user [$time_local] $http_host \"$request\" $status $body_bytes_sent $request_time \"$http_referer\" \"$http_user_agent\" $upstream_addr $upstream_status $upstream_response_time \"$upstream_scheme://$upstream_host$upstream_uri\""
access_log_format_escape: default
keepalive_timeout: 60s # timeout during which a keep-alive client connection will stay open on the server side.
client_header_timeout: 60s # timeout for reading client request header, then 408 (Request Time-out) error is returned to the client
client_body_timeout: 60s # timeout for reading client request body, then 408 (Request Time-out) error is returned to the client
send_timeout: 10s # timeout for transmitting a response to the client.then the connection is closed
underscores_in_headers: "on" # default enables the use of underscores in client request header fields
real_ip_header: "X-Real-IP" # http://nginx.org/en/docs/http/ngx_http_realip_module.html#real_ip_header
real_ip_from: # http://nginx.org/en/docs/http/ngx_http_realip_module.html#set_real_ip_from
- 127.0.0.1
- 'unix:'
etcd:
host: # it's possible to define multiple etcd hosts addresses of the same etcd cluster.
- "http://apisix-etcd.sre-production.svc.cluster.local:2379"
prefix: "/apisix" # apisix configurations prefix
timeout: 30 # 30 seconds
discovery:
nacos:
host:
- "http://172.17.98.204:8848"
prefix: "/nacos/v1/"
fetch_interval: 10 # default 30 sec
weight: 100 # default 100
timeout:
connect: 2000 # default 2000 ms
send: 2000 # default 2000 ms
read: 5000 # default 5000 msx
plugins: # plugin list
- api-breaker
- authz-keycloak
- basic-auth
- batch-requests
- consumer-restriction
- cors
- echo
- fault-injection
- grpc-transcode
- hmac-auth
- http-logger
- ip-restriction
- ua-restriction
- jwt-auth
- kafka-logger
- key-auth
- limit-conn
- limit-count
- limit-req
- node-status
- openid-connect
- authz-casbin
- prometheus
- proxy-cache
- proxy-mirror
- proxy-rewrite
- redirect
- referer-restriction
- request-id
- request-validation
- response-rewrite
- serverless-post-function
- serverless-pre-function
- sls-logger
- syslog
- tcp-logger
- udp-logger
- uri-blocker
- wolf-rbac
- zipkin
- traffic-split
- gzip
- real-ip
- ext-plugin-pre-req
- ext-plugin-post-req
stream_plugins:
- mqtt-proxy
- ip-restriction
- limit-conn
kind: ConfigMap
@bearchess have you solved your problem?
We are seeing this as well with APISIX 3.5.0 (production) and 3.7.0 (staging). We can reproduce by nuking the etcd cluster and recreating it. Propagating changes with the APISIX ingress controller to etcd seems to work, but APISIX itself does not reconnect with etcd until restarted.
@Revolyssup PTAL
@bearchess have you solved your problem?
I recovered the problem by restarting the apisix pod, but I didn't find out what the cause was specifically。
We also have similar issues. Currently, we have found that the pattern seems to be triggered with a very small probability when updating services in large quantities. The apisix ingress controller correctly updated the pod IP to etcd ,through curl XGET http://127.0.0.1:80/apisix/admin/upstream/xxx It is correct to check the IP list of Upstream, but there will still be a small number of requests made to offline IPs by Apisix until it is restarted to restore normal operations. Does Apisix have any memory caching mechanism? This problem has a serious impact。。。。
I got the same issue, do we have any workaround for this issue?
I have the same issue. Reproduced in both 3.3.0 and 3.9.1. We have kubernetes discovery set up following the minimal setup and the instructions in the values.yaml:
discovery:
kubernetes: { }
# The prerequisites for the above minimal Kubernetes example:
# 1. [Optional] Set `.serviceAccount.create` to `true` to create a dedicated ServiceAccount.
# It is recommended to do so, otherwise the default ServiceAccount "default" will be used.
# 2. [Required] Set `.rbac.create` to `true` to create and bind the necessary RBAC resources.
# This grants the ServiceAccount in use to List-Watch Kubernetes Endpoints resources.
# 3. [Required] Include the following environment variables in `.nginx.envs` to pass them into
# nginx worker processes (https://nginx.org/en/docs/ngx_core_module.html#env):
# - KUBERNETES_SERVICE_HOST
# - KUBERNETES_SERVICE_PORT
# This is for allowing the default `host` and `port` of `.discovery.registry.kubernetes.service`.
I'll take over this issue and try to fix it.
If someone could help me with a (standalone preferred) reproduction example, it would be much easier for me. cc: @start1943, @bstasz, @jbergstroem
I tried reproducing this issue but it worked as expected.
curl http://127.0.0.1:9180/apisix/admin/routes/1 -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1' -X PUT -i -d '
{
"uri": "/nacos/*",
"upstream": {
"service_name": "APISIX-NACOS",
"type": "roundrobin",
"discovery_type": "nacos"
}
}'
nacos
to register a service instance:
curl -X POST 'http://127.0.0.1:8848/nacos/v1/ns/instance?serviceName=APISIX-NACOS&ip=127.0.0.1&port=1980&ephemeral=false'
curl http://127.0.0.1:9080/nacos/get -i
HTTP/1.1 200 OK
Content-Type: text/plain; charset=utf-8
Content-Length: 19
Connection: keep-alive
Date: Tue, 30 Jul 2024 08:29:49 GMT
Server: APISIX/3.9.0
Hello 1980
- update service registry data:
```bash
curl -X POST 'http://127.0.0.1:8848/nacos/v1/ns/instance?serviceName=APISIX-NACOS&ip=127.0.0.1&port=1981&ephemeral=false'
curl http://127.0.0.1:9080/nacos/get -i
HTTP/1.1 200 OK
Content-Type: text/plain; charset=utf-8
Content-Length: 19
Connection: keep-alive
Date: Tue, 30 Jul 2024 08:29:49 GMT
Server: APISIX/3.9.0
Hello 1981
We also have similar issues. Currently, we have found that the pattern seems to be triggered with a very small probability when updating services in large quantities. The apisix ingress controller correctly updated the pod IP to etcd ,through curl XGET http://127.0.0.1:80/apisix/admin/upstream/xxx It is correct to check the IP list of Upstream, but there will still be a small number of requests made to offline IPs by Apisix until it is restarted to restore normal operations. Does Apisix have any memory caching mechanism? This problem has a serious impact。。。。
Hi @start1943 ,
While the problem occur,
retries & retry_timeout
config of the upstream. With default?Ping @start1943 :D
We also have similar issues. Currently, we have found that the pattern seems to be triggered with a very small probability when updating services in large quantities. The apisix ingress controller correctly updated the pod IP to etcd ,through curl XGET http://127.0.0.1:80/apisix/admin/upstream/xxx It is correct to check the IP list of Upstream, but there will still be a small number of requests made to offline IPs by Apisix until it is restarted to restore normal operations. Does Apisix have any memory caching mechanism? This problem has a serious impact。。。。
Hi @start1943 ,
While the problem occur,
- What about the CPU/Memory usage and Connection stat. Any obvious exceptions?
- How long it usually last before restarting APISIX?
- What about the
retries & retry_timeout
config of the upstream. With default?- And the quantity of nodes of the upstream?
retries & retry_timeout
config is defaultWe also have similar issues. Currently, we have found that the pattern seems to be triggered with a very small probability when updating services in large quantities. The apisix ingress controller correctly updated the pod IP to etcd ,through curl XGET http://127.0.0.1:80/apisix/admin/upstream/xxx It is correct to check the IP list of Upstream, but there will still be a small number of requests made to offline IPs by Apisix until it is restarted to restore normal operations. Does Apisix have any memory caching mechanism? This problem has a serious impact。。。。
Hi @start1943 , While the problem occur,
- What about the CPU/Memory usage and Connection stat. Any obvious exceptions?
- How long it usually last before restarting APISIX?
- What about the
retries & retry_timeout
config of the upstream. With default?- And the quantity of nodes of the upstream?
- Both cpu and memory、qps all fine, the problem does not occur when the apisix node has high cpu and memory, but occurs when a large number of updates in a short period of time to deployment triggers a change in pod ip
- reload apisix is very fast,reload will trigger synchronization of the upstream node IP information in etcd, so hitting the abnormal offline node causes the 504 problem to recover
retries & retry_timeout
config is default- All upstream 100 or so, each upstream inside the node 2-30
- The problem appears to be that the apisix node is not updating a memory cache properly,and is an occasional issue that may need to be triggered when a large number of pod updates are being tested
Hi @start1943 , Is it possible to explicitly qualify the upstream 'retry_timeout & timeout' parameters in your environment, based on the connection conditions/characteristics of the business? See if the problem lasts for a shorter period of time after this setting and recovers in a shorter period of time without reload.
Is it possible this issue is related to usage of builtin etcd? We have following configuration in our apisix-ingress-controller helmchart values.yaml:
config:
...
etcdserver:
enabled: true
image:
repository: docker.io/apache/apisix
tag: 3.9.1-debian
We are curious if it would be worthy to switch out of builtin etcd to the standalone full-fledged etcd cluster for better isolation if this is etcd related issue or not.
Related to number of routes for testing - on our testing cluster, where we hit this issue cca once a week, we have currently 11 apisixroutes.apisix.apache.org
CRDs (much higher number of production clusters) with total of 270 backends in form like (some of them have more complex match stanzas, some of them also using regex_uri etc)
- backends:
- serviceName: xxx-service
servicePort: 9040
match:
exprs:
- op: RegexMatch
subject:
scope: Path
value: /api/v\d+/schemas/result
hosts:
- xxx.yyy.com
paths:
- /api/*
name: asdasdasdasd-some-random-name
plugins:
- config:
http_to_https: true
enable: true
name: redirect
priority: 999
timeout:
read: 182s
...
- backends:
- serviceName: xxx-yyy
servicePort: 9092
match:
hosts:
- xxx.yyy.com
paths:
- /analyze
- /components
- /dashboards
- /modeler
- /metrics
- /
- /*
name: some-random-name
plugins:
- config:
http_to_https: true
enable: true
name: redirect
- config:
regex_uri:
- ^/(analyze|components|dashboards|metrics|modeler)$
- /$1/
enable: true
name: proxy-rewrite
priority: 101
timeout:
read: 182s
Is it possible this issue is related to usage of builtin etcd,The prerequisite for this issue is that APISIX has been running for a relatively long time (I tested for 5+ days). After updating the backend, this issue is consistently reproducible. and after reload apisix the ip will be new。
通过观察info日志可以清晰的看到正常的apisix pod发生变更的时候调用的路径为: 2024/10/15 02:56:15 [info] 53#53: 92955852 [lua] config_etcd.lua:202: res_func: { 2024/10/15 02:56:15 [info] 53#53: 92955852 [lua] config_etcd.lua:120: produce_res(): append res: { 2024/10/15 02:56:15 [info] 53#53: 182428634 [lua] config_etcd.lua:414: http_waitdir(): http_waitdir: { 2024/10/15 02:56:15 [info] 53#53: 182428634 [lua] config_etcd.lua:626: sync_data(): waitdir key: /apisix/upstreams prev_index: 31721, context: ngx.timer 2024/10/15 02:56:15 [info] 53#53: 182428634 [lua] config_etcd.lua:627: sync_data(): res: {"headers":{"X-Etcd-Index":"31721"},"body":{"node":[{"modifiedIndex":31721,"value":{"labels":{"managed-by":"apisix-ingress-controller"},"desc":"Created by apisix-ingress-controller, DO NOT modify it manually","scheme":"https","timeout":{"connect":120,"send":1200,"read":1200},"type":"roundrobin","name":"cls-180l5ec4_apiserver-token_443","pass_host":"pass","nodes":[{"host":"11.128.57.47","port":28438,"weight":100}],"id":"fc7a9e3b"},"key":"/apisix/upstreams/fc7a9e3b","createdIndex":8238}]}}, err: nil, context: ngx.timer 2024/10/15 02:56:15 [info] 53#53: 182428634 [lua] config_etcd.lua:711: sync_data(): update data by key: fc7a9e3b, context: ngx.timer
不正常的为: 2024/10/15 02:56:05 [info] 54#54: 183282442 [lua] config_etcd.lua:202: res_func: { 2024/10/15 02:56:05 [info] 54#54: 183282442 [lua] config_etcd.lua:120: produce_res(): append res: { 2024/10/15 02:56:11 [info] 54#54: 183282442 [lua] config_etcd.lua:202: res_func: { 2024/10/15 02:56:11 [info] 54#54: 183282442 [lua] config_etcd.lua:120: produce_res(): append res: { 2024/10/15 02:56:11 [info] 54#54: 183282442 [lua] config_etcd.lua:202: res_func: { 2024/10/15 02:56:11 [info] 54#54: 183282442 [lua] config_etcd.lua:120: produce_res(): append res: { 2024/10/15 02:56:12 [info] 54#54: 183282442 [lua] config_etcd.lua:202: res_func: { 2024/10/15 02:56:12 [info] 54#54: 183282442 [lua] config_etcd.lua:120: produce_res(): append res: { 可以看到sync_data一直未响应
Current Behavior
I am currently using APIsix and Nacos in Kubernetes. APIsix service discovery is configured with Nacos. However, after a pod update and restart in K8s, APIsix still retrieves the old pod IP, resulting in a 503 error upon access. This issue is resolved upon restarting APIsix, and it is currently not reproducible.
Expected Behavior
No response
Error Logs
[error] 45#45: *59314180 upstream timed out (110: Operation timed out) while connecting to upstream, client: xx.xx.xx.xx, server: _, request: "GET /micro-user/system HTTP/1.1", upstream: "http://172.17.97.37:18081/micro-user/system/", host: "https://www.test.com", referrer: "https://www.test.com/"
Steps to Reproduce
onloy one step update an image in Kubernetes deployment
Environment
apisix version
):2.14.2uname -a
): Linux apisix-5f5bc75b47-dp2cb 5.10.134-15.1.2.lifsea8.x86_64 #1 SMP Tue Aug 29 07:26:14 UTC 2023 x86_64 Linuxopenresty -V
ornginx -V
): openresty/1.19.9.1curl http://127.0.0.1:9090/v1/server_info
):luarocks --version
):