Closed rsbrisci closed 6 years ago
Going into a little high level analysis we know the error comes from here:
https://github.com/Kong/kong/blob/master/kong/plugins/oauth2/access.lua Lines 56-71 respectively
local token, err = singletons.dao.oauth2_tokens:insert({
api_id = api_id,
credential_id = credential.id,
authenticated_userid = authenticated_userid,
expires_in = token_expiration,
refresh_token = refresh_token,
scope = scope
}, {ttl = token_expiration > 0 and refresh_token_ttl or nil}) -- Access tokens (and their associated refresh token) are being
-- permanently deleted after 'refresh_token_ttl' seconds
if err then
return responses.send_HTTP_INTERNAL_SERVER_ERROR(err)
end
return {
access_token = token.access_token,
token_type = "bearer",
expires_in = token_expiration > 0 and token.expires_in or nil,
refresh_token = refresh_token,
state = state -- If state is nil, this value won't be added
}
end
The singletons.dao.oauth2_tokens:insert method is somehow leaving us with a nil token AND not even setting the err object because we managed to reach the lower return block and nil.access_token gives us the lua null pointer exception error we see in Ross's first block above.
I assume this is due to stress between cassandra + kong, and kong is not handling errors as gracefully as it could. As a patch job running at 50 TPS against the OAUTH endpoint I do plan on trying this band-aid because I am just not sure how to fix the root cause in lower level cassandra/lua land currently with kong (the below is sudo code of my idea):
local function token_db_retry(api_id, credential.id, authenticated_userid, scope, state, token_expiration, refresh_token, refresh_token_ttl)
sleep 1; -- Sleep 1 second to give C*/Kong time to reprocess the few tokens that error'ed out.
--Code here will retry the singletons.dao.oauth2_tokens:insert method with newer values
return singletons.dao.oauth2_tokens:insert(...)
end
local token, err = singletons.dao.oauth2_tokens:insert({
api_id = api_id,
credential_id = credential.id,
authenticated_userid = authenticated_userid,
expires_in = token_expiration,
refresh_token = refresh_token,
scope = scope
}, {ttl = token_expiration > 0 and refresh_token_ttl or nil})
if(token == nil) then
--Counter here to prevent infinite retry loop in the event of problems, maybe like 5 times, we don't want to be forced to return an HTTP_INTERNAL_SERVER_ERROR if it can be helped because forcing your consumers to have to add oauth2 retry logic is pretty bad implementation IMO.
token, err = token_db_retry()...
--if token ~= nil then break because success
end
if err then
return responses.send_HTTP_INTERNAL_SERVER_ERROR(err)
end
return {
access_token = token.access_token,
token_type = "bearer",
expires_in = token_expiration > 0 and token.expires_in or nil,
refresh_token = refresh_token,
state = state -- If state is nil, this value won't be added
}
Note at 50 TPS we were seeing 2-5 failures over a span of 3 seconds or so. Ultimately we would like to see this able to handle OAUTH token creation at around 1000+ TPS honestly. And yes my patch job idea is terrible implementation in general, but may be a temp patch for us until root cause can be addressed.
-Jeremy
Update, will be testing it with this additional retry logic today!
local url = require "socket.url"
local utils = require "kong.tools.utils"
local responses = require "kong.tools.responses"
local constants = require "kong.constants"
local timestamp = require "kong.tools.timestamp"
local singletons = require "kong.singletons"
local public_utils = require "kong.tools.public"
local string_find = string.find
local req_get_headers = ngx.req.get_headers
local ngx_set_header = ngx.req.set_header
local ngx_req_get_method = ngx.req.get_method
local ngx_req_get_uri_args = ngx.req.get_uri_args
local check_https = utils.check_https
local _M = {}
local CONTENT_LENGTH = "content-length"
local CONTENT_TYPE = "content-type"
local RESPONSE_TYPE = "response_type"
local STATE = "state"
local CODE = "code"
local TOKEN = "token"
local REFRESH_TOKEN = "refresh_token"
local SCOPE = "scope"
local CLIENT_ID = "client_id"
local CLIENT_SECRET = "client_secret"
local REDIRECT_URI = "redirect_uri"
local ACCESS_TOKEN = "access_token"
local GRANT_TYPE = "grant_type"
local GRANT_AUTHORIZATION_CODE = "authorization_code"
local GRANT_CLIENT_CREDENTIALS = "client_credentials"
local GRANT_REFRESH_TOKEN = "refresh_token"
local GRANT_PASSWORD = "password"
local ERROR = "error"
local AUTHENTICATED_USERID = "authenticated_userid"
local function token_db_retry(api_id, id, authenticated_userid, token_expiration, refresh_token, scope, token_expiration, refresh_token_ttl)
ngx.sleep(0.500) --Sleep 500 milliseconds
return singletons.dao.oauth2_tokens:insert({
api_id = api_id,
credential_id = id,
authenticated_userid = authenticated_userid,
expires_in = token_expiration,
refresh_token = refresh_token,
scope = scope
}, {ttl = token_expiration > 0 and refresh_token_ttl or nil})
end
local function generate_token(conf, api, credential, authenticated_userid, scope, state, expiration, disable_refresh)
local token_expiration = expiration or conf.token_expiration
local refresh_token
if not disable_refresh and token_expiration > 0 then
refresh_token = utils.random_string()
end
local refresh_token_ttl
if conf.refresh_token_ttl and conf.refresh_token_ttl > 0 then
refresh_token_ttl = conf.refresh_token_ttl
end
local api_id
if not conf.global_credentials then
api_id = api.id
end
local token, err = singletons.dao.oauth2_tokens:insert({
api_id = api_id,
credential_id = credential.id,
authenticated_userid = authenticated_userid,
expires_in = token_expiration,
refresh_token = refresh_token,
scope = scope
}, {ttl = token_expiration > 0 and refresh_token_ttl or nil}) -- Access tokens (and their associated refresh token) are being
-- permanently deleted after 'refresh_token_ttl' seconds
--New retry logic in the event of failure
if token == nil then
--Temporary logs for debugging purposes
ngx.log(ngx.ERR,"Token DAO Insert Failed First time!: ", err)
local i = 0
while( i < 3 )
do
token, err = token_db_retry(api_id, credential.id, authenticated_userid, token_expiration, refresh_token, scope, token_expiration, refresh_token_ttl)
if token ~= nil then break end
--Temporary logs for debugging purposes
ngx.log(ngx.ERR,"Token DAO Insert Failed Again!: ", err)
i =i+1
end
end
--end of additional logic
if err then
return responses.send_HTTP_INTERNAL_SERVER_ERROR(err)
end
return {
access_token = token.access_token,
token_type = "bearer",
expires_in = token_expiration > 0 and token.expires_in or nil,
refresh_token = refresh_token,
state = state -- If state is nil, this value won't be added
}
end
Will come back and post with our results for this patchjob band-aid attempt.
Following up on the ^ since I said I would release initial results post bandaid testing:
No Patch behavior 5 users:
Behavior with Patch 5 users:
10 Users Patch:
20 w patch:
40 w patch:
80 w patch:
Overall very pleased with my band aid performance and now consumers will not be impacted by this issue! I still am holding out hope you Kong pros will figure out root cause and salvage the original source code for resiliency though :p .
Anyone else feel free to use the above source code in your access.lua file as a work around for the problem for now if you are hitting it like we are in testing!
P.S - The err value is nil when this problem is happening too, the oauth2_tokens:insert() must not be responding as expected. Thought I would just throw that in for reference too.
Thanks, -Jeremy
@thibaultcha ever get a chance to attempt some load testing on the OAuth2.0 plugin with a single oauth2.0 proxy and give it global permissions for client credential generation with the db as a Cassandra cluster? I would expect the HTTP 500 response errors to be reproducible. We are still using my retry patchwork logic which I see as only a band-aid to the underlying root cause today on latest Kong 13.x too.
@jeremyjpj0916 No, never. I do not focus on the OAuth2 plugin. It seems like it could use some hardening.
@thibaultcha Gotcha thanks for the honesty lol, I will dig a bit further and see if I can find the real root cause shortly.
We were running this fix in our Kong instance as we had issues similar to this bug report. When we ran under load still noticed the Token DAO Insert Failed First time!
followed by a couple Token DAO Insert Failed Again!
errors. The time for each of these failures was 1.5 seconds total which equated to the .5 second wait executing 3 times.
What seemed to be happening on our system was that the singletons.dao.oauth2_tokens:insert call successfully did the Insert of the Token, but then it tries to immediately read it out of the database. With Cassandra’s eventual consistency this error makes sense. The retry logic was then trying to insert the token again which fails so it didn’t try to read it out of the database. I don’t have the exact error messages, but it was something like a duplicate key since we are reusing the same id’s for the retry logic.
Our solution was to change the consistency level using cassandra_consistency from the default of ONE to LOCAL_QUORUM. We did this because we have multiple data centers and 3 nodes per data center. Once this change was made we went from 6% error rate under load to 0% errors generating token.
@charlie-retzler thanks for the input, what you are saying is starting to make some sense to me because I too run 6 node C cluster in 2 dc's with a consistency of ONE and a replication factor of TWO. The highest load I ever reached with my patch test logic was only 363 TPS so I am guessing your load tests were an even larger scale! I may give this a shot, LOCAL_QUORUM enforces essentially that the local DC C nodes acknowledge the transaction prior to completion right? So then it makes sense an immediate read out would work at that point because it forces the local consistency. Did you notice any increases in overall latency switching from ONE to LOCAL_QUORUM? I would think when its trying to "read" it out of the database it would use the same node the tx just placed the token onto but I guess maybe Kong jumps to the next local node quickly and that causes the problem. Ultimately sounds like Kong could get more specific in implementation depending on C settings(which I am not really a fan of because that adds a lot of complication) but also the fix with kong client c_consistency seems easiest so Kong's current code plays nice with it 👍 . Thanks again!
Confirmed LOCAL_QUORUM was the fix. I also noticed my admin API calls sometimes randomly HTTP 500 on resource creations(where kong reports fail but underlying resource was actually created). @rsbrisci I think we can close this git issue, not sure how Kong intends to compensate for consistency ONE vs say LOCAL_QUORUM etc. I think Kong probably should document LOCAL_QUORUM will yield the most stable form of Kong, otherwise you will hit things like oauth2 problems + admin-api performing at about 90-95% success rate.
Thanks for the reports @charlie-retzler and @jeremyjpj0916! This probably does warrant a few steps:
@thibaultcha Another possible, but much more global change would be to change the implementation in cassandra.lua for _M:insert to not do and insert and then a find from a database that uses eventual consistency. If all the needed data was present in the insert, it could return the object it inserted instead of doing the find.
I didn't look into this a ton, but we would need to change the models to generate their own ID's instead of having them added by _M.dao_insert_values. This would still probably require LOCAL_QUORUM to handle subsequent calls to Admin API's if they happen very close to time of insertion, but would reduce overhead for inserts.
@charlie-retzler We ran into an issue today where 1 of our nodes in a 3 node local dc of 2 data center's(so 6 nodes total to make the full C cluster) went down in Stage. you will find that running LOCAL_QUORUM will cause that cluster to start throwing unable to achieve consistency errors. I am reverting back to my patch logic with ONE most likely simply because we can't have one C node going down impact our Gateway, just wanted to throw it out as an FYI in case you ever hit that situation. I found the error strange too though because I would have though 2/3 nodes available in a DC Kong would have been able to sustain no impact.
We were running these settings @ the time of impact: KONG_CASSANDRA_CONSISTENCY: LOCAL_QUORUM KONG_CASSANDRA_LB_POLICY: RequestDCAwareRoundRobin
-Update, worked out in the forum a plan, we are changing the RF to 3 where all nodes share the data and we can then continue using LOCAL_QUORUM safely. Before we were running Replication Factor of 2.
@jeremyjpj0916 Out of curiosity, what tool for performance test do you use in this comment ? https://github.com/Kong/kong/issues/3182#issuecomment-362890351
@zyxep That tool from the screenshots that was sending the load and has a table of info was SOAPUI, not a tool I generally recommend(bigger fan of postman/insomnia), but for quick and dirty load tests that also has a gui SOAPUI is kinda nice. And the chartings of 200's vs 500's etc. was because we log Kong tx's to splunk for analytics engines to see real time perf and http status over time with a time-chart etc.
Sorry for late reply, just was seeing your comment haha.
Summary
When we generate tokens with the same credentials at high tps (>50) we see intermittent HTTP 500 responses with
{"message":"An unexpected error occurred"}
in the payload. In the logs, we see this stacktrace:Steps To Reproduce
Additional Details & Logs
2018/01/26 21:12:30 [debug] 40#0: *2120 [lua] base_plugin.lua:20: rewrite(): executing plugin "rate-limiting": rewrite
2018/01/26 21:12:30 [debug] 46#0: *2114 [lua] base_plugin.lua:20: rewrite(): executing plugin "rate-limiting": rewrite
2018/01/26 21:12:30 [debug] 46#0: *2114 [lua] base_plugin.lua:20: rewrite(): executing plugin "kong-upstream-jwt": rewrite
2018/01/26 21:12:30 [debug] 46#0: *2114 [lua] base_plugin.lua:20: rewrite(): executing plugin "custom-kong-http-log-plugin": rewrite
2018/01/26 21:12:30 [debug] 40#0: *2120 [lua] base_plugin.lua:20: rewrite(): executing plugin "kong-upstream-jwt": rewrite
2018/01/26 21:12:30 [debug] 46#0: *2114 [lua] base_plugin.lua:20: rewrite(): executing plugin "statsd": rewrite
2018/01/26 21:12:30 [debug] 46#0: *2114 [lua] base_plugin.lua:20: rewrite(): executing plugin "correlation-id": rewrite
2018/01/26 21:12:30 [debug] 40#0: *2120 [lua] base_plugin.lua:20: rewrite(): executing plugin "custom-kong-http-log-plugin": rewrite
2018/01/26 21:12:30 [debug] 40#0: *2120 [lua] base_plugin.lua:20: rewrite(): executing plugin "statsd": rewrite
2018/01/26 21:12:30 [debug] 40#0: *2120 [lua] base_plugin.lua:20: rewrite(): executing plugin "correlation-id": rewrite
2018/01/26 21:12:30 [debug] 46#0: *2114 [lua] base_plugin.lua:24: access(): executing plugin "oauth2": access
2018/01/26 21:12:30 [debug] 40#0: *2120 [lua] base_plugin.lua:24: access(): executing plugin "oauth2": access
2018/01/26 21:12:30 [debug] 46#0: *2114 [lua] cluster.lua:428: next_coordinator(): [lua-cassandra] load balancing policy chose host at 10.86.36.142
2018/01/26 21:12:30 [info] 46#0: *2096 client 10.1.49.1 closed keepalive connection (104: Connection reset by peer)
2018/01/26 21:12:30 [error] 46#0: *2113 [lua] handler.lua:104: [events] missing entity in crud subscriber, client: 10.1.51.1, server: kong, request: "POST /auth/oauth2/token HTTP/1.1", host: "gateway-dev-core-elr.mydomain.com"
2018/01/26 21:12:30 [error] 46#0: *2113 lua coroutine: runtime error: /usr/local/share/lua/5.1/kong/plugins/oauth2/access.lua:71: attempt to index local 'token' (a nil value)
stack traceback:
coroutine 0:
/usr/local/share/lua/5.1/kong/plugins/oauth2/access.lua: in function 'generate_token'
/usr/local/share/lua/5.1/kong/plugins/oauth2/access.lua:338: in function 'execute'
/usr/local/share/lua/5.1/kong/plugins/oauth2/handler.lua:12: in function </usr/local/share/lua/5.1/kong/plugins/oauth2/handler.lua:10>
coroutine 1:
[C]: in function 'resume'
coroutine.wrap:21: in function
/usr/local/share/lua/5.1/kong/init.lua:374: in function 'access'
access_by_lua(nginx-kong.conf:92):2: in function <access_by_lua(nginx-kong.conf:92):1>, client: 10.1.51.1, server: kong, request: "POST /auth/oauth2/token HTTP/1.1", host: "gateway-dev-core-elr.mydomain.com"
2018/01/26 21:12:30 [error] 46#0: *2113 [lua] responses.lua:121: access(): /usr/local/share/lua/5.1/kong/plugins/oauth2/access.lua:71: attempt to index local 'token' (a nil value), client: 10.1.51.1, server: kong, request: "POST /auth/oauth2/token HTTP/1.1", host: "gateway-dev-core-elr.custom.com"
{ "created_at": 1511816567681, "strip_uri": true, "id": "e39d742e-3cec-4bdf-b1a1-c8c8cc53cbf8", "name": "Ross_Test_Proxy", "upstream_url": "https://google.com", "http_if_terminated": false, "https_only": false, "retries": 5, "uris": [ "/ross/test" ], "upstream_send_timeout": 6000, "upstream_connect_timeout": 6000, "upstream_read_timeout": 6000, "methods": {}, "preserve_host": false, "health_checks": { "createdUser": null, "updatedUser": null, "id": 3, "api_id": "e39d742e-3cec-4bdf-b1a1-c8c8cc53cbf8", "api": { "created_at": 1511816567681, "strip_uri": true, "id": "e39d742e-3cec-4bdf-b1a1-c8c8cc53cbf8", "name": "Ross_Test_Proxy", "upstream_url": "http://secrethost.com:8000", "http_if_terminated": false, "https_only": false, "retries": 5, "uris": [ "/ross/test" ], "upstream_send_timeout": 6000, "upstream_connect_timeout": 6000, "upstream_read_timeout": 6000, "methods": "", "preserve_host": false }, "health_check_endpoint": "", "notification_endpoint": "", "active": false, "data": null, "createdAt": "2017-11-30T05:25:14.000Z", "updatedAt": "2017-11-30T05:25:14.000Z" } }
{ "total": 3, "data": [ { "created_at": 1516723644052, "config": { "refresh_token_ttl": 120 9600, "token_expiration": 7200, "mandatory_scope": false, "hide_credentials": false, "a ccept_http_if_already_terminated": false, "enable_implicit_grant": false, "global_cr edentials": true, "provision_key": "function", "enable_password_grant": false, "enable _authorization_code": false, "anonymous": "", "enable_client_credentials": true, "auth _header_name": "authorization" }, "id": "98e1ac1a-9c34-4c51-b3e9-31c95f1a8c86", "enab led": false, "name": "oauth2", "api_id": "e39d742e-3cec-4bdf-b1a1-c8c8cc53cbf8" }, { "cr eated_at": 1516824832470, "config": {}, "id": "f0b76aff-d7f7-4cbd-9df4-199e8b8b2870", "enabled": true, "name": "stdout-log", "api_id": "e39d742e-3cec-4bdf-b1a1-c8c8cc53cbf 8" }, { "created_at": 1516723790418, "config": { "whitelist": [ "ross_oauth_group" ] }, "id": "5d12721f-46d8-49f8-882c-094cee9ac935", "enabled": false, "name": "acl", "api_id": "e 39d742e-3cec-4bdf-b1a1-c8c8cc53cbf8" } ] }
KONG_LOG_LEVEL | debug KONG_PROXY_ACCESS_LOG | off KONG_ADMIN_ACCESS_LOG | off KONG_PROXY_ERROR_LOG | /dev/stderr KONG_ADMIN_ERROR_LOG | /dev/stderr KONG_ANONYMOUS_REPORTS | off KONG_PROXY_LISTEN | 0.0.0.0:8000 KONG_PROXY_LISTEN_SSL | 0.0.0.0:8443 KONG_ADMIN_LISTEN | 0.0.0.0:8001 KONG_ADMIN_LISTEN_SSL | 0.0.0.0:8444 KONG_MEM_CACHE_SIZE | 256m KONG_SSL_CERT | /usr/local/kong/ssl/kongcert.crt KONG_SSL_CERT_KEY | /usr/local/kong/ssl/kongprivatekey.key KONG_HTTP2 | on KONG_UPSTREAM_KEEPALIVE | 60 KONG_SERVER_TOKENS | off KONG_LATENCY_TOKENS | on KONG_REAL_IP_HEADER | X-Real-IP KONG_REAL_IP_RECURSIVE | off KONG_CLIENT_MAX_BODY_SIZE | 0 KONG_CLIENT_BODY_BUFFER_SIZE | 5m KONG_ERROR_DEFAULT_TYPE | text/plain KONG_DATABASE | cassandra KONG_CASSANDRA_PORT | 9042 KONG_CASSANDRA_KEYSPACE | kong KONG_CASSANDRA_TIMEOUT | 5000 KONG_CASSANDRA_SSL | on KONG_CASSANDRA_SSL_VERIFY | on KONG_CASSANDRA_USERNAME | ... KONG_CASSANDRA_PASSWORD | ... KONG_CASSANDRA_CONSISTENCY | ONE KONG_CASSANDRA_LB_POLICY | RoundRobin KONG_DB_UPDATE_FREQUENCY | 5 KONG_DB_UPDATE_PROPAGATION | 5 KONG_DB_CACHE_TTL | 10800 KONG_DNS_STALE_TTL | 4 KONG_DNS_NOT_FOUND_TTL | 30 KONG_DNS_ERROR_TTL | 1 KONG_DNS_NO_SYNC | off KONG_LUA_SSL_TRUSTED_CERTIFICATE | /usr/local/kong/ssl/kongcert.pem KONG_LUA_SSL_VERIFY_DEPTH | 2 KONG_LUA_SOCKET_POOL_SIZE | 30 KONG_CASSANDRA_CONTACT_POINTS | 10.86.36.142,10.204.96.124,10.204.96.125,10.204.94.124,10.204.90.125 KONG_DNS_ORDER | LAST,SRV,A,CNAME KONG_CUSTOM_PLUGINS | kong-upstream-jwt,stdout-log,custom-kong-http-log-plugin