We had a DNS blip overnight where the pdp service couldn't find the address (and thus connect) to the required database. Unfortunately, this crashed the service. We should add some exception handling such that the service returns an HTTP 503 if the database is unavailable. Logs to follow.
2020-07-15 23:46:55 [7] [CRITICAL] WORKER TIMEOUT (pid:11)
2020-07-15 23:46:55 [7] [CRITICAL] WORKER TIMEOUT (pid:11)
2020-07-15 23:46:55 [7] [CRITICAL] WORKER TIMEOUT (pid:23)
2020-07-15 23:46:55 [7] [CRITICAL] WORKER TIMEOUT (pid:23)
2020-07-15 23:46:56 [28] [INFO] Booting worker with pid: 28
2020-07-15 23:46:56 [28] [INFO] Booting worker with pid: 28
2020-07-15 23:46:56 [29] [INFO] Booting worker with pid: 29
2020-07-15 23:46:56 [29] [INFO] Booting worker with pid: 29
2020-07-15 23:47:11 [28] [ERROR] Exception in worker process
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 583, in spawn_worker
worker.init_process()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/workers/base.py", line 129, in init_process
self.load_wsgi()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/workers/base.py", line 138, in load_wsgi
self.wsgi = self.app.wsgi()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/base.py", line 67, in wsgi
self.callable = self.load()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/wsgiapp.py", line 52, in load
return self.load_wsgiapp()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/wsgiapp.py", line 41, in load_wsgiapp
return util.import_app(self.app_uri)
File "/usr/local/lib/python2.7/dist-packages/gunicorn/util.py", line 350, in import_app
__import__(module)
File "/root/pdp/pdp/wsgi.py", line 12, in <module>
use_analytics=global_config['use_analytics']
File "/root/pdp/pdp/main.py", line 40, in initialize_frontend
for app in apps
File "/root/pdp/pdp/main.py", line 40, in <dictcomp>
for app in apps
File "/root/pdp/pdp/portals/bc_prism.py", line 42, in mk_frontend
'js/prism_demo_app.js'])
File "/root/pdp/pdp/portals/__init__.py", line 57, in make_raster_frontend
conf = raster_conf(dsn, config, ensemble_name, url_base)
File "/root/pdp/pdp/portals/__init__.py", line 30, in raster_conf
root_url=root_url
File "/usr/local/lib/python2.7/dist-packages/pdp_util/raster.py", line 224, in db_raster_configurator
files = ensemble_files(session, ensemble)
File "/usr/local/lib/python2.7/dist-packages/pdp_util/raster.py", line 236, in ensemble_files
return { row.unique_id: row.filename for row in q }
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2855, in __iter__
return self._execute_and_instances(context)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2876, in _execute_and_instances
close_with_result=True)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2885, in _get_bind_args
**kw
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2867, in _connection_from_session
conn = self.session.connection(**kw)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1013, in connection
execution_options=execution_options)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1018, in _connection_for_bind
engine, execution_options)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 403, in _connection_for_bind
conn = bind.contextual_connect()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2112, in contextual_connect
self._wrap_pool_connect(self.pool.connect, None),
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2151, in _wrap_pool_connect
e, dialect, self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 1465, in _handle_dbapi_exception_noconnection
exc_info
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/util/compat.py", line 203, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2147, in _wrap_pool_connect
return fn()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 387, in connect
return _ConnectionFairy._checkout(self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 766, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 516, in checkout
rec = pool._do_get()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 1138, in _do_get
self._dec_overflow()
File "/usr/local/lib/python2.7/dist-packages/sqlal2020-07-15 23:47:11 [29] [ERROR] Exception in worker process
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 583, in spawn_worker
worker.init_process()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/workers/base.py", line 129, in init_process
self.load_wsgi()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/workers/base.py", line 138, in load_wsgi
self.wsgi = self.app.wsgi()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/base.py", line 67, in wsgi
self.callable = self.load()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/wsgiapp.py", line 52, in load
return self.load_wsgiapp()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/wsgiapp.py", line 41, in load_wsgiapp
return util.import_app(self.app_uri)
File "/usr/local/lib/python2.7/dist-packages/gunicorn/util.py", line 350, in import_app
__import__(module)
File "/root/pdp/pdp/wsgi.py", line 12, in <module>
use_analytics=global_config['use_analytics']
File "/root/pdp/pdp/main.py", line 40, in initialize_frontend
for app in apps
File "/root/pdp/pdp/main.py", line 40, in <dictcomp>
for app in apps
File "/root/pdp/pdp/portals/bc_prism.py", line 42, in mk_frontend
'js/prism_demo_app.js'])
File "/root/pdp/pdp/portals/__init__.py", line 57, in make_raster_frontend
conf = raster_conf(dsn, config, ensemble_name, url_base)
File "/root/pdp/pdp/portals/__init__.py", line 30, in raster_conf
root_url=root_url
File "/usr/local/lib/python2.7/dist-packages/pdp_util/raster.py", line 224, in db_raster_configurator
files = ensemble_files(session, ensemble)
File "/usr/local/lib/python2.7/dist-packages/pdp_util/raster.py", line 236, in ensemble_files
return { row.unique_id: row.filename for row in q }
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2855, in __iter__
return self._execute_and_instances(context)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2876, in _execute_and_instances
close_with_result=True)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2885, in _get_bind_args
**kw
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/query.py", line 2867, in _connection_from_session
conn = self.session.connection(**kw)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1013, in connection
execution_options=execution_options)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 1018, in _connection_for_bind
engine, execution_options)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/orm/session.py", line 403, in _connection_for_bind
conn = bind.contextual_connect()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2112, in contextual_connect
self._wrap_pool_connect(self.pool.connect, None),
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2151, in _wrap_pool_connect
e, dialect, self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 1465, in _handle_dbapi_exception_noconnection
exc_info
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/util/compat.py", line 203, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/base.py", line 2147, in _wrap_pool_connect
return fn()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 387, in connect
return _ConnectionFairy._checkout(self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 766, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 516, in checkout
rec = pool._do_get()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 1138, in _do_get
self._dec_overflow()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/util/langhelpers.py", line 66, in __exit__
compat.reraise(exc_type, exc_value, exc_tb)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 1135, in _do_get
return self._create_connection()
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 333, in _create_connection
return _ConnectionRecord(self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 461, in __init__
self.__connect(first_connect_check=True)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/pool.py", line 651, in __connect
connection = pool._invoke_creator(self)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/strategies.py", line 105, in connect
return dialect.connect(*cargs, **cparams)
File "/usr/local/lib/python2.7/dist-packages/sqlalchemy/engine/default.py", line 393, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/usr/local/lib/python2.7/dist-packages/psycopg2/__init__.py", line 130, in connect
conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
OperationalError: (psycopg2.OperationalError) ERROR: pgbouncer cannot connect to server
...
This happens a bunch of times and then:
OperationalError: (psycopg2.OperationalError) ERROR: pgbouncer cannot connect to server
2020-07-15 23:47:11 [28] [INFO] Worker exiting (pid: 28)
2020-07-15 23:47:11 [29] [INFO] Worker exiting (pid: 29)
2020-07-15 23:47:11 [28] [INFO] Worker exiting (pid: 28)
2020-07-15 23:47:11 [29] [INFO] Worker exiting (pid: 29)
Traceback (most recent call last):
File "/usr/local/bin/gunicorn", line 8, in <module>
sys.exit(run())
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/wsgiapp.py", line 61, in run
WSGIApplication("%(prog)s [OPTIONS] [APP_MODULE]").run()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/base.py", line 223, in run
super(Application, self).run()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/app/base.py", line 72, in run
Arbiter(self).run()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 232, in run
self.halt(reason=inst.reason, exit_status=inst.exit_status)
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 345, in halt
self.stop()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 390, in stop
self.kill_workers(sig)
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 626, in kill_workers
self.kill_worker(pid, sig)
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 641, in kill_worker
worker.tmp.close()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/workers/workertmp.py", line 56, in close
return self._tmp.close()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 245, in handle_chld
self.reap_workers()
File "/usr/local/lib/python2.7/dist-packages/gunicorn/arbiter.py", line 525, in reap_workers
raise HaltServer(reason, self.WORKER_BOOT_ERROR)
gunicorn.errors.HaltServer: <HaltServer 'Worker failed to boot.' 3>
We had a DNS blip overnight where the
pdp
service couldn't find the address (and thus connect) to the required database. Unfortunately, this crashed the service. We should add some exception handling such that the service returns an HTTP 503 if the database is unavailable. Logs to follow.This happens a bunch of times and then: