jd / tenacity

Retrying library for Python
http://tenacity.readthedocs.io
Apache License 2.0
6.6k stars 281 forks source link

Retry not triggered when database query timeout #246

Open YujiShen opened 4 years ago

YujiShen commented 4 years ago

I add retry decorator on my database execute method, but when the database lost connection during the query, retry is not triggered.

Here is my DB class:

class DB(object):
    def __init__(self) -> None:
        """
        Initiate a connection pool
        """
        database_url = f'mysql+pymysql://{user}:{password}@localhost/{db}'

        self.database = sqlalchemy.create_engine(
            database_url,
            connect_args={
                'connect_timeout': 3,
                'read_timeout': 5,
                'write_timeout': 5
            },
            pool_size=10,
            max_overflow=10,
            pool_timeout=3,  # 3 seconds
            pool_recycle=600,  # 10 minutes
        )

    @retry(reraise=True, wait=wait_fixed(1), stop=stop_after_delay(30), after=after_log(logger, logging.WARNING))
    def fetch(self, query: str, values: Dict = None) -> List:
        """
        Fetch the result
        """
        with self.database.connect() as conn:
            values = values or {}
            result = conn.execute(text(query), **values).fetchall()
            if result:
                result = [dict(row) for row in result]
            return result

    @retry(reraise=True, wait=wait_fixed(1), stop=stop_after_delay(30), after=after_log(logger, logging.WARNING))
    def execute(self, query: str, values: Dict = None, cloud_logger: CloudLogger = None,
                log_struct: Dict = None, error_message: str = None) -> None:
        """
        Execute query
        """
        try:
            with self.database.connect() as conn:
                values = values or {}
                conn.execute(text(query), **values)
        except Exception as e:
            if cloud_logger and log_struct:
                cloud_logger.log_error(log_struct, error_message, e)
            else:
                logging.error(e, exc_info=True)
            raise e

Here is the log:

[2020-08-10 19:23:23 +0000] [16] [ERROR] (pymysql.err.OperationalError) (2013, 'Lost connection to MySQL server during query (timed out)')
[SQL: update .....]
(Background on this error at: http://sqlalche.me/e/e3q8)
Traceback (most recent call last):
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 690, in _read_bytes
    data = self._rfile.read(num_bytes)
  File "/opt/python3.7/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
socket.timeout: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1248, in _execute_context
    cursor, statement, parameters, context
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 590, in do_execute
    cursor.execute(statement, parameters)
  File "/env/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
    result = self._query(query)
  File "/env/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
    conn.query(q)
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 516, in query
    self._affected_rows = self._read_query_result(unbuffered=unbuffered)
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 727, in _read_query_result
    result.read()
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 1066, in read
    first_packet = self.connection._read_packet()
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 656, in _read_packet
    packet_header = self._read_bytes(4)
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 698, in _read_bytes
    "Lost connection to MySQL server during query (%s)" % (e,))
pymysql.err.OperationalError: (2013, 'Lost connection to MySQL server during query (timed out)')

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/srv/utils/cloudsql.py", line 78, in execute
    conn.execute(text(query), **values)
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 984, in execute
    return meth(self, multiparams, params)
  File "/env/lib/python3.7/site-packages/sqlalchemy/sql/elements.py", line 293, in _execute_on_connection
    return connection._execute_clauseelement(self, multiparams, params)
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1103, in _execute_clauseelement
    distilled_params,
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1288, in _execute_context
    e, statement, parameters, cursor, context
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1482, in _handle_dbapi_exception
    sqlalchemy_exception, with_traceback=exc_info[2], from_=e
  File "/env/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 178, in raise_
    raise exception
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1248, in _execute_context
    cursor, statement, parameters, context
  File "/env/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 590, in do_execute
    cursor.execute(statement, parameters)
  File "/env/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
    result = self._query(query)
  File "/env/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
    conn.query(q)
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 516, in query
    self._affected_rows = self._read_query_result(unbuffered=unbuffered)
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 727, in _read_query_result
    result.read()
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 1066, in read
    first_packet = self.connection._read_packet()
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 656, in _read_packet
    packet_header = self._read_bytes(4)
  File "/env/lib/python3.7/site-packages/pymysql/connections.py", line 698, in _read_bytes
    "Lost connection to MySQL server during query (%s)" % (e,))
sqlalchemy.exc.OperationalError: (pymysql.err.OperationalError) (2013, 'Lost connection to MySQL server during query (timed out)')
[SQL: update .....]
(Background on this error at: http://sqlalche.me/e/e3q8)

[2020-08-10 19:23:23 +0000] [16] [WARNING] Finished call to 'utils.cloudsql.DB.execute' after 5.023(s), this was the 1st time calling it.

The request happen on 2020-08-10 15:23:17.997 EDT, I set write_timeout to 5s, so on 2020-08-10 19:23:23 +0000 the pymysql throw timeout error. Is it because the error is not initiated by execute function directly? From log you can see the exception is thrown by pymysql/connections.py. If this is the reason, how should I use tenacity to retry in this situation?

YujiShen commented 4 years ago

Similar issue also happened in my async version of DB class.

@retry(reraise=True, wait=wait_fixed(1), stop=stop_after_delay(30), after=after_log(logger, logging.WARNING)
async def execute(self, query: str, values: Dict = None, cloud_logger: CloudLogger = None, log_struct: Dict = None,
                  error_message: str = None) -> None:
    try:
        await asyncio.wait_for(self.database.execute(query=query, values=values), timeout=5)
    except Exception as e:
        if cloud_logger and log_struct:
            cloud_logger.log_error(log_struct, error_message, e)
        else:
            logging.error(e, exc_info=True)
        raise e

It raised error and did not retry:

[2020-08-11 12:16:08 +0000] [18] [ERROR]
Traceback (most recent call last):
  File "/srv/utils/async_sql.py", line 70, in execute
    await asyncio.wait_for(self.database.execute(query=query, values=values), timeout=5)
  File "/opt/python3.7/lib/python3.7/asyncio/tasks.py", line 449, in wait_for
    raise futures.TimeoutError()
concurrent.futures._base.TimeoutError

[2020-08-11 12:16:08 +0000] [18] [WARNING] Finished call to 'utils.async_sql.AsyncDB.execute' after 5.265(s), this was the 1st time calling it.

Did I use retry correctly? Why this one did not work either?

YujiShen commented 4 years ago

I added tenacity for all these kind of timeout issues in my app: database queries, HTTP requests, BigQuery queries. But all of them have the similar problem I report above. It only tried once and throw the error. It did retried sometimes, but I can't find out why these failed. One common I can find is these errors are not in trace stack. Is this the reason? If so, am I able to use tenacity to retry when this happen?

chenlujjj commented 4 years ago

I encountered the similar issue: not retry on pymysql.err.OperationalError