DIRACGrid / DIRAC

DIRAC Grid
http://diracgrid.org
GNU General Public License v3.0
113 stars 174 forks source link

FTS3Agent deadlocks when running multiple agents #7798

Open chaen opened 1 week ago

chaen commented 1 week ago

Obviously very bad, as it seems to have resulted in multiple FTS transfers stepping on each other's toes

2024-09-18 12:30:48 UTC DataManagement/FTS3Agent/FTS3DB ERROR: persistOperation: unexpected exception
Traceback (most recent call last):
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
    self.dialect.do_execute(
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/default.py", line 924, in do_execute
    cursor.execute(statement, parameters)
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/MySQLdb/cursors.py", line 206, in execute
    res = self._query(query)
          ^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/MySQLdb/cursors.py", line 319, in _query
    db.query(q)
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/MySQLdb/connections.py", line 259, in query
    _mysql.connection.query(self, query)
MySQLdb._exceptions.OperationalError: (1213, 'Deadlock found when trying to get lock; try restarting transaction')

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/DataManagementSystem/DB/FTS3DB.py", line 247, in persistOperation
    session.commit()
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 2017, in commit
    trans.commit(_to_root=True)
  File "<string>", line 2, in commit
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/state_changes.py", line 139, in _go
    ret_value = fn(self, *arg, **kw)
                ^^^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 1302, in commit
    self._prepare_impl()
  File "<string>", line 2, in _prepare_impl
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/state_changes.py", line 139, in _go
    ret_value = fn(self, *arg, **kw)
                ^^^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 1277, in _prepare_impl
    self.session.flush()
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 4341, in flush
    self._flush(objects)
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 4476, in _flush
    with util.safe_reraise():
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/util/langhelpers.py", line 146, in __exit__
    raise exc_value.with_traceback(exc_tb)
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 4437, in _flush
    flush_context.execute()
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/unitofwork.py", line 466, in execute
    rec.execute(self)
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/unitofwork.py", line 642, in execute
    util.preloaded.orm_persistence.save_obj(
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/persistence.py", line 93, in save_obj
    _emit_insert_statements(
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/orm/persistence.py", line 1233, in _emit_insert_statements
    result = connection.execute(
             ^^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/base.py", line 1418, in execute
    return meth(
           ^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/sql/elements.py", line 515, in _execute_on_connection
    return connection._execute_clauseelement(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/base.py", line 1640, in _execute_clauseelement
    ret = self._execute_context(
          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/base.py", line 1846, in _execute_context
    return self._exec_single_context(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/base.py", line 1986, in _exec_single_context
    self._handle_dbapi_exception(
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/base.py", line 2353, in _handle_dbapi_exception
    raise sqlalchemy_exception.with_traceback(exc_info[2]) from e
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
    self.dialect.do_execute(
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/sqlalchemy/engine/default.py", line 924, in do_execute
    cursor.execute(statement, parameters)
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/MySQLdb/cursors.py", line 206, in execute
    res = self._query(query)
          ^^^^^^^^^^^^^^^^^^
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/MySQLdb/cursors.py", line 319, in _query
    db.query(q)
  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/MySQLdb/connections.py", line 259, in query
    _mysql.connection.query(self, query)
sqlalchemy.exc.OperationalError: (MySQLdb._exceptions.OperationalError) (1213, 'Deadlock found when trying to get lock; try restarting transaction')
[SQL: INSERT INTO `Jobs` (`operationID`, `submitTime`, `lastUpdate`, `lastMonitor`, completeness, username, `userGroup`, `ftsGUID`, `ftsServer`, error, status, assignment) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)]
[parameters: (8840397, datetime.datetime(2024, 9, 18, 12, 30, 46), datetime.datetime(2024, 9, 18, 12, 30, 46), datetime.datetime(2024, 9, 18, 12, 30, 46), None, 'lbprods', 'lhcb_prmgr', 'd48f1c5a-75b9-11ef-bbfb-fa163e8e0048', 'https://fts3-lhcb.cern.ch:8446', None, 'Submitted', None)]
(Background on this error at: https://sqlalche.me/e/20/e3q8)
2024-09-18 12:30:48 UTC DataManagement/FTS3Agent/submit/8840413/RAL-BUFFER_RAL-RAW INFO: Got GUID d5c1de82-75b9-11ef-ac94-fa163e5a69c7
2024-09-18 12:30:48 UTC DataManagement/FTS3Agent/treatOperation/8840397 ERROR: Could not persist operation {'OK': False, 'Errno': 0, 'Message': "persistOperation: unexpected exception (MySQLdb._exceptions.OperationalError) (1213, 'Deadlock found when trying to get lock; try restarting transaction')\n[SQL: INSERT INTO `Jobs` (`operationID`, `submitTime`, `lastUpdate`, `lastMonitor`, completeness, username, `userGroup`, `ftsGUID`, `ftsServer`, error, status, assignment) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)]\n[parameters: (8840397, datetime.datetime(2024, 9, 18, 12, 30, 46), datetime.datetime(2024, 9, 18, 12, 30, 46), datetime.datetime(2024, 9, 18, 12, 30, 46), None, 'lbprods', 'lhcb_prmgr', 'd48f1c5a-75b9-11ef-bbfb-fa163e8e0048', 'https://fts3-lhcb.cern.ch:8446', None, 'Submitted', None)]\n(Background on this error at: https://sqlalche.me/e/20/e3q8)", 'CallStack': ['  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/threading.py", line 1002, in _bootstrap\n    self._bootstrap_inner()\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/threading.py", line 1045, in _bootstrap_inner\n    self.run()\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/threading.py", line 982, in run\n    self._target(*self._args, **self._kwargs)\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/DataManagementSystem/Agent/FTS3Agent.py", line 513, in _treatOperation\n    res = self.fts3db.persistOperation(operation)\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/DataManagementSystem/DB/FTS3DB.py", line 255, in persistOperation\n    return S_ERROR(f"persistOperation: unexpected exception {e}")\n']}
2024-09-18 14:39:56 UTC DataManagement/FTS3Agent/monitorJobs ERROR: Could not retrieve ftsJobs from the DB {'OK': False, 'Errno': 0, 'Message': "getAllActiveJobs: unexpected exception : (MySQLdb._exceptions.OperationalError) (1213, 'Deadlock found when trying to get lock; try restarting transaction')\n[SQL: SELECT `Jobs`.`jobID` AS `Jobs_jobID`, `Jobs`.`operationID` AS `Jobs_operationID`, `Jobs`.`submitTime` AS `Jobs_submitTime`, `Jobs`.`lastUpdate` AS `Jobs_lastUpdate`, `Jobs`.`lastMonitor` AS `Jobs_lastMonitor`, `Jobs`.completeness AS `Jobs_completeness`, `Jobs`.username AS `Jobs_username`, `Jobs`.`userGroup` AS `Jobs_userGroup`, `Jobs`.`ftsGUID` AS `Jobs_ftsGUID`, `Jobs`.`ftsServer` AS `Jobs_ftsServer`, `Jobs`.error AS `Jobs_error`, `Jobs`.status AS `Jobs_status`, `Jobs`.assignment AS `Jobs_assignment` \nFROM `Jobs` INNER JOIN `Operations` ON `Operations`.`operationID` = `Jobs`.`operationID` \nWHERE `Jobs`.status IN (%s, %s, %s, %s, %s) AND `Jobs`.assignment IS NULL AND `Operations`.assignment IS NULL AND `Jobs`.`lastMonitor` < %s ORDER BY `Jobs`.`lastMonitor` ASC \n LIMIT %s FOR UPDATE]\n[parameters: ('Submitted', 'Staging', 'Archiving', 'Ready', 'Active', datetime.datetime(2024, 9, 18, 14, 28, 22, 250137), 20)]\n(Background on this error at: https://sqlalche.me/e/20/e3q8)", 'CallStack': ['  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/bin/dirac-agent", line 8, in <module>\n    sys.exit(main())\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/Base/Script.py", line 74, in __call__\n    return entrypointFunc._func()\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/scripts/dirac_agent.py", line 39, in main\n    agentReactor.go()\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/Base/AgentReactor.py", line 144, in go\n    timeToNext = self.__scheduler.executeNextTask()\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/Utilities/ThreadScheduler.py", line 127, in executeNextTask\n    self.__executeTask(taskId)\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/Utilities/ThreadScheduler.py", line 180, in __executeTask\n    task["func"](*task["args"])\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/Base/AgentModule.py", line 350, in am_go\n    cycleResult = self.__executeModuleCycle()\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/Base/AgentModule.py", line 420, in __executeModuleCycle\n    result = self.am_secureCall(executors[0][0], executors[0][1])\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/Core/Base/AgentModule.py", line 310, in am_secureCall\n    result = functor(*args)\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/DataManagementSystem/Agent/FTS3Agent.py", line 656, in execute\n    res = self.monitorJobsLoop()\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/DataManagementSystem/Agent/FTS3Agent.py", line 320, in monitorJobsLoop\n    res = self.fts3db.getActiveJobs(\n', '  File "/opt/dirac/versions/v11.0.46-1723623328/Linux-x86_64/lib/python3.11/site-packages/DIRAC/DataManagementSystem/DB/FTS3DB.py", line 349, in getActiveJobs\n    return S_ERROR(f"getAllActiveJobs: unexpected exception : {e}")\n']}