Open sambles opened 1 month ago
Another example, the worker monitor-v2 task handle_task_failure
needs to update other sub-task status on workflow error
Mark all other subtasks which are queue or pending as CANCELLED
or ERROR
[2024-06-05 12:15:55,973: INFO/ForkPoolWorker-3] handle_task_failure[a5b2f784-3363-4c90-9d5d-80ea505dab04]: analysis_pk: 3, initiator_pk: 3, traceback: Traceback (most recent call last):
File "/home/worker/.local/lib/python3.10/site-packages/celery/app/trace.py", line 477, in trace_task
R = retval = fun(*args, **kwargs)
File "/home/worker/.local/lib/python3.10/site-packages/celery/app/trace.py", line 760, in __protected_call__
return self.run(*args, **kwargs)
File "/home/worker/.local/lib/python3.10/site-packages/celery/app/autoretry.py", line 60, in run
ret = task.retry(exc=exc, **retry_kwargs)
File "/home/worker/.local/lib/python3.10/site-packages/celery/app/task.py", line 736, in retry
raise_with_context(exc)
File "/home/worker/.local/lib/python3.10/site-packages/celery/app/autoretry.py", line 38, in run
return task._orig_run(*args, **kwargs)
File "/home/worker/src/model_execution_worker/distributed_tasks.py", line 447, in run
return fn(self, params, *args, analysis_id=analysis_id, run_data_uuid=run_data_uuid, **kwargs)
File "/home/worker/src/model_execution_worker/distributed_tasks.py", line 508, in pre_analysis_hook
params['pre_loc_file'] = filestore.put(
File "/home/worker/.local/lib/python3.10/site-packages/oasis_data_manager/filestore/backends/base.py", line 318, in put
self.fs.put(reference, storage_location)
File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/dirfs.py", line 184, in put
return self.fs.put(
File "/home/worker/.local/lib/python3.10/site-packages/fsspec/spec.py", line 1055, in put
self.put_file(lpath, rpath, callback=child, **kwargs)
File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/local.py", line 138, in put_file
return self.cp_file(path1, path2, **kwargs)
File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/local.py", line 124, in cp_file
shutil.copyfile(path1, path2)
File "/usr/lib/python3.10/shutil.py", line 256, in copyfile
with open(dst, 'wb') as fdst:
FileNotFoundError: [Errno 2] No such file or directory: '/shared-fs/analysis-3_files-6dcb6b968c774f59b53522d4bd81b07b/location.csv'
[
{
"id": 15,
"task_id": "c7f94103-0d34-48cc-b62e-8095cd356766",
"status": "COMPLETED",
"queue_name": "GEM-CHEQ-2-v2",
"name": "Prepare input generation params",
"slug": "prepare-input-generation-params",
"pending_time": "2024-06-05T12:15:38.826860Z",
"queue_time": null,
"start_time": "2024-06-05T12:15:39.100452Z",
"end_time": "2024-06-05T12:15:40.359411Z",
"output_log": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analysis-task-statuses/15/output_log/",
"error_log": null
},
{
"id": 16,
"task_id": "b6934042-8424-4c08-933b-197afe5963f0",
"status": "ERROR",
"queue_name": "GEM-CHEQ-2-v2",
"name": "Pre analysis hook",
"slug": "pre-analysis-hook",
"pending_time": "2024-06-05T12:15:38.826986Z",
"queue_time": null,
"start_time": "2024-06-05T12:15:39.427706Z",
"end_time": "2024-06-05T12:15:55.971324Z",
"output_log": null,
"error_log": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analysis-task-statuses/16/error_log/"
},
{
"id": 17,
"task_id": "3f9364a9-a693-4f3b-b649-02bc336ac2de",
"status": "QUEUED",
"queue_name": "GEM-CHEQ-2-v2",
"name": "Prepare keys file 0",
"slug": "prepare-keys-file-0",
"pending_time": "2024-06-05T12:15:38.827020Z",
"queue_time": "2024-06-05T12:15:39.170141Z",
"start_time": null,
"end_time": null,
"output_log": null,
"error_log": null
},
{
"id": 18,
"task_id": "d71f2573-dbd4-42b6-8885-4fa121b51b45",
"status": "QUEUED",
"queue_name": "GEM-CHEQ-2-v2",
"name": "Collect keys",
"slug": "collect-keys",
"pending_time": "2024-06-05T12:15:38.827050Z",
"queue_time": "2024-06-05T12:15:39.170141Z",
"start_time": null,
"end_time": null,
"output_log": null,
"error_log": null
},
{
"id": 19,
"task_id": "4ff2f76d-97d0-4318-bca5-c53229962259",
"status": "QUEUED",
"queue_name": "GEM-CHEQ-2-v2",
"name": "Write input files",
"slug": "write-input-files",
"pending_time": "2024-06-05T12:15:38.827078Z",
"queue_time": "2024-06-05T12:15:39.170141Z",
"start_time": null,
"end_time": null,
"output_log": null,
"error_log": null
},
{
"id": 20,
"task_id": "1f7355f4-0d9d-466e-a82d-62b009088566",
"status": "QUEUED",
"queue_name": "celery-v2",
"name": "Record input files",
"slug": "record-input-files",
"pending_time": "2024-06-05T12:15:38.827106Z",
"queue_time": "2024-06-05T12:15:39.170141Z",
"start_time": null,
"end_time": null,
"output_log": null,
"error_log": null
},
{
"id": 21,
"task_id": "21a40748-a820-40c5-a839-4d9a8ffaa4c4",
"status": "QUEUED",
"queue_name": "GEM-CHEQ-2-v2",
"name": "Cleanup input generation",
"slug": "cleanup-input-generation",
"pending_time": "2024-06-05T12:15:38.827136Z",
"queue_time": "2024-06-05T12:15:39.170141Z",
"start_time": null,
"end_time": null,
"output_log": null,
"error_log": null
}
]
Issue Description
If a failed analyses has subtasks marked as status
QUEUED
then the auto-scaler with not spin down to zero even though nothing is running.logs