OasisLMF / OasisPlatform

Loss modelling platform.
BSD 3-Clause "New" or "Revised" License
42 stars 16 forks source link

Failed analyses with subtasks marked as queued #1061

Open sambles opened 1 month ago

sambles commented 1 month ago

Issue Description

If a failed analyses has subtasks marked as status QUEUED then the auto-scaler with not spin down to zero even though nothing is running.

logs

{
  "created": "2024-06-03T15:12:20.041454Z",
  "modified": "2024-06-03T15:18:22.728742Z",
  "name": "Analysis_03062024-161219",
  "id": 5,
  "portfolio": 5,
  "model": 2,
  "status": "RUN_ERROR",
  "run_mode": "V2",
  "task_started": "2024-06-03T15:13:04.173610Z",
  "task_finished": "2024-06-03T15:18:22.664475Z",
  "complex_model_data_files": [],
  "input_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/input_file/",
  "settings_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/settings_file/",
  "settings": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/settings/",
  "lookup_errors_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/lookup_errors_file/",
  "lookup_success_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/lookup_success_file/",
  "lookup_validation_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/lookup_validation_file/",
  "summary_levels_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/summary_levels_file/",
  "input_generation_traceback_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/input_generation_traceback_file/",
  "output_file": null,
  "run_traceback_file": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/run_traceback_file/",
  "run_log_file": null,
  "storage_links": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/storage_links/",
  "chunking_configuration": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/chunking_configuration/",
  "lookup_chunks": 5,
  "analysis_chunks": 10,
  "sub_task_count": 15,
  "groups": [],
  "sub_task_list": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analyses/5/sub_task_list/",
  "sub_task_error_ids": [],
  "status_count": {
    "TOTAL_IN_QUEUE": 3,
    "TOTAL": 15,
    "PENDING": 0,
    "QUEUED": 3,
    "STARTED": 0,
    "COMPLETED": 12,
    "CANCELLED": 0,
    "ERROR": 0
  },
  "priority": 4
}
[
  {
    "id": 128,
    "task_id": "a603de05-6d93-4111-a029-37a88bb5e8fa",
    "status": "QUEUED",
    "queue_name": "OasisLMF-PiWind-2-v2",
    "name": "Generate losses output",
    "slug": "generate_losses_output",
    "pending_time": "2024-06-03T15:13:03.829714Z",
    "queue_time": "2024-06-03T15:13:04.176875Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 129,
    "task_id": "23db7750-e291-4038-a816-817b78b09970",
    "status": "QUEUED",
    "queue_name": "celery-v2",
    "name": "Record losses files",
    "slug": "record-losses-files",
    "pending_time": "2024-06-03T15:13:03.829741Z",
    "queue_time": "2024-06-03T15:13:04.176875Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 130,
    "task_id": "b6b8139a-3511-41a0-9f90-d84eeb51c892",
    "status": "QUEUED",
    "queue_name": "OasisLMF-PiWind-2-v2",
    "name": "Cleanup losses generation",
    "slug": "cleanup-losses-generation",
    "pending_time": "2024-06-03T15:13:03.829767Z",
    "queue_time": "2024-06-03T15:13:04.176875Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  }
2024-06-04 11:39:00,360 DEBUG: Socket message: {'time': '2024-06-04T11:39:00.316702Z', 'type': 'queue_status.updated', 'status': 'SUCCESS', 'content': [{'queue': {'name': 'OasisLMF-PiWind-2-v2', 'pending_count': 0, 'queued_count': 2, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': [{'id': 2, 'supplier_id': 'OasisLMF', 'model_id': 'PiWind', 'version_id': '2', 'created': '2024-06-03T08:32:35.951484Z', 'modified': '2024-06-03T08:32:36.567678Z', 'data_files': [], 'settings': '/api/v2/models/2/settings/', 'versions': '/api/v2/models/2/versions/', 'scaling_configuration': '/api/v2/models/2/scaling_configuration/', 'chunking_configuration': '/api/v2/models/2/chunking_configuration/', 'groups': [], 'run_mode': 'V2'}]}, 'analyses': []}, {'queue': {'name': 'celery', 'pending_count': 0, 'queued_count': 0, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': []}, 'analyses': []}, {'queue': {'name': 'celery-v2', 'pending_count': 0, 'queued_count': 1, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': []}, 'analyses': []}, {'queue': {'name': 'task-controller', 'pending_count': 0, 'queued_count': 0, 'running_count': 0, 'queue_message_count': 0, 'worker_count': 1, 'models': []}, 'analyses': []}]}
2024-06-04 11:39:00,360 DEBUG: Analyses pending: {'pending-task_OasisLMF-PiWind-2-v2': {'id': None, 'tasks': 1, 'queue_names': ['OasisLMF-PiWind-2-v2'], 'priority': 4}}
2024-06-04 11:39:00,360 DEBUG: Analyses running: {}
2024-06-04 11:39:00,360 DEBUG: Model statuses: {'oasislmf-piwind-2-v2': {'tasks': 1, 'analyses': 1, 'priority': 4}, 'oasislmf-piwind-1-v1': {'tasks': 0, 'analyses': 0, 'priority': 1}}
2024-06-04 11:39:00,361 DEBUG: Scaling: [('oasislmf-piwind-2-v2', {'tasks': 1, 'analyses': 1, 'priority': 4}, <worker_deployments.WorkerDeployment object at 0x7fb7e0611450>)]
2024-06-04 11:39:00,568 DEBUG: Total desired number of workers: 1
sambles commented 1 month ago

Another example, the worker monitor-v2 task handle_task_failure needs to update other sub-task status on workflow error Mark all other subtasks which are queue or pending as CANCELLED or ERROR

[2024-06-05 12:15:55,973: INFO/ForkPoolWorker-3] handle_task_failure[a5b2f784-3363-4c90-9d5d-80ea505dab04]: analysis_pk: 3, initiator_pk: 3, traceback: Traceback (most recent call last):
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/trace.py", line 477, in trace_task
    R = retval = fun(*args, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/trace.py", line 760, in __protected_call__
    return self.run(*args, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/autoretry.py", line 60, in run
    ret = task.retry(exc=exc, **retry_kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/task.py", line 736, in retry
    raise_with_context(exc)
  File "/home/worker/.local/lib/python3.10/site-packages/celery/app/autoretry.py", line 38, in run
    return task._orig_run(*args, **kwargs)
  File "/home/worker/src/model_execution_worker/distributed_tasks.py", line 447, in run
    return fn(self, params, *args, analysis_id=analysis_id, run_data_uuid=run_data_uuid, **kwargs)
  File "/home/worker/src/model_execution_worker/distributed_tasks.py", line 508, in pre_analysis_hook
    params['pre_loc_file'] = filestore.put(
  File "/home/worker/.local/lib/python3.10/site-packages/oasis_data_manager/filestore/backends/base.py", line 318, in put
    self.fs.put(reference, storage_location)
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/dirfs.py", line 184, in put
    return self.fs.put(
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/spec.py", line 1055, in put
    self.put_file(lpath, rpath, callback=child, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/local.py", line 138, in put_file
    return self.cp_file(path1, path2, **kwargs)
  File "/home/worker/.local/lib/python3.10/site-packages/fsspec/implementations/local.py", line 124, in cp_file
    shutil.copyfile(path1, path2)
  File "/usr/lib/python3.10/shutil.py", line 256, in copyfile
    with open(dst, 'wb') as fdst:
FileNotFoundError: [Errno 2] No such file or directory: '/shared-fs/analysis-3_files-6dcb6b968c774f59b53522d4bd81b07b/location.csv'
[
  {
    "id": 15,
    "task_id": "c7f94103-0d34-48cc-b62e-8095cd356766",
    "status": "COMPLETED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Prepare input generation params",
    "slug": "prepare-input-generation-params",
    "pending_time": "2024-06-05T12:15:38.826860Z",
    "queue_time": null,
    "start_time": "2024-06-05T12:15:39.100452Z",
    "end_time": "2024-06-05T12:15:40.359411Z",
    "output_log": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analysis-task-statuses/15/output_log/",
    "error_log": null
  },
  {
    "id": 16,
    "task_id": "b6934042-8424-4c08-933b-197afe5963f0",
    "status": "ERROR",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Pre analysis hook",
    "slug": "pre-analysis-hook",
    "pending_time": "2024-06-05T12:15:38.826986Z",
    "queue_time": null,
    "start_time": "2024-06-05T12:15:39.427706Z",
    "end_time": "2024-06-05T12:15:55.971324Z",
    "output_log": null,
    "error_log": "https://oasis-enterprise-sam-load-testing.northcentralus.cloudapp.azure.com/api/v2/analysis-task-statuses/16/error_log/"
  },
  {
    "id": 17,
    "task_id": "3f9364a9-a693-4f3b-b649-02bc336ac2de",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Prepare keys file 0",
    "slug": "prepare-keys-file-0",
    "pending_time": "2024-06-05T12:15:38.827020Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 18,
    "task_id": "d71f2573-dbd4-42b6-8885-4fa121b51b45",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Collect keys",
    "slug": "collect-keys",
    "pending_time": "2024-06-05T12:15:38.827050Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 19,
    "task_id": "4ff2f76d-97d0-4318-bca5-c53229962259",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Write input files",
    "slug": "write-input-files",
    "pending_time": "2024-06-05T12:15:38.827078Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 20,
    "task_id": "1f7355f4-0d9d-466e-a82d-62b009088566",
    "status": "QUEUED",
    "queue_name": "celery-v2",
    "name": "Record input files",
    "slug": "record-input-files",
    "pending_time": "2024-06-05T12:15:38.827106Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  },
  {
    "id": 21,
    "task_id": "21a40748-a820-40c5-a839-4d9a8ffaa4c4",
    "status": "QUEUED",
    "queue_name": "GEM-CHEQ-2-v2",
    "name": "Cleanup input generation",
    "slug": "cleanup-input-generation",
    "pending_time": "2024-06-05T12:15:38.827136Z",
    "queue_time": "2024-06-05T12:15:39.170141Z",
    "start_time": null,
    "end_time": null,
    "output_log": null,
    "error_log": null
  }
]