FNNDSC / pman

A process management system written in python
MIT License
22 stars 33 forks source link

On Kubernetes, status is incorrectly reported as "undefined" in rare cases when job is actually finished #225

Closed jennydaman closed 1 year ago

jennydaman commented 1 year ago

https://github.com/FNNDSC/pman/blob/f6730da5a4384d1bfecda6e93ee96516f12dea70/pman/kubernetesmgr.py#L86-L102

jennydaman commented 1 year ago

job.to_dict() looks like

{'api_version': 'batch/v1', 'kind': 'Job', 'metadata': {'annotations': {'batch.kubernetes.io/job-tracking': ''}, 'cluster_name': None, 'creation_timestamp': datetime.datetime(2023, 3, 17, 13, 20, 51, tzinfo=tzlocal()), 'deletion_grace_period_seconds': None, 'deletion_timestamp': None, 'finalizers': None, 'generate_name': None, 'generation': 1, 'labels': {'controller-uid': 'be512575-a3bf-43e4-9a2a-7686dc1f94db', 'job-name': 'acpo-20210708jid-8188'}, 'managed_fields': [{'api_version': 'batch/v1', 'fields_type': 'FieldsV1', 'fields_v1': {'f:spec': {'f:activeDeadlineSeconds': {}, 'f:backoffLimit': {}, 'f:completionMode': {}, 'f:completions': {}, 'f:parallelism': {}, 'f:suspend': {}, 'f:template': {'f:spec': {'f:containers': {'k:{"name":"acpo-20210708jid-8188"}': {'.': {}, 'f:command': {}, 'f:env': {'.': {}, 'k:{"name":"CHRIS_JID"}': {'.': {}, 'f:name': {}, 'f:value': {}}, 'k:{"name":"CHRIS_PIPELINE_ID"}': {'.': {}, 'f:name': {}, 'f:value': {}}, 'k:{"name":"CHRIS_PLG_INST_ID"}': {'.': {}, 'f:name': {}, 'f:value': {}}, 'k:{"name":"CHRIS_PREV_JID"}': {'.': {}, 'f:name': {}, 'f:value': {}}, 'k:{"name":"CHRIS_PREV_PLG_INST_ID"}': {'.': {}, 'f:name': {}, 'f:value': {}}, 'k:{"name":"CHRIS_WORKFLOW_ID"}': {'.': {}, 'f:name': {}, 'f:value': {}}, 'k:{"name":"CHRIS_WORKFLOW_PLG_INSTANCES"}': {'.': {}, 'f:name': {}, 'f:value': {}}, 'k:{"name":"HOME"}': {'.': {}, 'f:name': {}, 'f:value': {}}}, 'f:image': {}, 'f:imagePullPolicy': {}, 'f:name': {}, 'f:resources': {'.': {}, 'f:limits': {'.': {}, 'f:cpu': {}, 'f:memory': {}}}, 'f:securityContext': {'.': {}, 'f:allowPrivilegeEscalation': {}, 'f:capabilities': {'.': {}, 'f:drop': {}}, 'f:runAsGroup': {}, 'f:runAsUser': {}}, 'f:terminationMessagePath': {}, 'f:terminationMessagePolicy': {}, 'f:volumeMounts': {'.': {}, 'k:{"mountPath":"/share"}': {'.': {}, 'f:mountPath': {}, 'f:name': {}}}}}, 'f:dnsPolicy': {}, 'f:restartPolicy': {}, 'f:schedulerName': {}, 'f:securityContext': {}, 'f:terminationGracePeriodSeconds': {}, 'f:volumes': {'.': {}, 'k:{"name":"storebase"}': {'.': {}, 'f:name': {}, 'f:nfs': {'.': {}, 'f:path': {}, 'f:server': {}}}}}}, 'f:ttlSecondsAfterFinished': {}}}, 'manager': 'OpenAPI-Generator', 'operation': 'Update', 'time': datetime.datetime(2023, 3, 17, 13, 20, 51, tzinfo=tzlocal())}, {'api_version': 'batch/v1', 'fields_type': 'FieldsV1', 'fields_v1': {'f:status': {'f:ready': {}, 'f:startTime': {}, 'f:uncountedTerminatedPods': {'.': {}, 'f:succeeded': {'.': {}, 'v:"92691226-ebe0-4354-ae84-7731e51524b8"': {}}}}}, 'manager': 'kube-controller-manager', 'operation': 'Update', 'time': datetime.datetime(2023, 3, 17, 13, 21, 8, tzinfo=tzlocal())}], 'name': 'acpo-20210708jid-8188', 'namespace': 'chris', 'owner_references': None, 'resource_version': '83788497', 'self_link': None, 'uid': 'be512575-a3bf-43e4-9a2a-7686dc1f94db'}, 'spec': {'active_deadline_seconds': 604800, 'backoff_limit': 0, 'completions': 1, 'manual_selector': None, 'parallelism': 1, 'selector': {'match_expressions': None, 'match_labels': {'controller-uid': 'be512575-a3bf-43e4-9a2a-7686dc1f94db'}}, 'template': {'metadata': {'annotations': None, 'cluster_name': None, 'creation_timestamp': None, 'deletion_grace_period_seconds': None, 'deletion_timestamp': None, 'finalizers': None, 'generate_name': None, 'generation': None, 'labels': {'controller-uid': 'be512575-a3bf-43e4-9a2a-7686dc1f94db', 'job-name': 'acpo-20210708jid-8188'}, 'managed_fields': None, 'name': None, 'namespace': None, 'owner_references': None, 'resource_version': None, 'self_link': None, 'uid': None}, 'spec': {'active_deadline_seconds': None, 'affinity': None, 'automount_service_account_token': None, 'containers': [{'args': None, 'command': ['/usr/local/bin/python', '/usr/local/bin/csv2json', '--saveinputmeta', '--saveoutputmeta', '--inputFileFilter', '**/*.csv', '--tagFileFilter', '**/*.dcm', '--outputFileStem', 'prediction', '--addTags', 'PatientID,PatientName,PatientAge,StudyDate', '--pftelDB', 'https://pftel-chris-public.apps.ocp-prod.massopen.cloud/api/v1/dylld/2023-03-17T13:10:31.158399+00:00/analysis', '/share/incoming', '/share/outgoing'], 'env': [{'name': 'CHRIS_JID', 'value': 'acpo-20210708jid-8188', 'value_from': None}, {'name': 'CHRIS_PLG_INST_ID', 'value': '8188', 'value_from': None}, {'name': 'CHRIS_PREV_PLG_INST_ID', 'value': '8186', 'value_from': None}, {'name': 'CHRIS_PREV_JID', 'value': 'acpo-20210708jid-8186', 'value_from': None}, {'name': 'CHRIS_WORKFLOW_ID', 'value': '48', 'value_from': None}, {'name': 'CHRIS_PIPELINE_ID', 'value': '72', 'value_from': None}, {'name': 'CHRIS_WORKFLOW_PLG_INSTANCES', 'value': 'landmarks-to-json:8188', 'value_from': None}, {'name': 'HOME', 'value': '/tmp', 'value_from': None}], 'env_from': None, 'image': 'fnndsc/pl-csv2json:1.0.10', 'image_pull_policy': 'IfNotPresent', 'lifecycle': None, 'liveness_probe': None, 'name': 'acpo-20210708jid-8188', 'ports': None, 'readiness_probe': None, 'resources': {'limits': {'cpu': '2', 'memory': '8000Mi'}, 'requests': None}, 'security_context': {'allow_privilege_escalation': False, 'capabilities': {'add': None, 'drop': ['ALL']}, 'privileged': None, 'proc_mount': None, 'read_only_root_filesystem': None, 'run_as_group': 1102, 'run_as_non_root': None, 'run_as_user': 7748, 'se_linux_options': None, 'windows_options': None}, 'startup_probe': None, 'stdin': None, 'stdin_once': None, 'termination_message_path': '/dev/termination-log', 'termination_message_policy': 'File', 'tty': None, 'volume_devices': None, 'volume_mounts': [{'mount_path': '/share', 'mount_propagation': None, 'name': 'storebase', 'read_only': None, 'sub_path': None, 'sub_path_expr': None}], 'working_dir': None}], 'dns_config': None, 'dns_policy': 'ClusterFirst', 'enable_service_links': None, 'ephemeral_containers': None, 'host_aliases': None, 'host_ipc': None, 'host_network': None, 'host_pid': None, 'hostname': None, 'image_pull_secrets': None, 'init_containers': None, 'node_name': None, 'node_selector': None, 'overhead': None, 'preemption_policy': None, 'priority': None, 'priority_class_name': None, 'readiness_gates': None, 'restart_policy': 'Never', 'runtime_class_name': None, 'scheduler_name': 'default-scheduler', 'security_context': {'fs_group': None, 'run_as_group': None, 'run_as_non_root': None, 'run_as_user': None, 'se_linux_options': None, 'supplemental_groups': None, 'sysctls': None, 'windows_options': None}, 'service_account': None, 'service_account_name': None, 'share_process_namespace': None, 'subdomain': None, 'termination_grace_period_seconds': 30, 'tolerations': None, 'topology_spread_constraints': None, 'volumes': [{'aws_elastic_block_store': None, 'azure_disk': None, 'azure_file': None, 'cephfs': None, 'cinder': None, 'config_map': None, 'csi': None, 'downward_api': None, 'empty_dir': None, 'fc': None, 'flex_volume': None, 'flocker': None, 'gce_persistent_disk': None, 'git_repo': None, 'glusterfs': None, 'host_path': None, 'iscsi': None, 'name': 'storebase', 'nfs': {'path': '/ifs/RC-FS-PROD/Data/Shares/FNNDSC-e2/neuro/labs/grantlab/research/prod/galena/storage/jorge_storebase/key-acpo-20210708jid-8188', 'read_only': None, 'server': 'rc-fs-nfs.tch.harvard.edu'}, 'persistent_volume_claim': None, 'photon_persistent_disk': None, 'portworx_volume': None, 'projected': None, 'quobyte': None, 'rbd': None, 'scale_io': None, 'secret': None, 'storageos': None, 'vsphere_volume': None}]}}, 'ttl_seconds_after_finished': 86400}, 'status': {'active': None, 'completion_time': None, 'conditions': None, 'failed': None, 'start_time': datetime.datetime(2023, 3, 17, 13, 20, 51, tzinfo=tzlocal()), 'succeeded': None}}