Open DailyDreaming opened 3 years ago
Source: https://ucsc-ci.com/databiosphere/toil/-/jobs/82743
_____________________ JobServiceTest.testServiceRecursive ______________________ Traceback (most recent call last): File "/builds/databiosphere/toil/src/toil/test/src/jobServiceTest.py", line 177, in runToil Job.Runner.startToil(rootJob, options) File "/builds/databiosphere/toil/src/toil/job.py", line 1743, in startToil return toil.restart() File "/builds/databiosphere/toil/src/toil/common.py", line 862, in restart return self._runMainLoop(rootJobDescription) File "/builds/databiosphere/toil/src/toil/common.py", line 1120, in _runMainLoop jobCache=self._jobCache).run() File "/builds/databiosphere/toil/src/toil/leader.py", line 262, in run raise FailedJobsException(self.config.jobStore, self.toilState.totalFailedJobs, self.jobStore) toil.leader.FailedJobsException: The job store 'file:/tmp/toil-test-toil.test.src.jobServiceTest.JobServiceTest-testServiceRecursive-jobstore-8zv8d88g' contains 1 failed jobs: 'JobFunctionWrappingJob' kind-JobFunctionWrappingJob/instance-acedx9bf Log from job "'JobFunctionWrappingJob' kind-JobFunctionWrappingJob/instance-acedx9bf" follows: =========> [2021-04-23T15:13:45+0000] [MainThread] [I] [toil.worker] ---TOIL WORKER OUTPUT LOG--- [2021-04-23T15:13:45+0000] [MainThread] [I] [toil] Running Toil version 5.4.0a1-9fe8a768da5602f2353811cf286a59e24135b596 on host runner-hyeg35g-project-3-concurrent-04sp2d. [2021-04-23T15:13:45+0000] [MainThread] [D] [toil] Configuration: {'workflowID': 'f0467ef6-e095-41cb-a790-73de6e343878', 'workflowAttemptNumber': 51, 'jobStore': 'file:/tmp/toil-test-toil.test.src.jobServiceTest.JobServiceTest-testServiceRecursive-jobstore-8zv8d88g', 'logLevel': 'DEBUG', 'workDir': None, 'noStdOutErr': False, 'stats': False, 'clean': 'onSuccess', 'cleanWorkDir': 'always', 'clusterStats': None, 'restart': True, 'batchSystem': 'single_machine', 'disableAutoDeployment': False, 'environment': {}, 'statePollingWait': 1, 'maxLocalJobs': 4, 'manualMemArgs': False, 'parasolCommand': 'parasol', 'parasolMaxBatches': 1000, 'scale': 1.0, 'linkImports': True, 'moveExports': False, 'mesosMasterAddress': '10.244.2.152:5050', 'allocate_mem': True, 'kubernetesHostPath': None, 'provisioner': None, 'nodeTypes': [], 'minNodes': None, 'maxNodes': [10], 'targetTime': 1800, 'betaInertia': 0.1, 'scaleInterval': 60, 'preemptableCompensation': 0.0, 'nodeStorage': 50, 'nodeStorageOverrides': [], 'metrics': False, 'maxPreemptableServiceJobs': 9223372036854775807, 'maxServiceJobs': 9223372036854775807, 'deadlockWait': 60, 'deadlockCheckInterval': 30, 'defaultMemory': 2147483648, 'defaultCores': 1, 'defaultDisk': 2147483648, 'readGlobalFileMutableByDefault': False, 'defaultPreemptable': False, 'maxCores': 9223372036854775807, 'maxMemory': 9223372036854775807, 'maxDisk': 9223372036854775807, 'retryCount': 1, 'enableUnlimitedPreemptableRetries': False, 'doubleMem': False, 'maxJobDuration': 9223372036854775807, 'rescueJobsFrequency': 3600, 'disableCaching': False, 'disableChaining': False, 'disableJobStoreChecksumVerification': False, 'maxLogFileSize': 64000, 'writeLogs': None, 'writeLogsGzip': None, 'writeLogsFromAllJobs': False, 'sseKey': None, 'servicePollingInterval': 1.0, 'useAsync': True, 'forceDockerAppliance': False, 'runCwlInternalJobsOnWorkers': False, 'statusWait': 3600, 'disableProgress': False, 'debugWorker': False, 'disableWorkerOutputCapture': False, 'badWorker': 0.5, 'badWorkerFailInterval': 0.1, 'cwl': False} [2021-04-23T15:13:45+0000] [MainThread] [D] [toil.deferred] Running for file /tmp/87dc614e0bb758e29408c7feecff8ab9/deferred/func6y8nwvei [2021-04-23T15:13:45+0000] [MainThread] [D] [toil.worker] Parsed job description [2021-04-23T15:13:45+0000] [MainThread] [D] [toil.worker] Job is a checkpoint [2021-04-23T15:13:45+0000] [MainThread] [D] [toil.worker] The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete. [2021-04-23T15:13:45+0000] [MainThread] [I] [toil.worker] Working on job 'JobFunctionWrappingJob' kind-JobFunctionWrappingJob/instance-acedx9bf [2021-04-23T15:13:45+0000] [MainThread] [D] [toil.worker] Got a command to run: _toil files/for-job/kind-JobFunctionWrappingJob/instance-acedx9bf/cleanup/file-f7a8dcb0d153450f813d1c273d33c698/stream /builds/databiosphere/toil/src toil.test.src.jobServiceTest False [2021-04-23T15:13:45+0000] [MainThread] [D] [toil.job] Loading user module ModuleDescriptor(dirPath='/builds/databiosphere/toil/src', name='toil.test.src.jobServiceTest', fromVirtualEnv=False). Traceback (most recent call last): File "/builds/databiosphere/toil/src/toil/worker.py", line 367, in workerScript job = Job.loadJob(jobStore, jobDesc) File "/builds/databiosphere/toil/src/toil/job.py", line 2238, in loadJob jobStore.readFile(pickleFile, filename) File "/builds/databiosphere/toil/src/toil/jobStores/fileJobStore.py", line 440, in readFile self._checkJobStoreFileID(jobStoreFileID) File "/builds/databiosphere/toil/src/toil/jobStores/fileJobStore.py", line 724, in _checkJobStoreFileID raise NoSuchFileException(jobStoreFileID) toil.jobStores.abstractJobStore.NoSuchFileException: File 'files/for-job/kind-JobFunctionWrappingJob/instance-acedx9bf/cleanup/file-f7a8dcb0d153450f813d1c273d33c698/stream' does not exist. [2021-04-23T15:13:45+0000] [MainThread] [E] [toil.worker] Exiting the worker because of a failed job on host runner-hyeg35g-project-3-concurrent-04sp2d <========= During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/builds/databiosphere/toil/src/toil/test/src/jobServiceTest.py", line 129, in testServiceRecursive self.runToil(t) File "/builds/databiosphere/toil/src/toil/test/src/jobServiceTest.py", line 182, in runToil self.fail() #Exceeded a reasonable number of restarts File "/usr/lib/python3.6/unittest/case.py", line 670, in fail raise self.failureException(msg) AssertionError: None
┆Issue is synchronized with this Jira Story ┆friendlyId: TOIL-882
This is still causing apparently spurious failures: https://ucsc-ci.com/databiosphere/toil/-/jobs/23953
➤ Melaina Legaspi commented:
We should just increase the number of tries.
Source: https://ucsc-ci.com/databiosphere/toil/-/jobs/82743
┆Issue is synchronized with this Jira Story ┆friendlyId: TOIL-882