Closed messersc closed 2 years ago
The current HEAD should not have this problem any more.
Our local deployment has the line indented by two levels / eight spaces.
I'm actually still seeing similar behavious as this on HEAD... Is there a more robust way of dealing with this?
Submitted job 84 with external jobid '40190775'.
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 44616)
Traceback (most recent call last):
File "/home/mihall/sw/miniconda3/envs/amplicon/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
self.process_request(request, client_address)
File "/home/mihall/sw/miniconda3/envs/amplicon/lib/python3.10/socketserver.py", line 347, in process_request
self.finish_request(request, client_address)
File "/home/mihall/sw/miniconda3/envs/amplicon/lib/python3.10/socketserver.py", line 360, in finish_request
self.RequestHandlerClass(request, client_address, self)
File "/home/mihall/sw/miniconda3/envs/amplicon/lib/python3.10/socketserver.py", line 747, in __init__
self.handle()
File "/home/mihall/sw/miniconda3/envs/amplicon/lib/python3.10/http/server.py", line 432, in handle
self.handle_one_request()
File "/home/mihall/sw/miniconda3/envs/amplicon/lib/python3.10/http/server.py", line 420, in handle_one_request
method()
File "/home/mihall/.config/snakemake/slurm.punim1637/slurm-sidecar.py", line 213, in do_GET
status = self.server.poll_thread.get_state(job_id)
File "/home/mihall/.config/snakemake/slurm.punim1637/slurm-sidecar.py", line 106, in get_state
self.states[jobid] = self._get_state_sacct(jobid)
File "/home/mihall/.config/snakemake/slurm.punim1637/slurm-sidecar.py", line 133, in _get_state_sacct
parsed = {x.split("|")[0]: x.split("|")[1] for x in output.strip().split("\n")}
File "/home/mihall/.config/snakemake/slurm.punim1637/slurm-sidecar.py", line 133, in <dictcomp>
parsed = {x.split("|")[0]: x.split("|")[1] for x in output.strip().split("\n")}
IndexError: list index out of range
----------------------------------------
slurm-status.py: could not query side car: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
list index out of range
Came across this again today. cc @holtgrewe
@mbhall88 meh
I'm in the middle of something right now. I'm actually using the following patch and it works pretty robustly. Does this help?
diff --git a/{{cookiecutter.profile_name}}/slurm-sidecar.py b/{{cookiecutter.profile_name}}/slurm-sidecar.py
index af18c12..a3daaaa 100755
--- a/{{cookiecutter.profile_name}}/slurm-sidecar.py
+++ b/{{cookiecutter.profile_name}}/slurm-sidecar.py
@@ -1,9 +1,9 @@
#!/usr/bin/env python3
"""Run a Snakemake v7+ sidecar process for Slurm
-This sidecar process will poll ``squeue --user [user] --format='%i,%T'``
-every 60 seconds by default (use environment variable
-``SNAKEMAKE_SLURM_SQUEUE_WAIT`` for adjusting this).
+This sidecar process will poll ``squeue --me --format='%i,%T'`` every 60
+seconds by default (use environment variable ``SNAKEMAKE_SLURM_SQUEUE_WAIT``
+for adjusting this).
Note that you have to adjust the value to fit to your ``MinJobAge`` Slurm
configuration. Jobs remain at least ``MinJobAge`` seconds known to the
@@ -63,7 +63,7 @@ class PollSqueueThread(threading.Thread):
squeue_cmd,
squeue_timeout=2,
sleep_time=0.01,
- max_tries=3,
+ max_tries=10,
*args,
**kwargs
):
@@ -103,7 +103,10 @@ class PollSqueueThread(threading.Thread):
"""Return the job state for the given jobid."""
jobid = str(jobid)
if jobid not in self.states:
- self.states[jobid] = self._get_state_sacct(jobid)
+ try:
+ self.states[jobid] = self._get_state_sacct(jobid)
+ except:
+ return "__not_seen_yet__"
return self.states.get(jobid, "__not_seen_yet__")
def register_job(self, jobid):
@@ -122,17 +125,22 @@ class PollSqueueThread(threading.Thread):
try:
logger.debug("Calling %s (try %d)", cmd, try_num)
output = subprocess.check_output(cmd, timeout=self.squeue_timeout, text=True)
- break
except subprocess.TimeoutExpired as e:
- logger.debug("Call to %s timed out (try %d of %d)", cmd, try_num, self.max_tries)
+ logger.warning("Call to %s timed out (try %d of %d)", cmd, try_num, self.max_tries)
+ continue
except subprocess.CalledProcessError as e:
- logger.debug("Call to %s failed (try %d of %d)", cmd, try_num, self.max_tries)
- if try_num >= self.max_tries:
- raise Exception("Problem with call to %s" % cmd)
- else:
- parsed = {x.split("|")[0]: x.split("|")[1] for x in output.strip().split("\n")}
- logger.debug("Returning state of %s as %s", jobid, parsed[jobid])
- return parsed[jobid]
+ logger.warning("Call to %s failed (try %d of %d)", cmd, try_num, self.max_tries)
+ continue
+ try:
+ parsed = {x.split("|")[0]: x.split("|")[1] for x in output.strip().split("\n")}
+ logger.debug("Returning state of %s as %s", jobid, parsed[jobid])
+ return parsed[jobid]
+ except IndexError:
+ logger.warning("Could not parse %s (try %d of %d)", repr(output), try_num, self.max_tries)
+ secs = try_num / 2.0
+ loger.info("Sleeping %f seconds", secs)
+ time.sleep(secs)
+ raise Exception("Problem with call to %s" % cmd)
def stop(self):
"""Flag thread to stop execution"""
@@ -143,7 +151,7 @@ class PollSqueueThread(threading.Thread):
"""Run the call to ``squeue``"""
cluster = CookieCutter.get_cluster_option()
try_num = 0
- cmd = [SQUEUE_CMD, "--user={}".format(os.environ.get("USER")), "--format=%i,%T", "--state=all"]
+ cmd = [SQUEUE_CMD, "--me", "--format=%i,%T", "--state=all"]
if cluster:
cmd.append(cluster)
while try_num < self.max_tries:
@@ -209,6 +217,12 @@ class JobStateHttpHandler(http.server.BaseHTTPRequestHandler):
return
# Otherwise, query job ID status
job_id = self.path[len("/job/status/") :]
+ try:
+ logger.debug("CLEMENS:")
+ logger.debug(job_id)
+ job_id=job_id.split("%20")[3]
+ except IndexError:
+ pass
logger.debug("Querying for job ID %s" % repr(job_id))
status = self.server.poll_thread.get_state(job_id)
logger.debug("Status: %s" % status)
@@ -286,8 +300,8 @@ def main():
poll_thread = PollSqueueThread(SQUEUE_WAIT, SQUEUE_CMD, name="poll-squeue")
poll_thread.start()
- # Initialize HTTP server that makes available the output of ``squeue --user [user]``
- # in a controlled fashion.
+ # Initialize HTTP server that makes available the output of ``squeue --me`` in a
+ # controlled fashion.
http_server = JobStateHttpServer(poll_thread)
http_thread = threading.Thread(name="http-server", target=http_server.serve_forever)
http_thread.start()
I'll give it a go, thanks
I was getting this error a lot
parsed = {x.split("|")[0]: x.split("|")[1] for x in output.strip().split("\n")}
IndexError: list index out of range
And that patch seems to have fixed it. I occasionally see warnings like Could not parse '' (try 1 of 10)
, but the sidecar stays running and soon after the job is probably recorded.
The patch also worked for me too. I'll open a PR with that patch applied
Hi,
we have problems with the sidecar crashing. This happens when the response was empty as far as I can see from the logs. We are not sure why the response might be empty yet, but this case should be handled better. Log is below, I can post a PR if we decide on the best course of action.
Thanks, Clemens