Description

If a remote worker node fails and is restarted, the running works will remain in the "Running" state and will not marked as "Completed" or "Failed" state forever.

If the worker process that should have been running does not exist after the node is restarted, shouldn't mark the work as "Failed" state?

Version

Using upstream devel image from quay.io/ansible/receptor:devel.

$ docker compose exec foo receptorctl version
receptorctl  1.3.0+g8f8481c
receptor     1.3.0+g8f8481c

Steps to reproduce

issue

Prepare files

foo.yml

---
- node:
 id: foo
- log-level:
 level: debug
- tcp-listener:
 port: 2222
- control-service:
 service: control
 filename: /tmp/receptor.sock

bar.yml

---
- node:
 id: bar
- log-level:
 level: debug
- tcp-peer:
 address: foo:2222
- control-service:
 service: control
- work-command:
 worktype: sleep
 command: sh
 params: -c "sleep 60; echo WORKER DONE"

docker-compose.yml

services:
foo:
 image: quay.io/ansible/receptor:devel
 command: "receptor -c /etc/receptor/receptor.conf"
 volumes:
   - "./foo.yml:/etc/receptor/receptor.conf"
bar:
 image: quay.io/ansible/receptor:devel
 command: "receptor -c /etc/receptor/receptor.conf"
 volumes:
   - "./bar.yml:/etc/receptor/receptor.conf"

Prepare environment

# Start containers
$ docker compose up -d
[+] Running 3/3
⠿ Network issue_default  Created                                                     0.1s
⠿ Container issue-bar-1  Started                                                     0.5s
⠿ Container issue-foo-1  Started                                                     0.5s

# Install `ps` command on bar for debugging
$ docker compose exec bar dnf -y install procps
...

Submit work

# Submit work
$ docker compose exec foo receptorctl work submit --node bar --no-payload sleep
Result: Job Started
Unit ID: 6lXrTbZq

# Ensure the work is running
$ docker compose exec foo receptorctl work list
{
   "6lXrTbZq": {
       "Detail": "Running: PID 74",
       "ExtraData": {
           "Expiration": "0001-01-01T00:00:00Z",
           "LocalCancelled": false,
           "LocalReleased": false,
           "RemoteNode": "bar",
           "RemoteParams": {},
           "RemoteStarted": true,
           "RemoteUnitID": "RcilHQav",
           "RemoteWorkType": "sleep",
           "SignWork": false,
           "TLSClient": ""
       },
       "State": 1,
       "StateName": "Running",
       "StdoutSize": 0,
       "WorkType": "remote"
   }
}

# Ensure worker process and PID in the status above exist on bar
$ docker compose exec bar ps -ef | cat
UID          PID    PPID  C STIME TTY          TIME CMD
root           1       0  0 20:53 ?        00:00:00 /usr/local/bin/dumb-init -- receptor -c /etc/receptor/receptor.conf
root           7       1  0 20:53 ?        00:00:00 receptor -c /etc/receptor/receptor.conf
root          67       7  0 20:56 ?        00:00:00 receptor --node id=worker --log-level debug --command-runner command=sh params=-c "sleep 60; echo WORKER DONE" unitdir=/tmp/receptor/bar/RcilHQav
root          74      67  0 20:56 ?        00:00:00 sh -c sleep 60; echo WORKER DONE
root          75      74  0 20:56 ?        00:00:00 /usr/bin/coreutils --coreutils-prog-shebang=sleep /usr/bin/sleep 60
root          76       0  0 20:56 ?        00:00:00 ps -ef

Restart executor node to simulate node failure

# Stop and start bar to simulate node failure
$ docker compose stop bar
[+] Running 1/0
⠿ Container issue-bar-1  Stopped                                                     0.1s

$ docker compose ps
NAME                COMMAND                  SERVICE             STATUS              PORTS
issue-bar-1    "/usr/local/bin/dumb…"   bar                 exited (143)        
issue-foo-1    "/usr/local/bin/dumb…"   foo                 running             7323/tcp

$ docker compose start bar
[+] Running 1/1
⠿ Container issue-bar-1  Started                                                     0.3s

# Ensure there is no forked process but unit files are exist on bar
$ docker compose exec bar ps -ef | cat
UID          PID    PPID  C STIME TTY          TIME CMD
root           1       0  0 20:56 ?        00:00:00 /usr/local/bin/dumb-init -- receptor -c /etc/receptor/receptor.conf
root           7       1  2 20:56 ?        00:00:00 receptor -c /etc/receptor/receptor.conf
root          14       0  0 20:56 ?        00:00:00 ps -ef

$ docker compose exec bar ls -l /tmp/receptor/bar/RcilHQav
total 4
-rw-------. 1 root root 141 Dec 23 20:56 status
-rw-------. 1 root root   0 Dec 23 20:56 status.lock
-rw-------. 1 root root   0 Dec 23 20:56 stdin
-rw-------. 1 root root   0 Dec 23 20:56 stdout

Ensure the work is in still running state and never be marked as completed or failed

$ docker compose exec foo receptorctl work list
{
   "6lXrTbZq": {
       "Detail": "Running: PID 74",
       "ExtraData": {
           "Expiration": "0001-01-01T00:00:00Z",
           "LocalCancelled": false,
           "LocalReleased": false,
           "RemoteNode": "bar",
           "RemoteParams": {},
           "RemoteStarted": true,
           "RemoteUnitID": "RcilHQav",
           "RemoteWorkType": "sleep",
           "SignWork": false,
           "TLSClient": ""
       },
       "State": 1,
       "StateName": "Running",
       "StdoutSize": 0,
       "WorkType": "remote"
   }
}

Additional information

With the current implementation, the sender of work will wait forever for the completion of work that never ends.

It seems that in the node failure, the entire processes/threads of the Goroutine that monitors the process are terminated, so the workpiece is isolated forever.

I think it would be a more natural behavior to implement a work to fail if, after Receptor restarts, there is no worker process that references the unit directory that exists.

ansible / receptor

Work remains in "Running" state after the failure of a remote worker node #711

Description

Version

Steps to reproduce

issue

Additional information