Issues replacing dead node

I believe I may have found a bug in the case where:

The scheduler and executor are running on the same instance
This instance fails, ie. does not cleanly exit the cluster and does not return (To replicate this on AWS I would simply terminate the instance.)
Scheduler comes up on another node as expected
Terminate the failed node using /node/{node}/terminate

As expected, receive this response from the scheduler:

{
  "ip" : "10.0.0.6",
  "hostname" : "10.0.0.6",
  "executorId" : "cassandra.clustername.node.2.executor",
  "targetRunState" : "TERMINATE"
}

However, when making the call to /node/{node}/replace, the following response is received:

{
  "ipToReplace" : "10.0.0.6",
  "success" : false,
  "reason" : "No such node"
}

The scheduler will then accept the next relevant offer and bring up a new executor, which of course, won't replace the dead cassandra node.

Finally, here is the result from /node/all, note that in this sample, executor 2 was lost and executor 4 was then started by the scheduler:

{
  "nodes": [
    {
      "dataVolumes": [
        {
          "path": "."
        }
      ],
      "healthCheckDetails": {
        "uptimeMillis": 2520291,
        "tokenCount": 256,
        "rpcServerRunning": true,
        "nativeTransportRunning": true,
        "gossipRunning": true,
        "gossipInitialized": true,
        "joined": true,
        "hostId": "54d41058-612c-4a95-b2f8-53ba11a6bb7a",
        "healthy": true,
        "msg": "",
        "version": "2.1.4",
        "operationMode": "NORMAL",
        "clusterName": "cassandra.clustername",
        "dataCenter": "dcos0",
        "rack": "rack0",
        "endpoint": "10.0.0.12"
      },
      "lastHealthCheck": 1456192696347,
      "cassandraDaemonPid": 2783,
      "rackDc": {
        "dc": "dcos0",
        "rack": "rack0"
      },
      "tasks": {
        "SERVER": {
          "taskName": "cassandra.clustername.node",
          "taskId": "cassandra.clustername.node.1.executor.server",
          "memMb": 2048,
          "diskMb": 6144,
          "cpuCores": 1
        },
        "METADATA": {
          "taskName": "cassandra.clustername.node.1.executor",
          "taskId": "cassandra.clustername.node.1.executor",
          "memMb": 32,
          "diskMb": 0,
          "cpuCores": 0.1
        }
      },
      "executorId": "cassandra.clustername.node.1.executor",
      "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S4/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.1.executor/runs/0d9c620e-918a-468c-8b6e-fe41878406b7",
      "ip": "10.0.0.12",
      "hostname": "10.0.0.12",
      "targetRunState": "RUN",
      "jmxPort": 7199,
      "seedNode": true
    },
    {
      "dataVolumes": [
        {
          "path": "."
        }
      ],
      "healthCheckDetails": {
        "uptimeMillis": 1620251,
        "tokenCount": 256,
        "rpcServerRunning": true,
        "nativeTransportRunning": true,
        "gossipRunning": true,
        "gossipInitialized": true,
        "joined": true,
        "hostId": "584f8266-4ee5-47fc-9c4a-4a7a4d666d39",
        "healthy": true,
        "msg": "",
        "version": "2.1.4",
        "operationMode": "NORMAL",
        "clusterName": "cassandra.clustername",
        "dataCenter": "dcos0",
        "rack": "rack0",
        "endpoint": "10.0.0.6"
      },
      "lastHealthCheck": 1456191916506,
      "cassandraDaemonPid": 7129,
      "rackDc": {
        "dc": "dcos0",
        "rack": "rack0"
      },
      "tasks": {
        "SERVER": {
          "taskName": "cassandra.clustername.node",
          "taskId": "cassandra.clustername.node.2.executor.server",
          "memMb": 2048,
          "diskMb": 6144,
          "cpuCores": 1
        },
        "METADATA": {
          "taskName": "cassandra.clustername.node.2.executor",
          "taskId": "cassandra.clustername.node.2.executor",
          "memMb": 32,
          "diskMb": 0,
          "cpuCores": 0.1
        }
      },
      "executorId": "cassandra.clustername.node.2.executor",
      "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S1/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.2.executor/runs/3c9f4f5c-30eb-4aaa-8de1-b4a42d95d197",
      "ip": "10.0.0.6",
      "hostname": "10.0.0.6",
      "targetRunState": "TERMINATE",
      "jmxPort": 7199,
      "seedNode": false
    },
    {
      "dataVolumes": [
        {
          "path": "."
        }
      ],
      "healthCheckDetails": {
        "uptimeMillis": 1139769,
        "tokenCount": 256,
        "rpcServerRunning": true,
        "nativeTransportRunning": true,
        "gossipRunning": true,
        "gossipInitialized": true,
        "joined": true,
        "hostId": "0be01ea8-b568-4260-a2c6-01be1652d12c",
        "healthy": true,
        "msg": "",
        "version": "2.1.4",
        "operationMode": "NORMAL",
        "clusterName": "cassandra.clustername",
        "dataCenter": "dcos0",
        "rack": "rack0",
        "endpoint": "10.0.0.108"
      },
      "lastHealthCheck": 1456192695125,
      "cassandraDaemonPid": 1850,
      "rackDc": {
        "dc": "dcos0",
        "rack": "rack0"
      },
      "tasks": {
        "SERVER": {
          "taskName": "cassandra.clustername.node",
          "taskId": "cassandra.clustername.node.3.executor.server",
          "memMb": 2048,
          "diskMb": 6144,
          "cpuCores": 1
        },
        "METADATA": {
          "taskName": "cassandra.clustername.node.3.executor",
          "taskId": "cassandra.clustername.node.3.executor",
          "memMb": 32,
          "diskMb": 0,
          "cpuCores": 0.1
        }
      },
      "executorId": "cassandra.clustername.node.3.executor",
      "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S5/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.3.executor/runs/25afa14d-5a18-47b3-af12-a49eb53ced2b",
      "ip": "10.0.0.108",
      "hostname": "10.0.0.108",
      "targetRunState": "RUN",
      "jmxPort": 7199,
      "seedNode": true
    },
    {
      "dataVolumes": [
        {
          "path": "."
        }
      ],
      "healthCheckDetails": {
        "uptimeMillis": 0,
        "tokenCount": 0,
        "rpcServerRunning": false,
        "nativeTransportRunning": false,
        "gossipRunning": false,
        "gossipInitialized": false,
        "joined": false,
        "hostId": "",
        "healthy": false,
        "msg": "",
        "version": "",
        "operationMode": "",
        "clusterName": "",
        "dataCenter": "",
        "rack": "",
        "endpoint": ""
      },
      "lastHealthCheck": 1456192650795,
      "cassandraDaemonPid": 1812,
      "rackDc": {
        "dc": "dcos0",
        "rack": "rack0"
      },
      "tasks": {
        "SERVER": {
          "taskName": "cassandra.clustername.node",
          "taskId": "cassandra.clustername.node.4.executor.server",
          "memMb": 2048,
          "diskMb": 6144,
          "cpuCores": 1
        },
        "METADATA": {
          "taskName": "cassandra.clustername.node.4.executor",
          "taskId": "cassandra.clustername.node.4.executor",
          "memMb": 32,
          "diskMb": 0,
          "cpuCores": 0.1
        }
      },
      "executorId": "cassandra.clustername.node.4.executor",
      "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S6/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.4.executor/runs/15c0e8d6-9994-4422-ae50-7fd598103bcc",
      "ip": "10.0.0.97",
      "hostname": "10.0.0.97",
      "targetRunState": "RUN",
      "jmxPort": 7199,
      "seedNode": false
    }
  ],
  "nodesToAcquire": 0,
  "replaceNodes": []
}

mesosphere-backup / cassandra-mesos-deprecated

Issues replacing dead node #167