Closed ad1ous closed 8 years ago
I believe I may have found a bug in the case where:
/node/{node}/terminate
As expected, receive this response from the scheduler:
{ "ip" : "10.0.0.6", "hostname" : "10.0.0.6", "executorId" : "cassandra.clustername.node.2.executor", "targetRunState" : "TERMINATE" }
However, when making the call to /node/{node}/replace, the following response is received:
/node/{node}/replace
{ "ipToReplace" : "10.0.0.6", "success" : false, "reason" : "No such node" }
The scheduler will then accept the next relevant offer and bring up a new executor, which of course, won't replace the dead cassandra node.
Finally, here is the result from /node/all, note that in this sample, executor 2 was lost and executor 4 was then started by the scheduler:
/node/all
{ "nodes": [ { "dataVolumes": [ { "path": "." } ], "healthCheckDetails": { "uptimeMillis": 2520291, "tokenCount": 256, "rpcServerRunning": true, "nativeTransportRunning": true, "gossipRunning": true, "gossipInitialized": true, "joined": true, "hostId": "54d41058-612c-4a95-b2f8-53ba11a6bb7a", "healthy": true, "msg": "", "version": "2.1.4", "operationMode": "NORMAL", "clusterName": "cassandra.clustername", "dataCenter": "dcos0", "rack": "rack0", "endpoint": "10.0.0.12" }, "lastHealthCheck": 1456192696347, "cassandraDaemonPid": 2783, "rackDc": { "dc": "dcos0", "rack": "rack0" }, "tasks": { "SERVER": { "taskName": "cassandra.clustername.node", "taskId": "cassandra.clustername.node.1.executor.server", "memMb": 2048, "diskMb": 6144, "cpuCores": 1 }, "METADATA": { "taskName": "cassandra.clustername.node.1.executor", "taskId": "cassandra.clustername.node.1.executor", "memMb": 32, "diskMb": 0, "cpuCores": 0.1 } }, "executorId": "cassandra.clustername.node.1.executor", "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S4/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.1.executor/runs/0d9c620e-918a-468c-8b6e-fe41878406b7", "ip": "10.0.0.12", "hostname": "10.0.0.12", "targetRunState": "RUN", "jmxPort": 7199, "seedNode": true }, { "dataVolumes": [ { "path": "." } ], "healthCheckDetails": { "uptimeMillis": 1620251, "tokenCount": 256, "rpcServerRunning": true, "nativeTransportRunning": true, "gossipRunning": true, "gossipInitialized": true, "joined": true, "hostId": "584f8266-4ee5-47fc-9c4a-4a7a4d666d39", "healthy": true, "msg": "", "version": "2.1.4", "operationMode": "NORMAL", "clusterName": "cassandra.clustername", "dataCenter": "dcos0", "rack": "rack0", "endpoint": "10.0.0.6" }, "lastHealthCheck": 1456191916506, "cassandraDaemonPid": 7129, "rackDc": { "dc": "dcos0", "rack": "rack0" }, "tasks": { "SERVER": { "taskName": "cassandra.clustername.node", "taskId": "cassandra.clustername.node.2.executor.server", "memMb": 2048, "diskMb": 6144, "cpuCores": 1 }, "METADATA": { "taskName": "cassandra.clustername.node.2.executor", "taskId": "cassandra.clustername.node.2.executor", "memMb": 32, "diskMb": 0, "cpuCores": 0.1 } }, "executorId": "cassandra.clustername.node.2.executor", "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S1/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.2.executor/runs/3c9f4f5c-30eb-4aaa-8de1-b4a42d95d197", "ip": "10.0.0.6", "hostname": "10.0.0.6", "targetRunState": "TERMINATE", "jmxPort": 7199, "seedNode": false }, { "dataVolumes": [ { "path": "." } ], "healthCheckDetails": { "uptimeMillis": 1139769, "tokenCount": 256, "rpcServerRunning": true, "nativeTransportRunning": true, "gossipRunning": true, "gossipInitialized": true, "joined": true, "hostId": "0be01ea8-b568-4260-a2c6-01be1652d12c", "healthy": true, "msg": "", "version": "2.1.4", "operationMode": "NORMAL", "clusterName": "cassandra.clustername", "dataCenter": "dcos0", "rack": "rack0", "endpoint": "10.0.0.108" }, "lastHealthCheck": 1456192695125, "cassandraDaemonPid": 1850, "rackDc": { "dc": "dcos0", "rack": "rack0" }, "tasks": { "SERVER": { "taskName": "cassandra.clustername.node", "taskId": "cassandra.clustername.node.3.executor.server", "memMb": 2048, "diskMb": 6144, "cpuCores": 1 }, "METADATA": { "taskName": "cassandra.clustername.node.3.executor", "taskId": "cassandra.clustername.node.3.executor", "memMb": 32, "diskMb": 0, "cpuCores": 0.1 } }, "executorId": "cassandra.clustername.node.3.executor", "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S5/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.3.executor/runs/25afa14d-5a18-47b3-af12-a49eb53ced2b", "ip": "10.0.0.108", "hostname": "10.0.0.108", "targetRunState": "RUN", "jmxPort": 7199, "seedNode": true }, { "dataVolumes": [ { "path": "." } ], "healthCheckDetails": { "uptimeMillis": 0, "tokenCount": 0, "rpcServerRunning": false, "nativeTransportRunning": false, "gossipRunning": false, "gossipInitialized": false, "joined": false, "hostId": "", "healthy": false, "msg": "", "version": "", "operationMode": "", "clusterName": "", "dataCenter": "", "rack": "", "endpoint": "" }, "lastHealthCheck": 1456192650795, "cassandraDaemonPid": 1812, "rackDc": { "dc": "dcos0", "rack": "rack0" }, "tasks": { "SERVER": { "taskName": "cassandra.clustername.node", "taskId": "cassandra.clustername.node.4.executor.server", "memMb": 2048, "diskMb": 6144, "cpuCores": 1 }, "METADATA": { "taskName": "cassandra.clustername.node.4.executor", "taskId": "cassandra.clustername.node.4.executor", "memMb": 32, "diskMb": 0, "cpuCores": 0.1 } }, "executorId": "cassandra.clustername.node.4.executor", "workdir": "/var/lib/mesos/slave/slaves/e0adbfa7-79a1-4624-8135-8dc42a02bd50-S6/frameworks/e0adbfa7-79a1-4624-8135-8dc42a02bd50-0002/executors/cassandra.clustername.node.4.executor/runs/15c0e8d6-9994-4422-ae50-7fd598103bcc", "ip": "10.0.0.97", "hostname": "10.0.0.97", "targetRunState": "RUN", "jmxPort": 7199, "seedNode": false } ], "nodesToAcquire": 0, "replaceNodes": [] }
I just ran in to this exact same scenario.
This repository is deprecated. Please use the latest supported framework here: https://github.com/mesosphere/dcos-cassandra-service.
I believe I may have found a bug in the case where:
/node/{node}/terminate
As expected, receive this response from the scheduler:
However, when making the call to
/node/{node}/replace
, the following response is received:The scheduler will then accept the next relevant offer and bring up a new executor, which of course, won't replace the dead cassandra node.
Finally, here is the result from
/node/all
, note that in this sample, executor 2 was lost and executor 4 was then started by the scheduler: