Open andrius-kulbis opened 7 years ago
We are seeing the same issue on our 13 node production cluster.
OS: Ubuntu 16.04.
Docker: 17.06.0-ce build 02c1d87.
Sort of seems like the overlay network gets cut off.
Output from worker node:
$ docker network inspect -v druid-stack_default
[
{
"Name": "druid-stack_default",
"Id": "rfjugygxri4o49i7y047ntrvm",
"Created": "2017-07-25T08:12:15.387933567Z",
"Scope": "swarm",
"Driver": "overlay",
"EnableIPv6": false,
"IPAM": {
"Driver": "default",
"Options": null,
"Config": [
{
"Subnet": "10.0.0.0/24",
"Gateway": "10.0.0.1"
}
]
},
"Internal": false,
"Attachable": false,
"Ingress": false,
"ConfigFrom": {
"Network": ""
},
"ConfigOnly": false,
"Containers": {
"e092693a5ad87e69a592c3a7148f45510f5ccfe21cff4873b3b28bfdc8e9ec6d": {
"Name": "druid-stack_middlemanager.3.xl3ibq8mxcdjmv57hjm7n5mi0",
"EndpointID": "aedc63f52a1406f72806a1a011512f902143737035316b22436c3a6a87139879",
"MacAddress": "02:42:0a:00:00:09",
"IPv4Address": "10.0.0.9/24",
"IPv6Address": ""
}
},
"Options": {
"com.docker.network.driver.overlay.vxlanid_list": "4097"
},
"Labels": {
"com.docker.stack.namespace": "druid-stack"
},
"Peers": [
{
"Name": "ip-172-31-23-4-039b67712e55",
"IP": "172.31.23.4"
},
{
"Name": "ip-172-31-25-132-5d6c985ab714",
"IP": "172.31.25.132"
}
],
"Services": {
"druid-stack_middlemanager": {
"VIP": "10.0.0.2",
"Ports": [],
"LocalLBIndex": 315,
"Tasks": [
{
"Name": "druid-stack_middlemanager.1.csffmb3y2p2c212xk76hsyv4u",
"EndpointID": "59c665d64ef0d2daaf7d24e94432d0a834019a915e1c0e8701c9c7b624f8c843",
"EndpointIP": "10.0.0.14",
"Info": {
"Host IP": "172.31.25.132"
}
},
{
"Name": "druid-stack_middlemanager.3.xl3ibq8mxcdjmv57hjm7n5mi0",
"EndpointID": "aedc63f52a1406f72806a1a011512f902143737035316b22436c3a6a87139879",
"EndpointIP": "10.0.0.9",
"Info": {
"Host IP": "172.31.23.4"
}
}
]
}
}
}
]
Output from manager node:
$ docker network inspect -v druid-stack_default
[
{
"Name": "druid-stack_default",
"Id": "rfjugygxri4o49i7y047ntrvm",
"Created": "2017-07-20T09:30:30.743573035Z",
"Scope": "swarm",
"Driver": "overlay",
"EnableIPv6": false,
"IPAM": {
"Driver": "default",
"Options": null,
"Config": [
{
"Subnet": "10.0.0.0/24",
"Gateway": "10.0.0.1"
}
]
},
"Internal": false,
"Attachable": false,
"Ingress": false,
"ConfigFrom": {
"Network": ""
},
"ConfigOnly": false,
"Containers": {
"b88b6c31656d9d20487453fd13fac34a845dcc5bed605d2df3522468b8e2052f": {
"Name": "druid-stack_overlord.1.o6fe453gc4cq6ua5ouu3x50eo",
"EndpointID": "1bb76125a0baf28a4cd9385ee1f2be2140d0ad2e9a9973b5a7fc7a2d2c71c121",
"MacAddress": "02:42:0a:00:00:13",
"IPv4Address": "10.0.0.19/24",
"IPv6Address": ""
},
"ea7c2074a12f672c451f90f190654412946a29e06b91a93d714c7ffe4c94f575": {
"Name": "druid-stack_historical.6.zssej1f3wf44me1kl0nyz0ivg",
"EndpointID": "84c2edc53a6aefc5a39a312bdef0c1dd0ed9e9b820fbc87fa1e8ac55c7070aaa",
"MacAddress": "02:42:0a:00:00:0c",
"IPv4Address": "10.0.0.12/24",
"IPv6Address": ""
}
},
"Options": {
"com.docker.network.driver.overlay.vxlanid_list": "4097"
},
"Labels": {
"com.docker.stack.namespace": "druid-stack"
},
"Peers": [
{
"Name": "ip-172-31-41-115-d5e6245fc4fa",
"IP": "172.31.41.115"
},
{
"Name": "ip-172-31-11-25-a02282c42c08",
"IP": "172.31.11.25"
},
{
"Name": "ip-172-31-5-18-59c51d47cc8a",
"IP": "172.31.5.18"
},
{
"Name": "ip-172-31-16-118-2cf9facff4fe",
"IP": "172.31.16.118"
},
{
"Name": "ip-172-31-11-207-bf72cc8c1e8a",
"IP": "172.31.11.207"
},
{
"Name": "ip-172-31-17-74-eb8ee2871bfd",
"IP": "172.31.17.74"
},
{
"Name": "ip-172-31-37-26-6b8cd598904c",
"IP": "172.31.37.26"
},
{
"Name": "ip-172-31-45-208-11b6a81db259",
"IP": "172.31.45.208"
},
{
"Name": "ip-172-31-16-61-f1501c1d73d0",
"IP": "172.31.16.61"
},
{
"Name": "ip-172-31-7-181-8f357e1d1087",
"IP": "172.31.7.181"
}
],
"Services": {
"druid-stack_broker": {
"VIP": "10.0.0.6",
"Ports": [],
"LocalLBIndex": 274,
"Tasks": [
{
"Name": "druid-stack_broker.3.vyceh5jmfkijbvfcq84s9mgfo",
"EndpointID": "65806d13a4628bf6bb300d73c85b3b7767f32a548e8b9f1289f3666b67ec642b",
"EndpointIP": "10.0.0.16",
"Info": {
"Host IP": "172.31.41.115"
}
}
]
},
"druid-stack_coordinator": {
"VIP": "10.0.0.4",
"Ports": [],
"LocalLBIndex": 267,
"Tasks": [
{
"Name": "druid-stack_coordinator.1.fc34fpkisf2b918200t42756p",
"EndpointID": "8ea604babad04ff060c398291ae159c6729a1935c8750380e9dd0f1cabfb7051",
"EndpointIP": "10.0.0.22",
"Info": {
"Host IP": "172.31.16.118"
}
}
]
},
"druid-stack_historical": {
"VIP": "10.0.0.8",
"Ports": [],
"LocalLBIndex": 261,
"Tasks": [
{
"Name": "druid-stack_historical.4.zx3d33yui3tcu5xps6xrfrp13",
"EndpointID": "841034e3517b50dc48c14fe74e555c6af394af87fc9813b782f22ab4303a02b9",
"EndpointIP": "10.0.0.18",
"Info": {
"Host IP": "172.31.37.26"
}
},
{
"Name": "druid-stack_historical.2.abs8xbmk77oymw0oeiy181nht",
"EndpointID": "2d420eed4b122d1eaa42c861369c96b2e65b9ed5f9cd48e127e68d10668d4a88",
"EndpointIP": "10.0.0.7",
"Info": {
"Host IP": "172.31.16.118"
}
},
{
"Name": "druid-stack_historical.6.zssej1f3wf44me1kl0nyz0ivg",
"EndpointID": "84c2edc53a6aefc5a39a312bdef0c1dd0ed9e9b820fbc87fa1e8ac55c7070aaa",
"EndpointIP": "10.0.0.12",
"Info": {
"Host IP": "172.31.5.18"
}
},
{
"Name": "druid-stack_historical.3.lrcrxrhsgju7760emy2stkmin",
"EndpointID": "36363d418db591f96bc13cc5807ac5f6bec01988096ca6caf5948cb38ecff711",
"EndpointIP": "10.0.0.20",
"Info": {
"Host IP": "172.31.17.74"
}
},
{
"Name": "druid-stack_historical.5.tmtc0yd2ak4su2ra3c2xphc3h",
"EndpointID": "72fd9e707e42d9e2c303713a6d3072f8c1442c1c52ba3af00b6cf9ecfaa06182",
"EndpointIP": "10.0.0.15",
"Info": {
"Host IP": "172.31.11.207"
}
},
{
"Name": "druid-stack_historical.1.d30he19w5f5n0yawgyp4z28fb",
"EndpointID": "79a167f6acd025b9466bc8adc995db70fce9b1bc8bab63b3ba1655c3d169431d",
"EndpointIP": "10.0.0.3",
"Info": {
"Host IP": "172.31.11.25"
}
}
]
},
"druid-stack_middlemanager": {
"VIP": "10.0.0.2",
"Ports": [],
"LocalLBIndex": 278,
"Tasks": [
{
"Name": "druid-stack_middlemanager.2.xynbk0bodkcb01o3hcqk3kgrw",
"EndpointID": "a1d9eaaf5a32335809a519729d3a28e4f26be62983cbffd28972211ed3b58b98",
"EndpointIP": "10.0.0.13",
"Info": {
"Host IP": "172.31.7.181"
}
}
]
},
"druid-stack_overlord": {
"VIP": "10.0.0.10",
"Ports": [],
"LocalLBIndex": 270,
"Tasks": [
{
"Name": "druid-stack_overlord.1.o6fe453gc4cq6ua5ouu3x50eo",
"EndpointID": "1bb76125a0baf28a4cd9385ee1f2be2140d0ad2e9a9973b5a7fc7a2d2c71c121",
"EndpointIP": "10.0.0.19",
"Info": {
"Host IP": "172.31.5.18"
}
}
]
}
}
}
]
You may notice there, that I recently restarted one worker node during debugging this issue, that is the druid-stack_middlemanager
task that actually appears in the swarm manager output.
Another point worth mentioning, is that we are currently running with pretty only swarm managers, in order to circumvent this issue. That is why the swarm worker output is so short relative to the swarm manager output.
Description
Having a the following swarm setup:
And the service description:
Redis slave-master communication fails after restarting the manager swarm node docker process until worker node process is not restarted.
Steps to reproduce the issue:
docker stack deploy -c redis-setup.yml redis
service docker restart
on manager nodeDescribe the results you received: Executing
docker exec -it <redis_master_container> ping redis-slave
gives me and error:ping: bad address 'redis-slave'
Describe the results you expected: Executing
docker exec -it <redis_master_container> ping redis-slave
shows the ping packets reaching redis-slave as a sign of inter-container communication working between containers in different nodes.Additional information you deem important (e.g. issue happens only occasionally): After restarting the worker node ping is working as expected again.
Output of
docker version
:Output of
docker info
:UPDATE: same issues with newest docker 17.06