mesosphere / marathon-lb

Marathon-lb is a service discovery & load balancing tool for DC/OS
Apache License 2.0
449 stars 300 forks source link

Zero downtime deployments result in 503s #657

Open sgvandijk opened 4 years ago

sgvandijk commented 4 years ago

We are trying to use ZDDs, however these result in a brief period where old instances are set to maintenance in Haproxy while the new versions are still coming up, which results in 503 responses until the new version has come fully online.

Versions: DC/OS: 1.12.0 Mesos: 1.7.1 Marathon: 1.7.174 Marathon-LB: 1.12.3 (also tested with 1.14.1)

At the time of deployment, dcos marathon app list --json gives:

  {
    "backoffFactor": 1.15,
    "backoffSeconds": 1,
    "constraints": [
      [
        "server_group",
        "IS",
        "PrivateAgentServerGroup"
      ]
    ],
    "container": {
      "docker": {
        "forcePullImage": false,
        "image": "<redacted>",
        "parameters": [
          {
            "key": "tty",
            "value": "true"
          },
          {
            "key": "log-driver",
            "value": "none"
          }
        ],
        "privileged": false
      },
      "portMappings": [
        {
          "containerPort": 80,
          "hostPort": 0,
          "labels": {},
          "protocol": "tcp",
          "servicePort": 10002
        }
      ],
      "type": "DOCKER",
      "volumes": []
    },
    "cpus": 0.5,
    "deployments": [],
    "disk": 0,
    "env": {
      "AWS_ACCESS_KEY_ID": "<redacted>",
      "AWS_SECRET_ACCESS_KEY": "<redacted>"
    },
    "executor": "",
    "gpus": 0,
    "healthChecks": [
      {
        "delaySeconds": 15,
        "gracePeriodSeconds": 300,
        "intervalSeconds": 60,
        "ipProtocol": "IPv4",
        "maxConsecutiveFailures": 3,
        "path": "/appname/health",
        "portIndex": 0,
        "protocol": "MESOS_HTTP",
        "timeoutSeconds": 20
      }
    ],
    "id": "/appname-blue",
    "instances": 1,
    "killSelection": "YOUNGEST_FIRST",
    "labels": {
      "HAPROXY_0_ENABLED": "true",
      "HAPROXY_0_PATH": "/appname",
      "HAPROXY_0_PORT": "10002",
      "HAPROXY_0_VHOST": "<redacted>",
      "HAPROXY_APP_ID": "appname",
      "HAPROXY_DEPLOYMENT_ALT_PORT": "10003",
      "HAPROXY_DEPLOYMENT_COLOR": "blue",
      "HAPROXY_DEPLOYMENT_GROUP": "appname",
      "HAPROXY_DEPLOYMENT_STARTED_AT": "2019-11-21T14:21:31.356274",
      "HAPROXY_DEPLOYMENT_TARGET_INSTANCES": "1",
      "HAPROXY_GROUP": "external"
    },
    "maxLaunchDelaySeconds": 3600,
    "mem": 2048,
    "networks": [
      {
        "mode": "container/bridge"
      }
    ],
    "requirePorts": false,
    "tasksHealthy": 1,
    "tasksRunning": 1,
    "tasksStaged": 0,
    "tasksUnhealthy": 0,
    "unreachableStrategy": {
      "expungeAfterSeconds": 0,
      "inactiveAfterSeconds": 0
    },
    "upgradeStrategy": {
      "maximumOverCapacity": 1,
      "minimumHealthCapacity": 1
    },
    "version": "2019-11-21T14:21:32.151Z",
    "versionInfo": {
      "lastConfigChangeAt": "2019-11-21T14:21:32.151Z",
      "lastScalingAt": "2019-11-21T14:21:32.151Z"
    }
  }

The new version gets deployed by posting the following configuration to Marathon:

{
  "id": "/appname-green",
  "cpus": 0.5,
  "mem": 2048,
  "instances": 1,
  "container": {
    "type": "DOCKER",
    "docker": {
      "image": "<redacted>",
      "network": "BRIDGE",
      "portMappings": [
        {
          "hostPort": 0,
          "containerPort": 80,
          "servicePort": 10003
        }
      ],
      "parameters": [
        {
          "key": "tty",
          "value": "true"
        },
        {
          "key": "log-driver",
          "value": "none"
        }
      ]
    }
  },
  "constraints": [
    [
      "server_group",
      "IS",
      "PrivateAgentServerGroup"
    ]
  ],
  "labels": {
    "HAPROXY_GROUP": "external",
    "HAPROXY_0_VHOST": "<redacted>",
    "HAPROXY_0_PATH": "/appname",
    "HAPROXY_DEPLOYMENT_ALT_PORT": "10003",
    "HAPROXY_0_ENABLED": "true",
    "HAPROXY_APP_ID": "appname",
    "HAPROXY_DEPLOYMENT_GROUP": "appname",
    "HAPROXY_DEPLOYMENT_COLOR": "green",
    "HAPROXY_0_PORT": "10002",
    "HAPROXY_DEPLOYMENT_TARGET_INSTANCES": "1",
    "HAPROXY_DEPLOYMENT_STARTED_AT": "2019-11-21T14:58:20.311986"
  },
  "env": {
    "AWS_ACCESS_KEY_ID": "<redacted>",
    "AWS_SECRET_ACCESS_KEY": "<redacted>"
  },
  "healthChecks": [
    {
      "gracePeriodSeconds": 300,
      "intervalSeconds": 60,
      "maxConsecutiveFailures": 3,
      "portIndex": 0,
      "timeoutSeconds": 20,
      "delaySeconds": 15,
      "protocol": "MESOS_HTTP",
      "path": "/appname/health",
      "ipProtocol": "IPv4"
    }
  ]
}

AFAICT this follows the logic in the README and zdd.py.

Checking the Haproxy stats directly after submission shows the previous, blue instance to be put directly into maintenance mode with the new, green instance still coming up:

image

If that takes a while, the new instance is marked as down:

image

During this time, until the green instance is up, trying to access the app returns 503 errors.

Happy to provide any more info if needed!