dcos-labs / dcos-jupyterlab-service

JupyterLab Notebook for Mesosphere DC/OS
Apache License 2.0
11 stars 11 forks source link

Jupyterlab (strict mode) Marathon-lb #44

Open redpine42 opened 5 years ago

redpine42 commented 5 years ago

I'm running DCOS 1.11.6, with Jupyterlab 1.2.0-DEV.33.7 and Marathon-lb 1.12.3. DCOS is configured for strict mode, with necessary secrets and service accounts configured. Marathon-lb is pointing to the container port 8080 and not the vip port 8888. Health checks are failing Jupyterlab is unreachable through marathon-lb. All the other Jupyterlab containers are showing healthy in Marathon-lb (same container and vip ports in the marathon.json.mustache file). In the past I fixed this by setting the vip and container port to the same value (8080). Which seems to be convention with Marathon-lb.

vishnu2kmohan commented 5 years ago

@redpine42 Have you looked at trying to (modify to suit and) use:

I use those ^^^ all the time for testing and they work just fine with DC/OS in strict mode and Marathon-LB (also in it's own strict mode).

We have some changes planned to make the configuration of the Jupyter Service a bit more intuitive, at which time I'll be putting those changes through the paces of strict mode(s).

vishnu2kmohan commented 5 years ago

@redpine42 We've released a new version 1.3.0-0.35.4 of the service and it's now renamed to beta-mesosphere-jupyter-service in the DC/OS Catalog

Please give this new version a spin and let me know if it resolves your issues.

redpine42 commented 5 years ago

Thanks, but I'm still having problems with marathon-lb showing it down. I've tried both from the Universe catalog and a marthon file. Below is my marathon.

{
  "service": {
    "name": "/jupyter",
    "cpus": 18,
    "mem": 61440,
    "gpu": {
      "enabled": true,
      "gpus": 2
    },
    "jupyter_password": "jupyter",
    "jupyter_conf_urls": "",
    "service_account": "jupyter",
    "service_account_secret": "jupyter/sa",
    "placement_constraints": "[]",
    "user": "nobody",
    "cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${MESOS_SANDBOX}\"",
    "log_level": "INFO"
  },
  "networking": {
    "cni_network_enabled": true,
    "cni_network_name": "dcos",
    "cni_network_plugin_labels": "",
    "ingress": {
      "enabled": true,
      "hostname": ""
    }
  },
  "storage": {
    "local_persistence": {
      "enabled": true,
      "volume_size": 100000,
      "volume_path": "jupyter_data"
    },
    "s3": {
      "aws_region": "us-east-1",
      "endpoint": "jupyterlab-data",
      "use_https": true,
      "verify_ssl": true
    }
  },
  "oidc": {
    "enabled": false,
    "discovery_uri": "https://keycloak.contoso.com/auth/realms/notebook/.well-known/openid-configuration",
    "client_id": "notebook",
    "client_secret": "",
    "scope": "openid profile email",
    "authorization_params": "",
    "authorized_email": "",
    "authorized_upn": "",
    "redirect_after_logout_uri": "",
    "post_logout_redirect_uri": "",
    "tls_verify": false,
    "redirect_uri": "/oidc-redirect-callback",
    "logout_path": "/logmeout",
    "token_endpoint_auth_method": "client_secret_basic",
    "use_spartan_resolver": true
  },
  "spark": {
    "spark_master_url": "mesos://zk://zk-1.zk:2181,zk-2.zk:2181,zk-3.zk:2181,zk-4.zk:2181,zk-5.zk:2181/mesos",
    "spark_conf_cores_max": 5,
    "spark_driver_cores": 2,
    "spark_conf_executor_cores": 1,
    "spark_conf_mesos_gpus_max": 0,
    "spark_conf_mesos_executor_gpus": 0,
    "spark_driver_memory": "6g",
    "spark_conf_executor_memory": "6g",
    "spark_conf_eventlog_enabled": true,
    "spark_conf_eventlog_dir": "/mnt/mesos/sandbox/",
    "start_spark_history_server": true,
    "spark_history_fs_logdirectory": "/mnt/mesos/sandbox/",
    "spark_conf_jars_packages": "",
    "spark_conf_mesos_principal": "",
    "spark_conf_mesos_role": "",
    "spark_conf_mesos_driver_labels": "",
    "spark_conf_mesos_task_labels": "",
    "spark_conf_executor_krb5_config": "/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_spark_scheduler_min_registered_resources_ratio": 1,
    "spark_conf_mesos_containerizer": "mesos",
    "spark_conf_hadoop_fs_s3a_aws_credentials_provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider",
    "spark_driver_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_executor_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_mesos_executor_home": "/opt/spark",
    "spark_conf_executor_java_home": "/opt/jdk",
    "spark_conf_executor_hadoop_hdfs_home": "/opt/hadoop",
    "spark_conf_executor_hadoop_opts": "-Djava.library.path=/opt/hadoop/lib/native -Djava.security.krb5.conf=/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_mesos_executor_docker_forcepullimage": false,
    "spark_user": "nobody"
  },
  "advanced": {
    "force_pull_jupyter_image": false,
    "force_pull_worker_image": false,
    "home": "/mnt/mesos/sandbox",
    "sandbox": "/mnt/mesos/sandbox",
    "hadoop_conf_dir": "/mnt/mesos/sandbox",
    "jupyter_config_dir": "/mnt/mesos/sandbox/.jupyter",
    "jupyter_runtime_dir": "/mnt/mesos/sandbox/.local/share/jupyter/runtime",
    "conda_envs_path": "/mnt/mesos/sandbox/conda/envs:/opt/conda/envs",
    "conda_pkgs_dir": "/mnt/mesos/sandbox/conda/pkgs:/opt/conda/pkgs",
    "dcos_dir": "/mnt/mesos/sandbox/.dcos",
    "java_opts": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "nginx_log_level": "warn",
    "spark_monitor_enabled": true,
    "start_dask_distributed": false,
    "start_ray_head_node": false,
    "start_tensorboard": true,
    "tensorboard_logdir": "/mnt/mesos/sandbox",
    "term": "xterm-256color"
  },
  "healthChecks": [
    {
      "portIndex": 0,
      "protocol": "MESOS_HTTP",
      "path": "/healthz",
      "gracePeriodSeconds": 30,
      "intervalSeconds": 20,
      "timeoutSeconds": 10,
      "maxConsecutiveFailures": 3
    }
  ],
  "labels": {
    "MARATHON_SINGLE_INSTANCE_APP": "true",
    "HAPROXY_GROUP": "external",
    "HAPROXY_0_ENABLED": "true",
    "HAPROXY_0_REDIRECT_TO_HTTPS": "true",
    "HAPROXY_0_VHOST": "jupyter.redpine.com"
  }

}
vishnu2kmohan commented 5 years ago

@redpine42 We completely ignore this labels section since it's not part of the config.json schema:

  "labels": {
    "MARATHON_SINGLE_INSTANCE_APP": "true",
    "HAPROXY_GROUP": "external",
    "HAPROXY_0_ENABLED": "true",
    "HAPROXY_0_REDIRECT_TO_HTTPS": "true",
    "HAPROXY_0_VHOST": "jupyter.redpine.com"
  }

You need to set networking.ingress.hostname:

  "networking": {
    "cni_network_enabled": true,
    "cni_network_name": "dcos",
    "cni_network_plugin_labels": "",
    "ingress": {
      "enabled": true,
      "hostname": "jupyter.redpine.com"
    }
  },

And then, assuming your Public Agent(s) (or Elastic Load Balancer that fronts your Public Agent(s)) have a DNS A (or CNAME) record for jupyter.redpine.com pointing to its IP(s), you should be able to access your Notebook at https://jupyter.redpine.com/jupyter

We also ignore this healthChecks stanza:

  "healthChecks": [
    {
      "portIndex": 0,
      "protocol": "MESOS_HTTP",
      "path": "/healthz",
      "gracePeriodSeconds": 30,
      "intervalSeconds": 20,
      "timeoutSeconds": 10,
      "maxConsecutiveFailures": 3
    }
  ],
redpine42 commented 5 years ago

I added the health checks and labels based upon the cni example you provided. Other than that the rest is what comes from the gui generated json. I'll try adding the networking section.

redpine42 commented 5 years ago

The networking section I did have the hostname set on a previous test with no luck.

Removed healthChecks, and labels. Added hostname still marathon-lb shows the backend down even though DC/OS shows jupyter up. Returns 503 when I hit https://jupyter.redpine.com. I did have all this working before. Only difference was I was using Traefik, which wasn't able to get to work in strict mode.

vishnu2kmohan commented 5 years ago

The schema for the options.json (Catalog Package Deployment UI) is not the same as the rendered Marathon App JSON.

We'll need to see the full (obfuscated) rendered Marathon JSON files for both your Marathon-LB and Mesosphere Jupyter Service to see what may be happening @redpine42

redpine42 commented 5 years ago

It's a pretty standard marathon-lb. I've hooked up other projects, such as jenkins. Here is the json for it pulled from the ui.

{
  "marathon-lb": {
    "auto-assign-service-ports": false,
    "bind-http-https": true,
    "cpus": 2,
    "haproxy_global_default_options": "redispatch,http-server-close,dontlognull",
    "haproxy-group": "external",
    "haproxy-map": true,
    "instances": 2,
    "mem": 1024,
    "minimumHealthCapacity": 0.5,
    "maximumOverCapacity": 0.2,
    "name": "marathon-lb",
    "parameters": [],
    "role": "slave_public",
    "strict-mode": false,
    "sysctl-params": "net.ipv4.tcp_tw_reuse=1 net.ipv4.tcp_fin_timeout=30 net.ipv4.tcp_max_syn_backlog=10240 net.ipv4.tcp_max_tw_buckets=400000 net.ipv4.tcp_max_orphans=60000 net.core.somaxconn=10000",
    "container-syslogd": false,
    "max-reload-retries": 10,
    "reload-interval": 10,
    "template-url": "",
    "marathon-uri": "https://marathon.mesos:8443",
    "secret_name": "marathon-lb/sa"
  }
}

Here is Jupyter pulled from the DC/OS UI

{
  "service": {
    "name": "/jupyter",
    "cpus": 18,
    "mem": 61440,
    "gpu": {
      "enabled": true,
      "gpus": 2
    },
    "jupyter_password": "jupyter",
    "jupyter_conf_urls": "",
    "service_account": "jupyter",
    "service_account_secret": "jupyter/sa",
    "placement_constraints": "[]",
    "user": "nobody",
    "cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${MESOS_SANDBOX}\"",
    "log_level": "INFO"
  },
  "networking": {
    "cni_network_enabled": true,
    "cni_network_name": "dcos",
    "cni_network_plugin_labels": "",
    "ingress": {
      "enabled": true,
      "hostname": "jupyter.redpine.com"
    }
  },
  "storage": {
    "local_persistence": {
      "enabled": true,
      "volume_size": 100000,
      "volume_path": "jupyter_data"
    },
    "s3": {
      "aws_region": "us-east-1",
      "endpoint": "jupyterlab-data",
      "use_https": true,
      "verify_ssl": true
    }
  },
  "oidc": {
    "enabled": false,
    "discovery_uri": "https://keycloak.contoso.com/auth/realms/notebook/.well-known/openid-configuration",
    "client_id": "notebook",
    "client_secret": "",
    "scope": "openid profile email",
    "authorization_params": "",
    "authorized_email": "",
    "authorized_upn": "",
    "redirect_after_logout_uri": "",
    "post_logout_redirect_uri": "",
    "tls_verify": false,
    "redirect_uri": "/oidc-redirect-callback",
    "logout_path": "/logmeout",
    "token_endpoint_auth_method": "client_secret_basic",
    "use_spartan_resolver": true
  },
  "spark": {
    "spark_master_url": "mesos://zk://zk-1.zk:2181,zk-2.zk:2181,zk-3.zk:2181,zk-4.zk:2181,zk-5.zk:2181/mesos",
    "spark_conf_cores_max": 5,
    "spark_driver_cores": 2,
    "spark_conf_executor_cores": 1,
    "spark_conf_mesos_gpus_max": 0,
    "spark_conf_mesos_executor_gpus": 0,
    "spark_driver_memory": "6g",
    "spark_conf_executor_memory": "6g",
    "spark_conf_eventlog_enabled": true,
    "spark_conf_eventlog_dir": "/mnt/mesos/sandbox/",
    "start_spark_history_server": true,
    "spark_history_fs_logdirectory": "/mnt/mesos/sandbox/",
    "spark_conf_jars_packages": "",
    "spark_conf_mesos_principal": "",
    "spark_conf_mesos_role": "",
    "spark_conf_mesos_driver_labels": "",
    "spark_conf_mesos_task_labels": "",
    "spark_conf_executor_krb5_config": "/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_spark_scheduler_min_registered_resources_ratio": 1,
    "spark_conf_mesos_containerizer": "mesos",
    "spark_conf_hadoop_fs_s3a_aws_credentials_provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider",
    "spark_driver_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_executor_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_mesos_executor_home": "/opt/spark",
    "spark_conf_executor_java_home": "/opt/jdk",
    "spark_conf_executor_hadoop_hdfs_home": "/opt/hadoop",
    "spark_conf_executor_hadoop_opts": "-Djava.library.path=/opt/hadoop/lib/native -Djava.security.krb5.conf=/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_mesos_executor_docker_forcepullimage": false,
    "spark_user": "nobody"
  },
  "advanced": {
    "force_pull_jupyter_image": false,
    "force_pull_worker_image": false,
    "home": "/mnt/mesos/sandbox",
    "sandbox": "/mnt/mesos/sandbox",
    "hadoop_conf_dir": "/mnt/mesos/sandbox",
    "jupyter_config_dir": "/mnt/mesos/sandbox/.jupyter",
    "jupyter_runtime_dir": "/mnt/mesos/sandbox/.local/share/jupyter/runtime",
    "conda_envs_path": "/mnt/mesos/sandbox/conda/envs:/opt/conda/envs",
    "conda_pkgs_dir": "/mnt/mesos/sandbox/conda/pkgs:/opt/conda/pkgs",
    "dcos_dir": "/mnt/mesos/sandbox/.dcos",
    "java_opts": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "nginx_log_level": "warn",
    "spark_monitor_enabled": true,
    "start_dask_distributed": false,
    "start_ray_head_node": false,
    "start_tensorboard": true,
    "tensorboard_logdir": "/mnt/mesos/sandbox",
    "term": "xterm-256color"
  }
}