Open redpine42 opened 5 years ago
@redpine42 Have you looked at trying to (modify to suit and) use:
I use those ^^^ all the time for testing and they work just fine with DC/OS in strict
mode and Marathon-LB (also in it's own strict
mode).
We have some changes planned to make the configuration of the Jupyter Service a bit more intuitive, at which time I'll be putting those changes through the paces of strict
mode(s).
@redpine42 We've released a new version 1.3.0-0.35.4
of the service and it's now renamed to beta-mesosphere-jupyter-service
in the DC/OS Catalog
Please give this new version a spin and let me know if it resolves your issues.
Thanks, but I'm still having problems with marathon-lb showing it down. I've tried both from the Universe catalog and a marthon file. Below is my marathon.
{
"service": {
"name": "/jupyter",
"cpus": 18,
"mem": 61440,
"gpu": {
"enabled": true,
"gpus": 2
},
"jupyter_password": "jupyter",
"jupyter_conf_urls": "",
"service_account": "jupyter",
"service_account_secret": "jupyter/sa",
"placement_constraints": "[]",
"user": "nobody",
"cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${MESOS_SANDBOX}\"",
"log_level": "INFO"
},
"networking": {
"cni_network_enabled": true,
"cni_network_name": "dcos",
"cni_network_plugin_labels": "",
"ingress": {
"enabled": true,
"hostname": ""
}
},
"storage": {
"local_persistence": {
"enabled": true,
"volume_size": 100000,
"volume_path": "jupyter_data"
},
"s3": {
"aws_region": "us-east-1",
"endpoint": "jupyterlab-data",
"use_https": true,
"verify_ssl": true
}
},
"oidc": {
"enabled": false,
"discovery_uri": "https://keycloak.contoso.com/auth/realms/notebook/.well-known/openid-configuration",
"client_id": "notebook",
"client_secret": "",
"scope": "openid profile email",
"authorization_params": "",
"authorized_email": "",
"authorized_upn": "",
"redirect_after_logout_uri": "",
"post_logout_redirect_uri": "",
"tls_verify": false,
"redirect_uri": "/oidc-redirect-callback",
"logout_path": "/logmeout",
"token_endpoint_auth_method": "client_secret_basic",
"use_spartan_resolver": true
},
"spark": {
"spark_master_url": "mesos://zk://zk-1.zk:2181,zk-2.zk:2181,zk-3.zk:2181,zk-4.zk:2181,zk-5.zk:2181/mesos",
"spark_conf_cores_max": 5,
"spark_driver_cores": 2,
"spark_conf_executor_cores": 1,
"spark_conf_mesos_gpus_max": 0,
"spark_conf_mesos_executor_gpus": 0,
"spark_driver_memory": "6g",
"spark_conf_executor_memory": "6g",
"spark_conf_eventlog_enabled": true,
"spark_conf_eventlog_dir": "/mnt/mesos/sandbox/",
"start_spark_history_server": true,
"spark_history_fs_logdirectory": "/mnt/mesos/sandbox/",
"spark_conf_jars_packages": "",
"spark_conf_mesos_principal": "",
"spark_conf_mesos_role": "",
"spark_conf_mesos_driver_labels": "",
"spark_conf_mesos_task_labels": "",
"spark_conf_executor_krb5_config": "/mnt/mesos/sandbox/krb5.conf",
"spark_conf_spark_scheduler_min_registered_resources_ratio": 1,
"spark_conf_mesos_containerizer": "mesos",
"spark_conf_hadoop_fs_s3a_aws_credentials_provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider",
"spark_driver_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
"spark_conf_executor_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
"spark_conf_mesos_executor_home": "/opt/spark",
"spark_conf_executor_java_home": "/opt/jdk",
"spark_conf_executor_hadoop_hdfs_home": "/opt/hadoop",
"spark_conf_executor_hadoop_opts": "-Djava.library.path=/opt/hadoop/lib/native -Djava.security.krb5.conf=/mnt/mesos/sandbox/krb5.conf",
"spark_conf_mesos_executor_docker_forcepullimage": false,
"spark_user": "nobody"
},
"advanced": {
"force_pull_jupyter_image": false,
"force_pull_worker_image": false,
"home": "/mnt/mesos/sandbox",
"sandbox": "/mnt/mesos/sandbox",
"hadoop_conf_dir": "/mnt/mesos/sandbox",
"jupyter_config_dir": "/mnt/mesos/sandbox/.jupyter",
"jupyter_runtime_dir": "/mnt/mesos/sandbox/.local/share/jupyter/runtime",
"conda_envs_path": "/mnt/mesos/sandbox/conda/envs:/opt/conda/envs",
"conda_pkgs_dir": "/mnt/mesos/sandbox/conda/pkgs:/opt/conda/pkgs",
"dcos_dir": "/mnt/mesos/sandbox/.dcos",
"java_opts": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
"nginx_log_level": "warn",
"spark_monitor_enabled": true,
"start_dask_distributed": false,
"start_ray_head_node": false,
"start_tensorboard": true,
"tensorboard_logdir": "/mnt/mesos/sandbox",
"term": "xterm-256color"
},
"healthChecks": [
{
"portIndex": 0,
"protocol": "MESOS_HTTP",
"path": "/healthz",
"gracePeriodSeconds": 30,
"intervalSeconds": 20,
"timeoutSeconds": 10,
"maxConsecutiveFailures": 3
}
],
"labels": {
"MARATHON_SINGLE_INSTANCE_APP": "true",
"HAPROXY_GROUP": "external",
"HAPROXY_0_ENABLED": "true",
"HAPROXY_0_REDIRECT_TO_HTTPS": "true",
"HAPROXY_0_VHOST": "jupyter.redpine.com"
}
}
@redpine42 We completely ignore this labels
section since it's not part of the config.json
schema:
"labels": {
"MARATHON_SINGLE_INSTANCE_APP": "true",
"HAPROXY_GROUP": "external",
"HAPROXY_0_ENABLED": "true",
"HAPROXY_0_REDIRECT_TO_HTTPS": "true",
"HAPROXY_0_VHOST": "jupyter.redpine.com"
}
You need to set networking.ingress.hostname
:
"networking": {
"cni_network_enabled": true,
"cni_network_name": "dcos",
"cni_network_plugin_labels": "",
"ingress": {
"enabled": true,
"hostname": "jupyter.redpine.com"
}
},
And then, assuming your Public Agent(s) (or Elastic Load Balancer that fronts your Public Agent(s)) have a DNS A
(or CNAME
) record for jupyter.redpine.com
pointing to its IP(s), you should be able to access your Notebook at https://jupyter.redpine.com/jupyter
We also ignore this healthChecks
stanza:
"healthChecks": [
{
"portIndex": 0,
"protocol": "MESOS_HTTP",
"path": "/healthz",
"gracePeriodSeconds": 30,
"intervalSeconds": 20,
"timeoutSeconds": 10,
"maxConsecutiveFailures": 3
}
],
I added the health checks and labels based upon the cni example you provided. Other than that the rest is what comes from the gui generated json. I'll try adding the networking section.
The networking section I did have the hostname set on a previous test with no luck.
Removed healthChecks, and labels. Added hostname still marathon-lb shows the backend down even though DC/OS shows jupyter up. Returns 503 when I hit https://jupyter.redpine.com. I did have all this working before. Only difference was I was using Traefik, which wasn't able to get to work in strict mode.
The schema for the options.json
(Catalog Package Deployment UI) is not the same as the rendered Marathon App JSON.
We'll need to see the full (obfuscated) rendered Marathon JSON files for both your Marathon-LB and Mesosphere Jupyter Service to see what may be happening @redpine42
It's a pretty standard marathon-lb. I've hooked up other projects, such as jenkins. Here is the json for it pulled from the ui.
{
"marathon-lb": {
"auto-assign-service-ports": false,
"bind-http-https": true,
"cpus": 2,
"haproxy_global_default_options": "redispatch,http-server-close,dontlognull",
"haproxy-group": "external",
"haproxy-map": true,
"instances": 2,
"mem": 1024,
"minimumHealthCapacity": 0.5,
"maximumOverCapacity": 0.2,
"name": "marathon-lb",
"parameters": [],
"role": "slave_public",
"strict-mode": false,
"sysctl-params": "net.ipv4.tcp_tw_reuse=1 net.ipv4.tcp_fin_timeout=30 net.ipv4.tcp_max_syn_backlog=10240 net.ipv4.tcp_max_tw_buckets=400000 net.ipv4.tcp_max_orphans=60000 net.core.somaxconn=10000",
"container-syslogd": false,
"max-reload-retries": 10,
"reload-interval": 10,
"template-url": "",
"marathon-uri": "https://marathon.mesos:8443",
"secret_name": "marathon-lb/sa"
}
}
Here is Jupyter pulled from the DC/OS UI
{
"service": {
"name": "/jupyter",
"cpus": 18,
"mem": 61440,
"gpu": {
"enabled": true,
"gpus": 2
},
"jupyter_password": "jupyter",
"jupyter_conf_urls": "",
"service_account": "jupyter",
"service_account_secret": "jupyter/sa",
"placement_constraints": "[]",
"user": "nobody",
"cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${MESOS_SANDBOX}\"",
"log_level": "INFO"
},
"networking": {
"cni_network_enabled": true,
"cni_network_name": "dcos",
"cni_network_plugin_labels": "",
"ingress": {
"enabled": true,
"hostname": "jupyter.redpine.com"
}
},
"storage": {
"local_persistence": {
"enabled": true,
"volume_size": 100000,
"volume_path": "jupyter_data"
},
"s3": {
"aws_region": "us-east-1",
"endpoint": "jupyterlab-data",
"use_https": true,
"verify_ssl": true
}
},
"oidc": {
"enabled": false,
"discovery_uri": "https://keycloak.contoso.com/auth/realms/notebook/.well-known/openid-configuration",
"client_id": "notebook",
"client_secret": "",
"scope": "openid profile email",
"authorization_params": "",
"authorized_email": "",
"authorized_upn": "",
"redirect_after_logout_uri": "",
"post_logout_redirect_uri": "",
"tls_verify": false,
"redirect_uri": "/oidc-redirect-callback",
"logout_path": "/logmeout",
"token_endpoint_auth_method": "client_secret_basic",
"use_spartan_resolver": true
},
"spark": {
"spark_master_url": "mesos://zk://zk-1.zk:2181,zk-2.zk:2181,zk-3.zk:2181,zk-4.zk:2181,zk-5.zk:2181/mesos",
"spark_conf_cores_max": 5,
"spark_driver_cores": 2,
"spark_conf_executor_cores": 1,
"spark_conf_mesos_gpus_max": 0,
"spark_conf_mesos_executor_gpus": 0,
"spark_driver_memory": "6g",
"spark_conf_executor_memory": "6g",
"spark_conf_eventlog_enabled": true,
"spark_conf_eventlog_dir": "/mnt/mesos/sandbox/",
"start_spark_history_server": true,
"spark_history_fs_logdirectory": "/mnt/mesos/sandbox/",
"spark_conf_jars_packages": "",
"spark_conf_mesos_principal": "",
"spark_conf_mesos_role": "",
"spark_conf_mesos_driver_labels": "",
"spark_conf_mesos_task_labels": "",
"spark_conf_executor_krb5_config": "/mnt/mesos/sandbox/krb5.conf",
"spark_conf_spark_scheduler_min_registered_resources_ratio": 1,
"spark_conf_mesos_containerizer": "mesos",
"spark_conf_hadoop_fs_s3a_aws_credentials_provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider",
"spark_driver_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
"spark_conf_executor_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
"spark_conf_mesos_executor_home": "/opt/spark",
"spark_conf_executor_java_home": "/opt/jdk",
"spark_conf_executor_hadoop_hdfs_home": "/opt/hadoop",
"spark_conf_executor_hadoop_opts": "-Djava.library.path=/opt/hadoop/lib/native -Djava.security.krb5.conf=/mnt/mesos/sandbox/krb5.conf",
"spark_conf_mesos_executor_docker_forcepullimage": false,
"spark_user": "nobody"
},
"advanced": {
"force_pull_jupyter_image": false,
"force_pull_worker_image": false,
"home": "/mnt/mesos/sandbox",
"sandbox": "/mnt/mesos/sandbox",
"hadoop_conf_dir": "/mnt/mesos/sandbox",
"jupyter_config_dir": "/mnt/mesos/sandbox/.jupyter",
"jupyter_runtime_dir": "/mnt/mesos/sandbox/.local/share/jupyter/runtime",
"conda_envs_path": "/mnt/mesos/sandbox/conda/envs:/opt/conda/envs",
"conda_pkgs_dir": "/mnt/mesos/sandbox/conda/pkgs:/opt/conda/pkgs",
"dcos_dir": "/mnt/mesos/sandbox/.dcos",
"java_opts": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
"nginx_log_level": "warn",
"spark_monitor_enabled": true,
"start_dask_distributed": false,
"start_ray_head_node": false,
"start_tensorboard": true,
"tensorboard_logdir": "/mnt/mesos/sandbox",
"term": "xterm-256color"
}
}
I'm running DCOS 1.11.6, with Jupyterlab 1.2.0-DEV.33.7 and Marathon-lb 1.12.3. DCOS is configured for strict mode, with necessary secrets and service accounts configured. Marathon-lb is pointing to the container port 8080 and not the vip port 8888. Health checks are failing Jupyterlab is unreachable through marathon-lb. All the other Jupyterlab containers are showing healthy in Marathon-lb (same container and vip ports in the marathon.json.mustache file). In the past I fixed this by setting the vip and container port to the same value (8080). Which seems to be convention with Marathon-lb.