Closed thpiotrowski closed 7 years ago
Hi,
can you share your zoe.conf
file anf the json you are using to start the jupyter notebook?
Thanks!
Hi, Thanks for your help, here are the files you asked:
Zoe.conf :
debug = true
deployment-name = compose
dbname = zzzzzzzzzzz
dbuser = yyyyyyyyy
dbpass = xxxxxxxx
dbhost = xx.yy.zz.ww
config = /etc/zoe/zoe.conf
workspace-deployment-path = /mnt/zoe-workspaces
dbport = 5432
api-listen-uri = tcp://*:4850
influxdb-dbname = zoe
influxdb-url = http://localhost:8086
workspace-base-path = /xxx/yyyyy
overlay-network-name = my-net
listen-address = 0.0.0.0
listen-port = 5001
master-url = tcp://xxxxxxxxxxxx :pppp
auth-type = text
auth-file = zoepass.csv
scheduler-class = ZoeSimpleScheduler
scheduler-policy = FIFO
backend = Swarm
cookie-secret = changeme
log-file = stderr
backend-swarm-zk-path = /
backend-swarm-url = zk://xx.yy.zz.ww :ppp
Zapp json:
{
"name": "aml-lab",
"size": 512,
"services": [
{
"image": " xxxxxxxxxxxx:ppppp/zoerepo/spark-master:latest",
"environment": [
[
"SPARK_MASTER_IP",
"{dns_name#self}"
],
[
"HADOOP_USER_NAME",
"{user_name}"
],
[
"PYTHONHASHSEED",
"42"
]
],
"volumes": [],
"essential_count": 1,
"monitor": false,
"name": "spark-master",
"ports": [
{
"name": "Spark master web interface",
"url_template": "http://{ip_port}/",
"port_number": 8080,
"protocol": "tcp"
}
],
"resources": {
"memory": {
"min": 536870912,
"max": 536870912
},
"cores": {
"min": null,
"max": null
}
},
"startup_order": 0,
"total_count": 1,
"replicas": 1,
"command": null
},
{
"image": " xxxxxxxxxxxx:ppppp/zoerepo/spark-worker:latest",
"environment": [
[
"SPARK_WORKER_CORES",
"6"
],
[
"SPARK_WORKER_RAM",
"11273240064"
],
[
"SPARK_MASTER_IP",
"{dns_name#spark-master0}"
],
[
"SPARK_LOCAL_IP",
"{dns_name#self}"
],
[
"PYTHONHASHSEED",
"42"
],
[
"HADOOP_USER_NAME",
"{user_name}"
]
],
"essential_count": 1,
"monitor": false,
"name": "spark-worker",
"ports": [
{
"name": "Spark worker web interface",
"url_template": "http://{ip_port}/",
"port_number": 8081,
"protocol": "tcp"
}
],
"resources": {
"memory": {
"min": 12884901888,
"max": 12884901888
},
"cores": {
"min": null,
"max": null
}
},
"volumes": [],
"startup_order": 1,
"total_count": 2,
"replicas": 1,
"command": null
},
{
"image": "xxxxxxxxxxxx:ppppp/zoerepo/spark-jupyter-notebook:latest",
"environment": [
[
"SPARK_MASTER",
"spark://{dns_name#spark-master0}:7077"
],
[
"SPARK_EXECUTOR_RAM",
"11273240064"
],
[
"SPARK_DRIVER_RAM",
"2147483648"
],
[
"HADOOP_USER_NAME",
"{user_name}"
],
[
"NB_USER",
"{user_name}"
],
[
"PYTHONHASHSEED",
"42"
],
[
"NAMENODE_HOST",
"hdfs-namenode.zoe"
]
],
"volumes": [],
"essential_count": 1,
"monitor": true,
"name": "spark-jupyter",
"ports": [
{
"name": "Spark application web interface",
"url_template": "http://{ip_port}/",
"port_number": 4040,
"protocol": "tcp"
},
{
"name": "Jupyter Notebook interface",
"url_template": "http://{ip_port}/",
"port_number": 8888,
"protocol": "tcp"
}
],
"resources": {
"memory": {
"min": 4294967296,
"max": 4294967296
},
"cores": {
"min": null,
"max": null
}
},
"startup_order": 0,
"total_count": 1,
"replicas": 1,
"command": null
}
],
"version": 3,
"will_end": false
}
Best regards, Thomas
When the wrong URL is showing up, can you run the following commands?
This will give you the list of running containers:
docker -H <swarm_manager_ip>:<swarm_manager_port> ps | grep jupyter
There should be one that is called spark-jupyter-<execution id>-compose
(the container name is the last column), take its ID (first column) and run:
docker -H <swarm_manager_ip>:<swarm_manager_port> inspect <container ID>
and send me the output.
Zoe takes the ip-port mapping from Swarm as-is, so either Swarm is giving us garbage or there is an intermittent problem with parsing Swarm API output.
Thanks!
We never had the right URL showing up. Here is the ouput of the inspect: docker -H :4000 inspect 810b4fd5acdc
[
{
"Id": "810b4fd5acdc7f7ffdc85d4b76ff0d3affdd4a19b9886c550605557b5b7c6338",
"Created": "2017-08-10T12:47:29.170149101Z",
"Path": "tini",
"Args": [
"--",
"start-notebook.sh"
],
"State": {
"Status": "running",
"Running": true,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 5993,
"ExitCode": 0,
"Error": "",
"StartedAt": "2017-08-10T12:47:30.981240965Z",
"FinishedAt": "0001-01-01T00:00:00Z"
},
"Image": "sha256:78d3a7af4bb3a86e1d70ed58aaad6e2f6d8f5c2496f90ca4247811ecfe21e1e8",
"ResolvConfPath": "/var/lib/docker/containers/810b4fd5acdc7f7ffdc85d4b76ff0d3affdd4a19b9886c550605557b5b7c6338/resolv.conf",
"HostnamePath": "/var/lib/docker/containers/810b4fd5acdc7f7ffdc85d4b76ff0d3affdd4a19b9886c550605557b5b7c6338/hostname",
"HostsPath": "/var/lib/docker/containers/810b4fd5acdc7f7ffdc85d4b76ff0d3affdd4a19b9886c550605557b5b7c6338/hosts",
"LogPath": "/var/lib/docker/containers/810b4fd5acdc7f7ffdc85d4b76ff0d3affdd4a19b9886c550605557b5b7c6338/810b4fd5acdc7f7ffdc85d4b76ff0d3affdd4a19b9886c550605557b5b7c6338-json.log",
"Node": {
"ID": "SGA7:TRTO:OZPZ:IH5Y:K5F7:BLCK:VQLJ:OROX:ZH6R:UKJM:UE6R:BP2A|ww.yy.ww.zz:2375",
"IP": "ww.yy.ww.zz",
"Addr": "ww.yy.ww.zz:2375",
"Name": "hostname",
"Cpus": 1,
"Memory": 3975208960,
"Labels": {
"kernelversion": "3.10.0-514.2.2.el7.x86_64",
"node.env": "tst",
"operatingsystem": "CentOS Linux 7 (Core)",
"ostype": "linux",
"storagedriver": "devicemapper"
}
},
"Name": "/spark-jupyter0-5-compose",
"RestartCount": 0,
"Driver": "devicemapper",
"MountLabel": "",
"ProcessLabel": "",
"AppArmorProfile": "",
"ExecIDs": null,
"HostConfig": {
"Binds": [
"/mnt/zoe-workspaces/admin:/mnt/workspace:rw"
],
"ContainerIDFile": "",
"LogConfig": {
"Type": "json-file",
"Config": {
"max-file": "5",
"max-size": "10m"
}
},
"NetworkMode": "my-net",
"PortBindings": {
"22/tcp": [
{
"HostIp": "",
"HostPort": ""
}
],
"4040/tcp": [
{
"HostIp": "",
"HostPort": ""
}
],
"6006/tcp": [
{
"HostIp": "",
"HostPort": ""
}
],
"8888/tcp": [
{
"HostIp": "",
"HostPort": ""
}
]
},
"RestartPolicy": {
"Name": "",
"MaximumRetryCount": 0
},
"AutoRemove": false,
"VolumeDriver": "",
"VolumesFrom": null,
"CapAdd": null,
"CapDrop": null,
"Dns": null,
"DnsOptions": null,
"DnsSearch": null,
"ExtraHosts": null,
"GroupAdd": null,
"IpcMode": "",
"Cgroup": "",
"Links": null,
"OomScoreAdj": 0,
"PidMode": "",
"Privileged": false,
"PublishAllPorts": false,
"ReadonlyRootfs": false,
"SecurityOpt": null,
"UTSMode": "",
"UsernsMode": "",
"ShmSize": 67108864,
"ConsoleSize": [
0,
0
],
"Isolation": "",
"CpuShares": 0,
"Memory": 1294967296,
"NanoCpus": 0,
"CgroupParent": "",
"BlkioWeight": 0,
"BlkioWeightDevice": null,
"BlkioDeviceReadBps": null,
"BlkioDeviceWriteBps": null,
"BlkioDeviceReadIOps": null,
"BlkioDeviceWriteIOps": null,
"CpuPeriod": 0,
"CpuQuota": 0,
"CpuRealtimePeriod": 0,
"CpuRealtimeRuntime": 0,
"CpusetCpus": "",
"CpusetMems": "",
"Devices": null,
"DeviceCgroupRules": null,
"DiskQuota": 0,
"KernelMemory": 0,
"MemoryReservation": 0,
"MemorySwap": 2589934592,
"MemorySwappiness": -1,
"OomKillDisable": false,
"PidsLimit": 0,
"Ulimits": null,
"CpuCount": 0,
"CpuPercent": 0,
"IOMaximumIOps": 0,
"IOMaximumBandwidth": 0
},
"GraphDriver": {
"Data": {
"DeviceId": "171",
"DeviceName": "docker-253:20-131075-8fcceb92a406596263e3cbe6d441046ee7887a56334957c9bec0bfb77c49ee19",
"DeviceSize": "10737418240"
},
"Name": "devicemapper"
},
"Mounts": [
{
"Source": "/mnt/zoe-workspaces/admin",
"Destination": "/mnt/workspace",
"Mode": "rw",
"RW": true,
"Propagation": "rprivate"
}
],
"Config": {
"Hostname": "spark-jupyter0-5-compose",
"Domainname": "",
"User": "",
"AttachStdin": false,
"AttachStdout": false,
"AttachStderr": false,
"ExposedPorts": {
"22/tcp": {},
"4040/tcp": {},
"6006/tcp": {},
"8888/tcp": {}
},
"Tty": false,
"OpenStdin": false,
"StdinOnce": false,
"Env": [
"EXECUTION_ID=5",
"UID=admin",
"SERVICE_NAME=spark-jupyter0",
"PYTHONHASHSEED=42",
"HADOOP_USER_NAME=admin",
"NAMENODE_HOST=hdfs-namenode.zoe",
"NB_USER=admin",
"SPARK_DRIVER_RAM=2147483648",
"DEPLOY_NAME=compose",
"PROXY_PATH=127.0.0.1",
"SPARK_EXECUTOR_RAM=11273240064",
"ZOE_WORKSPACE=/mnt/workspace",
"PATH=/opt/hadoop/bin:/opt/conda/bin:/opt/spark/bin:/opt/spark/sbin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"SPARK_VERSION=1.6.3",
"HADOOP_VERSION=2.6.1",
"JAVA_HOME=/usr/lib/jvm/java-1.8.0",
"CONDA_DIR=/opt/conda",
"CONDA_2_DIR=/opt/conda2",
"HADOOP_HOME=/opt/hadoop",
"SPARK_HOME=/opt/spark",
"SHELL=/bin/bash",
"LANG=en_US.UTF-8",
"LANGUAGE=en_US.UTF-8",
"HADOOP_USER_CLASSPATH_FIRST=true",
"SPARK_HIVE=true",
"HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop",
"PYTHONPATH=/opt/spark/python:/opt/spark/python/lib/py4j-0.9-src.zip"
],
"Cmd": [
"start-notebook.sh"
],
"Image": "qvipbdokrg01:5000/zoerepo/spark-jupyter-notebook:latest",
"Volumes": {
"/mnt/workspace": {}
},
"WorkingDir": "/root/work",
"Entrypoint": [
"tini",
"--"
],
"OnBuild": null,
"Labels": {
"build-date": "20161214",
"com.docker.swarm.id": "64da2d3c28a4c3c068dc6a3dcb78eea02a84c211219d9137469924fc625dbf08",
"hadoop_version": "hadoop2.6",
"jupyter_version": "4.3*",
"license": "GPLv2",
"maintainer": "Pierre-Arnaud Bousigues \u003cpabousigues@airfrance.fr\u003e",
"name": "CentOS Base Image",
"python_version": "3.5.*",
"spark_version": "1.6.3",
"tensorflow_version": "1.2.1",
"vendor": "CentOS",
"zoe_deployment_name": "compose",
"zoe_execution_id": "5",
"zoe_execution_name": "test",
"zoe_owner": "admin",
"zoe_service_id": "5",
"zoe_service_name": "spark-jupyter0",
"zoe_type": "app_service"
}
},
"NetworkSettings": {
"Bridge": "",
"SandboxID": "1026c55cef4141f0ce8db007a9c2e6c6817d275917ce29e1d50bf9bb6c0a35ec",
"HairpinMode": false,
"LinkLocalIPv6Address": "",
"LinkLocalIPv6PrefixLen": 0,
"Ports": {
"22/tcp": [
{
"HostIp": "ww.yy.ww.zz",
"HostPort": "32787"
}
],
"4040/tcp": [
{
"HostIp": "ww.yy.ww.zz",
"HostPort": "32786"
}
],
"6006/tcp": [
{
"HostIp": "ww.yy.ww.zz",
"HostPort": "32785"
}
],
"8888/tcp": [
{
"HostIp": "ww.yy.ww.zz",
"HostPort": "32784"
}
]
},
"SandboxKey": "/var/run/docker/netns/1026c55cef41",
"SecondaryIPAddresses": null,
"SecondaryIPv6Addresses": null,
"EndpointID": "",
"Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"MacAddress": "",
"Networks": {
"my-net": {
"IPAMConfig": null,
"Links": null,
"Aliases": null,
"NetworkID": "d22c39422de8d5c4c1129d0da8442ecf244b9e105167db30d6b34a8dfe8ab462",
"EndpointID": "6e662f835c02c61e0a9990f13fc21ffcd5258d675377c31e51d62bd8396e98d2",
"Gateway": "",
"IPAddress": "dd.ee.ff.gg",
"IPPrefixLen": 24,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"MacAddress": "02:42:0a:00:00:02",
"DriverOpts": null
}
}
}
}
]
I didn't notice you are using the web interface. If you use the command line client, does the right URL show up? The command is:
./zoe.py exec-get 5
I'm checking the web interface to see if there is a bug there.
Yes, the web interface was not updated to reflect the changes we made to execution objects. I'm working on a fix.
On the command line it should work, but please let me know.
Thanks!
Can you test the version in branch devel/webui-fixes ?
I fixed the execution details page and improved the looks of the login page.
There is one known issue: the login page sets a cookie, but there is no logout link anywhere to delete the cookie.
well the command ./zoe.py exec-get 5 doesn't give anything it is stuck, I didn't notice anything in the log files api.err and master.err. I am going to test your fix and let you know.
Fix is working fine, thanks a lot ! I am leaving on holidays today when I get back I'll check why zoe.py is not answering
Ok, I will merge these changes. Thanks for reporting the issue and taking the time to test the changes.
Hello, We are currently running ZOE 0.9.7 and would like to update to the latest version but we have an issue with the URL on the "detailed information for execution xxxx" (http://xxxxxxxxx:5001/executions/inspect/5). For example the Jupyter notebook link is http://spark-jupyter0-5-compose.127.0.0.1/ but it should be http://xx.yy.ww.zz:ppppp/ Have we done something wrong ? Can you help please ? Regards, Thomas