alibaba / x-deeplearning

An industrial deep learning framework for high-dimension sparse data
Apache License 2.0
4.26k stars 1.03k forks source link

分布式训练docker启动不了 #194

Open UanonymousU opened 5 years ago

UanonymousU commented 5 years ago

2019-03-28 19:50:51,417 INFO xdl.AppMasterBase: Launching worker container [container_1553773796023_0001_01_000005] 2019-03-28 19:50:51,440 INFO xdl.AppMasterBase: container Id is container_1553773796023_0001_01_000005,node Id is slave3:42716 2019-03-28 19:50:53,848 INFO xdl.AppMasterRunner$ApplicationMasterSignalHandler: Application Master is killed by signal:15 2019-03-28 19:50:53,861 INFO impl.AMRMClientImpl: Waiting for application to be successfully unregistered. 2019-03-28 19:50:53,975 ERROR xdl.AppMasterRunner: run error! org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException: Application attempt appattempt_1553773796023_0001_000001 doesn't exist in ApplicationMasterService cache. at org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.allocate(ApplicationMasterService.java:404) at org.apache.hadoop.yarn.api.impl.pb.service.ApplicationMasterProtocolPBServiceImpl.allocate(ApplicationMasterProtocolPBServiceImpl.java:60) at org.apache.hadoop.yarn.proto.ApplicationMasterProtocol$ApplicationMasterProtocolService$2.callBlockingMethod(ApplicationMasterProtocol.java:99) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:523) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:991) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:872) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:818) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2678)

at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.yarn.ipc.RPCUtil.instantiateException(RPCUtil.java:53)
at org.apache.hadoop.yarn.ipc.RPCUtil.instantiateYarnException(RPCUtil.java:75)
at org.apache.hadoop.yarn.ipc.RPCUtil.unwrapAndThrowException(RPCUtil.java:116)
at org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.allocate(ApplicationMasterProtocolPBClientImpl.java:79)
at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)
at com.sun.proxy.$Proxy13.allocate(Unknown Source)
at org.apache.hadoop.yarn.client.api.impl.AMRMClientImpl.allocate(AMRMClientImpl.java:320)
at com.alibaba.xdl.AppMasterBase.waitForWorkerFinish(AppMasterBase.java:303)
at com.alibaba.xdl.AppMasterBase.run(AppMasterBase.java:182)
at com.alibaba.xdl.AppMasterRunner.main(AppMasterRunner.java:81)

Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException): Application attempt appattempt_1553773796023_0001_000001 doesn't exist in ApplicationMasterService cache. at org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.allocate(ApplicationMasterService.java:404) at org.apache.hadoop.yarn.api.impl.pb.service.ApplicationMasterProtocolPBServiceImpl.allocate(ApplicationMasterProtocolPBServiceImpl.java:60) at org.apache.hadoop.yarn.proto.ApplicationMasterProtocol$ApplicationMasterProtocolService$2.callBlockingMethod(ApplicationMasterProtocol.java:99) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:523) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:991) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:872) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:818) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2678)

at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1511)
at org.apache.hadoop.ipc.Client.call(Client.java:1457)
at org.apache.hadoop.ipc.Client.call(Client.java:1367)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:228)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:116)
at com.sun.proxy.$Proxy12.allocate(Unknown Source)
at org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.allocate(ApplicationMasterProtocolPBClientImpl.java:77)
... 13 more

Exception in thread "SIGTERM handler" java.lang.ExceptionInInitializerError at java.nio.file.FileSystems.getDefault(FileSystems.java:176) at java.nio.file.Paths.get(Paths.java:84) at com.alibaba.xdl.AppMasterBase.removeZkRootPath(AppMasterBase.java:1000) at com.alibaba.xdl.AppMasterBase.close(AppMasterBase.java:1018) at com.alibaba.xdl.AppMasterBase.dealWithExit(AppMasterBase.java:978) at com.alibaba.xdl.AppMasterRunner$ApplicationMasterSignalHandler.handle(AppMasterRunner.java:121) at sun.misc.Signal$1.run(Signal.java:212) at java.lang.Thread.run(Thread.java:748) Caused by: java.security.PrivilegedActionException: sun.nio.fs.UnixException: No such file or directory at java.security.AccessController.doPrivileged(Native Method) at java.nio.file.FileSystems$DefaultFileSystemHolder.defaultFileSystem(FileSystems.java:96) at java.nio.file.FileSystems$DefaultFileSystemHolder.(FileSystems.java:90) ... 8 more Caused by: sun.nio.fs.UnixException: No such file or directory at sun.nio.fs.UnixNativeDispatcher.getcwd(Native Method) at sun.nio.fs.UnixFileSystem.(UnixFileSystem.java:67) at sun.nio.fs.LinuxFileSystem.(LinuxFileSystem.java:39) at sun.nio.fs.LinuxFileSystemProvider.newFileSystem(LinuxFileSystemProvider.java:46) at sun.nio.fs.LinuxFileSystemProvider.newFileSystem(LinuxFileSystemProvider.java:39) at sun.nio.fs.UnixFileSystemProvider.(UnixFileSystemProvider.java:56) at sun.nio.fs.LinuxFileSystemProvider.(LinuxFileSystemProvider.java:41) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at java.lang.Class.newInstance(Class.java:442) at sun.nio.fs.DefaultFileSystemProvider.createProvider(DefaultFileSystemProvider.java:48) at sun.nio.fs.DefaultFileSystemProvider.create(DefaultFileSystemProvider.java:63) at java.nio.file.FileSystems$DefaultFileSystemHolder.getDefaultProvider(FileSystems.java:108) at java.nio.file.FileSystems$DefaultFileSystemHolder.access$000(FileSystems.java:89) at java.nio.file.FileSystems$DefaultFileSystemHolder$1.run(FileSystems.java:98) at java.nio.file.FileSystems$DefaultFileSystemHolder$1.run(FileSystems.java:96) ... 11 more Exception in thread "main" java.lang.NoClassDefFoundError: Could not initialize class java.nio.file.FileSystems$DefaultFileSystemHolder at java.nio.file.FileSystems.getDefault(FileSystems.java:176) at java.nio.file.Paths.get(Paths.java:84) at com.alibaba.xdl.AppMasterBase.removeZkRootPath(AppMasterBase.java:1000) at com.alibaba.xdl.AppMasterBase.close(AppMasterBase.java:1018) at com.alibaba.xdl.AppMasterBase.dealWithExit(AppMasterBase.java:978) at com.alibaba.xdl.AppMasterRunner.main(AppMasterRunner.java:85)

UanonymousU commented 5 years ago

@songyue1104 @lovickie @largestone1982 麻烦帮忙看看,非常感谢!

songyue1104 commented 5 years ago

描述一下机器环境和复现步骤

UanonymousU commented 5 years ago

搭建hadoop(包括zk)集群,然后在集群的每台机器上安装docker,并挂载提供的docker镜像,在docker镜像中修改etc/hosts中指定集群的hostname和ip映射关系,在hadoop集群的namenode启动xdl_submit.py --config config.tree_init.json,发现datanode上面docker状态为create。还有一点是docker的命令能够手动启动,所以感觉是否是yarn调度docker这块有问题? @songyue1104

yiling-dc commented 5 years ago

请问这个还有问题吗?

mengyiliu22 commented 5 years ago

请问这个问题是怎么解决的呢?也遇到了相同的问题 @UanonymousU

UanonymousU commented 5 years ago

@mengyiliu22 采用的手工启动的方式,如果yarn能成功调度,还请互相交流一下

UanonymousU commented 5 years ago

@yiling-dc 目前用手工启动的方式,但还是希望yarn能调度起来

UanonymousU commented 5 years ago

目前执行tree_init,输出如下:

hadoop bin /home/hadoop-3.1.2/bin/hadoop
CMD: /home/hadoop-3.1.2/bin/hadoop jar /usr/bin/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar com.alibaba.xdl.Client -c=config.tree_init.json -f=/usr/bin/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar -uuid=9d67056d-0f92-4ead-8322-77ec4eb8819b
2019-04-25 06:10:12,245 INFO xdl.Client: Yarn client start success.
2019-04-25 06:10:12,341 INFO xdl.Client: Create application with id:[application_1556090006599_0008] success.
2019-04-25 06:10:13,001 INFO xdl.Utils: Path:[hdfs://ns1/user/root/.xdl/application_1556090006599_0008] not exists, create success.
2019-04-25 06:10:13,001 INFO xdl.Client: Application base path:[hdfs://ns1/user/root/.xdl/application_1556090006599_0008/].
2019-04-25 06:10:13,180 INFO xdl.Client: Upload file config.tree_init.json to hdfs:/ns1/user/root/.xdl/application_1556090006599_0008/config.tree_init.json success.
2019-04-25 06:10:13,180 INFO xdl.Client: begin to upload files to hdfs
2019-04-25 06:10:13,224 INFO xdl.Client: Upload file /usr/bin/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar to hdfs:/ns1/user/root/.xdl/application_1556090006599_0008/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar success.
2019-04-25 06:12:55,130 INFO xdl.Utils: Run cmd [tar -czf /tmp/xdl_local/9d67056d-0f92-4ead-8322-77ec4eb8819b/tdm_mock.tar.gz -C /work ./tdm_mock] success.
2019-04-25 06:12:57,896 INFO xdl.Client: Upload file /work/tdm_mock to hdfs://ns1/user/root/.xdl/application_1556090006599_0008/tdm_mock.tar.gz success.
2019-04-25 06:12:57,896 INFO xdl.Client: finish uploading files to hdfs
2019-04-25 06:12:57,896 INFO xdl.Client: Upload user files success.
2019-04-25 06:12:57,897 INFO xdl.Client: ApplicationMaster start command is: [$JAVA_HOME/bin/java -Xmx256M  com.alibaba.xdl.AppMasterRunner -c=config.tree_init.json -v=tdm_mock.tar.gz -u=root -p=hdfs://ns1/user/root/.xdl/application_1556090006599_0008/ 1><LOG_DIR>/stdout 2><LOG_DIR>/stderr]
2019-04-25 06:12:57,978 INFO xdl.Client: local resources: {xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar=resource { scheme: "hdfs" host: "ns1" port: -1 file: "/user/root/.xdl/application_1556090006599_0008/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar" } size: 4784145 timestamp: 1556172613221 type: FILE visibility: PUBLIC, config.tree_init.json=resource { scheme: "hdfs" host: "ns1" port: -1 file: "/user/root/.xdl/application_1556090006599_0008/config.tree_init.json" } size: 648 timestamp: 1556172613175 type: FILE visibility: PUBLIC}
2019-04-25 06:12:57,985 INFO xdl.Client: Master add CLASSPATH:/export/hadoop/hadoop-3.1.2/etc/hadoop:/export/hadoop/hadoop-3.1.2/share/hadoop/common/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/common/*:/export/hadoop/hadoop-3.1.2/share/hadoop/hdfs:/export/hadoop/hadoop-3.1.2/share/hadoop/hdfs/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/hdfs/*:/export/hadoop/hadoop-3.1.2/share/hadoop/mapreduce/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/mapreduce/*:/export/hadoop/hadoop-3.1.2/share/hadoop/yarn:/export/hadoop/hadoop-3.1.2/share/hadoop/yarn/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/yarn/*
2019-04-25 06:12:57,986 INFO xdl.Client: Setup ApplicationMaster container success.
2019-04-25 06:12:58,006 INFO conf.Configuration: found resource resource-types.xml at file:/home/hadoop-3.1.2/etc/hadoop/resource-types.xml
2019-04-25 06:12:58,016 INFO resource.ResourceUtils: Adding resource type - name = yarn.io/gpu, units = , type = COUNTABLE
2019-04-25 06:12:58,021 INFO xdl.Client: Setup application context success.
2019-04-25 06:12:58,021 INFO xdl.Client: Submitting application application_1556090006599_0008
2019-04-25 06:12:58,070 INFO impl.YarnClientImpl: Submitted application application_1556090006599_0008
2019-04-25 06:12:58,072 INFO xdl.Client: AppMaster host N/A Start waiting application: application_1556090006599_0008 ends.
2019-04-25 06:13:07,686 INFO xdl.Client: Application application_1556090006599_0008 finish with state FINISHED
2019-04-25 06:13:07,687 INFO xdl.Utils: ================================FINAL STATUS==================================
2019-04-25 06:13:07,687 INFO xdl.Utils:   application_1556090006599_0008 : KILLED 
2019-04-25 06:13:07,687 INFO xdl.Utils: ================================FINAL STATUS==================================
2019-04-25 06:13:07,695 INFO xdl.Utils: Delete the hdfs dir:hdfs://ns1/user/root/.xdl/application_1556090006599_0008/ success.

还是不能正常执行。

mengyiliu22 commented 5 years ago

目前执行tree_init,输出如下:

hadoop bin /home/hadoop-3.1.2/bin/hadoop
CMD: /home/hadoop-3.1.2/bin/hadoop jar /usr/bin/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar com.alibaba.xdl.Client -c=config.tree_init.json -f=/usr/bin/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar -uuid=9d67056d-0f92-4ead-8322-77ec4eb8819b
2019-04-25 06:10:12,245 INFO xdl.Client: Yarn client start success.
2019-04-25 06:10:12,341 INFO xdl.Client: Create application with id:[application_1556090006599_0008] success.
2019-04-25 06:10:13,001 INFO xdl.Utils: Path:[hdfs://ns1/user/root/.xdl/application_1556090006599_0008] not exists, create success.
2019-04-25 06:10:13,001 INFO xdl.Client: Application base path:[hdfs://ns1/user/root/.xdl/application_1556090006599_0008/].
2019-04-25 06:10:13,180 INFO xdl.Client: Upload file config.tree_init.json to hdfs:/ns1/user/root/.xdl/application_1556090006599_0008/config.tree_init.json success.
2019-04-25 06:10:13,180 INFO xdl.Client: begin to upload files to hdfs
2019-04-25 06:10:13,224 INFO xdl.Client: Upload file /usr/bin/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar to hdfs:/ns1/user/root/.xdl/application_1556090006599_0008/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar success.
2019-04-25 06:12:55,130 INFO xdl.Utils: Run cmd [tar -czf /tmp/xdl_local/9d67056d-0f92-4ead-8322-77ec4eb8819b/tdm_mock.tar.gz -C /work ./tdm_mock] success.
2019-04-25 06:12:57,896 INFO xdl.Client: Upload file /work/tdm_mock to hdfs://ns1/user/root/.xdl/application_1556090006599_0008/tdm_mock.tar.gz success.
2019-04-25 06:12:57,896 INFO xdl.Client: finish uploading files to hdfs
2019-04-25 06:12:57,896 INFO xdl.Client: Upload user files success.
2019-04-25 06:12:57,897 INFO xdl.Client: ApplicationMaster start command is: [$JAVA_HOME/bin/java -Xmx256M  com.alibaba.xdl.AppMasterRunner -c=config.tree_init.json -v=tdm_mock.tar.gz -u=root -p=hdfs://ns1/user/root/.xdl/application_1556090006599_0008/ 1><LOG_DIR>/stdout 2><LOG_DIR>/stderr]
2019-04-25 06:12:57,978 INFO xdl.Client: local resources: {xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar=resource { scheme: "hdfs" host: "ns1" port: -1 file: "/user/root/.xdl/application_1556090006599_0008/xdl-yarn-scheduler-1.0.0-SNAPSHOT-jar-with-dependencies.jar" } size: 4784145 timestamp: 1556172613221 type: FILE visibility: PUBLIC, config.tree_init.json=resource { scheme: "hdfs" host: "ns1" port: -1 file: "/user/root/.xdl/application_1556090006599_0008/config.tree_init.json" } size: 648 timestamp: 1556172613175 type: FILE visibility: PUBLIC}
2019-04-25 06:12:57,985 INFO xdl.Client: Master add CLASSPATH:/export/hadoop/hadoop-3.1.2/etc/hadoop:/export/hadoop/hadoop-3.1.2/share/hadoop/common/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/common/*:/export/hadoop/hadoop-3.1.2/share/hadoop/hdfs:/export/hadoop/hadoop-3.1.2/share/hadoop/hdfs/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/hdfs/*:/export/hadoop/hadoop-3.1.2/share/hadoop/mapreduce/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/mapreduce/*:/export/hadoop/hadoop-3.1.2/share/hadoop/yarn:/export/hadoop/hadoop-3.1.2/share/hadoop/yarn/lib/*:/export/hadoop/hadoop-3.1.2/share/hadoop/yarn/*
2019-04-25 06:12:57,986 INFO xdl.Client: Setup ApplicationMaster container success.
2019-04-25 06:12:58,006 INFO conf.Configuration: found resource resource-types.xml at file:/home/hadoop-3.1.2/etc/hadoop/resource-types.xml
2019-04-25 06:12:58,016 INFO resource.ResourceUtils: Adding resource type - name = yarn.io/gpu, units = , type = COUNTABLE
2019-04-25 06:12:58,021 INFO xdl.Client: Setup application context success.
2019-04-25 06:12:58,021 INFO xdl.Client: Submitting application application_1556090006599_0008
2019-04-25 06:12:58,070 INFO impl.YarnClientImpl: Submitted application application_1556090006599_0008
2019-04-25 06:12:58,072 INFO xdl.Client: AppMaster host N/A Start waiting application: application_1556090006599_0008 ends.
2019-04-25 06:13:07,686 INFO xdl.Client: Application application_1556090006599_0008 finish with state FINISHED
2019-04-25 06:13:07,687 INFO xdl.Utils: ================================FINAL STATUS==================================
2019-04-25 06:13:07,687 INFO xdl.Utils:   application_1556090006599_0008 : KILLED 
2019-04-25 06:13:07,687 INFO xdl.Utils: ================================FINAL STATUS==================================
2019-04-25 06:13:07,695 INFO xdl.Utils: Delete the hdfs dir:hdfs://ns1/user/root/.xdl/application_1556090006599_0008/ success.

还是不能正常执行。

这个输出和我这边是一样的,我的appmaster里面是这样:

2019-04-25 14:59:23,330 INFO xdl.AppMasterBase: container start command is $JAVA_HOME/bin/java -Xmx256M com.alibaba.xdl.ContainerRunner -c=config.tree_init.json -j=scheduler -i=0 -z=master:2181,worker1:2181 -r=/xdl -u=root -v=tdm_mock.tar.gz -cpuset=_CPULIST -cd=GPU_LIST_PLACEHOLDER 1> /stdout 2> /stderr 2019-04-25 14:59:23,331 INFO xdl.AppMasterBase: Launching scheduler container [container_1556099741086_0011_01_000002] 2019-04-25 14:59:23,407 INFO xdl.AppMasterBase: container Id is container_1556099741086_0011_01_000002,node Id is worker1:43869 2019-04-25 14:59:23,408 INFO xdl.AppMasterBase: container start command is $JAVA_HOME/bin/java -Xmx256M com.alibaba.xdl.ContainerRunner -c=config.tree_init.json -j=ps -i=0 -z=master:2181,worker1:2181 -r=/xdl -u=root -v=tdm_mock.tar.gz -cpuset=_CPULIST -cd=GPU_LIST_PLACEHOLDER 1> /stdout 2> /stderr 2019-04-25 14:59:23,408 INFO xdl.AppMasterBase: Launching ps container [container_1556099741086_0011_01_000003] 2019-04-25 14:59:23,419 INFO xdl.AppMasterBase: container Id is container_1556099741086_0011_01_000003,node Id is worker1:43869 2019-04-25 14:59:23,419 INFO xdl.AppMasterBase: container start command is $JAVA_HOME/bin/java -Xmx256M com.alibaba.xdl.ContainerRunner -c=config.tree_init.json -j=ps -i=1 -z=master:2181,worker1:2181 -r=/xdl -u=root -v=tdm_mock.tar.gz -cpuset=_CPULIST -cd=GPU_LIST_PLACEHOLDER 1> /stdout 2> /stderr 2019-04-25 14:59:23,419 INFO xdl.AppMasterBase: Launching ps container [container_1556099741086_0011_01_000004] 2019-04-25 14:59:23,429 INFO xdl.AppMasterBase: container Id is container_1556099741086_0011_01_000004,node Id is worker1:43869 2019-04-25 14:59:23,430 INFO xdl.AppMasterBase: container start command is $JAVA_HOME/bin/java -Xmx256M com.alibaba.xdl.ContainerRunner -c=config.tree_init.json -j=worker -i=0 -z=master:2181,worker1:2181 -r=/xdl -u=root -v=tdm_mock.tar.gz -cpuset=_CPULIST -cd=GPU_LIST_PLACEHOLDER 1> /stdout 2> /stderr 2019-04-25 14:59:23,430 INFO xdl.AppMasterBase: Launching worker container [container_1556099741086_0011_01_000005] 2019-04-25 14:59:23,439 INFO xdl.AppMasterBase: container Id is container_1556099741086_0011_01_000005,node Id is worker1:43869 2019-04-25 14:59:23,439 INFO xdl.AppMasterBase: container start command is $JAVA_HOME/bin/java -Xmx256M com.alibaba.xdl.ContainerRunner -c=config.tree_init.json -j=worker -i=1 -z=master:2181,worker1:2181 -r=/xdl -u=root -v=tdm_mock.tar.gz -cpuset=_CPULIST -cd=GPU_LIST_PLACEHOLDER 1> /stdout 2> /stderr 2019-04-25 14:59:23,439 INFO xdl.AppMasterBase: Launching worker container [container_1556099741086_0011_01_000006] 2019-04-25 14:59:23,448 INFO xdl.AppMasterBase: container Id is container_1556099741086_0011_01_000006,node Id is worker1:43869 2019-04-25 14:59:23,939 INFO xdl.AppMasterRunner$ApplicationMasterSignalHandler: Application Master is killed by signal:15 2019-04-25 14:59:23,945 INFO impl.AMRMClientImpl: Waiting for application to be successfully unregistered. 2019-04-25 14:59:24,004 ERROR xdl.AppMasterRunner: run error! org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException: Application attempt appattempt_1556099741086_0011_000001 doesn't exist in ApplicationMasterService cache. at org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService.allocate(ApplicationMasterService.java:404) at org.apache.hadoop.yarn.api.impl.pb.service.ApplicationMasterProtocolPBServiceImpl.allocate(ApplicationMasterProtocolPBServiceImpl.java:60) at org.apache.hadoop.yarn.proto.ApplicationMasterProtocol$ApplicationMasterProtocolService$2.callBlockingMethod(ApplicationMasterProtocol.java:99) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:523) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:991) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:872) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:818) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2678)

at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
at org.apache.hadoop.yarn.ipc.RPCUtil.instantiateException(RPCUtil.java:53)
at org.apache.hadoop.yarn.ipc.RPCUtil.instantiateYarnException(RPCUtil.java:75)
at org.apache.hadoop.yarn.ipc.RPCUtil.unwrapAndThrowException(RPCUtil.java:116)
at org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.allocate(ApplicationMasterProtocolPBClientImpl.java:79)
UanonymousU commented 5 years ago

@mengyiliu22 我这边appmaster输出也是: INFO xdl.AppMasterRunner$ApplicationMasterSignalHandler: Application Master is killed by signal:15 @songyue1104 @yiling-dc

mengyiliu22 commented 5 years ago

@UanonymousU 手动启动一直出现这个是什么情况 I0425 09:01:59.789927 13061 scheduler_impl.cc:356] Waiting for 1 more servers

UanonymousU commented 5 years ago

配置的ps数目大于已连接上的ps数目 @mengyiliu22

mengyiliu22 commented 5 years ago

配置的ps数目大于已连接上的ps数目 @mengyiliu22

python tree_init.py --task_name=scheduler --zk_addr=zfs://master:2181,worker1:2181 --ps_num=1 --ps_cpu_cores=2 --ps_memory_m=1000 --ckpt_dir=hdfs:/

只配了一个