Open zhangshiyu01 opened 8 years ago
Should I add the hosts file? I see if args.cluster == 'local' or args.host_file is None args.host_file == 'None': from dmlc_tracker import local local.submit(args) if args.cluster == 'sge': from dmlc_tracker import sge sge.submit(args) elif args.cluster == 'yarn': from dmlc_tracker import yarn print '------- go yarn ---' yarn.submit(args) elif args.cluster == 'ssh': from dmlc_tracker import ssh ssh.submit(args) elif args.cluster == 'mpi': from dmlc_tracker import mpi mpi.submit(args) else: raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
Does it mean I need to add the hosts file when I use sge, yarn, ssh OR mpi?
This issue is closed due to lack of activity in the last 90 days. Feel free to ping me to reopen if this is still an active issue. Thanks! Also, do please check out our forum (and Chinese version) for general "how-to" questions.
I meet the same error. how to fix it
@zhangshiyu01 were you able to resolve the issue?
@zhangshiyu01 @everwind I know it's kind of late, but did you resolve the issues?
this error still happens, have you got it done? @lanking520 @everwind @zhangshiyu01
../../tools/launch.py -n 2 --launcher yarn python train_mnist.py --network lenet --kv-store dist_sync
Traceback (most recent call last): File "/data0/ads/chenglei/mxnet/dmlc-core/tracker/dmlc_tracker/launcher.py", line 81, in
main()
File "/data0/ads/chenglei/mxnet/dmlc-core/tracker/dmlc_tracker/launcher.py", line 30, in main
assert cluster is not None, 'need to have DMLC_JOB_CLUSTER'
AssertionError: need to have DMLC_JOB_CLUSTER
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 801, in bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 754, in run
self.__target(_self.args, *_self.__kwargs)
File "/data0/ads/chenglei/mxnet/tools/../dmlc-core/tracker/dmlc_tracker/tracker.py", line 365, in
target=(lambda: subprocess.check_call(self.cmd, env=env, shell=True)), args=())
File "/usr/lib/python2.7/subprocess.py", line 540, in check_call
raise CalledProcessError(retcode, cmd)
CalledProcessError: Command '/data0/ads/chenglei/mxnet/dmlc-core/tracker/dmlc_tracker/launcher.py python train_mnist.py --network lenet --kv-store dist_sync' returned non-zero exit status 1
yarn 2 -------------/usr/local/jdk1.6.045/bin/java -cp /usr/local/hadoop-2.4.0/etc/hadoop:/usr/local/hadoop-2.4.0/share/hadoop/common/lib/:/usr/local/hadoop-2.4.0/share/hadoop/common/:/usr/local/hadoop-2.4.0/share/hadoop/hdfs:/usr/local/hadoop-2.4.0/share/hadoop/hdfs/lib/:/usr/local/hadoop-2.4.0/share/hadoop/hdfs/:/usr/local/hadoop-2.4.0/share/hadoop/yarn/lib/:/usr/local/hadoop-2.4.0/share/hadoop/yarn/:/usr/local/hadoop-2.4.0/share/hadoop/mapreduce/lib/:/usr/local/hadoop-2.4.0/share/hadoop/mapreduce/_:/usr/local/hadoop-2.4.0/contrib/capacity-scheduler/*.jar:/data0/ads/chenglei/mxnet/tools/../dmlc-core/tracker/dmlc_tracker/../yarn/dmlc-yarn.jar org.apache.hadoop.yarn.dmlc.Client -file /data0/ads/chenglei/mxnet/tools/../dmlc-core/tracker/dmlc_tracker/../yarn/dmlc-yarn.jar -file train_mnist.py -file /data0/ads/chenglei/mxnet/dmlc-core/tracker/dmlc_tracker/launcher.py -jobname DMLC[nworker=2,nsever=2]:python -tempdir /tmp -queue default ./launcher.py python ./train_mnist.py --network lenet --kv-store dist_sync
16/08/08 15:59:55 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Exception in thread "main" org.apache.hadoop.security.AccessControlException: Permission denied: user=ads, access=WRITE, inode="/tmp":hadoop:supergroup:drwxr-xr-x at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkFsPermission(FSPermissionChecker.java:274) at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPermissionChecker.java:260) at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPermissionChecker.java:241) at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermission(FSPermissionChecker.java:185) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkPermission(FSNamesystem.java:5546) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkPermission(FSNamesystem.java:5528) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkAncestorAccess(FSNamesystem.java:5493) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesystem.java:3632) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.java:3602) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java:3576) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcServer.java:760) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:560) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:928) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2013) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2009) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1550) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2007)
Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.AccessControlException): Permission denied: user=ads, access=WRITE, inode="/tmp":hadoop:supergroup:drwxr-xr-x at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkFsPermission(FSPermissionChecker.java:274) at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPermissionChecker.java:260) at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPermissionChecker.java:241) at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermission(FSPermissionChecker.java:185) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkPermission(FSNamesystem.java:5546) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkPermission(FSNamesystem.java:5528) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkAncestorAccess(FSNamesystem.java:5493) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesystem.java:3632) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.java:3602) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java:3576) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcServer.java:760) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:560) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:928) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2013) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2009) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1550) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2007)
Exception in thread Thread-2: Traceback (most recent call last): File "/usr/lib/python2.7/threading.py", line 801, in bootstrap_inner self.run() File "/usr/lib/python2.7/threading.py", line 754, in run self.__target(_self.args, _self.__kwargs) File "/data0/ads/chenglei/mxnet/tools/../dmlc-core/tracker/dmlc_tracker/yarn.py", line 114, in run subprocess.check_call(cmd, shell=True, env=env) File "/usr/lib/python2.7/subprocess.py", line 540, in check_call raise CalledProcessError(retcode, cmd) CalledProcessError: Command '/usr/local/jdk1.6.045/bin/java -cp /usr/local/hadoop-2.4.0/etc/hadoop:/usr/local/hadoop-2.4.0/share/hadoop/common/lib/:/usr/local/hadoop-2.4.0/share/hadoop/common/:/usr/local/hadoop-2.4.0/share/hadoop/hdfs:/usr/local/hadoop-2.4.0/share/hadoop/hdfs/lib/:/usr/local/hadoop-2.4.0/share/hadoop/hdfs/:/usr/local/hadoop-2.4.0/share/hadoop/yarn/lib/:/usr/local/hadoop-2.4.0/share/hadoop/yarn/:/usr/local/hadoop-2.4.0/share/hadoop/mapreduce/lib/:/usr/local/hadoop-2.4.0/share/hadoop/mapreduce/_:/usr/local/hadoop-2.4.0/contrib/capacity-scheduler/.jar:/data0/ads/chenglei/mxnet/tools/../dmlc-core/tracker/dmlc_tracker/../yarn/dmlc-yarn.jar org.apache.hadoop.yarn.dmlc.Client -file /data0/ads/chenglei/mxnet/tools/../dmlc-core/tracker/dmlc_tracker/../yarn/dmlc-yarn.jar -file train_mnist.py -file /data0/ads/chenglei/mxnet/dmlc-core/tracker/dmlc_tracker/launcher.py -jobname DMLC[nworker=2,nsever=2]:python -tempdir /tmp -queue default ./launcher.py python ./train_mnist.py --network lenet --kv-store dist_sync' returned non-zero exit status 1