Open eric-haibin-lin opened 5 years ago
http://ci.mxnet.io/blue/organizations/jenkins/GluonNLP-py3-master-gpu-integration/detail/PR-893/1/pipeline
=================================== FAILURES =================================== ______________________ test_skipgram_cbow[False-skipgram] ______________________ model = 'skipgram', fasttext = False @pytest.mark.serial @pytest.mark.remote_required @pytest.mark.gpu @pytest.mark.integration @pytest.mark.parametrize('model', ['skipgram', 'cbow']) @pytest.mark.parametrize('fasttext', [True, False]) def test_skipgram_cbow(model, fasttext): cmd = [ sys.executable, './scripts/word_embeddings/train_sg_cbow.py', '--gpu', '0', '--epochs', '2', '--model', model, '--data', 'toy', '--batch-size', '64'] cmd += ['--similarity-datasets', 'WordSim353'] cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet'] if fasttext: cmd += ['--ngram-buckets', '1000'] else: cmd += ['--ngram-buckets', '0'] > subprocess.check_call(cmd) scripts/tests/test_scripts.py:46: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ popenargs = (['/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/bin/python', './scripts/word_embeddings/train_sg_cbow.py', '--gpu', '0', '--epochs', '2', ...],) kwargs = {}, retcode = 1 cmd = ['/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/bin/python', './scripts/word_embeddings/train_sg_cbow.py', '--gpu', '0', '--epochs', '2', ...] def check_call(*popenargs, **kwargs): """Run command with arguments. Wait for command to complete. If the exit code was zero then return, otherwise raise CalledProcessError. The CalledProcessError object will have the return code in the returncode attribute. The arguments are the same as for the call function. Example: check_call(["ls", "-l"]) """ retcode = call(*popenargs, **kwargs) if retcode: cmd = kwargs.get("args") if cmd is None: cmd = popenargs[0] > raise CalledProcessError(retcode, cmd) E subprocess.CalledProcessError: Command '['/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/bin/python', './scripts/word_embeddings/train_sg_cbow.py', '--gpu', '0', '--epochs', '2', '--model', 'skipgram', '--data', 'toy', '--batch-size', '64', '--similarity-datasets', 'WordSim353', '--analogy-datasets', 'GoogleAnalogyTestSet', '--ngram-buckets', '0']' returned non-zero exit status 1. conda/gpu/py3-master/lib/python3.6/subprocess.py:291: CalledProcessError ----------------------------- Captured stderr call ----------------------------- INFO:root:Starting to count and construct vocabulary INFO:root:Finished to count and construct vocabulary in 0.01 seconds INFO:root:Starting to code data INFO:root:Finished to code data in 0.02 seconds Traceback (most recent call last): File "./scripts/word_embeddings/train_sg_cbow.py", line 324, in <module> train(args_) File "./scripts/word_embeddings/train_sg_cbow.py", line 182, in train embedding.initialize(ctx=context) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/gluon/block.py", line 535, in initialize self.collect_params().initialize(init, ctx, verbose, force_reinit) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/gluon/parameter.py", line 886, in initialize v.initialize(None, ctx, init, force_reinit=force_reinit) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/gluon/parameter.py", line 463, in initialize self._finish_deferred_init() File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/gluon/parameter.py", line 347, in _finish_deferred_init self._init_impl(data, ctx) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/gluon/parameter.py", line 359, in _init_impl self._data = [data.copyto(ctx) for ctx in self._ctx_list] File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/gluon/parameter.py", line 359, in <listcomp> self._data = [data.copyto(ctx) for ctx in self._ctx_list] File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py", line 2463, in copyto hret = NDArray(_new_alloc_handle(self.shape, other, True, self.dtype)) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py", line 142, in _new_alloc_handle ctypes.byref(hdl))) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/base.py", line 253, in check_call raise MXNetError(py_str(_LIB.MXGetLastError())) mxnet.base.MXNetError: [17:09:14] include/mxnet/base.h:459: Check failed: e == cudaSuccess (304 vs. 0) : CUDA: OS call failed or operation not supported on this OS Stack trace: [bt] (0) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x509cdb) [0x7efd14083cdb] [bt] (1) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29eb995) [0x7efd16565995] [bt] (2) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x33d1ef5) [0x7efd16f4bef5] [bt] (3) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x8ff20d) [0x7efd1447920d] [bt] (4) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/site-packages/mxnet/libmxnet.so(MXNDArrayCreateEx+0x1ff) [0x7efd1656e2cf] [bt] (5) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7efe38ca1630] [bt] (6) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7efe38ca0fed] [bt] (7) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7efe38cb800e] [bt] (8) /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/conda/gpu/py3-master/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x13a45) [0x7efe38cb8a45] =============================== warnings summary =============================== src/gluonnlp/data/registry.py:91 /var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@2/src/gluonnlp/data/registry.py:91: UserWarning: New dataset dataset.TOY registered with name toy isoverriding existing dataset scripts.machine_translation.dataset.TOY return register_(class_)
Could this be a mxnet regression: "CUDA: OS call failed or operation not supported on this OS"
http://ci.mxnet.io/blue/organizations/jenkins/GluonNLP-py3-master-gpu-integration/detail/PR-893/1/pipeline