Closed mseth10 closed 3 years ago
Using the script provided above, I tested the following operators: ['Reshape', 'transpose', 'elemwise_add', 'elemwise_sub', 'sum', '_lesser_scalar', '_greater_scalar', '_minus_scalar', '_plus_scalar'] These ops all have the same Invalid argument error during backward prop.
@samskalicky @szha @eric-haibin-lin do you have an idea what could be causing this?
Here is the stack trace for this error. I used the following patch to obtain it:
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index ebb3134ae..e0ba2791e 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -394,7 +394,6 @@ int MXAutogradBackwardEx(uint32_t num_output,
NDArrayHandle **grad_handles,
int **grad_stypes) {
MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
- API_BEGIN();
std::vector<NDArray*> outputs, ograds, variables;
outputs.reserve(num_output);
@@ -430,7 +429,7 @@ int MXAutogradBackwardEx(uint32_t num_output,
*grad_handles = dmlc::BeginPtr(ret->ret_handles);
*grad_stypes = dmlc::BeginPtr(ret->out_types);
}
- API_END();
+ return 0;
}
int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out) {
and build via cmake -GNinja -DCMAKE_BUILD_TYPE=Debug -DLOG_FATAL_THROW=0 -DUSE_CUDA=0 ..; ninja
Back Trace:
Note: static_alloc
is set to true
for internal CachedOps in the default subgraph property, hence it goes to StaticBackward
Thread 1 "python3" received signal SIGABRT, Aborted.
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
51 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0 0x00007ffff7a22f47 in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1 0x00007ffff7a248b1 in __GI_abort () at abort.c:79
#2 0x00007ffff7a1442a in __assert_fail_base (fmt=0x7ffff7b9ba38 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0x7ffff77d9750 "INTERNAL_SYSCALL_ERRNO (e, __err) != ESRCH || !robust", file=file@entry=0x7ffff77d9695 "../nptl/pthread_mutex_lock.c", line=line@entry=425, function=function@entry=0x7ffff77d9800 <__PRETTY_FUNCTION__.8909> "__pthread_mutex_lock_full") at assert.c:92
#3 0x00007ffff7a144a2 in __GI___assert_fail (assertion=assertion@entry=0x7ffff77d9750 "INTERNAL_SYSCALL_ERRNO (e, __err) != ESRCH || !robust", file=file@entry=0x7ffff77d9695 "../nptl/pthread_mutex_lock.c", line=line@entry=425, function=function@entry=0x7ffff77d9800 <__PRETTY_FUNCTION__.8909> "__pthread_mutex_lock_full") at assert.c:101
#4 0x00007ffff77cef3c in __pthread_mutex_lock_full (mutex=0x1eb5690) at ../nptl/pthread_mutex_lock.c:425
#5 0x00007fff42e242bf in __gthread_mutex_lock(__gthread_mutex_t*) (__mutex=0x1eb5690) at /usr/include/x86_64-linux-gnu/c++/7/bits/gthr-default.h:748
#6 0x00007fff42e24552 in std::mutex::lock() (this=0x1eb5690) at /usr/include/c++/7/bits/std_mutex.h:103
#7 0x00007fff42e9219a in std::lock_guard<std::mutex>::lock_guard(std::mutex&) (this=0x7fffffff5970, __m=...) at /usr/include/c++/7/bits/std_mutex.h:162
#8 0x00007fff431b49bd in mxnet::CachedOp::StaticBackward(bool, mxnet::OpStatePtr const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&) (this=0x1eeca90, retain_graph=false, state_ptr=..., inputs=std::vector of length 3, capacity 3 = {...}, reqs=std::vector of length 1, capacity 1 = {...}, outputs=std::vector of length 1, capacity 1 = {...}) at ../src/imperative/cached_op.cc:935
#9 0x00007fff431b59c6 in mxnet::CachedOp::Backward(bool, mxnet::OpStatePtr const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&) (this=0x1eeca90, retain_graph=false, state=..., inputs=std::vector of length 3, capacity 3 = {...}, reqs=std::vector of length 1, capacity 1 = {...}, outputs=std::vector of length 1, capacity 1 = {...}) at ../src/imperative/cached_op.cc:1046
#10 0x00007fff43219394 in (anonymous namespace)::InvokeOperator(const nnvm::IndexedGraph &, int, bool, const std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > &, mxnet::Context, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> > *, const std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > &, const std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > &, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > *, std::vector<unsigned int, std::allocator<unsigned int> > *, std::function<void(const mxnet::OpStatePtr&)>) (idx=..., node_idx=3, retain_graph=false, arrays=std::vector of length 4, capacity 4 = {...}, ctx=..., p_states=0x7fffffff63a0, ndinputs=std::vector of length 3, capacity 3 = {...}, ndoutputs=std::vector of length 1, capacity 1 = {...}, p_req=0x7fffffff5f90, p_ref_count=0x7fffffff6380, invoke=...) at ../src/imperative/imperative_utils.cc:91
#11 0x00007fff43219faf in mxnet::imperative::RunGraph(bool, nnvm::IndexedGraph const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, unsigned long, unsigned long, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> >&&, std::vector<unsigned int, std::allocator<unsigned int> >&&, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> >*, std::vector<mxnet::DispatchMode, std::allocator<mxnet::DispatchMode> > const&, bool, std::vector<mxnet::TShape, std::allocator<mxnet::TShape> >*, std::function<void (char const*, char const*, void*)> const&, bool) (retain_graph=false, idx=..., arrays=std::vector of length 4, capacity 4 = {...}, node_start=2, node_end=4, array_reqs=..., ref_count=..., p_states=0x7fffffff63a0, dispatch_modes=std::vector of length 4, capacity 4 = {...}, recording=false, shapes=0x0, callback=..., monitor_all=false)
at ../src/imperative/imperative_utils.cc:165
#12 0x00007fff431fcaa9 in mxnet::Imperative::Backward(std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, bool, bool, bool) (this=0x7fff51515ac0 <mxnet::Imperative::Get()::inst>, outputs=std::vector of length 1, capacity 1 = {...}, ograds=std::vector of length 1, capacity 1 = {...}, variables=std::vector of length 0, capacity 0, is_train=true, retain_graph=false, create_graph=false) at ../src/imperative/imperative.cc:616
#13 0x00007fff43062242 in MXAutogradBackwardEx(uint32_t, NDArrayHandle*, NDArrayHandle*, uint32_t, NDArrayHandle*, int, int, int, NDArrayHandle**, int**) (num_output=1, output_handles=0x7fff93f89de0, ograd_handles=0x7fff93f89c48, num_variables=0, var_handles=0x0, retain_graph=0, create_graph=0, is_train=1, grad_handles=0x0, grad_stypes=0x0) at ../src/c_api/c_api_ndarray.cc:419
And this is the stack trace when static_alloc
for internal CachedOp is set to false
in subgraph property. I used the following additional patch to obtain it:
diff --git a/src/operator/subgraph/default_subgraph_property.cc b/src/operator/subgraph/default_subgraph_property.cc
index ff51b6397..4228c6521 100644
--- a/src/operator/subgraph/default_subgraph_property.cc
+++ b/src/operator/subgraph/default_subgraph_property.cc
@@ -64,7 +64,7 @@ class DefaultSubgraphProperty: public SubgraphProperty {
n->attrs.name = "_CachedOp" + std::to_string(subgraph_id);
n->attrs.subgraphs.push_back(std::make_shared<nnvm::Symbol>(sym));
- std::vector<std::pair<std::string, std::string> > flags{{"static_alloc", "true"}};
+ std::vector<std::pair<std::string, std::string> > flags{};
n->attrs.parsed = std::make_shared<CachedOp>(sym, flags);
return n;
Back Trace:
Note: This time it fails in DynamicForward
Thread 1 "python3" received signal SIGABRT, Aborted.
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
51 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0 0x00007ffff7a22f47 in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1 0x00007ffff7a248b1 in __GI_abort () at abort.c:79
#2 0x00007fffe93a6957 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#3 0x00007fffe93acae6 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#4 0x00007fffe93acb21 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#5 0x00007fffe93acd54 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#6 0x00007fffe93d5012 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#7 0x00007fff42e355ac in __gnu_cxx::new_allocator<nnvm::NodeEntry>::allocate(unsigned long, void const*) (this=0x23893a0, __n=12297829382473034410) at /usr/include/c++/7/ext/new_allocator.h:102
#8 0x00007fff42e33c4c in std::allocator_traits<std::allocator<nnvm::NodeEntry> >::allocate(std::allocator<nnvm::NodeEntry>&, unsigned long) (__a=..., __n=12297829382473034410) at /usr/include/c++/7/bits/alloc_traits.h:436
#9 0x00007fff42e3162e in std::_Vector_base<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> >::_M_allocate(unsigned long) (this=0x23893a0, __n=12297829382473034410) at /usr/include/c++/7/bits/stl_vector.h:172
#10 0x00007fff42e2ec8e in std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> >::_M_allocate_and_copy<__gnu_cxx::__normal_iterator<nnvm::NodeEntry const*, std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > > >(unsigned long, __gnu_cxx::__normal_iterator<nnvm::NodeEntry const*, std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > >, __gnu_cxx::__normal_iterator<nnvm::NodeEntry const*, std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > >) (this=0x23893a0, __n=12297829382473034410, __first={node = <error reading variable: Cannot access memory at address 0xf0000000a>, index = 15, version = 0}, __last=
{node = <error reading variable: Cannot access memory at address 0x100000010>, index = 1362897120, version = 32767}) at /usr/include/c++/7/bits/stl_vector.h:1260
#11 0x00007fff42e2c50e in std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> >::operator=(std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > const&) (this=0x23893a0, __x=std::vector of length 0, capacity -41698 = {...}) at /usr/include/c++/7/bits/vector.tcc:206
#12 0x00007fff431c3b79 in nnvm::Graph::operator=(nnvm::Graph const&) (this=0x23893a0) at ../include/nnvm/graph.h:46
#13 0x00007fff431b3c7e in mxnet::CachedOp::DynamicBackward(bool, mxnet::OpStatePtr const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&) (this=0x1eeca90, retain_graph=false, op_state=..., inputs=std::vector of length 3, capacity 3 = {...}, reqs=std::vector of length 1, capacity 1 = {...}, outputs=std::vector of length 1, capacity 1 = {...}) at ../src/imperative/cached_op.cc:853
#14 0x00007fff431b5a00 in mxnet::CachedOp::Backward(bool, mxnet::OpStatePtr const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&) (this=0x1eeca90, retain_graph=false, state=..., inputs=std::vector of length 3, capacity 3 = {...}, reqs=std::vector of length 1, capacity 1 = {...}, outputs=std::vector of length 1, capacity 1 = {...}) at ../src/imperative/cached_op.cc:1048
#15 0x00007fff43219394 in (anonymous namespace)::InvokeOperator(const nnvm::IndexedGraph &, int, bool, const std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > &, mxnet::Context, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> > *, const std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > &, const std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > &, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > *, std::vector<unsigned int, std::allocator<unsigned int> > *, std::function<void(const mxnet::OpStatePtr&)>) (idx=..., node_idx=3, retain_graph=false, arrays=std::vector of length 4, capacity 4 = {...}, ctx=..., p_states=0x7fffffff63a0, ndinputs=std::vector of length 3, capacity 3 = {...}, ndoutputs=std::vector of length 1, capacity 1 = {...}, p_req=0x7fffffff5f90, p_ref_count=0x7fffffff6380, invoke=...) at ../src/imperative/imperative_utils.cc:91
#16 0x00007fff43219faf in mxnet::imperative::RunGraph(bool, nnvm::IndexedGraph const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, unsigned long, unsigned long, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> >&&, std::vector<unsigned int, std::allocator<unsigned int> >&&, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> >*, std::vector<mxnet::DispatchMode, std::allocator<mxnet::DispatchMode> > const&, bool, std::vector<mxnet::TShape, std::allocator<mxnet::TShape> >*, std::function<void (char const*, char const*, void*)> const&, bool) (retain_graph=false, idx=..., arrays=std::vector of length 4, capacity 4 = {...}, node_start=2, node_end=4, array_reqs=..., ref_count=..., p_states=0x7fffffff63a0, dispatch_modes=std::vector of length 4, capacity 4 = {...}, recording=false, shapes=0x0, callback=..., monitor_all=false)
at ../src/imperative/imperative_utils.cc:165
#17 0x00007fff431fcaa9 in mxnet::Imperative::Backward(std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, bool, bool, bool) (this=0x7fff51515ac0 <mxnet::Imperative::Get()::inst>, outputs=std::vector of length 1, capacity 1 = {...}, ograds=std::vector of length 1, capacity 1 = {...}, variables=std::vector of length 0, capacity 0, is_train=true, retain_graph=false, create_graph=false) at ../src/imperative/imperative.cc:616
#18 0x00007fff43062242 in MXAutogradBackwardEx(uint32_t, NDArrayHandle*, NDArrayHandle*, uint32_t, NDArrayHandle*, int, int, int, NDArrayHandle**, int**) (num_output=1, output_handles=0x7fff90f87de0, ograd_handles=0x7fff90f87c48, num_variables=0, var_handles=0x0, retain_graph=0, create_graph=0, is_train=1, grad_handles=0x0, grad_stypes=0x0) at ../src/c_api/c_api_ndarray.cc:419
Fixed by #19614
Description
When a computation graph is partitioned, ops are grouped into subgraphs based on the subgraph property. For CachedOp subgraphs containing reshape and/or transpose op, the backward pass fails.
Error Message
To Reproduce
Here's a simple script to reproduce the error with Reshape op:
It produces the following graph:
Steps to reproduce
(Paste the commands you ran that produced the error.)
pip3 install -U --user https://repo.mxnet.io/dist/python/cpu/mxnet-2.0.0b20200710-py2.py3-none-manylinux2014_x86_64.whl
Environment
We recommend using our script for collecting the diagnositc information. Run the following command and paste the outputs below: