Open 18217265596 opened 10 months ago
gcc : gcc version 11.2.0 cuda: cuda/12.1.1 the error file look like this
and
/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/backend/pytorch/tensor.py:449: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
assert input.numel() == input.storage().size(), (
Error executing job with overrides: ['inference.output_prefix=example_outputs/design_ppi', 'inference.input_pdb=input.pdb', 'contigmap.contigs=[A414-588/0 20-60]', 'inference.num_designs=10000', 'denoiser.noise_scale_ca=0.5', 'denoiser.noise_scale_frame=0.5']
Traceback (most recent call last):
File "/dssg/home/acct-clswg/clswg/apps/RFdiffusion/scripts/run_inference.py", line 94, in main
px0, x_t, seq_t, plddt = sampler.sample_step(
File "/dssg/home/acct-clswg/clswg/apps/RFdiffusion/rfdiffusion/inference/model_runners.py", line 664, in sample_step
msa_prev, pair_prev, px0, state_prev, alpha, logits, plddt = self.model(msa_masked,
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/dssg/home/acct-clswg/clswg/apps/RFdiffusion/rfdiffusion/RoseTTAFoldModel.py", line 102, in forward
msa, pair, R, T, alpha_s, state = self.simulator(seq, msa_latent, msa_full, pair, xyz[:,:,:3],
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/dssg/home/acct-clswg/clswg/apps/RFdiffusion/rfdiffusion/Track_module.py", line 420, in forward
msa_full, pair, R_in, T_in, state, alpha = self.extra_block[i_m](msa_full,
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/dssg/home/acct-clswg/clswg/apps/RFdiffusion/rfdiffusion/Track_module.py", line 332, in forward
R, T, state, alpha = self.str2str(msa, pair, R_in, T_in, xyz, state, idx, motif_mask=motif_mask, top_k=0)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
return func(*args, kwargs)
File "/dssg/home/acct-clswg/clswg/apps/RFdiffusion/rfdiffusion/Track_module.py", line 266, in forward
shift = self.se3(G, node.reshape(BL, -1, 1), l1_feats, edge_feats)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/dssg/home/acct-clswg/clswg/apps/RFdiffusion/rfdiffusion/SE3_network.py", line 83, in forward
return self.se3(G, node_features, edge_features)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/se3_transformer-1.0.0-py3.9.egg/se3_transformer/model/transformer.py", line 150, in forward
node_feats = self.graph_modules(node_feats, edge_feats, graph=graph, basis=basis)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/se3_transformer-1.0.0-py3.9.egg/se3_transformer/model/transformer.py", line 46, in forward
input = module(input, *args, *kwargs)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/se3_transformer-1.0.0-py3.9.egg/se3_transformer/model/layers/attention.py", line 163, in forward
z = self.attention(value, key, query, graph)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/se3_transformer-1.0.0-py3.9.egg/se3_transformer/model/layers/attention.py", line 83, in forward
edge_weights = edge_softmax(graph, edge_weights)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/ops/edge_softmax.py", line 136, in edge_softmax
return edge_softmax_internal(
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/backend/pytorch/sparse.py", line 1116, in edge_softmax
return EdgeSoftmax.apply(args)
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(args, kwargs) # type: ignore[misc]
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/backend/pytorch/sparse.py", line 710, in forward
score_max = _gspmm(gidx, "copy_rhs", "max", None, score)[0]
File "/dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/_sparse_ops.py", line 239, in _gspmm
_CAPI_DGLKernelSpMM(
File "dgl/_ffi/_cython/./function.pxi", line 295, in dgl._ffi._cy3.core.FunctionBase.call
File "dgl/_ffi/_cython/./function.pxi", line 241, in dgl._ffi._cy3.core.FuncCall
dgl._ffi.base.DGLError: [20:18:50] /opt/dgl/src/runtime/cuda/cuda_deviceapi.cc:117: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: out of memory
Stack trace:
[bt] (0) /dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/libdgl.so(+0x8b9685) [0x151d9eca0685]
[bt] (1) /dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::CUDADeviceAPI::AllocDataSpace(DGLContext, unsigned long, unsigned long, DGLDataType)+0x17d) [0x151d9eca1fed]
[bt] (2) /dssg/home/acct-clswg/clswg/.conda/envs/SE3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Empty(std::vector<long, std::allocator
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
now that i had a clue its that i didt specify the hotspots
i think this issu can be closed, i will leave it open for people who get into the same problem to see
im doing binder design on A100 with " python /dssg/home/acct-clswg/clswg/apps/RFdiffusion/scripts/run_inference.py \ inference.output_prefix=example_outputs/design_ppi \ inference.input_pdb=input.pdb \ 'contigmap.contigs=[A414-588/0 20-50]' \ inference.num_designs=10000 \ denoiser.noise_scale_ca=0.5 \ denoiser.noise_scale_frame=0.5" but always get cuda out of memory after maybe 10 or 20 designs the input pdb is about 300aa long is the length of my binder proper or can anything else trigger such issue? i doubt A100 would go out of memory on this case