Closed zhoudan-brandeis closed 1 year ago
After doing some digging, appears that this error is not isolated to Cosmos-HMM model, also appears for default cosmos fitting settings
Which version of Tapqir are you using? Can you try using the example data from the tutorial and see if you get the same error message?
I was on 1.1.8 when this error appeared; I have since updated to 1.1.12. I will try with Grace's example data and report back
Same error with example data:
RuntimeError Traceback (most recent call last) File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:174, in TraceHandler.call(self, *args, *kwargs) 173 try: --> 174 ret = self.fn(args, **kwargs) 175 except (ValueError, RuntimeError) as e:
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/hmm.py:326, in hmm.guide(self) 325 ndx = ndx[:, None, None] --> 326 mask = Vindex(self.data.mask)[ndx].to(self.device) 327 with handlers.mask(mask=mask):
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:217, in Vindex.getitem(self, args) 216 def getitem(self, args): --> 217 return vindex(self._tensor, args)
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:152, in vindex(tensor, args) 151 if not isinstance(args, tuple): --> 152 return tensor[args] 153 if not args:
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last) File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/model.py:220, in Model.run(self, num_iter, progress_bar) 219 try: --> 220 self.iter_loss = self.svi.step() 221 # save a checkpoint every 200 iterations
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/infer/svi.py:145, in SVI.step(self, *args, *kwargs) 144 with poutine.trace(param_only=True) as param_capture: --> 145 loss = self.loss_and_grads(self.model, self.guide, args, **kwargs) 147 params = set( 148 site["value"].unconstrained() for site in param_capture.trace.nodes.values() 149 )
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/contrib/funsor/infer/elbo.py:20, in ELBO.loss_and_grads(self, model, guide, *args, kwargs) 19 def loss_and_grads(self, model, guide, *args, *kwargs): ---> 20 loss = self.differentiable_loss(model, guide, args, kwargs) 21 loss.backward()
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/infer/elbo.py:29, in TraceMarkovEnum_ELBO.differentiable_loss(self, model, guide, *args, kwargs) 22 with plate( 23 size=self.num_particles 24 ) if self.num_particles > 1 else contextlib.ExitStack(), enum( (...) 27 else None 28 ): ---> 29 guide_tr = trace()(guide).get_trace(*args, *kwargs) 30 model_tr = trace()(replay(model, trace=guide_tr)).get_trace(args, kwargs)
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:198, in TraceHandler.get_trace(self, *args, *kwargs) 191 """ 192 :returns: data structure 193 :rtype: pyro.poutine.Trace (...) 196 Calls this poutine and returns its trace instead of the function's return value. 197 """ --> 198 self(args, **kwargs) 199 return self.msngr.get_trace()
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:180, in TraceHandler.call(self, *args, **kwargs) 179 exc = exc.with_traceback(traceback) --> 180 raise exc from e 181 self.msngr.trace.add_node( 182 "_RETURN", name="_RETURN", type="return", value=ret 183 )
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:174, in TraceHandler.call(self, *args, *kwargs) 173 try: --> 174 ret = self.fn(args, **kwargs) 175 except (ValueError, RuntimeError) as e:
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/hmm.py:326, in hmm.guide(self) 325 ndx = ndx[:, None, None] --> 326 mask = Vindex(self.data.mask)[ndx].to(self.device) 327 with handlers.mask(mask=mask):
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:217, in Vindex.getitem(self, args) 216 def getitem(self, args): --> 217 return vindex(self._tensor, args)
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:152, in vindex(tensor, args) 151 if not isinstance(args, tuple): --> 152 return tensor[args] 153 if not args:
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
Trace Shapes:
Param Sites:
gain_loc
gain_beta
init_mean 1 2
init_size 1 1
trans_mean 1 2 2
trans_size 1 2 1
lamda_loc 1
lamda_beta 1
proximity_loc
proximity_size
Sample Sites:
gain dist |
value |
init dist | 1 2
value | 1 2
trans dist | 1 2 2
value | 1 2 2
lamda dist | 1
value | 1
proximity dist |
value |
spots dist |
value 2 |
aois dist |
value 10 |
channels dist |
value 1 |
During handling of the above exception, another exception occurred:
AssertionError Traceback (most recent call last) File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/gui.py:528, in fitCmd(b, layout, out, DEFAULTS) 526 DEFAULTS["priors"].update(layout["priors"].children[0].kwargs) 527 with out: --> 528 fit( 529 **layout.kwargs, 530 k_max=2, 531 funsor=False, 532 pykeops=True, 533 no_input=True, 534 progress_bar=tqdm_notebook, 535 ) 537 out.clear_output(wait=True)
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/main.py:472, in fit(model, S, cuda, nbatch_size, fbatch_size, learning_rate, num_iter, k_max, matlab, funsor, pykeops, overwrite, no_input, progress_bar) 470 model.init(learning_rate, nbatch_size, fbatch_size) 471 try: --> 472 model.run(num_iter, progress_bar=progress_bar) 473 except CudaOutOfMemoryError: 474 logger.exception("Failed to fit the data")
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/model.py:242, in Model.run(self, num_iter, progress_bar) 238 logger.warning( 239 f"Iteration #{self.iter} restarting with a new seed: {new_seed}." 240 ) 241 except RuntimeError as err: --> 242 assert err.args[0].startswith("CUDA out of memory") 243 raise CudaOutOfMemoryError() 244 else:
AssertionError:
Dan, are you running Tapqir on the lab computer (with GPU) or your own laptop? Do you have CUDA installed?
It all works fine on the lab computer for me. So I suspect it is something that might have to do with the installation if it is another computer.
I am running on the lab computer, on my @brandeis USERS account. I was able to extract AOIs just fine
Dan, can you please try out the new version of Tapqir v1.1.14? Let me know if you still encounter this issue.
Hi Yerdos, looks like 1.1.14 is working so far!