Error trying to fit data to Cosmos-HMM model

zhoudan-brandeis commented 1 year ago

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:174, in TraceHandler.__call__(self, *args, **kwargs)
    173 try:
--> 174     ret = self.fn(*args, **kwargs)
    175 except (ValueError, RuntimeError) as e:

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/hmm.py:326, in hmm.guide(self)
    325 ndx = ndx[:, None, None]
--> 326 mask = Vindex(self.data.mask)[ndx].to(self.device)
    327 with handlers.mask(mask=mask):

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:217, in Vindex.__getitem__(self, args)
    216 def __getitem__(self, args):
--> 217     return vindex(self._tensor, args)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:152, in vindex(tensor, args)
    151 if not isinstance(args, tuple):
--> 152     return tensor[args]
    153 if not args:

RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/model.py:220, in Model.run(self, num_iter, progress_bar)
    219 try:
--> 220     self.iter_loss = self.svi.step()
    221     # save a checkpoint every 200 iterations

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/infer/svi.py:145, in SVI.step(self, *args, **kwargs)
    144 with poutine.trace(param_only=True) as param_capture:
--> 145     loss = self.loss_and_grads(self.model, self.guide, *args, **kwargs)
    147 params = set(
    148     site["value"].unconstrained() for site in param_capture.trace.nodes.values()
    149 )

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/contrib/funsor/infer/elbo.py:20, in ELBO.loss_and_grads(self, model, guide, *args, **kwargs)
     19 def loss_and_grads(self, model, guide, *args, **kwargs):
---> 20     loss = self.differentiable_loss(model, guide, *args, **kwargs)
     21     loss.backward()

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/infer/elbo.py:29, in TraceMarkovEnum_ELBO.differentiable_loss(self, model, guide, *args, **kwargs)
     22 with plate(
     23     size=self.num_particles
     24 ) if self.num_particles > 1 else contextlib.ExitStack(), enum(
   (...)
     27     else None
     28 ):
---> 29     guide_tr = trace()(guide).get_trace(*args, **kwargs)
     30     model_tr = trace()(replay(model, trace=guide_tr)).get_trace(*args, **kwargs)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:198, in TraceHandler.get_trace(self, *args, **kwargs)
    191 """
    192 :returns: data structure
    193 :rtype: pyro.poutine.Trace
   (...)
    196 Calls this poutine and returns its trace instead of the function's return value.
    197 """
--> 198 self(*args, **kwargs)
    199 return self.msngr.get_trace()

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:180, in TraceHandler.__call__(self, *args, **kwargs)
    179     exc = exc.with_traceback(traceback)
--> 180     raise exc from e
    181 self.msngr.trace.add_node(
    182     "_RETURN", name="_RETURN", type="return", value=ret
    183 )

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:174, in TraceHandler.__call__(self, *args, **kwargs)
    173 try:
--> 174     ret = self.fn(*args, **kwargs)
    175 except (ValueError, RuntimeError) as e:

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/hmm.py:326, in hmm.guide(self)
    325 ndx = ndx[:, None, None]
--> 326 mask = Vindex(self.data.mask)[ndx].to(self.device)
    327 with handlers.mask(mask=mask):

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:217, in Vindex.__getitem__(self, args)
    216 def __getitem__(self, args):
--> 217     return vindex(self._tensor, args)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:152, in vindex(tensor, args)
    151 if not isinstance(args, tuple):
--> 152     return tensor[args]
    153 if not args:

RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
 Trace Shapes:             
  Param Sites:             
      gain_loc             
     gain_beta             
     init_mean    1 3      
     init_size    1 1      
    trans_mean 1  3 3      
    trans_size 1  3 1      
     lamda_loc      1      
    lamda_beta      1      
 proximity_loc             
proximity_size             
 Sample Sites:             
     gain dist      |      
         value      |      
     init dist      | 1 3  
         value      | 1 3  
    trans dist      | 1 3 3
         value      | 1 3 3
    lamda dist      | 1    
         value      | 1    
proximity dist      |      
         value      |      
    spots dist      |      
         value    2 |      
     aois dist      |      
         value   10 |      
 channels dist      |      
         value    1 |      

During handling of the above exception, another exception occurred:

AssertionError                            Traceback (most recent call last)
File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/gui.py:528, in fitCmd(b, layout, out, DEFAULTS)
    526 DEFAULTS["priors"].update(layout["priors"].children[0].kwargs)
    527 with out:
--> 528     fit(
    529         **layout.kwargs,
    530         k_max=2,
    531         funsor=False,
    532         pykeops=True,
    533         no_input=True,
    534         progress_bar=tqdm_notebook,
    535     )
    537 out.clear_output(wait=True)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/main.py:472, in fit(model, S, cuda, nbatch_size, fbatch_size, learning_rate, num_iter, k_max, matlab, funsor, pykeops, overwrite, no_input, progress_bar)
    470 model.init(learning_rate, nbatch_size, fbatch_size)
    471 try:
--> 472     model.run(num_iter, progress_bar=progress_bar)
    473 except CudaOutOfMemoryError:
    474     logger.exception("Failed to fit the data")

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/model.py:242, in Model.run(self, num_iter, progress_bar)
    238         logger.warning(
    239             f"Iteration #{self.iter} restarting with a new seed: {new_seed}."
    240         )
    241     except RuntimeError as err:
--> 242         assert err.args[0].startswith("CUDA out of memory")
    243         raise CudaOutOfMemoryError()
    244 else:

AssertionError:

zhoudan-brandeis commented 1 year ago

After doing some digging, appears that this error is not isolated to Cosmos-HMM model, also appears for default cosmos fitting settings

ordabayevy commented 1 year ago

Which version of Tapqir are you using? Can you try using the example data from the tutorial and see if you get the same error message?

zhoudan-brandeis commented 1 year ago

I was on 1.1.8 when this error appeared; I have since updated to 1.1.12. I will try with Grace's example data and report back

zhoudan-brandeis commented 1 year ago

Same error with example data:

Fitting the data ... 0% 0/100000 [00:00<?, ?it/s]

RuntimeError Traceback (most recent call last) File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:174, in TraceHandler.call(self, *args, *kwargs) 173 try: --> 174 ret = self.fn(args, **kwargs) 175 except (ValueError, RuntimeError) as e:

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/hmm.py:326, in hmm.guide(self) 325 ndx = ndx[:, None, None] --> 326 mask = Vindex(self.data.mask)[ndx].to(self.device) 327 with handlers.mask(mask=mask):

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:217, in Vindex.getitem(self, args) 216 def getitem(self, args): --> 217 return vindex(self._tensor, args)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:152, in vindex(tensor, args) 151 if not isinstance(args, tuple): --> 152 return tensor[args] 153 if not args:

RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)

The above exception was the direct cause of the following exception:

RuntimeError Traceback (most recent call last) File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/model.py:220, in Model.run(self, num_iter, progress_bar) 219 try: --> 220 self.iter_loss = self.svi.step() 221 # save a checkpoint every 200 iterations

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/infer/svi.py:145, in SVI.step(self, *args, *kwargs) 144 with poutine.trace(param_only=True) as param_capture: --> 145 loss = self.loss_and_grads(self.model, self.guide, args, **kwargs) 147 params = set( 148 site["value"].unconstrained() for site in param_capture.trace.nodes.values() 149 )

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/contrib/funsor/infer/elbo.py:20, in ELBO.loss_and_grads(self, model, guide, *args, kwargs) 19 def loss_and_grads(self, model, guide, *args, *kwargs): ---> 20 loss = self.differentiable_loss(model, guide, args, kwargs) 21 loss.backward()

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/infer/elbo.py:29, in TraceMarkovEnum_ELBO.differentiable_loss(self, model, guide, *args, kwargs) 22 with plate( 23 size=self.num_particles 24 ) if self.num_particles > 1 else contextlib.ExitStack(), enum( (...) 27 else None 28 ): ---> 29 guide_tr = trace()(guide).get_trace(*args, *kwargs) 30 model_tr = trace()(replay(model, trace=guide_tr)).get_trace(args, kwargs)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:198, in TraceHandler.get_trace(self, *args, *kwargs) 191 """ 192 :returns: data structure 193 :rtype: pyro.poutine.Trace (...) 196 Calls this poutine and returns its trace instead of the function's return value. 197 """ --> 198 self(args, **kwargs) 199 return self.msngr.get_trace()

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:180, in TraceHandler.call(self, *args, **kwargs) 179 exc = exc.with_traceback(traceback) --> 180 raise exc from e 181 self.msngr.trace.add_node( 182 "_RETURN", name="_RETURN", type="return", value=ret 183 )

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/poutine/trace_messenger.py:174, in TraceHandler.call(self, *args, *kwargs) 173 try: --> 174 ret = self.fn(args, **kwargs) 175 except (ValueError, RuntimeError) as e:

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/hmm.py:326, in hmm.guide(self) 325 ndx = ndx[:, None, None] --> 326 mask = Vindex(self.data.mask)[ndx].to(self.device) 327 with handlers.mask(mask=mask):

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:217, in Vindex.getitem(self, args) 216 def getitem(self, args): --> 217 return vindex(self._tensor, args)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/pyro/ops/indexing.py:152, in vindex(tensor, args) 151 if not isinstance(args, tuple): --> 152 return tensor[args] 153 if not args:

During handling of the above exception, another exception occurred:

AssertionError Traceback (most recent call last) File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/gui.py:528, in fitCmd(b, layout, out, DEFAULTS) 526 DEFAULTS["priors"].update(layout["priors"].children[0].kwargs) 527 with out: --> 528 fit( 529 **layout.kwargs, 530 k_max=2, 531 funsor=False, 532 pykeops=True, 533 no_input=True, 534 progress_bar=tqdm_notebook, 535 ) 537 out.clear_output(wait=True)

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/main.py:472, in fit(model, S, cuda, nbatch_size, fbatch_size, learning_rate, num_iter, k_max, matlab, funsor, pykeops, overwrite, no_input, progress_bar) 470 model.init(learning_rate, nbatch_size, fbatch_size) 471 try: --> 472 model.run(num_iter, progress_bar=progress_bar) 473 except CudaOutOfMemoryError: 474 logger.exception("Failed to fit the data")

File ~/anaconda3/envs/tapqir-env/lib/python3.8/site-packages/tapqir/models/model.py:242, in Model.run(self, num_iter, progress_bar) 238 logger.warning( 239 f"Iteration #{self.iter} restarting with a new seed: {new_seed}." 240 ) 241 except RuntimeError as err: --> 242 assert err.args[0].startswith("CUDA out of memory") 243 raise CudaOutOfMemoryError() 244 else:

AssertionError:

ordabayevy commented 1 year ago

Dan, are you running Tapqir on the lab computer (with GPU) or your own laptop? Do you have CUDA installed?

ordabayevy commented 1 year ago

It all works fine on the lab computer for me. So I suspect it is something that might have to do with the installation if it is another computer.

zhoudan-brandeis commented 1 year ago

I am running on the lab computer, on my @brandeis USERS account. I was able to extract AOIs just fine

ordabayevy commented 1 year ago

Dan, can you please try out the new version of Tapqir v1.1.14? Let me know if you still encounter this issue.

zhoudan-brandeis commented 1 year ago

Hi Yerdos, looks like 1.1.14 is working so far!

gelles-brandeis / tapqir

Error trying to fit data to Cosmos-HMM model #396

Fitting the data ... 0% 0/100000 [00:00<?, ?it/s]