fastai / fastai

The fastai deep learning library
http://docs.fast.ai
Apache License 2.0
26.22k stars 7.56k forks source link

Crashing after training is complete but before saving epoch? #517

Closed QwertyCoolMT closed 6 years ago

QwertyCoolMT commented 6 years ago

RuntimeError Traceback (most recent call last)

in () ----> 1 learn.fit(lr, 2) ~/fastai/fastai/courses/dl1/fastai/learner.py in fit(self, lrs, n_cycle, wds, **kwargs) 285 self.sched = None 286 layer_opt = self.get_layer_opt(lrs, wds) --> 287 return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs) 288 289 def warm_up(self, lr, wds=None): ~/fastai/fastai/courses/dl1/fastai/learner.py in fit_gen(self, model, data, layer_opt, n_cycle, cycle_len, cycle_mult, cycle_save_name, best_save_name, use_clr, use_clr_beta, metrics, callbacks, use_wd_sched, norm_wds, wds_sched_mult, use_swa, swa_start, swa_eval_freq, **kwargs) 232 metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16, 233 swa_model=self.swa_model if use_swa else None, swa_start=swa_start, --> 234 swa_eval_freq=swa_eval_freq, **kwargs) 235 236 def get_layer_groups(self): return self.models.get_layer_groups() ~/fastai/fastai/courses/dl1/fastai/model.py in fit(model, data, n_epochs, opt, crit, metrics, callbacks, stepper, swa_model, swa_start, swa_eval_freq, **kwargs) 157 158 if not all_val: --> 159 vals = validate(model_stepper, cur_data.val_dl, metrics) 160 stop=False 161 for cb in callbacks: stop = stop or cb.on_epoch_end(vals) ~/fastai/fastai/courses/dl1/fastai/model.py in validate(stepper, dl, metrics) 218 if isinstance(x,list): batch_cnts.append(len(x[0])) 219 else: batch_cnts.append(len(x)) --> 220 loss.append(to_np(l)) 221 res.append([f(preds.data, y) for f in metrics]) 222 return [np.average(loss, 0, weights=batch_cnts)] + list(np.average(np.stack(res), 0, weights=batch_cnts)) ~/fastai/fastai/courses/dl1/fastai/core.py in to_np(v) 59 if isinstance(v, Variable): v=v.data 60 if isinstance(v, torch.cuda.HalfTensor): v=v.float() ---> 61 return v.cpu().numpy() 62 63 IS_TORCH_04 = LooseVersion(torch.__version__) >= LooseVersion('0.4') ~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/tensor.py in cpu(self) 43 def cpu(self): 44 r"""Returns a CPU copy of this tensor if it's not already on the CPU""" ---> 45 return self.type(getattr(torch, self.__class__.__name__)) 46 47 def double(self): ~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/cuda/__init__.py in type(self, *args, **kwargs) 394 def type(self, *args, **kwargs): 395 with device(self.get_device()): --> 396 return super(_CudaBase, self).type(*args, **kwargs) 397 398 __new__ = _lazy_new ~/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/_utils.py in _type(self, new_type, async) 36 if new_type.is_sparse: 37 raise RuntimeError("Cannot cast dense tensor to sparse tensor") ---> 38 return new_type(self.size()).copy_(self, async) 39 40 RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/lib/THC/generic/THCTensorCopy.c:70
jph00 commented 6 years ago

Please use the forums