pytorch RuntimeError:CUDA 错误:触发设备端断言


我在 google colab 上有一个笔记本失败并出现以下错误

RuntimeError                              Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/fastai/ in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     93         exception = e
---> 94         raise e
     95     finally: cb_handler.on_train_end(exception)

/usr/local/lib/python3.6/dist-packages/fastai/ in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     85                 if cb_handler.on_batch_end(loss): break

/usr/local/lib/python3.6/dist-packages/fastai/ in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
     24     if opt is not None:
---> 25         loss = cb_handler.on_backward_begin(loss)
     26         loss.backward()

/usr/local/lib/python3.6/dist-packages/fastai/ in on_backward_begin(self, loss)
    223         for cb in self.callbacks:
--> 224             a = cb.on_backward_begin(**self.state_dict)
    225             if a is not None: self.state_dict['last_loss'] = a

/usr/local/lib/python3.6/dist-packages/fastai/ in on_backward_begin(self, smooth_loss, **kwargs)
    266         if self.pbar is not None and hasattr(self.pbar,'child'):
--> 267             self.pbar.child.comment = f'{smooth_loss:.4f}'

/usr/local/lib/python3.6/dist-packages/torch/ in __format__(self, format_spec)
    377         if self.dim() == 0:
--> 378             return self.item().__format__(format_spec)
    379         return object.__format__(self, format_spec)

RuntimeError: CUDA error: device-side assert triggered

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-33-dd390b1c8108> in <module>()
----> 1 lr_find(learn)
      2 learn.recorder.plot()

/usr/local/lib/python3.6/dist-packages/fastai/ in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
     26     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
     27     a = int(np.ceil(num_it/len(
---> 28, start_lr, callbacks=[cb], **kwargs)
     30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:

/usr/local/lib/python3.6/dist-packages/fastai/ in fit(self, epochs, lr, wd, callbacks)
    160         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
    161         fit(epochs, self.model, self.loss_func, opt=self.opt,, metrics=self.metrics,
--> 162             callbacks=self.callbacks+callbacks)
    164     def create_opt(self, lr:Floats, wd:Floats=0.)->None:

/usr/local/lib/python3.6/dist-packages/fastai/ in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
     93         exception = e
     94         raise e
---> 95     finally: cb_handler.on_train_end(exception)
     97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,

/usr/local/lib/python3.6/dist-packages/fastai/ in on_train_end(self, exception)
    254     def on_train_end(self, exception:Union[bool,Exception])->None:
    255         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
--> 256         self('train_end', exception=exception)
    258 class AverageMetric(Callback):

/usr/local/lib/python3.6/dist-packages/fastai/ in __call__(self, cb_name, call_mets, **kwargs)
    185         "Call through to all of the `CallbakHandler` functions."
    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

/usr/local/lib/python3.6/dist-packages/fastai/ in <listcomp>(.0)
    185         "Call through to all of the `CallbakHandler` functions."
    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:

/usr/local/lib/python3.6/dist-packages/fastai/callbacks/ in on_train_end(self, **kwargs)
     45         # restore the valid_dl we turned of on `__init__`
     46 = self.valid_dl
---> 47         self.learn.load('tmp')
     48         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()
     49         print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')

/usr/local/lib/python3.6/dist-packages/fastai/ in load(self, name, device)
    202         "Load model `name` from `self.model_dir` using `device`, defaulting to ``."
    203         if device is None: device =
--> 204         self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))
    205         return self

/usr/local/lib/python3.6/dist-packages/torch/ in load(f, map_location, pickle_module)
    356         f = open(f, 'rb')
    357     try:
--> 358         return _load(f, map_location, pickle_module)
    359     finally:
    360         if new_fd:

/usr/local/lib/python3.6/dist-packages/torch/ in _load(f, map_location, pickle_module)
    527     unpickler = pickle_module.Unpickler(f)
    528     unpickler.persistent_load = persistent_load
--> 529     result = unpickler.load()
    531     deserialized_storage_keys = pickle_module.load(f)

/usr/local/lib/python3.6/dist-packages/torch/ in persistent_load(saved_id)
    493             if root_key not in deserialized_objects:
    494                 deserialized_objects[root_key] = restore_location(
--> 495                     data_type(size), location)
    496             storage = deserialized_objects[root_key]
    497             if view_metadata is not None:

/usr/local/lib/python3.6/dist-packages/torch/ in restore_location(storage, location)
    376     elif isinstance(map_location, torch.device):
    377         def restore_location(storage, location):
--> 378             return default_restore_location(storage, str(map_location))
    379     else:
    380         def restore_location(storage, location):

/usr/local/lib/python3.6/dist-packages/torch/ in default_restore_location(storage, location)
    102 def default_restore_location(storage, location):
    103     for _, _, fn in _package_registry:
--> 104         result = fn(storage, location)
    105         if result is not None:
    106             return result

/usr/local/lib/python3.6/dist-packages/torch/ in _cuda_deserialize(obj, location)
     84                                'to an existing device.'.format(
     85                                    device, torch.cuda.device_count()))
---> 86         return obj.cuda(device)

/usr/local/lib/python3.6/dist-packages/torch/ in _cuda(self, device, non_blocking, **kwargs)
     74         else:
     75             new_type = getattr(torch.cuda, self.__class__.__name__)
---> 76             return new_type(self.size()).copy_(self, non_blocking)

RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

没有关于真正原因的信息,我试图通过使用这样的单元强制 cuda 在一个 gpu 上运行(如此处建议) 获取堆栈跟踪



还有其他方法可以与 Google Colab 一起使用吗?

翻译遵循 CC BY-SA 4.0 许可协议

1 个回答

确保你的目标值从零开始到类数 - 1。例如:你有 100 个分类类,所以你的目标应该是从 0 到 99

翻译遵循 CC BY-SA 4.0 许可协议

