Closed mzhadigerov closed 2 years ago
I'm getting this error while running eval script:
eval
python main.py --mode eval --data_type structure --config_file structure_config.json --data_root_dir data/ --model_load_path data/model/structure.pth --debug {'lr': 5e-05, 'lr_backbone': 1e-05, 'batch_size': 2, 'weight_decay': 0.0001, 'epochs': 20, 'lr_drop': 1, 'lr_gamma': 0.9, 'clip_max_norm': 0.1, 'backbone': 'resnet18', 'num_classes': 6, 'dilation': False, 'position_embedding': 'sine', 'emphasized_weights': {}, 'enc_layers': 6, 'dec_layers': 6, 'dim_feedforward': 2048, 'hidden_dim': 256, 'dropout': 0.1, 'nheads': 8, 'num_queries': 125, 'pre_norm': True, 'masks': False, 'aux_loss': False, 'mask_loss_coef': 1, 'dice_loss_coef': 1, 'ce_loss_coef': 1, 'bbox_loss_coef': 5, 'giou_loss_coef': 2, 'eos_coef': 0.4, 'set_cost_class': 1, 'set_cost_bbox': 5, 'set_cost_giou': 2, 'device': 'cuda', 'seed': 42, 'start_epoch': 0, 'num_workers': 2, 'data_root_dir': 'data/', 'config_file': 'structure_config.json', 'data_type': 'structure', 'model_load_path': 'data/model/structure.pth', 'metrics_save_filepath': '', 'table_words_dir': None, 'mode': 'eval', 'debug': True, 'checkpoint_freq': 1, '__module__': '__main__', '__dict__': <attribute '__dict__' of 'Args' objects>, '__weakref__': <attribute '__weakref__' of 'Args' objects>, '__doc__': None} ---------------------------------------------------------------------------------------------------- loading model loading model from checkpoint loading data creating index... index created! Traceback (most recent call last): File "main.py", line 373, in <module> main() File "main.py", line 365, in main eval_coco(model, criterion, postprocessors, data_loader_test, dataset_test, device) File "/home/ali/AI/nexus/table-transformer/src/eval.py", line 653, in eval_coco device, None) File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 15, in decorate_context return func(*args, **kwargs) File "../detr/engine.py", line 97, in evaluate outputs = model(samples) File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "../detr/models/detr.py", line 65, in forward hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "../detr/models/transformer.py", line 56, in forward memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "../detr/models/transformer.py", line 78, in forward src_key_padding_mask=src_key_padding_mask, pos=pos) File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "../detr/models/transformer.py", line 183, in forward return self.forward_pre(src, src_mask, src_key_padding_mask, pos) File "../detr/models/transformer.py", line 171, in forward_pre key_padding_mask=src_key_padding_mask)[0] File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/modules/activation.py", line 845, in forward attn_mask=attn_mask) File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/functional.py", line 3827, in multi_head_attention_forward q = linear(query, _w, _b) File "/home/ali/AI/nexus/table-transformer/venv/lib/python3.7/site-packages/torch/nn/functional.py", line 1612, in linear output = input.matmul(weight.t()) RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
Was able to solve by adding --device cpu flag
--device cpu
I'm getting this error while running
eval
script: