mlcommons / training

Reference implementations of MLPerf™ training benchmarks
https://mlcommons.org/en/groups/training
Apache License 2.0
1.6k stars 553 forks source link

Object_detection error "cannot import name '_C' from 'maskrcnn_benchmark'" #623

Closed mahmoodn closed 1 year ago

mahmoodn commented 1 year ago

Hi, When I want to run the object_detection script, I get the following error:

root@bb0af6242a75:/workspace/object_detection# cat run_and_time.sh 
#!/bin/bash

# Runs benchmark and reports time to convergence

pushd pytorch

# Single GPU training
time python tools/train_mlperf.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" \
       SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" SOLVER.BASE_LR 0.0025

popd
root@bb0af6242a75:/workspace/object_detection# ./run_and_time.sh 
/workspace/object_detection/pytorch /workspace/object_detection
Traceback (most recent call last):
  File "tools/train_mlperf.py", line 36, in <module>
    from maskrcnn_benchmark.engine.inference import inference
  File "/workspace/object_detection/pytorch/maskrcnn_benchmark/engine/inference.py", line 23, in <module>
    from maskrcnn_benchmark.data.datasets.evaluation import evaluate
  File "/workspace/object_detection/pytorch/maskrcnn_benchmark/data/datasets/evaluation/__init__.py", line 16, in <module>
    from .coco import coco_evaluation
  File "/workspace/object_detection/pytorch/maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py", line 14, in <module>
    from .coco_eval import do_coco_evaluation
  File "/workspace/object_detection/pytorch/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py", line 23, in <module>
    from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
  File "/workspace/object_detection/pytorch/maskrcnn_benchmark/structures/boxlist_ops.py", line 19, in <module>
    from maskrcnn_benchmark.layers import nms as _box_nms
  File "/workspace/object_detection/pytorch/maskrcnn_benchmark/layers/__init__.py", line 21, in <module>
    from .nms import nms
  File "/workspace/object_detection/pytorch/maskrcnn_benchmark/layers/nms.py", line 16, in <module>
    from maskrcnn_benchmark import _C
ImportError: cannot import name '_C' from 'maskrcnn_benchmark' (/workspace/object_detection/pytorch/maskrcnn_benchmark/__init__.py)

Any idea about that? I was able to download the data and it was fine. Please see below:

mnaderan@rtx3080:object_detection$ source download_dataset.sh
~/training/object_detection/pytorch/datasets/coco ~/training/object_detection
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 73.4M  100 73.4M    0     0  8980k      0  0:00:08  0:00:08 --:--:-- 11.0M
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 18.0G  100 18.0G    0     0  9273k      0  0:33:56  0:33:56 --:--:-- 10.3M
Archive:  train2017.zip
   creating: train2017/
 extracting: train2017/000000147328.jpg  
...
extracting: val2017/000000500826.jpg  
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  241M  100  241M    0     0  8135k      0  0:00:30  0:00:30 --:--:-- 10.6M
Archive:  annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  
~/training/object_detection

$ nvidia-docker run -v /home/mahmood/training:/workspace -t -i --rm --ipc=host mlperf/object_detection /bin/bash
root@bb0af6242a75:/workspace/object_detection# ls
Dockerfile  README.md  download_dataset.sh  object_detection  pytorch  run_and_time.sh
mahmoodn commented 1 year ago

I was able to fix that by rerunning the setup file again inside the docker image.

root@e4601b48a00d:/workspace/object_detection# cd pytorch/
root@e4601b48a00d:/workspace/object_detection/pytorch# python setup.py clean build develop --user
running clean
running build
running build_py
creating build
...
Creating /root/.local/lib/python3.7/site-packages/maskrcnn-benchmark.egg-link (link to .)
maskrcnn-benchmark 0.1 is already the active version in easy-install.pth

Installed /workspace/object_detection/pytorch
Processing dependencies for maskrcnn-benchmark==0.1
Finished processing dependencies for maskrcnn-benchmark==0.1

root@e4601b48a00d:/workspace/object_detection/pytorch# cd ..
root@e4601b48a00d:/workspace/object_detection# ./run_and_time.sh 
/workspace/object_detection/pytorch /workspace/object_detection
:::MLLOG {"namespace": "", "time_ms": 1676295832931, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "tools/train_mlperf.py", "lineno": 216}}
:::MLLOG {"namespace": "", "time_ms": 1676295832967, "event_type": "POINT_IN_TIME", "key": "seed", "value": 2952485400, "metadata": {"file": "tools/train_mlperf.py", "lineno": 263}}
2023-02-13 13:43:52,973 maskrcnn_benchmark INFO: Using 1 GPUs
2023-02-13 13:43:52,973 maskrcnn_benchmark INFO: Namespace(config_file='configs/e2e_mask_rcnn_R_50_FPN_1x.yaml', distributed=False, local_rank=0, opts=['SOLVER.IMS_PER_BATCH', '2', 'TEST.IMS_PER_BATCH', '1', 'SOLVER.MAX_ITER', '720000', 'SOLVER.STEPS', '(480000, 640000)', 'SOLVER.BASE_LR', '0.0025'], seed=2952485400)
2023-02-13 13:43:52,973 maskrcnn_benchmark INFO: Worker 0: Setting seed 1907235436
2023-02-13 13:43:52,973 maskrcnn_benchmark INFO: Collecting env info (might take some time)
2023-02-13 13:43:53,887 maskrcnn_benchmark INFO: 
PyTorch version: 1.10.0
Is debug build: False
CUDA used to build PyTorch: 11.3
ROCM used to build PyTorch: N/A

OS: Ubuntu 18.04.5 LTS (x86_64)
GCC version: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Clang version: Could not collect
CMake version: Could not collect
Libc version: glibc-2.17
...