huawei-noah / vega

AutoML tools chain
http://www.noahlab.com.hk/opensource/vega/
Other
841 stars 177 forks source link

请问在moderarts上怎么配置,里边预训练模型这些在哪儿下? #276

Open marvellee1 opened 1 year ago

marvellee1 commented 1 year ago

%8R Z B7HF@${WBRD_ JWTB C(_F7Q{RAQQ{UXW(TPM34PL

zhangjiajin commented 1 year ago

请给出更详细的日志。

marvellee1 commented 1 year ago

(MindSpore) [ma-user examples]$vega ./nas/sp_nas/spnas_md.yml -b m -d NPU 2022-12-08 19:13:41.377 INFO ------------------------------------------------ 2022-12-08 19:13:41.377 INFO task id: 1208.191334.031 2022-12-08 19:13:41.377 INFO ------------------------------------------------ 2022-12-08 19:13:41.379 INFO configure: { "general": { "backend": "m", "device_category": "NPU", "cluster": {} }, "pipeline": [ "serial" ], "serial": { "pipe_step": { "type": "SearchPipeStep" }, "search_algorithm": { "type": "SpNasS", "max_sample": 1, "objective_keys": "AP50" }, "search_space": { "type": "SearchSpace", "hyperparameters": [ { "key": "network.backbone.code", "type": "CATEGORY", "range": [ "111-2111-211111-211" ] } ] }, "model": { "model_desc": { "type": "Faster_Rcnn_MD" } }, "trainer": { "type": "SpNasTrainerCallback", "epochs": 6, "get_train_metric_after_epoch": false, "model_statistics": false, "is_detection_trainer": true, "perfs_cmp_key": "AP50", "optimizer": { "type": "SGD", "params": { "lr": 0.03, "momentum": 0.9, "weight_decay": 0.0001 } }, "lr_scheduler": { "type": "WarmupScheduler", "by_epoch": false, "params": { "warmup_type": "linear", "warmup_iters": 2000, "warmup_ratio": 0.001, "after_scheduler_config": { "type": "MultiStepLR", "by_epoch": true, "params": { "milestones": [ 10, 20 ], "gamma": 0.1 } } } }, "loss": { "type": "SumLoss" }, "metric": { "type": "coco", "params": { "anno_path": "/home/ma-user/work/cache/datasets/COCO2017/annotations/instances_val2017.json" } } }, "dataset": { "type": "CocoDataset", "common": { "batch_size": 16, "num_parallel_workers": 1, "flip_ratio": 0.5, "expand_ratio": 1.0, "img_width": 700, "img_height": 512, "keep_ratio": true, "device_id": 0, "device_num": 1, "rank_id": 0, "python_multiprocessing": true, "coco_root": "/home/ma-user/work/cache/datasets/COCO2017", "mindrecord_dir": "/home/ma-user/work/cache/MindRecord_COCO_TRAIN", "instanceset": "annotations/instances{}.json", "coco_classes": [ "target", "fault" ], "num_classes": 3 }, "train": { "train_data_type": "train2017" }, "val": { "val_data_type": "val2017", "test_batch_size": 64 } } }, "parallel": { "pipe_step": { "type": "SearchPipeStep", "models_folder": "{local_base_path}/output/serial/", "pretrained_folder": "{local_base_path}/output/serial/" }, "search_algorithm": { "type": "SpNasP" }, "search_space": { "type": "SearchSpace", "hyperparameters": [ { "key": "network.neck.code", "type": "CATEGORY", "range": [ [ 0, 1, 2, 3 ] ] } ] }, "model": { "model_desc": { "type": "Faster_Rcnn_MD" } }, "trainer": { "ref": "serial.trainer", "type": "SpNasTrainerCallback", "epochs": 6, "get_train_metric_after_epoch": false, "model_statistics": false, "is_detection_trainer": true, "perfs_cmp_key": "AP50", "optimizer": { "type": "SGD", "params": { "lr": 0.03, "momentum": 0.9, "weight_decay": 0.0001 } }, "lr_scheduler": { "type": "WarmupScheduler", "by_epoch": false, "params": { "warmup_type": "linear", "warmup_iters": 2000, "warmup_ratio": 0.001, "after_scheduler_config": { "type": "MultiStepLR", "by_epoch": true, "params": { "milestones": [ 10, 20 ], "gamma": 0.1 } } } }, "loss": { "type": "SumLoss" }, "metric": { "type": "coco", "params": { "anno_path": "/home/ma-user/work/cache/datasets/COCO2017/annotations/instances_val2017.json" } } }, "dataset": { "ref": "serial.dataset", "type": "CocoDataset", "common": { "batch_size": 16, "num_parallel_workers": 1, "flip_ratio": 0.5, "expand_ratio": 1.0, "img_width": 700, "img_height": 512, "keep_ratio": true, "device_id": 0, "device_num": 1, "rank_id": 0, "python_multiprocessing": true, "coco_root": "/home/ma-user/work/cache/datasets/COCO2017", "mindrecord_dir": "/home/ma-user/work/cache/MindRecord_COCO_TRAIN", "instanceset": "annotations/instances{}.json", "coco_classes": [ "target", "fault" ], "num_classes": 3 }, "train": { "train_data_type": "train2017" }, "val": { "val_data_type": "val2017", "test_batch_size": 64 } } }, "fullytrain": { "pipe_step": { "type": "TrainPipeStep", "models_folder": "{local_base_path}/output/parallel/", "pretrained_folder": "{local_base_path}/output/parallel/" }, "trainer": { "ref": "serial.trainer", "epochs": 24, "type": "SpNasTrainerCallback", "get_train_metric_after_epoch": false, "model_statistics": false, "is_detection_trainer": true, "perfs_cmp_key": "AP50", "optimizer": { "type": "SGD", "params": { "lr": 0.03, "momentum": 0.9, "weight_decay": 0.0001 } }, "lr_scheduler": { "type": "WarmupScheduler", "by_epoch": false, "params": { "warmup_type": "linear", "warmup_iters": 2000, "warmup_ratio": 0.001, "after_scheduler_config": { "type": "MultiStepLR", "by_epoch": true, "params": { "milestones": [ 10, 20 ], "gamma": 0.1 } } } }, "loss": { "type": "SumLoss" }, "metric": { "type": "coco", "params": { "anno_path": "/home/ma-user/work/cache/datasets/COCO2017/annotations/instances_val2017.json" } } }, "dataset": { "ref": "serial.dataset", "type": "CocoDataset", "common": { "batch_size": 16, "num_parallel_workers": 1, "flip_ratio": 0.5, "expand_ratio": 1.0, "img_width": 700, "img_height": 512, "keep_ratio": true, "device_id": 0, "device_num": 1, "rank_id": 0, "python_multiprocessing": true, "coco_root": "/home/ma-user/work/cache/datasets/COCO2017", "mindrecord_dir": "/home/ma-user/work/cache/MindRecord_COCO_TRAIN", "instanceset": "annotations/instances{}.json", "coco_classes": [ "target", "fault" ], "num_classes": 3 }, "train": { "train_data_type": "train2017" }, "val": { "val_data_type": "val2017", "test_batch_size": 64 } } }, "abs_path": true } 2022-12-08 19:13:41.379 INFO ------------------------------------------------ {} 2022-12-08 19:13:42.534 INFO ------------------------------------------------ 2022-12-08 19:13:42.534 INFO Step: serial 2022-12-08 19:13:42.534 INFO ------------------------------------------------ 2022-12-08 19:13:42.547 INFO Serial-level Sample1: expend -> swap -> expend. Success. 2022-12-08 19:13:42.548 INFO desc:{'network.backbone.code': '1111-211-12111111112-11'} 2022-12-08 19:13:42.575 INFO submit trainer, id=1 2022-12-08 19:13:42.577 INFO Run train/val in mode: 0. 2022-12-08 19:13:42.578 INFO minspore context, mode: 0, target: Ascend, device_id: 0 2022-12-08 19:13:42.578 INFO DEVICE_ID: 0 2022-12-08 19:13:42.578 INFO Dataset_sink_mode:True. [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.413.059 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.416.528 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.432.797 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.436.952 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.453.311 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.457.830 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.472.690 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.478.160 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.498.580 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.511.148 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.511.647 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.513.333 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.513.772 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.515.253 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:43.942.775 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:44.232.465 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:44.301.238 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data [WARNING] ME(119171:281472820496960,MainProcess):2022-12-08-19:13:44.304.187 [mindspore/common/tensor.py:1637] WARN_DEPRECATED: The usage of to_tensor is deprecated. Please use init_data 2022-12-08 19:13:44.327 INFO Model was created. CHECKING MINDRECORD FILES ... [WARNING] PIPELINE(119171,ffff7f7aca40,python3.7):2022-12-08-19:13:46.765.392 [mindspore/ccsrc/pipeline/jit/pipeline.cc:173] CheckArgValid] The data types of Tensor:[[ True False False ... False False False] [ True False False ... False False False] [ True False False ... False False False] ... [ True False False ... False False False] [ True False False ... False False False] [ True False False ... False False False]] is bool, which may cause SelectKernelInfo failure for operator [AddN]. For more details, please refer to the FAQ at https://www.mindspore.cn. [ERROR] ANALYZER(119171,ffff7f7aca40,python3.7):2022-12-08-19:14:10.941.108 [mindspore/ccsrc/pipeline/jit/static_analysis/async_eval_result.cc:66] HandleException] Exception happened, check the information as below.

The function call stack (See file '/home/ma-user/work/vega-master/examples/rank_0/om/analyze_fail.dat' for more details):

0 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/algorithms/nas/sp_nas/src/network_define.py(155)

    if self.reduce_flag:

1 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/algorithms/nas/sp_nas/src/network_define.py(154)

    grads = self.grad(self.network, weights)(x, img_shape, gt_bboxe, gt_label, gt_num, self.sens)
            ^

2 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/algorithms/nas/sp_nas/src/network_define.py(107)

    loss1, loss2, loss3, loss4, loss5, loss6 = self._backbone(x, img_shape, gt_bboxe, gt_label, gt_num)
                                               ^

3 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/networks/mindspore/faster_rcnn/faster_rcnn_resnet.py(267)

    if self.training:

4 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/networks/mindspore/faster_rcnn/faster_rcnn_resnet.py(330)

                                   self.cast(x[0], mstype.float32),
                                             ^

5 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/networks/mindspore/faster_rcnn/fpn_neck.py(102)

    for i in range(self.fpn_layer):

6 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/networks/mindspore/faster_rcnn/fpn_neck.py(111)

    for i in range(self.fpn_layer - 1, -1, -1):
    ^

7 In file /home/ma-user/.local/lib/python3.7/site-packages/vega/networks/mindspore/faster_rcnn/fpn_neck.py(106)

    y = y + (x[2] + self.interpolate1(y[self.fpn_layer - 4]),)
             ^

2022-12-08 19:14:11.8 ERROR Failed to run worker, id: 1, message: For 'Add', x.shape and y.shape are supposed to broadcast, where broadcast means that x.shape[i] = 1 or -1 or y.shape[i] = 1 or -1 or x.shape[i] = y.shape[i], but now x.shape and y.shape can not broadcast, got i: -2, x.shape: [16, 256, 32, 44], y.shape: [16, 256, 48, 80]. 2022-12-08 19:14:11.59 INFO Update Success. step_name=serial, worker_id=1 2022-12-08 19:14:11.59 INFO waiting for the workers [1] to finish 2022-12-08 19:14:11.60 INFO Best values: [] 2022-12-08 19:14:11.62 WARNING Failed to dump pareto front records, report is emplty. 2022-12-08 19:14:13.74 INFO ------------------------------------------------ 2022-12-08 19:14:13.74 INFO Pipeline end. 2022-12-08 19:14:13.74 INFO 2022-12-08 19:14:13.75 INFO task id: 1208.191334.031 2022-12-08 19:14:13.75 INFO output folder: /home/ma-user/work/vega-master/examples/tasks/1208.191334.031/output 2022-12-08 19:14:13.75 INFO 2022-12-08 19:14:13.75 INFO running time: 2022-12-08 19:14:13.76 INFO serial: 0:00:28 [2022-12-08 19:13:42.540940 - 2022-12-08 19:14:11.072154] 2022-12-08 19:14:13.76 INFO 2022-12-08 19:14:13.80 INFO result file output.csv is not existed or empty 2022-12-08 19:14:13.80 INFO ------------------------------------------------ 2022-12-08 19:14:16.178 INFO Shutdown urgently.

Exception ignored in: <function _PythonMultiprocessing.del at 0xffff1a6b6710> Traceback (most recent call last): File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindspore/dataset/engine/datasets.py", line 3165, in del File "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindspore/dataset/engine/datasets.py", line 2841, in terminate TypeError: 'NoneType' object is not callable

zhangjiajin commented 1 year ago

mindspore的版本号是多少?

marvellee1 commented 1 year ago

1.7.0 0V))JAK)5ZZB5@0CWC() B4 预训练模型有提供下载的地址么?

zhangjiajin commented 1 year ago

这个预训练模型在这: https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth