Open Harry-Bai opened 5 months ago
你好! 你这个看起来是从huggingface上下载预训练模型由于网络问题报错。我当时用的是timm==0.5.4,然后预训练权重是从github上下载的,下载的时候开着VPN就可以了。你把timm版本降级到0.5.4再运行看看。
好滴,谢谢大佬指点,我去试一下
现在又遇到了这样的报错,还得麻烦您再帮忙看一下
你好,这是由于你没有为你的自定义数据集设置对应的前景分割方案,一个简单的方法是去除前景分割:
好的,谢谢指点😄
好的,谢谢指点😄
大佬您好,我在训练realnet时遇到了以下报错,在前面训练扩散模型那些的时候都没有这个问题,麻烦您帮忙解答一下。 root@2b64273ad939:/home/bhy/RealNet-main# python3 -m torch.distributed.launch --nproc_per_node=1 train_realnet.py --dataset ub --class_name 1 /usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py:180: FutureWarning: The module torch.distributed.launch is deprecated and will be removed in future. Use torchrun. Note that --use_env is set by default in torchrun. If your script expects
--local_rank
argument to be set, please change it to read fromos.environ['LOCAL_RANK']
instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for further instructionswarnings.warn( [2024-06-06 20:47:10,760][train_realnet.py][line: 122][ INFO] args: Namespace(class_name='1', config='experiments/ub/realnet.yaml', dataset='ub', local_rank=0) [2024-06-06 20:47:10,762][train_realnet.py][line: 123][ INFO] config: {'checkpoints_path': 'experiments/ub/realnet_checkpoints/', 'criterion': [{'kwargs': {'weight': 1.0}, 'name': 'SegmentCrossEntropyLoss', 'type': 'SegmentCrossEntropyLoss'}, {'kwargs': {'weight': 1.0}, 'name': 'FeatureMSELoss', 'type': 'FeatureMSELoss'}], 'dataset': {'batch_size': 4, 'image_reader': {'kwargs': {'color_mode': 'RGB', 'image_dir': '/home/bhy/RealNet-main/data/UB/ub/'}, 'type': 'opencv'}, 'input_size': [256, 256], 'pixel_mean': [0.485, 0.456, 0.406], 'pixel_std': [0.229, 0.224, 0.225], 'test': {'meta_file': './data/UB/samples/test_1.json'}, 'train': {'anomaly_types': {'normal': 0.5, 'sdas': 0.5}, 'dtd_dir': 'data/DTD/images', 'dtd_transparency_range': [0.2, 1.0], 'hflip': False, 'meta_file': 'data/UB/samples/train_1.json', 'min_perlin_scale': 0, 'perlin_scale': 6, 'rotate': False, 'sdas_dir': 'data/UB/sdas/1', 'sdas_transparency_range': [0.5, 1.0], 'vflip': False}, 'type': 'mvtec', 'workers': 4}, 'evaluator': {'key_metric': 'mean', 'metrics': {'auc': [{'kwargs': {'avgpool_size': [16, 16]}, 'name': 'image'}, {'name': 'pixel'}]}}, 'exp_path': 'experiments/ub', 'log_path': 'experiments/ub/realnet_log/', 'net': [{'frozen': True, 'kwargs': {'backbone': 'wide_resnet50_2', 'outlayers': ['layer1', 'layer2', 'layer3', 'layer4']}, 'name': 'backbone', 'type': 'models.backbones.Backbone'}, {'frozen': True, 'kwargs': {'init_bsn': 64, 'structure': [{'layers': [{'idx': 'layer1', 'planes': 256}], 'name': 'block1', 'stride': 4}, {'layers': [{'idx': 'layer2', 'planes': 512}], 'name': 'block2', 'stride': 8}, {'layers': [{'idx': 'layer3', 'planes': 512}], 'name': 'block3', 'stride': 16}, {'layers': [{'idx': 'layer4', 'planes': 256}], 'name': 'block4', 'stride': 32}]}, 'name': 'afs', 'prev': 'backbone', 'type': 'models.afs.AFS'}, {'kwargs': {'attention_mult': [2, 4], 'channel_mult': [1, 2, 4], 'hide_channels_ratio': 0.5, 'num_res_blocks': 2}, 'name': 'recon', 'prev': 'afs', 'type': 'models.recon.ReconstructionLayer'}, {'kwargs': {'mode_numbers': [256, 256], 'modes': ['max', 'mean'], 'num_residual_layers': 2, 'stop_grad': False}, 'name': 'rrs', 'prev': 'recon', 'type': 'models.rrs.RRS'}], 'random_seed': 100, 'saver': {'checkpoints_dir': 'realnet_checkpoints/', 'log_dir': 'realnet_log/', 'vis_dir': 'realnet_vis/'}, 'structure': [{'layers': [{'idx': 'layer1', 'planes': 256}], 'name': 'block1', 'stride': 4}, {'layers': [{'idx': 'layer2', 'planes': 512}], 'name': 'block2', 'stride': 8}, {'layers': [{'idx': 'layer3', 'planes': 512}], 'name': 'block3', 'stride': 16}, {'layers': [{'idx': 'layer4', 'planes': 256}], 'name': 'block4', 'stride': 32}], 'trainer': {'max_epoch': 1000, 'optimizer': {'kwargs': {'betas': [0.9, 0.999], 'lr': 0.0001}, 'type': 'Adam'}, 'print_freq_step': 20, 'val_freq_epoch': 5}, 'version': 'v1.0.0'} [2024-06-06 20:47:10,762][train_realnet.py][line: 124][ INFO] class name is : 1 Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/urllib3/connection.py", line 174, in _new_conn conn = connection.create_connection( File "/usr/local/lib/python3.8/dist-packages/urllib3/util/connection.py", line 95, in create_connection raise err File "/usr/local/lib/python3.8/dist-packages/urllib3/util/connection.py", line 85, in create_connection sock.connect(sa) socket.timeout: timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/urllib3/connectionpool.py", line 714, in urlopen httplib_response = self._make_request( File "/usr/local/lib/python3.8/dist-packages/urllib3/connectionpool.py", line 403, in _make_request self._validate_conn(conn) File "/usr/local/lib/python3.8/dist-packages/urllib3/connectionpool.py", line 1053, in _validate_conn conn.connect() File "/usr/local/lib/python3.8/dist-packages/urllib3/connection.py", line 363, in connect self.sock = conn = self._new_conn() File "/usr/local/lib/python3.8/dist-packages/urllib3/connection.py", line 179, in _new_conn raise ConnectTimeoutError( urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f35f422ef40>, 'Connection to huggingface.co timed out. (connect timeout=10)')
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/requests/adapters.py", line 489, in send resp = conn.urlopen( File "/usr/local/lib/python3.8/dist-packages/urllib3/connectionpool.py", line 798, in urlopen retries = retries.increment( File "/usr/local/lib/python3.8/dist-packages/urllib3/util/retry.py", line 592, in increment raise MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /timm/wide_resnet50_2.racm_in1k/resolve/main/pytorch_model.bin (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f35f422ef40>, 'Connection to huggingface.co timed out. (connect timeout=10)'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/file_download.py", line 1247, in hf_hub_download metadata = get_hf_file_metadata( File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn return fn(args, kwargs) File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/file_download.py", line 1624, in get_hf_file_metadata r = _request_wrapper( File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/file_download.py", line 402, in _request_wrapper response = _request_wrapper( File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/file_download.py", line 425, in _request_wrapper response = get_session().request(method=method, url=url, params) File "/usr/local/lib/python3.8/dist-packages/requests/sessions.py", line 587, in request resp = self.send(prep, send_kwargs) File "/usr/local/lib/python3.8/dist-packages/requests/sessions.py", line 701, in send r = adapter.send(request, kwargs) File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/utils/_http.py", line 63, in send return super().send(request, args, **kwargs) File "/usr/local/lib/python3.8/dist-packages/requests/adapters.py", line 553, in send raise ConnectTimeout(e, request=request) requests.exceptions.ConnectTimeout: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /timm/wide_resnet50_2.racm_in1k/resolve/main/pytorch_model.bin (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f35f422ef40>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 9f5eba2b-3445-4f22-96b1-4c127c1d8a5a)')
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "train_realnet.py", line 328, in
main()
File "train_realnet.py", line 131, in main
model = ModelHelper(config.net)
File "/home/bhy/RealNet-main/models/model_helper.py", line 29, in init
module = self.build(mtype, kwargs)
File "/home/bhy/RealNet-main/models/model_helper.py", line 36, in build
return cls(kwargs)
File "/home/bhy/RealNet-main/models/backbones/backbone.py", line 37, in init
self.feature_extractor = timm.create_model(self.backbone, features_only=True,pretrained=True,
File "/usr/local/lib/python3.8/dist-packages/timm/models/_factory.py", line 117, in create_model
model = create_fn(
File "/usr/local/lib/python3.8/dist-packages/timm/models/resnet.py", line 1446, in wide_resnet50_2
return _create_resnet('wide_resnet50_2', pretrained, dict(model_args, kwargs))
File "/usr/local/lib/python3.8/dist-packages/timm/models/resnet.py", line 584, in _create_resnet
return build_model_with_cfg(ResNet, variant, pretrained, kwargs)
File "/usr/local/lib/python3.8/dist-packages/timm/models/_builder.py", line 397, in build_model_with_cfg
load_pretrained(
File "/usr/local/lib/python3.8/dist-packages/timm/models/_builder.py", line 190, in load_pretrained
state_dict = load_state_dict_from_hf(pretrained_loc)
File "/usr/local/lib/python3.8/dist-packages/timm/models/_hub.py", line 188, in load_state_dict_from_hf
cached_file = hf_hub_download(hf_model_id, filename=filename, revision=hf_revision)
File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/huggingface_hub/file_download.py", line 1377, in hf_hub_download
raise LocalEntryNotFoundError(
huggingface_hub.utils._errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2937395) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 195, in
main()
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
train_realnet.py FAILED
Failures: