Open xuanxie2000 opened 4 months ago
我在使用recognition中的训练时遇到相同问题,与数据相关,请检查你的数据或者配置文件中的路径是否正确,可以参考以下方法尝试解决:
请检查你的train.yaml
中DATA_ROOT
和name
的配置,tfrecord文件最终读取路径为DATA_ROOT/name
。
可以在/TFace/recognition/torchkit/data/parser.py
的line 54
找到class TFRecordSampleParser(object)
的定义,添加打印输出record_path
检查是否为你的Tfrecord文件所在位置:
class TFRecordSampleParser(object):
""" Class for TFRecord Sample parser
"""
def __init__(self, transform) -> None:
self.transform = transform
self.file_readers = dict()
def __call__(self, record_path, offset, label):
print(f"record_path check:{record_path}")
rr = self.file_readers.get(record_path, None)
if rr is None:
rr = db.RecordReader(record_path)
self.file_readers[record_path] = rr
另,提供一个文件路径检查和错误处理的方式供参考:
import os
class TFRecordSampleParser(object):
""" Class for TFRecord Sample parser
"""
def __init__(self, transform) -> None:
self.transform = transform
self.file_readers = dict()
def __call__(self, record_path, offset, label):
# 检查 record_path 是否存在
if not os.path.exists(record_path):
raise FileNotFoundError(f"TFRecord file not found at path: {record_path}")
rr = self.file_readers.get(record_path, None)
if rr is None:
try:
rr = db.RecordReader(record_path)
self.file_readers[record_path] = rr
except Exception as e:
raise RuntimeError(f"Error initializing RecordReader for path {record_path}: {e}")
pb_data = rr.read_record(offset)
example = example_pb2.Example()
example.ParseFromString(pb_data)
image_raw = example.features.feature['image'].bytes_list.value[0]
image = cv2.imdecode(np.frombuffer(image_raw, np.uint8), cv2.IMREAD_COLOR)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
if self.transform is not None:
image = self.transform(image)
return image, label
在跑minusface模型的时候报错,有没有大佬知道为什么?
完整报错: ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 867) of binary: /root/miniconda3/envs/MinusFace/bin/python3 Traceback (most recent call last): File "/root/miniconda3/envs/MinusFace/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/root/miniconda3/envs/MinusFace/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/root/miniconda3/envs/MinusFace/lib/python3.8/site-packages/torch/distributed/launch.py", line 196, in
main()
File "/root/miniconda3/envs/MinusFace/lib/python3.8/site-packages/torch/distributed/launch.py", line 192, in main
launch(args)
File "/root/miniconda3/envs/MinusFace/lib/python3.8/site-packages/torch/distributed/launch.py", line 177, in launch
run(args)
File "/root/miniconda3/envs/MinusFace/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/root/miniconda3/envs/MinusFace/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/envs/MinusFace/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
train.py FAILED
Failures: