Traceback (most recent call last):
2024-07-25T06:30:46.3968189Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmpretrain/tools/train.py", line 166, in <module>
2024-07-25T06:30:46.3969123Z main()
2024-07-25T06:30:46.3970030Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmpretrain/tools/train.py", line 162, in main
2024-07-25T06:30:46.3970936Z runner.train()
2024-07-25T06:30:46.3971950Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmengine/mmengine/runner/runner.py", line 1721, in train
2024-07-25T06:30:46.3973000Z model = self.train_loop.run() # type: ignore
2024-07-25T06:30:46.3974102Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmengine/mmengine/runner/loops.py", line 96, in run
2024-07-25T06:30:46.3975398Z self.run_epoch()
2024-07-25T06:30:46.3976429Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmengine/mmengine/runner/loops.py", line 111, in run_epoch
2024-07-25T06:30:46.3977464Z for idx, data_batch in enumerate(self.dataloader):
2024-07-25T06:30:46.3978179Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/dataloader.py", line 437, in __iter__
2024-07-25T06:30:46.3978878Z self._iterator = self._get_iterator()
2024-07-25T06:30:46.3979590Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/dataloader.py", line 388, in _get_iterator
2024-07-25T06:30:46.3980325Z return _MultiProcessingDataLoaderIter(self)
2024-07-25T06:30:46.3981029Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/dataloader.py", line 1055, in __init__
2024-07-25T06:30:46.3981842Z current_device = torch.cuda.current_device() # choose cuda for default
2024-07-25T06:30:46.3982634Z File "/mnt/cache/share/platform/cienv/pytorch/torch/cuda/__init__.py", line 674, in current_device
2024-07-25T06:30:46.3983247Z _lazy_init()
2024-07-25T06:30:46.3983762Z File "/mnt/cache/share/platform/cienv/pytorch/torch/cuda/__init__.py", line 239, in _lazy_init
2024-07-25T06:30:46.3984460Z raise AssertionError("Torch not compiled with CUDA enabled")
Traceback (most recent call last):
2024-07-25T06:30:46.9071670Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmpretrain/tools/train.py", line 166, in <module>
2024-07-25T06:30:46.9075870Z main()
2024-07-25T06:30:46.9079095Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmpretrain/tools/train.py", line 162, in main
2024-07-25T06:30:46.9082117Z runner.train()
2024-07-25T06:30:46.9085814Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmengine/mmengine/runner/runner.py", line 1721, in train
2024-07-25T06:30:46.9089244Z model = self.train_loop.run() # type: ignore
2024-07-25T06:30:46.9093014Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmengine/mmengine/runner/loops.py", line 96, in run
2024-07-25T06:30:46.9096128Z self.run_epoch()
2024-07-25T06:30:46.9099569Z File "/mnt/cache/share/deeplinkci/github/DeepLink-org/deeplink.framework/6045/Build-Ascend-910b/dipu/mmlab_pack/mmengine/mmengine/runner/loops.py", line 111, in run_epoch
2024-07-25T06:30:46.9101040Z for idx, data_batch in enumerate(self.dataloader):
2024-07-25T06:30:46.9101557Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/dataloader.py", line 634, in __next__
2024-07-25T06:30:46.9102014Z data = self._next_data()
2024-07-25T06:30:46.9102461Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/dataloader.py", line 680, in _next_data
2024-07-25T06:30:46.9103016Z data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
2024-07-25T06:30:46.9103576Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/_utils/pin_memory.py", line 60, in pin_memory
2024-07-25T06:30:46.9104335Z return type(data)({k: pin_memory(sample, device) for k, sample in data.items()}) # type: ignore[call-arg]
2024-07-25T06:30:46.9105004Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/_utils/pin_memory.py", line 60, in <dictcomp>
2024-07-25T06:30:46.9105835Z return type(data)({k: pin_memory(sample, device) for k, sample in data.items()}) # type: ignore[call-arg]
2024-07-25T06:30:46.9106489Z File "/mnt/cache/share/platform/cienv/pytorch/torch/utils/data/_utils/pin_memory.py", line 55, in pin_memory
2024-07-25T06:30:46.9106982Z return data.pin_memory(device)
2024-07-25T06:30:46.9107667Z File "/home/autolink/.local/lib/python3.9/site-packages/torch_dipu-0.1-py3.9-linux-x86_64.egg/torch_dipu/dipu/device.py", line 99, in _proxyFuncInst
2024-07-25T06:30:46.9108258Z return rawfunc(self, *args, **kwargs)
2024-07-25T06:30:46.9111819Z NotImplementedError: Could not run 'aten::_pin_memory' with arguments from the 'CUDA' backend.
# for other backends, pin_memory_device need to set. if not set
# default behaviour is CUDA device. if pin_memory_device is selected
# and pin_memory is not set, the default behaviour false.
if len(loader.pin_memory_device) == 0:
self._pin_memory = loader.pin_memory and torch.cuda.is_available()
self._pin_memory_device = None
else:
if not loader.pin_memory:
warn_msg = (
"pin memory device is set and pin_memory flag is not used then device pinned memory won't be used"
"please set pin_memory to true, if you need to use the device pin memory"
)
warnings.warn(warn_msg)
self._pin_memory = loader.pin_memory
self._pin_memory_device = loader.pin_memory_device
同时,经过测试,若将寒武纪CI脚本run one iter中的srun命令改为bash,也会出现相同的报错this commit check。
elif device == "camb":
# For the inference of large language models, simply compare the inference results on the current device directly with the results generated on the GPU
if "infer" in p2 and "infer" in p3:
cmd_run_one_iter = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} --time=40 python {train_path}"
cmd_cp_one_iter = ""
else:
cmd_run_one_iter = f"bash SMART/tools/one_iter_tool/run_one_iter.sh {train_path} {config_path} {work_dir} {opt_arg}"
cmd_cp_one_iter = f"bash SMART/tools/one_iter_tool/compare_one_iter.sh {package_name} {atol} {rtol} {metric}"
# cmd_run_one_iter = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} --time=40 sh SMART/tools/one_iter_tool/run_one_iter.sh {train_path} {config_path} {work_dir} {opt_arg}"
# cmd_cp_one_iter = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} --time=30 sh SMART/tools/one_iter_tool/compare_one_iter.sh {package_name} {atol} {rtol} {metric}"
summary
该pr主要用于:
问题描述
CI ascend test-one-iter会出现以下报错
经过debug,发现这两个报错都与训练过程中 dataloader 中的pin-memory参数有关。 若在训练命令后加上--no-pin-memory参数,就不会出现上述报错。
通过查阅相关代码,我发现以上两种报错的根本原因是 one-iter CPU模式训练过程中尝试去使用并不存在的cuda device。 在one-iter CPU模式中,脚本将mock_cuda改成false,导致训练过程中的cuda device并没有落到厂商device上。
同时,在import dipu时,会有一些逻辑绕过torch dataloader原生的device check:
torch/utils/data/dataloader.py: _BaseDataLoaderIter类中部分代码如下:
如果用户没有传入pin_memory_device,那么torch会默认pin_memory_device为cuda,同时会检查torch.cuda.is_available(),若当前cuda device不可用,torch会将用户传递的pin_memory_device设为false。
而torch_dipu实现的dataloader中,将pin_memory_device直接置为“cuda”,这会使得len(loader.pin_memory_device) != 0,即绕过了torch的cuda可用性检查。
因此训练过程中会尝试使用并不存在的cuda device,从而出现错误。
同时,经过测试,若将寒武纪CI脚本run one iter中的srun命令改为bash,也会出现相同的报错this commit check。
解决方法
在环境变量dipu_mock_cuda = False的情况下,绕过dipu dataloader中的pin_memory_device初始化赋值,从而在pin-memory = True时会走到torch dataloader原生的torch.cuda.is_available()检查