您好,我在尝试tutorials中都能遇到以下问题,请问有什么解决办法吗,谢谢指导. 版本是Fatellm2.1
[INFO] [2024-05-02 03:36:43,059] [202405020336083952670] [52:140389839259392] - [base_saver.execute_update] [line:223]: UPDATE "t_task" SET "f_update_time" = 1714621003059, "f_error_report" = 'Traceback (most recent call last):
File "/data/projects/fate/fate/python/fate/components/entrypoint/cli/component/execute_cli.py", line 147, in execute_component_from_config
component.execute(ctx, role, execution_io.get_kwargs())
File "/data/projects/fate/fate/python/fate/components/core/component_desc/component.py", line 101, in execute
return self.callback(ctx, role, kwargs)
File "/data/projects/fate/fate/python/fate/components/components/homo_nn.py", line 61, in train
train_procedure(
File "/data/projects/fate/fate/python/fate/components/components/nn/component_utils.py", line 155, in train_procedure
runner.train(train_data, validatedata, output_dir, saved_model_path)
File "/data/projects/fate/fate/python/fate/components/components/nn/runner/homo_default_runner.py", line 270, in train
trainer.train()
File "/data/projects/fate/fate/python/fate_llm/fedkseed/fedkseed.py", line 123, in train
direction_derivative_history = self.train_once(
File "/data/projects/fate/fate/python/fate_llm/fedkseed/fedkseed.py", line 154, in train_once
trainer.train()
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/transformers/trainer.py", line 1624, in train
return inner_training_loop(
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/data/projects/fate/fate/python/fate_llm/fedkseed/trainer.py", line 96, in training_step
loss = self._kseed_optimizer.kseed_zeroth_order_step(closure=closure)
File "/data/projects/fate/fate/python/fate_llm/fedkseed/optimizer.py", line 228, in kseed_zeroth_order_step
directional_derivative_value, loss_right, loss_left = self.zeroth_order_step(seed, closure)
File "/data/projects/fate/fate/python/fate_llm/fedkseed/optimizer.py", line 129, in zeroth_order_step
loss_right = closure()
File "/data/projects/fate/fate/python/fate_llm/fedkseed/trainer.py", line 90, in closure
return self.compute_loss(model, inputs, return_outputs=False).detach()
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/transformers/trainer.py", line 2925, in compute_loss
outputs = model(inputs)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 175, in forward
inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 197, in scatter
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 74, in scatter_kwargs
scattered_kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in scatter
res = scatter_map(inputs)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 51, in scatter_map
return [type(obj)(i) for i in zip(map(scatter_map, obj.items()))]
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 47, in scatter_map
return list(zip(map(scatter_map, obj)))
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 43, in scatter_map
return Scatter.apply(target_gpus, None, dim, obj)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 96, in forward
outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 187, in scatter
return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: CUDA error: peer mapping resources exhausted
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
您好,我在尝试tutorials中都能遇到以下问题,请问有什么解决办法吗,谢谢指导. 版本是Fatellm2.1 [INFO] [2024-05-02 03:36:43,059] [202405020336083952670] [52:140389839259392] - [base_saver.execute_update] [line:223]: UPDATE "t_task" SET "f_update_time" = 1714621003059, "f_error_report" = 'Traceback (most recent call last): File "/data/projects/fate/fate/python/fate/components/entrypoint/cli/component/execute_cli.py", line 147, in execute_component_from_config component.execute(ctx, role, execution_io.get_kwargs()) File "/data/projects/fate/fate/python/fate/components/core/component_desc/component.py", line 101, in execute return self.callback(ctx, role, kwargs) File "/data/projects/fate/fate/python/fate/components/components/homo_nn.py", line 61, in train train_procedure( File "/data/projects/fate/fate/python/fate/components/components/nn/component_utils.py", line 155, in train_procedure runner.train(train_data, validatedata, output_dir, saved_model_path) File "/data/projects/fate/fate/python/fate/components/components/nn/runner/homo_default_runner.py", line 270, in train trainer.train() File "/data/projects/fate/fate/python/fate_llm/fedkseed/fedkseed.py", line 123, in train direction_derivative_history = self.train_once( File "/data/projects/fate/fate/python/fate_llm/fedkseed/fedkseed.py", line 154, in train_once trainer.train() File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/transformers/trainer.py", line 1624, in train return inner_training_loop( File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/data/projects/fate/fate/python/fate_llm/fedkseed/trainer.py", line 96, in training_step loss = self._kseed_optimizer.kseed_zeroth_order_step(closure=closure) File "/data/projects/fate/fate/python/fate_llm/fedkseed/optimizer.py", line 228, in kseed_zeroth_order_step directional_derivative_value, loss_right, loss_left = self.zeroth_order_step(seed, closure) File "/data/projects/fate/fate/python/fate_llm/fedkseed/optimizer.py", line 129, in zeroth_order_step loss_right = closure() File "/data/projects/fate/fate/python/fate_llm/fedkseed/trainer.py", line 90, in closure return self.compute_loss(model, inputs, return_outputs=False).detach() File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/transformers/trainer.py", line 2925, in compute_loss outputs = model(inputs) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(args, kwargs) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 175, in forward inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 197, in scatter return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 74, in scatter_kwargs scattered_kwargs = scatter(kwargs, target_gpus, dim) if kwargs else [] File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in scatter res = scatter_map(inputs) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 51, in scatter_map return [type(obj)(i) for i in zip(map(scatter_map, obj.items()))] File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 47, in scatter_map return list(zip(map(scatter_map, obj))) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 43, in scatter_map return Scatter.apply(target_gpus, None, dim, obj) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/autograd/function.py", line 539, in apply return super().apply(*args, **kwargs) # type: ignore[misc] File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 96, in forward outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) File "/data/projects/fate/env/python/venv/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 187, in scatter return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams)) RuntimeError: CUDA error: peer mapping resources exhausted CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.