Closed Zengxia-Guo closed 3 months ago
The relevant code is as follows,
`# 定义联邦学习客户端 class Client: def init(self, args, domain_name, work_dir): print(f'args.work_dir: {args.work_dir}') args.work_dir = work_dir print(f'args.work_dir: {args.work_dir}')
self.args = args
self.env = dmc2gym.make(
domain_name=domain_name,
task_name=args.task_name,
resource_files=args.resource_files,
img_source=args.img_source,
total_frames=args.total_frames,
seed=args.seed,
visualize_reward=False,
from_pixels=(args.encoder_type == 'pixel'),
height=args.image_size,
width=args.image_size,
frame_skip=args.action_repeat
)
self.env.seed(args.seed)
self.eval_env1 = dmc2gym.make(
domain_name='cartpole1',
task_name=args.task_name,
resource_files=args.eval_resource_files,
img_source=args.img_source,
total_frames=args.total_frames,
seed=args.seed,
visualize_reward=False,
from_pixels=(args.encoder_type == 'pixel'),
height=args.image_size,
width=args.image_size,
frame_skip=args.action_repeat
)
self.eval_env1.seed(args.seed)
self.eval_env2 = dmc2gym.make(
domain_name='cartpole2',
task_name=args.task_name,
resource_files=args.eval_resource_files,
img_source=args.img_source,
total_frames=args.total_frames,
seed=args.seed,
visualize_reward=False,
from_pixels=(args.encoder_type == 'pixel'),
height=args.image_size,
width=args.image_size,
frame_skip=args.action_repeat
)
self.eval_env2.seed(args.seed)
self.eval_env3 = dmc2gym.make(
domain_name='cartpole3',
task_name=args.task_name,
resource_files=args.eval_resource_files,
img_source=args.img_source,
total_frames=args.total_frames,
seed=args.seed,
visualize_reward=False,
from_pixels=(args.encoder_type == 'pixel'),
height=args.image_size,
width=args.image_size,
frame_skip=args.action_repeat
)
self.eval_env3.seed(args.seed)
self.eval_env4 = dmc2gym.make(
domain_name='cartpole4',
task_name=args.task_name,
resource_files=args.eval_resource_files,
img_source=args.img_source,
total_frames=args.total_frames,
seed=args.seed,
visualize_reward=False,
from_pixels=(args.encoder_type == 'pixel'),
height=args.image_size,
width=args.image_size,
frame_skip=args.action_repeat
)
self.eval_env4.seed(args.seed)
# stack several consecutive frames together
if args.encoder_type.startswith('pixel') and not args.robosuite and args.domain_name != 'carla':
env = utils.FrameStack(self.env, k=args.frame_stack)
eval_env1 = utils.FrameStack(self.eval_env1, k=args.frame_stack)
eval_env2 = utils.FrameStack(self.eval_env2, k=args.frame_stack)
eval_env3 = utils.FrameStack(self.eval_env3, k=args.frame_stack)
eval_env4 = utils.FrameStack(self.eval_env4, k=args.frame_stack)
utils.make_dir(args.work_dir)
self.video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
self.model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
self.buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))
print(f'self.video_dir: {self.video_dir}')
self.video = VideoRecorder(self.video_dir if args.save_video else None)
# the dmc2gym wrapper standardizes actions
assert self.env.action_space.low.min() >= -1
assert self.env.action_space.high.max() <= 1
print(self.env.observation_space.shape)
print(self.env.action_space.shape)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.replay_buffer = utils.ReplayBuffer(
obs_shape=self.env.observation_space.shape,
action_shape=self.env.action_space.shape,
capacity=args.replay_buffer_capacity,
batch_size=args.batch_size,
device=self.device,
is_framestack=(args.domain_name != 'carla'),
)
with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
json.dump(vars(args), f, sort_keys=True, indent=4)
self.agent = train.make_agent(
obs_shape=self.env.observation_space.shape,
action_shape=self.env.action_space.shape,
args=args,
device=self.device
)
self.L = Logger(args.work_dir, use_tb=args.save_tb)
if hasattr(self.agent, 'source_code_file_path'):
if self.agent.source_code_file_path is not None:
code_dir = os.path.join(args.work_dir, 'code')
os.makedirs(code_dir, exist_ok=True)
copyfile(self.agent.source_code_file_path, os.path.join(code_dir,
os.path.basename(self.agent.source_code_file_path)))
self.step = 0
self.episode = 0
self.episode_reward=0
def train(self, num_episodes=100, eval = False):
episode_step, reward, done = 0, 0, True
start_time = time.time()
obs = self.env.reset()
for e in range(num_episodes):
if done:
if self.args.decoder_type == 'inverse':
for i in range(1, self.args.k): # fill k_obs with 0s if episode is done
self.replay_buffer.k_obses[self.replay_buffer.idx - i] = 0
if self.step > 0:
self.L.log('train/duration', time.time() - start_time, self.step)
start_time = time.time()
self.L.dump(self.step)
# evaluate agent periodically
if self.episode % self.args.eval_freq == 0:
self.L.log('eval/episode', self.episode, self.step)
train.evaluate(self.eval_env1, self.agent, self.video, self.args.num_eval_episodes, self.L, self.step)
self.L.log('eval/episode', self.episode, self.step)
train.evaluate(self.eval_env2, self.agent, self.video, self.args.num_eval_episodes, self.L,
self.step)
self.L.log('eval/episode', self.episode, self.step)
train.evaluate(self.eval_env3, self.agent, self.video, self.args.num_eval_episodes, self.L,
self.step)
self.L.log('eval/episode', self.episode, self.step)
train.evaluate(self.eval_env4, self.agent, self.video, self.args.num_eval_episodes, self.L,
self.step)
if self.args.save_model:
self.agent.save(self.model_dir, step=None)
if self.args.save_buffer:
self.replay_buffer.save(self.buffer_dir)
self.L.log('train/episode_reward', self.episode_reward, self.step)
obs = self.env.reset()
done = False
self.episode_reward = 0
episode_step = 0
self.episode += 1
reward = 0
self.L.log('train/episode', self.episode, self.step)
if eval:
break
while not done:
# sample action for data collection
if self.step < self.args.init_steps:
action = self.env.action_space.sample()
log_pi = utils.gym_action_space_log_prob(self.env.action_space, action)
else:
with utils.eval_mode(self.agent):
action, log_pi = self.agent.sample_action(obs)
# run training update
if self.step >= self.args.init_steps:
num_updates = self.args.init_steps if self.step == self.args.init_steps else 1
for _ in range(num_updates):
self.agent.update(self.replay_buffer, self.L, self.step)
curr_reward = reward
next_obs, reward, done, _ = self.env.step(action)
# allow infinit bootstrap
done_bool = 0 if episode_step + 1 == self.env._max_episode_steps else float(
done
)
self.episode_reward += reward
self.replay_buffer.add(obs, action, log_pi, curr_reward, reward, next_obs, done_bool, )
if self.args.decoder_type == 'inverse':
np.copyto(self.replay_buffer.k_obses[self.replay_buffer.idx - self.args.k], next_obs)
obs = next_obs
episode_step += 1
self.step += 1
def get_weights(self):
return self.agent.critic.encoder.state_dict()
# def get_params(self):
# return self.agent.critic.encoder.parameters()
def set_weights(self, weights):
self.agent.critic.encoder.load_state_dict(weights)
self.agent.critic_target.encoder.load_state_dict(self.agent.critic.encoder.state_dict())
# self.agent.update_target_network()
# def set_params(self, target_net):
# for param, target_param in zip(self.agent.critic.encoder.parameters(), target_net.parameters()):
# # target_param.data.copy_(
# # tau * param.data + (1 - tau) * target_param.data
# # )
# target_param.data.copy_(
# param.data
# )
def federated_averaging(weights): avg_weights = weights[0] for key in avg_weights.keys(): for i in range(1, len(weights)): avg_weights[key] += weights[i][key] avg_weights[key] = avg_weights[key] / len(weights) return avg_weights
class Server: def init(self, clients): self.clients = clients
def train(self, rounds=10, num_episodes=100):
for round in range(rounds):
client_weights = []
for client in self.clients:
client.train(num_episodes=num_episodes, eval=False)
client_weights.append(client.get_weights())
avg_weights = federated_averaging(client_weights)
for client in self.clients:
client.set_weights(avg_weights)
print(f'Round {round + 1}/{rounds} completed')`
I don't think the error happens in the logging codes because PyTorch runs CUDA asynchronously.
You may set environment variable CUDA_LAUNCH_BLOCKING=1
to double-check where the error exactly occurs.
您的来信我已收到,将尽快查看,谢谢!
Thank you very much for your help! Best regards.
Hi,
I am encountering a CUDA error during the training phase when federating your dm_control rap.py code using the RAPFedAvg5.py script. Below is the error log:
| eval | S: 51250 | ER: 207.8327 | eval | S: 51250 | ER: 200.0906 | eval | S: 51250 | ER: 163.6263 | eval | S: 51250 | ER: 205.6595 | eval | S: 51250 | ER: 259.2034 | eval | S: 51250 | ER: 178.4320 | eval | S: 51250 | ER: 198.9506 | eval | S: 51250 | ER: 224.9817 | train | E: 206 | S: 51500 | D: 83.8 s | R: 166.9316 | BR: 0.6993 | ALOSS: -85.1009 | CLOSS: 6.5211 | RLOSS: 0.0000 | RHO: 0.0000 | MR: 0.0000 | EI: 0
Traceback (most recent call last): File "RAPFedAvg5.py", line 333, in <module> server.train(rounds, num_episodes) File "RAPFedAvg5.py", line 275, in train client.train(num_episodes=num_episodes, eval=False) File "RAPFedAvg5.py", line 213, in train self.agent.update(self.replay_buffer, self.L, self.step) File "/workspace/workspace3/RAP_distance/agent/rap.py", line 406, in update self.update_actor_and_alpha(obs, L, step) File "/workspace/workspace3/RAP_distance/agent/rap.py", line 244, in update_actor_and_alpha L.log('train_actor/loss', actor_loss, step) File "/workspace/workspace3/RAP_distance/logger.py", line 138, in log value = value.item() RuntimeError: CUDA error: unspecified launch failure
Here are some additional details:NVIDIA-SMI: 535.183.01 Driver Version: 535.183.01 CUDA Version: 12.2 GPU: NVIDIA RTX A6000
Here is the list of installed packages:
Package Version
absl-py 2.1.0 backcall 0.2.0 beautifulsoup4 4.9.3 brotlipy 0.7.0 cachetools 5.3.3 certifi 2020.12.5 cffi 1.14.3 chardet 3.0.4 cloudpickle 3.0.0 conda 4.9.2 conda-build 3.21.4 conda-package-handling 1.7.2 contourpy 1.1.1 cryptography 3.2.1 cycler 0.12.1 Cython 3.0.10 decorator 4.4.2 dm-control 0.0.364194727 dm-env 1.6 dm-tree 0.1.8 dmc2gym 1.0.0 dnspython 2.1.0 dotmap 1.3.30 etils 1.3.0 fasteners 0.19 ffmpeg 1.4 filelock 3.0.12 fonttools 4.51.0 future 1.0.0 glob2 0.7 google-auth 2.29.0 google-auth-oauthlib 1.0.0 grpcio 1.63.0 gym 0.20.0 gym-notices 0.0.8 h5py 3.11.0 idna 2.10 imageio 2.34.1 imageio-ffmpeg 0.4.9 importlib-metadata 7.1.0 importlib-resources 6.4.0 ipython 7.19.0 ipython-genutils 0.2.0 jedi 0.17.2 Jinja2 2.11.2 kiwisolver 1.4.5 kornia 0.5.1 labmaze 1.0.6 lazy-loader 0.4 libarchive-c 2.9 llvmlite 0.36.0 lxml 5.2.2 Markdown 3.6 MarkupSafe 2.1.5 matplotlib 3.7.5 mkl-fft 1.2.0 mkl-random 1.1.1 mkl-service 2.3.0 mujoco 3.1.5 mujoco-py 2.1.2.14 networkx 3.1 numba 0.53.1 numpy 1.21.1 oauthlib 3.2.2 olefile 0.46 opencv-python 4.9.0.80 packaging 24.0 parso 0.7.0 patchelf 0.17.2.1 pexpect 4.8.0 pickleshare 0.7.5 Pillow 9.5.0 pip 20.2.4 pkginfo 1.7.0 prompt-toolkit 3.0.8 protobuf 5.26.1 psutil 5.7.2 ptyprocess 0.7.0 pyasn1 0.6.0 pyasn1-modules 0.4.0 pycosat 0.6.3 pycparser 2.20 pygame 2.5.2 Pygments 2.7.4 PyOpenGL 3.1.7 PyOpenGL-accelerate 3.1.7 pyOpenSSL 19.1.0 pyparsing 3.1.2 PySocks 1.7.1 python-dateutil 2.9.0.post0 python-etcd 0.4.5 pytz 2020.5 PyWavelets 1.4.1 PyYAML 5.3.1 requests 2.24.0 requests-oauthlib 2.0.0 robosuite 1.0.1 rsa 4.9 ruamel-yaml 0.15.87 scikit-image 0.21.0 scikit-video 1.1.11 scipy 1.10.1 setuptools 65.5.0 six 1.15.0 soupsieve 2.1 tb-nightly 2.14.0a20230808 tensorboard-data-server 0.7.2 termcolor 2.4.0 tifffile 2023.7.10 torch 1.7.1 torchelastic 0.2.1 torchvision 0.8.2 tqdm 4.51.0 traitlets 5.0.5 typing-extensions 3.7.4.3 urllib3 1.25.11 wcwidth 0.2.5 werkzeug 3.0.3 wheel 0.35.1 zipp 3.18.2
Could you provide any guidance on what might be causing this issue and how to resolve it? Any suggestions or insights would be greatly appreciated.
Thank you for your time and assistance.