Open zhiziwy opened 1 month ago
Hi, I have the same problem. Did you solve it?
Hi, I found this problem as well while testing on esc-50. my device have only 1 gpu, so the command that i used is
CUDA_VISIBLE_DEVICES=0 python main.py test
then, I found the same RuntimeError
Looking back to the commit 8e1f216, where the author was adding single gpu support, there are several lines in the file sed_model.py
added to make sure validation_epoch_end
checks about the number of devices. But, the author doesn't apply the same change to test_epoch_end
. So, I fixed it by copying the change in validation_epoch_end
to test_epoch_end
.
You can replace test_epoch_end
in sed_model.py
with the code below
def test_epoch_end(self, test_step_outputs):
self.device_type = next(self.parameters()).device
if self.config.fl_local:
pred = np.concatenate([d[0] for d in test_step_outputs], axis = 0)
pred_map = np.concatenate([d[1] for d in test_step_outputs], axis = 0)
audio_name = np.concatenate([d[2] for d in test_step_outputs], axis = 0)
real_len = np.concatenate([d[3] for d in test_step_outputs], axis = 0)
heatmap_file = os.path.join(self.config.heatmap_dir, self.config.test_file + "_" + str(self.device_type) + ".npy")
save_npy = [
{
"audio_name": audio_name[i],
"heatmap": pred_map[i],
"pred": pred[i],
"real_len":real_len[i]
}
for i in range(len(pred))
]
np.save(heatmap_file, save_npy)
else:
self.device_type = next(self.parameters()).device
pred = torch.cat([d[0] for d in test_step_outputs], dim = 0)
target = torch.cat([d[1] for d in test_step_outputs], dim = 0)
if torch.cuda.device_count() > 1:
gather_pred = [torch.zeros_like(pred) for _ in range(dist.get_world_size())]
gather_target = [torch.zeros_like(target) for _ in range(dist.get_world_size())]
dist.barrier()
if self.config.dataset_type == "audioset":
metric_dict = {
"mAP": 0.,
"mAUC": 0.,
"dprime": 0.
}
else:
metric_dict = {
"acc":0.
}
if torch.cuda.device_count() > 1:
dist.all_gather(gather_pred, pred)
dist.all_gather(gather_target, target)
if dist.get_rank() == 0:
gather_pred = torch.cat(gather_pred, dim = 0).cpu().numpy()
gather_target = torch.cat(gather_target, dim = 0).cpu().numpy()
if self.config.dataset_type == "scv2":
gather_target = np.argmax(gather_target, 1)
metric_dict = self.evaluate_metric(gather_pred, gather_target)
print(self.device_type, dist.get_world_size(), metric_dict, flush = True)
if self.config.dataset_type == "audioset":
self.log("mAP", metric_dict["mAP"] * float(dist.get_world_size()), on_epoch = True, prog_bar=True, sync_dist=True)
self.log("mAUC", metric_dict["mAUC"] * float(dist.get_world_size()), on_epoch = True, prog_bar=True, sync_dist=True)
self.log("dprime", metric_dict["dprime"] * float(dist.get_world_size()), on_epoch = True, prog_bar=True, sync_dist=True)
else:
self.log("acc", metric_dict["acc"] * float(dist.get_world_size()), on_epoch = True, prog_bar=True, sync_dist=True)
dist.barrier()
else:
gather_pred = pred.cpu().numpy()
gather_target = target.cpu().numpy()
if self.config.dataset_type == "scv2":
gather_target = np.argmax(gather_target, 1)
metric_dict = self.evaluate_metric(gather_pred, gather_target)
print(self.device_type, metric_dict, flush = True)
if self.config.dataset_type == "audioset":
self.log("mAP", metric_dict["mAP"], on_epoch = True, prog_bar=True, sync_dist=False)
self.log("mAUC", metric_dict["mAUC"], on_epoch = True, prog_bar=True, sync_dist=False)
self.log("dprime", metric_dict["dprime"], on_epoch = True, prog_bar=True, sync_dist=False)
else:
self.log("acc", metric_dict["acc"], on_epoch = True, prog_bar=True, sync_dist=False)
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.