About the multi-scale testing

I was trying to implement multi-scale testing for my project based on HRNet's official source code. I have downloaded their pre-trained model and run the MPII test set. But I only got 91.6% instead of 92.3% as reported in the original paper. I know i should probably post the issue on the original HRNet GitHub page (i did and I also wrote an email to the author but i got no response).

So, I post here as it is a newer paper based on HRNet's source code, and also no open issue here. I have included my implementation of multi-test as well as the Matlab evaluation code directly evaluating PCKh from the .mat file generated by the official code with 7247 predictions and see if there are problems with my code:

def read_scaled_image(image_file, s, center, scale, image_size, COLOR_RGB, DATA_FORMAT, image_transform):
    if DATA_FORMAT == 'zip':
        from utils import zipreader
        data_numpy = zipreader.imread(image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    else:
        data_numpy = cv2.imread(image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
    if COLOR_RGB:
        data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB)
    trans = get_affine_transform(center, s * scale, 0, image_size)
    images_warp = cv2.warpAffine(data_numpy, trans, tuple(image_size), flags=cv2.INTER_LINEAR)
    return image_transform(images_warp)

def scale_back_output(output_hm, s, output_size):
    hm_size = [output_hm.size(3), output_hm.size(2)]
    # original_max_val1, _ = torch.max(output_hm, dim=2, keepdim=True)
    # original_max_val2, _ = torch.max(original_max_val1, dim=3, keepdim=True)
    if s != 1.0:
        hm_w_margin = int(abs(1.0 - s) * hm_size[0] / 2.0)
        hm_h_margin = int(abs(1.0 - s) * hm_size[1] / 2.0)
        if s < 1.0:
            hm_padding = torch.nn.ZeroPad2d((hm_w_margin, hm_w_margin, hm_h_margin, hm_h_margin))
            resized_hm = hm_padding(output_hm)
        else:
            resized_hm = output_hm[:, :, hm_h_margin:hm_size[1] - hm_h_margin, hm_w_margin:hm_size[0] - hm_w_margin]
        resized_hm = torch.nn.functional.interpolate(
            resized_hm,
            size=(output_size[0], output_size[1]),
            mode='bilinear',  # bilinear bicubic area
            align_corners=False
        )
    else:
        resized_hm = output_hm
        if hm_size[0] != output_size[0] or hm_size[1] != output_size[1]:
            resized_hm = torch.nn.functional.interpolate(
                resized_hm,
                size=(output_size[0], output_size[1]),
                mode='bilinear',  # bilinear bicubic area
                align_corners=False
            )

    # max_val1, _ = torch.max(resized_hm, dim=2, keepdim=True)
    # max_val2, _ = torch.max(max_val1, dim=3, keepdim=True)
    # resized_hm = resized_hm/max_val2*original_max_val2

    # resized_hm = resized_hm / torch.amax(resized_hm, dim=[2, 3], keepdim=True)
    # resized_hm = torch.nn.functional.normalize(resized_hm, dim=[2, 3], p=1)
    # resized_hm = resized_hm/(torch.sum(resized_hm, dim=[2, 3], keepdim=True) + 1e-9)
    return resized_hm

def validate(config, val_loader, val_dataset, model, criterion, output_dir, tb_log_dir, writer_dict=None, test_scale=None, image_transform=None):
    batch_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    # switch to evaluate mode
    model.eval()

    num_samples = len(val_dataset)
    all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3), dtype=np.float32)
    all_boxes = np.zeros((num_samples, 6))
    image_path = []
    filenames = []
    imgnums = []
    idx = 0

    # PRINT_FREQ = min(config.PRINT_FREQ//10, 5)
    PRINT_FREQ = config.PRINT_FREQ
    thread_pool = multiprocessing.Pool(multiprocessing.cpu_count())

    image_size = np.array([config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0]])
    final_test_scale = test_scale if test_scale is not None else config.TEST.SCALE_FACTOR
    with torch.no_grad():
        end = time.time()

        start_time = time.time()
        for i, (input, target, target_weight, meta) in enumerate(val_loader):
            # compute output
            # print("Batch", i, "Batch Size", input.size(0))

            target = target.cuda(non_blocking=True)
            target_weight = target_weight.cuda(non_blocking=True)

            outputs = []
            hm_size = None
            for sidx, s in enumerate(sorted(final_test_scale, reverse=True)):
                print("Test Scale", s)
                if s != 1.0:
                    image_files = meta["image"]
                    centers = meta["center"].numpy()
                    scales = meta["scale"].numpy()

                    images_resized = thread_pool.starmap(read_scaled_image, [(image_file,
                                                                              s,
                                                                              center,
                                                                              scale,
                                                                              image_size,
                                                                              config.DATASET.COLOR_RGB,
                                                                              config.DATASET.DATA_FORMAT,
                                                                              image_transform) for (image_file, center, scale) in zip(image_files, centers, scales)])
                    images_resized = torch.stack(images_resized, dim=0)
                else:
                    images_resized = input

                model_outputs = model(images_resized)
                if isinstance(model_outputs, list):
                    model_outputs = model_outputs[-1]

                if config.TEST.FLIP_TEST:
                    print("Test Flip")
                    input_flipped = images_resized.flip(3)
                    output_flipped = model(input_flipped)
                    if isinstance(output_flipped, list):
                        output_flipped = output_flipped[-1]

                    output_flipped = flip_back(output_flipped.cpu().numpy(), val_dataset.flip_pairs)
                    output_flipped = torch.from_numpy(output_flipped.copy()).cuda()

                    # feature is not aligned, shift flipped heatmap for higher accuracy
                    if config.TEST.SHIFT_HEATMAP:
                        output_flipped[:, :, :, 1:] = output_flipped.clone()[:, :, :, 0:-1]

                    model_outputs = 0.5 * (model_outputs + output_flipped)

                hm_size = [model_outputs.size(3), model_outputs.size(2)]
                # hm_size = image_size
                # hm_size = [128, 128]
                output_flipped_resized = scale_back_output(model_outputs, s, hm_size)
                outputs.append(output_flipped_resized)

            for indv_output in outputs:
                _, avg_acc, _, _ = accuracy(indv_output.cpu().numpy(), target.cpu().numpy())
                print("Indv Accuracy", avg_acc)

            output = torch.stack(outputs, dim=0).mean(dim=0)

            target = scale_back_output(target, 1.0, hm_size)
            loss = criterion(output, target, target_weight)

            num_images = input.size(0)
            # measure accuracy and record loss
            losses.update(loss.item(), num_images)
            _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(), target.cpu().numpy())
            print("Avg Accuracy", avg_acc)
            acc.update(avg_acc, cnt)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            c = meta['center'].numpy()
            s = meta['scale'].numpy()
            score = meta['score'].numpy()

            preds, maxvals = get_final_preds(config, output.clone().cpu().numpy(), c, s)

            all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]
            all_preds[idx:idx + num_images, :, 2:3] = maxvals
            # double check this all_boxes parts
            all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2]
            all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2]
            all_boxes[idx:idx + num_images, 4] = np.prod(s*200, 1)
            all_boxes[idx:idx + num_images, 5] = score
            image_path.extend(meta['image'])

            idx += num_images

            if i % PRINT_FREQ == 0:
                msg = 'Test: [{0}/{1}]\t' \
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                      'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(i, len(val_loader), batch_time=batch_time, loss=losses, acc=acc)
                logger.info(msg)

                prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i)
                save_debug_images(config, input, meta, target, pred*4, output, prefix)

        total_duration = time.time() - start_time
        logger.info("Total test time: {:.1f}".format(total_duration))
        name_values, perf_indicator = val_dataset.evaluate(config, all_preds, output_dir, all_boxes, image_path, filenames, imgnums)

        model_name = config.MODEL.NAME
        if isinstance(name_values, list):
            for name_value in name_values:
                _print_name_value(name_value, model_name)
        else:
            _print_name_value(name_values, model_name)

        if writer_dict:
            writer = writer_dict['writer']
            global_steps = writer_dict['valid_global_steps']
            writer.add_scalar('valid_loss', losses.avg, global_steps)
            writer.add_scalar('valid_acc', acc.avg, global_steps)
            if isinstance(name_values, list):
                for name_value in name_values:
                    writer.add_scalars('valid', dict(name_value), global_steps)
            else:
                writer.add_scalars('valid', dict(name_values), global_steps)
            writer_dict['valid_global_steps'] = global_steps + 1

    thread_pool.close()
    thread_pool.join()
    return perf_indicator

Below is the Matlab MPII test set evaluation code (evalMPIITest.m), you need to download their newly released test set annotation in http://human-pose.mpi-inf.mpg.de/#download:

% Evaluate performance by comparing predictions to ground truth annotations.

%%% OPTIONS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% IDs of prediction sets to include in results
PRED_IDS = [1, 2, 3];
% Subset of the data that the predictions correspond to ('val' or 'train')
plotcurve = false;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

addpath ('eval')

fprintf('# MPII single-person pose evaluation script\n')

range = 0:0.01:0.5;

tableDir = './latex'; if (~exist(tableDir,'dir')), mkdir(tableDir); end
plotsDir = './plots'; if (~exist(plotsDir,'dir')), mkdir(plotsDir); end
tableTex = cell(length(PRED_IDS)+1,1);

% load ground truth
p = getExpParams(-1)
load([p.gtDir '/annolist_dataset_v12'], 'annolist');
load([p.gtDir '/mpii_human_pose_v1_u12'], 'RELEASE');
annolist_test = annolist(RELEASE.img_train == 0);
% evaluate on the "single person" subset only
single_person_test = RELEASE.single_person(RELEASE.img_train == 0);
% convert to annotation list with a single pose per entry
[annolist_test_flat, single_person_test_flat] = flatten_annolist(annolist_test,single_person_test);
% represent ground truth as a matrix 2x14xN_images
gt = annolist2matrix(annolist_test_flat(single_person_test_flat == 1));
% compute head size
headSize = getHeadSizeAll(annolist_test_flat(single_person_test_flat == 1));

pckAll = zeros(length(range),16,length(PRED_IDS));

for i = 1:length(PRED_IDS);
  % load predictions
  p = getExpParams(PRED_IDS(i));
  try
    load(p.predFilename, 'preds');
  catch
    preds = h5read(p.predFilename, '/preds');
  end

  if size(preds, 1) == 2
    preds = permute(preds, [3, 2, 1]);
  end

  % Check that there are the same number of predictions and ground truth
  % annotations. If this assertion fails, a likely cause is a mismatch in
  % subsets (eg predictions are for the training set but ground truth
  % annotations are for the validation set).
  fprintf('%d\n', length(preds))
  fprintf('%d\n', length(gt))
  assert(length(preds) == length(gt));

  pred_flat = annolist_test_flat(single_person_test_flat == 1);
  for idx = 1:length(preds);
    for pidx = 1:length(pred_flat(idx).annorect.annopoints.point);
      joint = pred_flat(idx).annorect.annopoints.point(pidx).id + 1;
      xy = preds(idx, joint, :);
      pred_flat(idx).annorect.annopoints.point(pidx).x = xy(1);
      pred_flat(idx).annorect.annopoints.point(pidx).y = xy(2);
    end
  end

  % pred = annolist2matrix(pred_flat(single_person_flat == 1));
  pred = annolist2matrix(pred_flat);

  % only gt is allowed to have NaN
  pred(isnan(pred)) = inf;

  % compute distance to ground truth joints
  dist = getDistPCKh(pred,gt,headSize);

  % compute PCKh
  pck = computePCK(dist,range);

  % plot results
  [row, header] = genTablePCK(pck(end,:),p.name);
  tableTex{1} = header;
  tableTex{i+1} = row;

  pckAll(:,:,i) = pck;

  auc = area_under_curve(scale01(range),pck(:,end));
  fprintf('%s, AUC: %1.1f\n',p.name,auc);
end

% Save results
fid = fopen([tableDir '/pckh.tex'],'wt');assert(fid ~= -1);
for i=1:length(tableTex),fprintf(fid,'%s\n',tableTex{i}); end; fclose(fid);

% plot curves
bSave = true;
if (plotcurve)
    plotCurveNew(squeeze(pckAll(:,end,:)),range,PRED_IDS,'PCKh total, MPII',[plotsDir '/pckh-total-mpii'],bSave,range(1:5:end));
    plotCurveNew(squeeze(mean(pckAll(:,[1 6],:),2)),range,PRED_IDS,'PCKh ankle, MPII',[plotsDir '/pckh-ankle-mpii'],bSave,range(1:5:end));
    plotCurveNew(squeeze(mean(pckAll(:,[2 5],:),2)),range,PRED_IDS,'PCKh knee, MPII',[plotsDir '/pckh-knee-mpii'],bSave,range(1:5:end));
    plotCurveNew(squeeze(mean(pckAll(:,[3 4],:),2)),range,PRED_IDS,'PCKh hip, MPII',[plotsDir '/pckh-hip-mpii'],bSave,range(1:5:end));
    plotCurveNew(squeeze(mean(pckAll(:,[7 12],:),2)),range,PRED_IDS,'PCKh wrist, MPII',[plotsDir '/pckh-wrist-mpii'],bSave,range(1:5:end));
    plotCurveNew(squeeze(mean(pckAll(:,[8 11],:),2)),range,PRED_IDS,'PCKh elbow, MPII',[plotsDir '/pckh-elbow-mpii'],bSave,range(1:5:end));
    plotCurveNew(squeeze(mean(pckAll(:,[9 10],:),2)),range,PRED_IDS,'PCKh shoulder, MPII',[plotsDir '/pckh-shoulder-mpii'],bSave,range(1:5:end));
    plotCurveNew(squeeze(mean(pckAll(:,[13 14],:),2)),range,PRED_IDS,'PCKh head, MPII',[plotsDir '/pckh-head-mpii'],bSave,range(1:5:end));
end

display('Done.')

HuangJunJie2017 / UDP-Pose

About the multi-scale testing #32