Semantic error in generating masks for complex structures (multiple polygons, multiple connected components, and multiple holes).

issue description:

I have segmentation labels of some plants that are in the form of contours (extracted using image processing/morphological operations).

My target task is to train a Mask RCNN model using these labels.

Accordingly, I convert them to RLE (or even use them as polygons), and visualize them before training to make sure the conversion was fine.

The problem is that when visualizing the masks (see images attached below), I notice that holes in the mask are being treated /colored/labeled as foreground (target class) while they should have been treated as background pixels.

I tried using polygons instead, taking parent/child relations into consideration when sorting, I also tried with binary masks fed directly and noticed no difference, they all caused the same issue.

Instructions To Reproduce the 🐛 Bug:

Full runnable code or full changes you made:

imports and set up checks.


import torch, detectron2
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)
# print("detectron2:", detectron2.__version__)
print("detectron2:", detectron2.__version__)

gpu_available = torch.cuda.is_available()
print(f"GPU available: {gpu_available}")

if gpu_available:
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
import matplotlib.pyplot as plt
import time
import numpy as np
import os, json, cv2, random
from detectron2.structures import BoxMode
from pycocotools import mask as mask_utils

def cv2_imshow(image):
    cv2.imshow('image', image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

data prep

def convert_relative_coordinates_from_list_to_contours_to_abs_contours(relative_contours, img_width, img_height):
    pixel_contours = []
    for relative_contour in relative_contours:
        # Convert to numpy array if it's a list
        if isinstance(relative_contour, list):
            relative_contour = np.array(relative_contour)

        # Convert to pixel coordinates
        pixel_contour = relative_contour.copy().astype(np.float32)
        pixel_contour[:, 0, 0] = (pixel_contour[:, 0, 0] * img_width).astype(np.int32)
        pixel_contour[:, 0, 1] = (pixel_contour[:, 0, 1] * img_height).astype(np.int32)
        pixel_contours.append(pixel_contour.astype(np.int32))
    return pixel_contours

def fix_contours(contours, image=None):
    if contours is not None:

        ## delete the contours with less than 3 points
        for i in range(len(contours) - 1, -1, -1):
            if len(contours[i]) < 3:
                del contours[i]

        cleaned_contours = []
        for contour in contours:
            epsilon = 0.0005 * cv2.arcLength(contour, True)
            simplified_contour = cv2.approxPolyDP(contour, epsilon, True)

            ## remove contours with less than 3 points
            if len(simplified_contour) < 3:
                continue

            ## remove contours with small area
            area = cv2.contourArea(simplified_contour)
            if area < 50:
                continue
            else:
                cleaned_contours.append(simplified_contour)

        return cleaned_contours

def contours_to_binary_mask(contours, img_width, img_height):
    """Draw contours onto an image to create a binary mask."""
    mask = np.zeros((img_height, img_width), dtype=np.uint8)
    cv2.drawContours(mask, contours, -1, (255), thickness=cv2.FILLED)

    return mask

def extract_contours_from_mask(mask):
    """Extract contours from a binary mask using cv2.findContours with RETR_CCOMP."""
    contours, hierarchy = cv2.findContours(mask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
    return contours, hierarchy

def convert_to_d2_format(contours, hierarchy):
    """Convert the contours extracted with cv2.findContours and RETR_CCOMP to Detectron2 format."""
    d2_segmentation = []

    for i, contour in enumerate(contours):
        # Skip child contours (holes) since they will be appended after their parent contour
        if hierarchy[0][i][2] != -1:
            continue

        main_contour = contour.reshape(-1).tolist()
        d2_segmentation.append(main_contour)

        # Check for child contour (hole)
        child_idx = hierarchy[0][i][2]
        while child_idx != -1:
            hole_contour = contours[child_idx].reshape(-1).tolist()
            d2_segmentation.append(hole_contour)
            child_idx = hierarchy[0][child_idx][0]  # next sibling

    return d2_segmentation

def binary_mask_to_rle(binary_mask):
    """Convert binary mask to RLE format."""
    rle = mask_utils.encode(np.asfortranarray(binary_mask.astype(np.uint8)))
    return rle

def rle_to_binary_mask(rle):
    """Convert RLE format to binary mask."""
    binary_mask = mask_utils.decode(rle)
    return binary_mask

def get_shape(list_of_lists):
    num_rows = len(list_of_lists)
    num_columns = max(len(sublist) for sublist in list_of_lists)
    return num_rows, num_columns

def get_data_dicts(img_dir):

    json_files = [f for f in os.listdir(img_dir) if f.endswith('.json')]

    dataset_dicts = []

    count = 0
    for jf in json_files:

        count += 1

        json_file_path = os.path.join(img_dir, jf)
        with open(json_file_path) as f:
            data = json.load(f)

        record = {}
        record["file_name"] = os.path.join(img_dir, data["image"]["file_name"])
        record["image_id"] = int(data["image"]["id"])
        record["height"] = data["image"]["height"]
        record["width"] = data["image"]["width"]

        annos = data["annotations"]
        objs = []

        # Convert normalized bbox to absolute values
        bbox = annos["bbox"]
        bbox_abs = [
            bbox[0] * record["width"],
            bbox[1] * record["height"],
            bbox[2] * record["width"],
            bbox[3] * record["height"],
        ]

        if str(annos["specie"]).lower() == 'tomato':
            category_id = 0
        elif str(annos["specie"]).lower() == 'chillies':
            category_id = 1
        else:
            raise Exception('error in categories.')

        abs_contours = convert_relative_coordinates_from_list_to_contours_to_abs_contours(annos["segmentation"], record["width"], record["height"])
        fixed_abs_contours = fix_contours(abs_contours)
        binary_mask = contours_to_binary_mask(fixed_abs_contours, record["width"], record["height"])
        rle = binary_mask_to_rle(binary_mask)

        obj = {
            "bbox": bbox_abs,
            "bbox_mode": BoxMode.XYWH_ABS,
            "segmentation": rle,
            "binary_mask": binary_mask,
            "category_id": category_id,
        }

        objs.append(obj)
        record["annotations"] = objs
        dataset_dicts.append(record)

    return dataset_dicts

register data


def register_datasets(root_dir, class_list_file):

    with open(class_list_file, 'r') as reader:
        classes_ = [l[:-1] for l in reader.readlines()]

    print('classes_', classes_)
    print('len(classes_)', len(classes_))

    for d in ["train", "val"]:
        DatasetCatalog.register("data_" + d, lambda d=d: get_data_dicts(os.path.join(root_dir, d)))
        MetadataCatalog.get("data_" + d).set(thing_classes=classes_)

    print('len(classes_)', len(classes_))
    return len(classes_)

weed_dataset_path = 'some/path/'
weed_class_list_path = 'some/path/'

register_datasets(weed_dataset_path, weed_class_list_path)

data_metadata = MetadataCatalog.get("data_train")

Visualize some data


for d in random.sample(dataset_dicts, 15):
    img = cv2.imread(d["file_name"])
    visualizer = Visualizer(img[:, :, ::-1], metadata=data_metadata, scale=0.3)
    out = visualizer.draw_dataset_dict(d)
    cv2_imshow(out.get_image()[:, :, ::-1])

    binary_mask = d["annotations"][0]["binary_mask"]
    plt.imshow(binary_mask)
    plt.show()

Your Environment:

Python Version: Python 3.8.17
PyTorch Version: pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.7
Detectron2 Version: 0.6
CUDA Version: 11.7, V11.7.64
cuDNN Version: 8.0.5
GPU: NVIDIA RTX A4000

Expected behavior:

the expected behavior is to convert the binary mask to the expected training/testing format while maintaining the correct labeling provided in the input labels.

This behavior is not only when using RLEs, I have extensively experienced with all possible formats, i.e., polygons (with respect to the hierarchy of polygons, parents first and child follow, and so on, as well as binary masks nd.arrays). Below is an example of the RLE vs Binary Mask provided as input for binary_mask_to_rle(binary_mask) in my get_data_dicsts() function.

Binary mask as visualized in plt.imshow(binary_mask) : 3-binary

RLE as visualized in cv2_imshow(out.get_image()[:, :, ::-1]) 3-vis

facebookresearch / detectron2