albumentations-team / albumentations

Fast and flexible image augmentation library. Paper about the library: https://www.mdpi.com/2078-2489/11/2/125
https://albumentations.ai
MIT License
14k stars 1.63k forks source link

Affine rotate shear after the picture frame does not overlap the object #1845

Closed omaiyiwa closed 1 month ago

omaiyiwa commented 1 month ago

I found that after using these two the target frames don't stack the objects! rotate=(-15, 15), shear={"x": (-10, 10), "y": (-10, 10)}, ` self.aug = A.Compose([ A.Affine( translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)}, scale={"x": (0.9, 1.1), "y": (0.9, 1.1)}, rotate=(-15, 15), shear={"x": (-10, 10), "y": (-10, 10)},

mode="edge"

            p=0.5,
        ),

` lQLPJxtnsKssLGfNAdHNASywmdSORH-464sGgTSLbntOAA_300_465 lQLPKHYwYKBX3GfNAcTNATyw2Man84B8zT0GgTSLbntOAQ_316_452

Here is my overall code, is there something wrong with it ` import os import cv2 import albumentations as A import xml.etree.ElementTree as ET

class VOCAug(object): def init(self, pre_image_path=None, pre_xml_path=None, aug_image_save_path=None, aug_xml_save_path=None, labels=None, num_augmentations=5, # 新增参数 is_show=False): """

    :param pre_image_path:
    :param pre_xml_path:
    :param aug_image_save_path:
    :param aug_xml_save_path:
    :param labels: 标签列表, 展示增强后的图片用
    :param num_augmentations: 每张图片增强的次数
    :param is_show:
    """
    self.pre_image_path = pre_image_path
    self.pre_xml_path = pre_xml_path
    self.aug_image_save_path = aug_image_save_path
    self.aug_xml_save_path = aug_xml_save_path
    self.labels = labels
    self.num_augmentations = num_augmentations
    self.is_show = is_show

    os.makedirs(self.aug_image_save_path, exist_ok=True)
    os.makedirs(self.aug_xml_save_path, exist_ok=True)
    assert self.labels is not None, "labels is None!!!"

    # 数据增强选项
    self.aug = A.Compose([
        A.Affine(
            translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
            scale={"x": (0.9, 1.1), "y": (0.9, 1.1)},
            rotate=(-15, 15),
            shear={"x": (-10, 10), "y": (-10, 10)},
            # mode="edge"
            p=0.5,
        ),

        # A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=1),
        # A.GaussianBlur(p=0.7),
        # A.GaussNoise(p=0.7),
        # A.CLAHE(clip_limit=2.0, tile_grid_size=(4, 4), p=0.5),  # 直方图均衡
        # A.Equalize(p=0.5),  # 均衡图像直方图
        # A.OneOf([
        #     A.RGBShift(r_shift_limit=50, g_shift_limit=50, b_shift_limit=50, p=0.5),
        #     A.ChannelShuffle(p=0.3),  # 随机排列通道
        #     A.ColorJitter(p=0.3),  # 随机改变图像的亮度、对比度、饱和度、色调
        #     A.ChannelDropout(p=0.3),  # 随机丢弃通道
        # ], p=0.5),
        # A.OneOf([
        #     A.Flip(p=0.5),
        #     A.RandomRotate90(p=0.3),
        #     A.HorizontalFlip(p=0.5),
        # ], p=0.5),
        # A.Downscale(p=0.7),  # 随机缩小和放大来降低图像质量
        # A.Emboss(p=0.2),  # 压印输入图像并将结果与原始图像叠加
    ],
        # voc: [xmin, ymin, xmax, ymax]  # 经过归一化
        # min_area: 表示bbox占据的像素总个数, 当数据增强后, 若bbox小于这个值则从返回的bbox列表删除该bbox.
        # min_visibility: 值域为[0,1], 如果增强后的bbox面积和增强前的bbox面积比值小于该值, 则删除该bbox
        bbox_params=A.BboxParams(format='pascal_voc', min_area=0., min_visibility=0., label_fields=['category_id'])
    )
    print('--------------*--------------')
    print("labels: ", self.labels)
    print('--------------*--------------')

def get_xml_data(self, xml_filename):
    with open(os.path.join(self.pre_xml_path, xml_filename), 'r') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        image_name = tree.find('filename').text
        size = root.find('size')
        w = int(size.find('width').text)
        h = int(size.find('height').text)
        bboxes = []
        cls_id_list = []
        for obj in root.iter('object'):
            difficult = obj.find('difficult').text
            cls_name = obj.find('name').text  # label
            if cls_name not in LABELS or int(difficult) == 1:
                continue
            xml_box = obj.find('bndbox')

            xmin = int(xml_box.find('xmin').text)
            ymin = int(xml_box.find('ymin').text)
            xmax = int(xml_box.find('xmax').text)
            ymax = int(xml_box.find('ymax').text)

            # 标注越界修正
            if xmax > w:
                xmax = w
            if ymax > h:
                ymax = h
            bbox = [xmin, ymin, xmax, ymax]
            bboxes.append(bbox)
            cls_id_list.append(self.labels.index(cls_name))

        # 读取图片
        image = cv2.imread(os.path.join(self.pre_image_path, image_name))

    return bboxes, cls_id_list, image, image_name

def aug_image(self):
    xml_list = os.listdir(self.pre_xml_path)

    for xml in xml_list:
        # AI Studio下会存在.ipynb_checkpoints文件, 为了不报错, 根据文件后缀过滤
        file_suffix = xml.split('.')[-1]
        if file_suffix not in ['xml']:
            continue
        bboxes, cls_id_list, image, image_name = self.get_xml_data(xml)

        anno_dict = {'image': image, 'bboxes': bboxes, 'category_id': cls_id_list}
        for aug_idx in range(self.num_augmentations):
            # 获得增强后的数据 {"image", "bboxes", "category_id"}
            augmented = self.aug(**anno_dict)

            # 保存增强后的数据
            self.save_aug_data(augmented, image_name, aug_idx)

def save_aug_data(self, augmented, image_name, aug_idx):
    aug_image = augmented['image']
    aug_bboxes = augmented['bboxes']
    aug_category_id = augmented['category_id']

    # 获取图片的后缀名
    image_suffix = image_name.split(".")[-1]
    # 获取原图的文件名(不包括后缀)
    base_name = image_name.replace(f".{image_suffix}", "")

    # 新的增强图像的文件名
    new_image_name = f"{base_name}_aug_{aug_idx}.{image_suffix}"
    # 新的增强xml文本的文件名
    new_xml_name = f"{base_name}_aug_{aug_idx}.xml"

    # 获取增强后的图片新的宽和高
    new_image_height, new_image_width = aug_image.shape[:2]

    # 深拷贝图片
    aug_image_copy = aug_image.copy()

    # 在对应的原始xml上进行修改, 获得增强后的xml文本
    with open(os.path.join(self.pre_xml_path, image_name.replace(image_suffix, 'xml')), 'r') as pre_xml:
        aug_tree = ET.parse(pre_xml)

    # 修改image_filename值
    root = aug_tree.getroot()
    aug_tree.find('filename').text = new_image_name

    # 修改变换后的图片大小
    size = root.find('size')
    size.find('width').text = str(new_image_width)
    size.find('height').text = str(new_image_height)

    # 修改每一个标注框
    for index, obj in enumerate(root.iter('object')):
        obj.find('name').text = self.labels[aug_category_id[index]]
        xmin, ymin, xmax, ymax = aug_bboxes[index]
        xml_box = obj.find('bndbox')
        xml_box.find('xmin').text = str(int(xmin))
        xml_box.find('ymin').text = str(int(ymin))
        xml_box.find('xmax').text = str(int(xmax))
        xml_box.find('ymax').text = str(int(ymax))
        if self.is_show:
            tl = 2
            text = f"{LABELS[aug_category_id[index]]}"
            t_size = cv2.getTextSize(text, 0, fontScale=tl / 3, thickness=tl)[0]
            cv2.rectangle(aug_image_copy, (int(xmin), int(ymin) - 3),
                          (int(xmin) + t_size[0], int(ymin) - t_size[1] - 3),
                          (0, 0, 255), -1, cv2.LINE_AA)  # filled
            cv2.putText(aug_image_copy, text, (int(xmin), int(ymin) - 2), 0, tl / 3, (255, 255, 255), tl,
                        cv2.LINE_AA)
            cv2.rectangle(aug_image_copy, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (255, 255, 0), 2)

    if self.is_show:
        aug_image_copy = cv2.resize(aug_image_copy, (640, 384))
        cv2.namedWindow('aug_image_show', cv2.WINDOW_NORMAL)  # WINDOW_NORMAL允许调整窗口大小
        cv2.moveWindow('aug_image_show', 100, 100)  # 调整窗口位置,使其在屏幕内
        cv2.imshow('aug_image_show', aug_image_copy)
        # 按下s键保存增强,否则取消保存此次增强
        key = cv2.waitKey(0)
        if key & 0xff == ord('s'):
            pass
        else:
            return False

    # 保存增强后的图片
    cv2.imwrite(os.path.join(self.aug_image_save_path, new_image_name), aug_image)
    # 保存增强后的xml文件
    tree = ET.ElementTree(root)
    tree.write(os.path.join(self.aug_xml_save_path, new_xml_name))

    return True

原始的xml路径和图片路径

PRE_IMAGE_PATH = 'test/images' PRE_XML_PATH = 'test/Annotations'

增强后保存的xml路径和图片路径

AUG_SAVE_IMAGE_PATH = 'test/images2' AUG_SAVE_XML_PATH = 'test/Annotations2'

标签列表

LABELS = ['brush', 'comb', 'lron']

aug = VOCAug( pre_image_path=PRE_IMAGE_PATH, pre_xml_path=PRE_XML_PATH, aug_image_save_path=AUG_SAVE_IMAGE_PATH, aug_xml_save_path=AUG_SAVE_XML_PATH, labels=LABELS, num_augmentations=5, # 设置增强次数 is_show=True, )

aug.aug_image()

`

ternaus commented 1 month ago

Could you please:

  1. Provide image + boxes
  2. Minimal code to reproduce the issue?

Should be 50 lines max.

omaiyiwa commented 1 month ago

Could you please: 请问:

  1. Provide image + boxes提供图片+框
  2. Minimal code to reproduce the issue?重现问题的代码最少?

Should be 50 lines max.最多应为 50 行。

Hi, here is my picture with the corresponding xml Shortest possible reproduction code

LABELS = ['brush', 'comb', 'lron'] A.Affine( rotate=(-15, 15), shear={"x": (-10, 10), "y": (-10, 10)}, p=0.5, ), bbox_params=A.BboxParams(format='pascal_voc', min_area=0., min_visibility=0., label_fields=['category_id'])

10 4 2 20_01_20240424162122783_0166

It doesn't seem to be possible to provide xml directly,So I'll just provide the box coordinates `

<name>brush</name>
  <xmin>1655</xmin>
  <ymin>149</ymin>
  <xmax>1852</xmax>
  <ymax>752</ymax>

<name>lron</name>
  <xmin>915</xmin>
  <ymin>0</ymin>
  <xmax>1206</xmax>
  <ymax>433</ymax>

<name>brush</name>
  <xmin>624</xmin>
  <ymin>419</ymin>
  <xmax>1053</xmax>
  <ymax>671</ymax>

`

omaiyiwa commented 1 month ago

1.4.11 version

ternaus commented 1 month ago

What is going on here.

Albumentations does not know the shape of the object inside of the box and treat it as bbox occupies the whole area. Before:

Screenshot 2024-07-24 at 15 57 27

Think about it as about black box occupying the area of the bbox:

Screenshot 2024-07-24 at 15 58 31

After rotation:

Screenshot 2024-07-24 at 15 59 05

And if we remove black box, you will see that box is not tight around object:

Screenshot 2024-07-24 at 16 00 10

Possible solutions: