IndexError: index 168 is out of bounds for dimension 0 with size 168 in keypointrcnn_loss

🐛 Describe the bug

Hi iam running in to the following error:

Traceback (most recent call last): File "/Users/eriktzschoppe/Documents/Masterarbeit/scripts/keyoint_detection/pytorch/train.py", line 28, in metric_logger = train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/Documents/Masterarbeit/scripts/keyoint_detection/pytorch/engine.py", line 30, in train_one_epoch loss_dict = model(images, targets) ^^^^^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/.local/lib/python3.12/site-packages/torchvision/models/detection/generalized_rcnn.py", line 105, in forward detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/.local/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/.local/lib/python3.12/site-packages/torchvision/models/detection/roi_heads.py", line 864, in forward rcnn_loss_keypoint = keypointrcnn_loss( ^^^^^^^^^^^^^^^^^^ File "/Users/eriktzschoppe/.local/lib/python3.12/site-packages/torchvision/models/detection/roi_heads.py", line 330, in keypointrcnn_loss keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
IndexError: index 168 is out of bounds for dimension 0 with size 168

I can't figure out why this occurs. I want to train the build in rccn model on a custom dataset for object and keypoint detection. A good amount of my code is from https://medium.com/@alexppppp/how-to-train-a-custom-keypoint-detection-model-with-pytorch-d9af90e111da. They've done something similar. In my dataset for testing, I got 2 classes, each containing 7 keypoints. It's a bit strange because sometimes the model is training for some epochs and at some point the mentioned error occurs. I would be very thankful if sometime could help me to investigate. I'll provide a snippet of my dataset too.

Thanks
dataset.zip Training:

import torch
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from pathlib import Path
import json
from utils import KeypointsDataset, get_model, collate_fn
from engine import train_one_epoch

input_dir_train = Path("/Users/eriktzschoppe/Documents/Masterarbeit/daten/training_data/dataset_test/train")
input_dir_val = Path("/Users/eriktzschoppe/Documents/Masterarbeit/daten/training_data/dataset_test/val")

dataset_train = KeypointsDataset(input_dir_train)
dataset_val = KeypointsDataset(input_dir_val)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = get_model(train_keypoint_head=4, num_keypoints=7)
data_loader_train = DataLoader(dataset_train, batch_size=4, shuffle=False, collate_fn=collate_fn)
data_loader_test = DataLoader(dataset_val, batch_size=1, shuffle=False, collate_fn=collate_fn)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3)
num_epochs = 5

for epoch in range(num_epochs):
    metric_logger = train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=1)
    lr_scheduler.step()

Utils:

import torch
import json
from typing import Optional
from pathlib import Path
from torch.utils.data import Dataset
from PIL import Image
from torchvision.transforms import ToTensor
import torchvision
from torchvision.models.detection.rpn import AnchorGenerator
import datetime
import errno
import os
import time
from collections import defaultdict, deque

import torch
import torch.nn as nn
import torch.distributed as dist

class SmoothedValue:
    """Track a series of values and provide access to smoothed values over a
    window or the global series average.
    """

    def __init__(self, window_size=20, fmt=None):
        if fmt is None:
            fmt = "{median:.4f} ({global_avg:.4f})"
        self.deque = deque(maxlen=window_size)
        self.total = 0.0
        self.count = 0
        self.fmt = fmt

    def update(self, value, n=1):
        self.deque.append(value)
        self.count += n
        self.total += value * n

    def synchronize_between_processes(self):
        """
        Warning: does not synchronize the deque!
        """
        if not is_dist_avail_and_initialized():
            return
        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
        dist.barrier()
        dist.all_reduce(t)
        t = t.tolist()
        self.count = int(t[0])
        self.total = t[1]

    @property
    def median(self):
        d = torch.tensor(list(self.deque))
        return d.median().item()

    @property
    def avg(self):
        d = torch.tensor(list(self.deque), dtype=torch.float32)
        return d.mean().item()

    @property
    def global_avg(self):
        return self.total / self.count

    @property
    def max(self):
        return max(self.deque)

    @property
    def value(self):
        return self.deque[-1]

    def __str__(self):
        return self.fmt.format(
            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
        )

def all_gather(data):
    """
    Run all_gather on arbitrary picklable data (not necessarily tensors)
    Args:
        data: any picklable object
    Returns:
        list[data]: list of data gathered from each rank
    """
    world_size = get_world_size()
    if world_size == 1:
        return [data]
    data_list = [None] * world_size
    dist.all_gather_object(data_list, data)
    return data_list

def reduce_dict(input_dict, average=True):
    """
    Args:
        input_dict (dict): all the values will be reduced
        average (bool): whether to do average or sum
    Reduce the values in the dictionary from all processes so that all processes
    have the averaged results. Returns a dict with the same fields as
    input_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return input_dict
    with torch.inference_mode():
        names = []
        values = []
        # sort the keys so that they are consistent across processes
        for k in sorted(input_dict.keys()):
            names.append(k)
            values.append(input_dict[k])
        values = torch.stack(values, dim=0)
        dist.all_reduce(values)
        if average:
            values /= world_size
        reduced_dict = {k: v for k, v in zip(names, values)}
    return reduced_dict

class MetricLogger:
    def __init__(self, delimiter="\t"):
        self.meters = defaultdict(SmoothedValue)
        self.delimiter = delimiter

    def update(self, **kwargs):
        for k, v in kwargs.items():
            if isinstance(v, torch.Tensor):
                v = v.item()
            assert isinstance(v, (float, int))
            self.meters[k].update(v)

    def __getattr__(self, attr):
        if attr in self.meters:
            return self.meters[attr]
        if attr in self.__dict__:
            return self.__dict__[attr]
        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")

    def __str__(self):
        loss_str = []
        for name, meter in self.meters.items():
            loss_str.append(f"{name}: {str(meter)}")
        return self.delimiter.join(loss_str)

    def synchronize_between_processes(self):
        for meter in self.meters.values():
            meter.synchronize_between_processes()

    def add_meter(self, name, meter):
        self.meters[name] = meter

    def log_every(self, iterable, print_freq, header=None):
        i = 0
        if not header:
            header = ""
        start_time = time.time()
        end = time.time()
        iter_time = SmoothedValue(fmt="{avg:.4f}")
        data_time = SmoothedValue(fmt="{avg:.4f}")
        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
        if torch.cuda.is_available():
            log_msg = self.delimiter.join(
                [
                    header,
                    "[{0" + space_fmt + "}/{1}]",
                    "eta: {eta}",
                    "{meters}",
                    "time: {time}",
                    "data: {data}",
                    "max mem: {memory:.0f}",
                ]
            )
        else:
            log_msg = self.delimiter.join(
                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
            )
        MB = 1024.0 * 1024.0
        for obj in iterable:
            data_time.update(time.time() - end)
            yield obj
            iter_time.update(time.time() - end)
            if i % print_freq == 0 or i == len(iterable) - 1:
                eta_seconds = iter_time.global_avg * (len(iterable) - i)
                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
                if torch.cuda.is_available():
                    print(
                        log_msg.format(
                            i,
                            len(iterable),
                            eta=eta_string,
                            meters=str(self),
                            time=str(iter_time),
                            data=str(data_time),
                            memory=torch.cuda.max_memory_allocated() / MB,
                        )
                    )
                else:
                    print(
                        log_msg.format(
                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
                        )
                    )
            i += 1
            end = time.time()
        total_time = time.time() - start_time
        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
        print(f"{header} Total time: {total_time_str} ({total_time / len(iterable):.4f} s / it)")

def collate_fn(batch):
    return tuple(zip(*batch))

def mkdir(path):
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

def setup_for_distributed(is_master):
    """
    This function disables printing when not in master process
    """
    import builtins as __builtin__

    builtin_print = __builtin__.print

    def print(*args, **kwargs):
        force = kwargs.pop("force", False)
        if is_master or force:
            builtin_print(*args, **kwargs)

    __builtin__.print = print

def is_dist_avail_and_initialized():
    if not dist.is_available():
        return False
    if not dist.is_initialized():
        return False
    return True

def get_world_size():
    if not is_dist_avail_and_initialized():
        return 1
    return dist.get_world_size()

def get_rank():
    if not is_dist_avail_and_initialized():
        return 0
    return dist.get_rank()

def is_main_process():
    return get_rank() == 0

def save_on_master(*args, **kwargs):
    if is_main_process():
        torch.save(*args, **kwargs)

def init_distributed_mode(args):
    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
        args.rank = int(os.environ["RANK"])
        args.world_size = int(os.environ["WORLD_SIZE"])
        args.gpu = int(os.environ["LOCAL_RANK"])
    elif "SLURM_PROCID" in os.environ:
        args.rank = int(os.environ["SLURM_PROCID"])
        args.gpu = args.rank % torch.cuda.device_count()
    else:
        print("Not using distributed mode")
        args.distributed = False
        return

    args.distributed = True

    torch.cuda.set_device(args.gpu)
    args.dist_backend = "nccl"
    print(f"| distributed init (rank {args.rank}): {args.dist_url}", flush=True)
    torch.distributed.init_process_group(
        backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
    )
    torch.distributed.barrier()
    setup_for_distributed(args.rank == 0)

class KeypointsDataset(Dataset):
    def __init__(self, input_dir):
        self.img_dir = input_dir / "images"
        self.annotations_dir = input_dir / "annotations"
        self.annotations_files = sorted(os.listdir(self.annotations_dir))
        self.images = sorted(list(self.img_dir.iterdir()))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = Image.open(self.images[idx])
        image_tensor = ToTensor()(image)
        target = {}
        with open(os.path.join(self.annotations_dir, self.annotations_files[idx]), "r") as f:
            annotation = json.load(f)
        ##Convert bbox to pytorch format (x0,y0,x1,y1)
        ann_bbox = []
        for ann in annotation:
            tmp = ann["bbox"]
            x0, y0, width, height = tmp[0], tmp[1], tmp[2], tmp[3]
            x1, y1 = x0 + width, y0 + height
            ann_bbox.append([x0, y0, x1, y1])
        target = {
            "boxes": torch.Tensor(ann_bbox),
            "labels": torch.Tensor([ann["category_id"] for ann in annotation]).type(torch.int64),
            "keypoints": torch.Tensor([ann["keypoints"] for ann in annotation])
            }
        return image_tensor, target

def get_model(num_keypoints, train_keypoint_head:Optional = False, train_fpn: Optional = True, weights_path:Optional[Path] = None):

    anchor_generator = AnchorGenerator(sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0))
    model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=None,
                                                                   weights_backbone=None,
                                                                   num_keypoints=num_keypoints,
                                                                   num_classes = 3, # Background is the first class, object is the second class
                                                                   rpn_anchor_generator=anchor_generator)
    for i,param in enumerate(model.parameters()):
        param.requires_grad = False

    if train_fpn==True:
      for param in model.backbone.fpn.parameters():
        param.requires_grad = True

    if weights_path:
        state_dict = torch.load(weights_path)
        model.load_state_dict(state_dict)

    out = nn.ConvTranspose2d(512, num_keypoints, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    model.roi_heads.keypoint_predictor.kps_score_lowres = out

    return model

def collate_fn(batch):
    return tuple(zip(*batch))

Engine:

import math
import sys
import time

import torch
import torchvision.models.detection.mask_rcnn
import utils
from coco_eval import CocoEvaluator

def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
    header = f"Epoch: [{epoch}]"

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = torch.optim.lr_scheduler.LinearLR(
            optimizer, start_factor=warmup_factor, total_iters=warmup_iters
        )

    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print(f"Loss is {loss_value}, stopping training")
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger

def _get_iou_types(model):
    model_without_ddp = model
    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
        model_without_ddp = model.module
    iou_types = ["bbox"]
    if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
        iou_types.append("segm")
    if isinstance(model_without_ddp, torchvision.models.de
        iou_types.append("keypoints")
    return iou_types

Versions

[pip3] numpy==1.23.5 [pip3] torch==2.0.0 [pip3] torchaudio==2.0.0 [pip3] torchdata==0.6.0 [pip3] torchelastic==0.2.2 [pip3] torchtext==0.15.0 [pip3] torchvision==0.15.0 [pip3] triton==2.0.0 [conda] blas 1.0 mkl
[conda] ffmpeg 4.3 hf484d3e_0 pytorch [conda] mkl 2021.4.0 h06a4308_640
[conda] mkl-service 2.4.0 py310h7f8727e_0
[conda] mkl_fft 1.3.1 py310hd6ae3a3_0
[conda] mkl_random 1.2.2 py310h00e6091_0
[conda] numpy 1.23.5 py310hd5efca6_0
[conda] numpy-base 1.23.5 py310h8e6c178_0
[conda] pytorch 2.0.0 py3.10_cuda11.7_cudnn8.5.0_0 pytorch [conda] pytorch-cuda 11.7 h778d358_3 pytorch [conda] pytorch-mutex 1.0 cuda pytorch [conda] torchaudio 2.0.0 py310_cu117 pytorch [conda] torchdata 0.6.0 py310 pytorch [conda] torchelastic 0.2.2 pypi_0 pypi [conda] torchtext 0.15.0 py310 pytorch [conda] torchtriton 2.0.0 py310 pytorch [conda] torchvision 0.15.0 py310_cu117 pytorch

pytorch / vision

IndexError: index 168 is out of bounds for dimension 0 with size 168 in keypointrcnn_loss #8371

🐛 Describe the bug

Versions