I'm experiencing a bug with the vot toolkit. The tracking recall returned by the "longterm_ar" metric is > 1 for sequences with occlusions. I'm using an oracle tracker: for every frame it returns the groundtruth.
# copy pasted from https://github.com/votchallenge/integration/blob/4aa5de6e3d87026e3ef516fa3ee02ee688d741ee/python/vot.py
import os
import collections
import numpy as np
try:
import trax
except ImportError:
raise Exception("TraX support not found. Please add trax module to Python path.")
if trax._ctypes.trax_version().decode("ascii") < "4.0.0":
raise ImportError("TraX version 4.0.0 or newer is required.")
Rectangle = collections.namedtuple("Rectangle", ["x", "y", "width", "height"])
Point = collections.namedtuple("Point", ["x", "y"])
Polygon = collections.namedtuple("Polygon", ["points"])
Empty = collections.namedtuple("Empty", [])
class VOT(object):
"""Base class for VOT toolkit integration in Python.
This class is only a wrapper around the TraX protocol and can be used for single or multi-object tracking.
The wrapper assumes that the experiment will provide new objects onlf at the first frame and will fail otherwise.
"""
def __init__(self, region_format, channels=None, multiobject: bool = None):
"""Constructor for the VOT wrapper.
Args:
region_format: Region format options
channels: Channels that are supported by the tracker
multiobject: Whether to use multi-object tracking
"""
assert region_format in [
trax.Region.RECTANGLE,
trax.Region.POLYGON,
trax.Region.MASK,
]
if multiobject is None:
multiobject = os.environ.get("VOT_MULTI_OBJECT", "0") == "1"
if channels is None:
channels = ["color"]
elif channels == "rgbd":
channels = ["color", "depth"]
elif channels == "rgbt":
channels = ["color", "ir"]
elif channels == "ir":
channels = ["ir"]
else:
raise Exception("Illegal configuration {}.".format(channels))
self._trax = trax.Server(
[region_format],
[trax.Image.PATH],
channels,
metadata=dict(vot="python"),
multiobject=multiobject,
)
request = self._trax.wait()
assert request.type == "initialize"
self._objects = []
assert len(request.objects) > 0 and (multiobject or len(request.objects) == 1)
for object, _ in request.objects:
if isinstance(object, trax.Polygon):
self._objects.append(Polygon([Point(x[0], x[1]) for x in object]))
elif isinstance(object, trax.Mask):
self._objects.append(object.array(True))
else:
self._objects.append(Rectangle(*object.bounds()))
self._image = [x.path() for k, x in request.image.items()]
if len(self._image) == 1:
self._image = self._image[0]
self._multiobject = multiobject
self._trax.status(request.objects)
def region(self):
"""
Returns initialization region for the first frame in single object tracking mode.
Returns:
initialization region
"""
assert not self._multiobject
return self._objects[0]
def objects(self):
"""
Returns initialization regions for the first frame in multi object tracking mode.
Returns:
initialization regions for all objects
"""
return self._objects
def report(self, status, confidence=None):
"""
Report the tracking results to the client
Arguments:
status: region for the frame or a list of regions in case of multi object tracking
confidence: confidence for the object detection, used only in single object tracking mode
"""
def convert(region):
"""Convert region to TraX format"""
# If region is None, return empty region
if region is None:
return trax.Rectangle.create(0, 0, 0, 0)
assert isinstance(region, (Empty, Rectangle, Polygon, np.ndarray))
if isinstance(region, Empty):
return trax.Rectangle.create(0, 0, 0, 0)
elif isinstance(region, Polygon):
return trax.Polygon.create([(x.x, x.y) for x in region.points])
elif isinstance(region, np.ndarray):
return trax.Mask.create(region)
else:
return trax.Rectangle.create(
region.x, region.y, region.width, region.height
)
if not self._multiobject:
properties = {}
if not confidence is None:
properties["confidence"] = confidence
status = [(convert(status), properties)]
else:
assert isinstance(status, (list, tuple))
status = [(convert(x), {}) for x in status]
self._trax.status(status, {})
def frame(self):
"""
Get a frame (image path) from client
Returns:
absolute path of the image
"""
if hasattr(self, "_image"):
image = self._image
del self._image
return image
request = self._trax.wait()
# Only the first frame can declare new objects for now
assert request.objects is None or len(request.objects) == 0
if request.type == "frame":
image = [x.path() for k, x in request.image.items()]
if len(image) == 1:
return image[0]
return image
else:
return None
def quit(self):
"""Quit the tracker"""
if hasattr(self, "_trax"):
self._trax.quit()
def __del__(self):
"""Destructor for the tracker, calls quit."""
self.quit()
# MY CODE
class OracleTracker:
def __init__(self, root):
with open(root, "r") as f:
lines = f.readlines()
def parse_line(line):
if line.strip() == "0":
return Empty()
else:
x, y, width, height = [float(x) for x in line.strip().split(",")]
return Rectangle(x, y, width, height)
self.groundtruth = [parse_line(line) for line in lines]
self.i = 1 # skip initialization frame
def track(self, imagefile):
box = self.groundtruth[self.i]
score = 1.0
self.i += 1
return box, score
def main():
tracker = OracleTracker(
os.path.join(os.path.dirname(__file__), "..", "sequences", "airplane", "groundtruth.txt")
)
# *****************************************
# VOT: Create VOT handle at the beginning
# Then get the initializaton region
# and the first image
# *****************************************
handle = VOT("rectangle")
# Process the first frame
imagefile = handle.frame()
if not imagefile:
exit(0)
selection = handle.region()
while True:
# *****************************************
# VOT: Call frame method to get path of the
# current image frame. If the result is
# null, the sequence is over.
# *****************************************
imagefile = handle.frame()
if not imagefile:
break
selection, score = tracker.track(imagefile)
# *****************************************
# VOT: Report the position of the object
# every frame using report method.
# *****************************************
handle.report(selection, score)
if __name__ == "__main__":
main()
the recall seems to take into account every frame, and not just the frames with the ground truth annotation as stated in the Eq. (2) in "Performance evaluation methodology for long-term single-object tracking", Lukezic et al., 2021:
groundtruth_index = [i for i, region in enumerate(sequence.groundtruth()) if region.type is not RegionType.SPECIAL]
confidence_filtered = confidence[groundtruth_index]
overlaps_filtered = overlaps[groundtruth_index]
subset_filtered = confidence_filtered >= threshold
recall[i] = np.sum(overlaps_filtered[subset_filtered]) / n_visible
Thank you for the comprehensive report and the push request; I will review it and merge it if I do not see any problems. But it will happen in July when I have some time.
I'm experiencing a bug with the vot toolkit. The tracking recall returned by the "longterm_ar" metric is > 1 for sequences with occlusions. I'm using an oracle tracker: for every frame it returns the groundtruth.
To reproduce
mkdir repro
to create a workspace folderconfig.yaml
stack.yaml
trackers.ini
tracker/tracker.py
run the evaluation with
vot evaluate oracle
get the report
vot report oracle
Possible solution
After some debugging I found out that here
https://github.com/votchallenge/toolkit/blob/c26c38b63cab831d9a494a80ca07459ddc8a5d5a/vot/analysis/longterm.py#L65-L84
the recall seems to take into account every frame, and not just the frames with the ground truth annotation as stated in the Eq. (2) in "Performance evaluation methodology for long-term single-object tracking", Lukezic et al., 2021:
What the code should do in my opinion is to filter out from
overlaps
andconfidence
arrays values without ground truth, i.e. transform this line https://github.com/votchallenge/toolkit/blob/c26c38b63cab831d9a494a80ca07459ddc8a5d5a/vot/analysis/longterm.py#L82 intoWith this fix I was able to get
Related issues
Might be related to https://github.com/votchallenge/toolkit/issues/63