Performance test of file reading and using lance to read images

1. Test code

import os
import argparse
import pyarrow as pa
import lance
import time
from tqdm import tqdm
from pycocotools.coco import COCO
import os.path as osp
import numpy as np
from PIL import Image

def process_images_detect(images_folder, split, schema, ann_file):
    coco = COCO(ann_file)
    images = coco.loadImgs(coco.getImgIds())
    images2id = {}
    for im_ann in images:
        images2id[im_ann["file_name"]] = im_ann["id"]

    image2ann = coco.imgToAnns

    for image_file in tqdm(
        os.listdir(images_folder), total=len(os.listdir(images_folder))
    ):
        if ".jpg" not in image_file:
            continue
        im_id = images2id[image_file]
        bboxes = []
        catids = []
        for ann in image2ann[im_id]:
            bboxes.append(ann["bbox"])
            catids.append(ann["category_id"])

        with open(osp.join(images_folder, image_file), "rb") as f:
            im = f.read()

        image_array = pa.array([im], type=pa.binary())
        filename_array = pa.array([str(image_file)], type=pa.string())
        bboxes_array = pa.array([np.asarray(bboxes).tobytes()], type=pa.binary())
        catid_array = pa.array([np.asarray(catids).tobytes()], type=pa.binary())
        labels = pa.array([image_file], type=pa.string())

        # Yield RecordBatch for each image
        yield pa.RecordBatch.from_arrays(
            [image_array, filename_array, bboxes_array, catid_array, labels],
            schema=schema,
        )

# Function to write PyArrow Table to Lance dataset
def write_to_lance(data_folder, dataset_name, schema):
    for split in ["train2017"]:
        lance_file_path = os.path.join(data_folder, f"{dataset_name}_{split}.lance")

        reader = pa.RecordBatchReader.from_batches(
            schema,
            process_images_detect(
                osp.join(data_folder,"train2017"),
                split,
                schema,
                osp.join(data_folder,"annotations/instances_train2017.json"),
            ),
        )
        lance.write_dataset(
            reader,
            lance_file_path,
            schema,
        )

def loading_into_pandas(images_folder, dataset_name):
    data_frames = {}  # Dictionary to store DataFrames for each data type

    batch_size = args.batch_size
    for split in ["train2017"]:
        uri = os.path.join(images_folder, f"{dataset_name}_{split}.lance")
        ds = lance.dataset(uri)

        for batch in tqdm(
            ds.to_batches(columns=["image", "filename"], batch_size=batch_size),
            desc=f"Loading {split} batches",
        ):
            batch.to_pandas()
    return data_frames

def load_file(data_dir):
    image_dir = osp.join(data_dir,"train2017")
    files = [osp.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(".jpg")]
    for f in tqdm(files):
        im = Image.open(f)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process image dataset.")
    parser.add_argument(
        "--batch_size", type=int, default=10, help="Batch size for processing images"
    )
    parser.add_argument("--dataset", type=str, help="Path to the image dataset folder")
    parser.add_argument("--ann_file")

    # try:
    args = parser.parse_args()
    dataset_path = args.dataset
    if dataset_path is None:
        raise ValueError(
            "Please provide the path to the image dataset folder using the --dataset argument."
        )
    # Extract dataset name
    dataset_name = os.path.basename(dataset_path)

    schema = pa.schema(
        [
            pa.field("image", pa.binary()),
            pa.field("filename", pa.string()),
            pa.field("bbox", pa.binary()),
            pa.field("catid", pa.binary()),
            pa.field("label", pa.string()),
        ]
    )

    # write_to_lance(dataset_path, dataset_name, schema)
    start = time.time()
    data_frames = loading_into_pandas(dataset_path, dataset_name)
    end = time.time()
    print(f"Lancedb Time(sec): {end - start:.2f}")
    start = time.time()
    load_file(dataset_path)
    end = time.time()
    print(f"File Time(sec): {end - start:.2f}")

Test data

coco2017 training set images, a total of 118,287 images

Software and hardware information

pyarrow==15.0.0 pydantic==2.7.1 lancedb==0.10.1 pylance==0.14.1 numpy==1.26.3

Test results

When we compared the performance of lancedb and reading directly by file name, we found that lance reading on SSD is slower than file reading, while lancedb reading on HDD is faster and the difference is larger.batch_size is set to 16, The following are relevant screenshots of the test on two devices： in SSD: in HDD:

I would like to ask if this test result is reasonable, because this result is slightly different from this test result.And I want to know what caused this result, so that we can know in which cases to use lancedb in the future

lancedb / lance