lancedb / lance

Modern columnar data format for ML and LLMs implemented in Rust. Convert from parquet in 2 lines of code for 100x faster random access, vector index, and data versioning. Compatible with Pandas, DuckDB, Polars, Pyarrow, with more integrations coming..
https://lancedb.github.io/lance/
Apache License 2.0
3.86k stars 213 forks source link

Performance test of file reading and using lance to read images #2635

Open mjq2020 opened 2 months ago

mjq2020 commented 2 months ago

1. Test code

import os
import argparse
import pyarrow as pa
import lance
import time
from tqdm import tqdm
from pycocotools.coco import COCO
import os.path as osp
import numpy as np
from PIL import Image

def process_images_detect(images_folder, split, schema, ann_file):
    coco = COCO(ann_file)
    images = coco.loadImgs(coco.getImgIds())
    images2id = {}
    for im_ann in images:
        images2id[im_ann["file_name"]] = im_ann["id"]

    image2ann = coco.imgToAnns

    for image_file in tqdm(
        os.listdir(images_folder), total=len(os.listdir(images_folder))
    ):
        if ".jpg" not in image_file:
            continue
        im_id = images2id[image_file]
        bboxes = []
        catids = []
        for ann in image2ann[im_id]:
            bboxes.append(ann["bbox"])
            catids.append(ann["category_id"])

        with open(osp.join(images_folder, image_file), "rb") as f:
            im = f.read()

        image_array = pa.array([im], type=pa.binary())
        filename_array = pa.array([str(image_file)], type=pa.string())
        bboxes_array = pa.array([np.asarray(bboxes).tobytes()], type=pa.binary())
        catid_array = pa.array([np.asarray(catids).tobytes()], type=pa.binary())
        labels = pa.array([image_file], type=pa.string())

        # Yield RecordBatch for each image
        yield pa.RecordBatch.from_arrays(
            [image_array, filename_array, bboxes_array, catid_array, labels],
            schema=schema,
        )

# Function to write PyArrow Table to Lance dataset
def write_to_lance(data_folder, dataset_name, schema):
    for split in ["train2017"]:
        lance_file_path = os.path.join(data_folder, f"{dataset_name}_{split}.lance")

        reader = pa.RecordBatchReader.from_batches(
            schema,
            process_images_detect(
                osp.join(data_folder,"train2017"),
                split,
                schema,
                osp.join(data_folder,"annotations/instances_train2017.json"),
            ),
        )
        lance.write_dataset(
            reader,
            lance_file_path,
            schema,
        )

def loading_into_pandas(images_folder, dataset_name):
    data_frames = {}  # Dictionary to store DataFrames for each data type

    batch_size = args.batch_size
    for split in ["train2017"]:
        uri = os.path.join(images_folder, f"{dataset_name}_{split}.lance")
        ds = lance.dataset(uri)

        for batch in tqdm(
            ds.to_batches(columns=["image", "filename"], batch_size=batch_size),
            desc=f"Loading {split} batches",
        ):
            batch.to_pandas()
    return data_frames

def load_file(data_dir):
    image_dir = osp.join(data_dir,"train2017")
    files = [osp.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(".jpg")]
    for f in tqdm(files):
        im = Image.open(f)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process image dataset.")
    parser.add_argument(
        "--batch_size", type=int, default=10, help="Batch size for processing images"
    )
    parser.add_argument("--dataset", type=str, help="Path to the image dataset folder")
    parser.add_argument("--ann_file")

    # try:
    args = parser.parse_args()
    dataset_path = args.dataset
    if dataset_path is None:
        raise ValueError(
            "Please provide the path to the image dataset folder using the --dataset argument."
        )
    # Extract dataset name
    dataset_name = os.path.basename(dataset_path)

    schema = pa.schema(
        [
            pa.field("image", pa.binary()),
            pa.field("filename", pa.string()),
            pa.field("bbox", pa.binary()),
            pa.field("catid", pa.binary()),
            pa.field("label", pa.string()),
        ]
    )

    # write_to_lance(dataset_path, dataset_name, schema)
    start = time.time()
    data_frames = loading_into_pandas(dataset_path, dataset_name)
    end = time.time()
    print(f"Lancedb Time(sec): {end - start:.2f}")
    start = time.time()
    load_file(dataset_path)
    end = time.time()
    print(f"File Time(sec): {end - start:.2f}")

Test data

coco2017 training set images, a total of 118,287 images

Software and hardware information

pyarrow==15.0.0 pydantic==2.7.1 lancedb==0.10.1 pylance==0.14.1 numpy==1.26.3

Test results

When we compared the performance of lancedb and reading directly by file name, we found that lance reading on SSD is slower than file reading, while lancedb reading on HDD is faster and the difference is larger.batch_size is set to 16, The following are relevant screenshots of the test on two devices: in SSD: image in HDD: image

I would like to ask if this test result is reasonable, because this result is slightly different from this test result.And I want to know what caused this result, so that we can know in which cases to use lancedb in the future

mjq2020 commented 2 months ago

I found an error in image loading. The open method of Image is lazy loading. After using load, the dataset will be loaded. After the fix, the test results on SSD are as follows. Lance is still slower. This time, Lance also loaded the image. image The following is the loading operation of Lance:

ds = lance.dataset(uri)
for i in tqdm(range(118287)):
    b = ds.take([i]).to_pydict()["image"][0]
    im = Image.open(io.BytesIO(b))
    im.load()