Take a look at the following function:

import pandas as pd import polars as pl import pyarrow as pa import pyarrow.parquet as pq from cytotable import convert from parsl.config import Config from parsl.executors import ThreadPoolExecutor import random import sqlite3

Constants for columns

COLUMNS = ( "TableNumber", "ImageNumber", "ObjectNumber", "Metadata_Well", "Metadata_Plate", "Cytoplasm_Parent_Cells", "Cytoplasm_Parent_Nuclei", )

Modified convert_parquet function to read data chunk by chunk

def convert_parquet( input_file, output_file, cols=COLUMNS, chunk_size=150000, thread=2, initial_offset=0, offset_step=100 ):

conn = sqlite3.connect(input_file)

# Get total number of rows in the image_table for processing
total_rows = pd.read_sql_query("SELECT COUNT(*) as count FROM image_table", conn)['count'][0]

# Define the schema
schema = pa.schema([
        ('Metadata_TableNumber', pa.int64()),
        ('Metadata_ImageNumber', pa.int64()),
        ('Metadata_Well', pa.string()),
        ('Metadata_Plate', pa.string()),
        ('cytoplasm_Metadata_TableNumber', pa.int64()),
        ('cytoplasm_Metadata_ImageNumber', pa.int64()),
        ('cytoplasm_Metadata_ObjectNumber', pa.int64()),
        ('cells_Metadata_ObjectNumber', pa.int64()),
        ('nuclei_Metadata_ObjectNumber', pa.int64())
    ])

# Create a Parquet writer
pq_writer = pq.ParquetWriter(output_file, schema, compression='gzip')

offset = initial_offset

while offset < total_rows:
    query_limit = f"LIMIT {offset_step} OFFSET {offset}"
    image_df = pd.read_sql_query(f"SELECT * FROM image_table {query_limit}", conn)
    cytoplasm_df = pd.read_sql_query(f"SELECT * FROM cytoplasm_table {query_limit}", conn)
    cells_df = pd.read_sql_query(f"SELECT * FROM cells_table {query_limit}", conn)
    nuclei_df = pd.read_sql_query(f"SELECT * FROM nuclei_table {query_limit}", conn)

    image_pl = pl.from_pandas(image_df)
    cytoplasm_pl = pl.from_pandas(cytoplasm_df)
    cells_pl = pl.from_pandas(cells_df)
    nuclei_pl = pl.from_pandas(nuclei_df)

    image_filtered = image_pl.select(['Metadata_TableNumber', 'Metadata_ImageNumber', 'Metadata_Well', 'Metadata_Plate'])

    # Perform join operations
    result = (
        image_filtered
        .join(cytoplasm_pl, on=['Metadata_TableNumber', 'Metadata_ImageNumber'], how='left')
        .join(cells_pl, left_on=['Metadata_TableNumber', 'Metadata_ImageNumber', 'Metadata_ObjectNumber'], right_on=['Metadata_TableNumber', 'Metadata_ImageNumber', 'Metadata_Cytoplasm_Parent_Cells'], how='left')
        .join(nuclei_pl, left_on=['Metadata_TableNumber', 'Metadata_ImageNumber', 'Metadata_ObjectNumber'], right_on=['Metadata_TableNumber', 'Metadata_ImageNumber', 'Metadata_Cytoplasm_Parent_Nuclei'], how='left')
    )

    # Convert the result to an Arrow table
    result_arrow = result.to_arrow()

    # Write the table to the Parquet file
    pq_writer.write_table(result_arrow)

    offset += offset_step

# Close the Parquet writer
pq_writer.close()

conn.close()

"""Convert sqlite profiles to parquet"""

hash_str = str(random.getrandbits(128))
parsl_config = Config(
                    executors=[
                        ThreadPoolExecutor(
                            max_threads=thread
                        )
                    ],
                    run_dir=f'./runinfo/{hash_str}'
                )

convert(
    source_path=input_file,
    dest_path=output_file,
    identifying_columns=cols,
    dest_datatype='parquet',
    chunk_size=chunk_size,
    preset="cell-health-cellprofiler-to-cytominer-database",
    joins=None,  # No joins needed here since it's already handled
    reload_parsl_config=True,
    parsl_config=parsl_config,
    sort_output=False, 
)

When I run it in jupyter notebook, the function does not work and raise some error regarding threading, but when I run the exact same code on python script it has no issue. I guess the issue is kernel and multithreading in jupyter notebook that this library can't handle.

cytomining / CytoTable

Issue Running in Jupyter Notebook for convert_parquet Function #221

Constants for columns

Modified convert_parquet function to read data chunk by chunk