widgetti / solara

A Pure Python, React-style Framework for Scaling Your Jupyter and Web Apps
https://solara.dev
MIT License
1.9k stars 140 forks source link

Slow duckdb queries #95

Closed legout closed 1 year ago

legout commented 1 year ago

Hello,

i´ve tested duckdb with solara and found, that executing queries in a solara app is much slower (5-10x) than running the same query in a jupyter notebook.

Here is some code for reproduction:

import duckdb
import pandas as pd
import solara
import pyarrow.dataset as pds
from typing import cast, Optional
import os
import threading
from solara.alias import rv
import tempfile
import time

TEMP_PATH = tempfile.mkdtemp()

def download_dataset(name: str = "netflix_titles"):
    df = pd.read_csv(
        f"https://raw.githubusercontent.com/adamerose/datasets/master/{name}.csv"
    )
    pd.concat([df for i in range(3)]).to_parquet(
        f"{TEMP_PATH}/{name}.parquet", index=False
    )

def register_table(
    con: duckdb.DuckDBPyConnection,
    name: str,
    path: str,
):
    if not os.path.exists(path):
        download_dataset(name)

    if not con.sql("SHOW TABLES"):
        ds = pds.dataset(path)
        con.register(name, ds)
    else:
        if name not in con.sql("SHOW TABLES").df()["name"].tolist():
            ds = pds.dataset(path)
            con.register(name, ds)

ALL_DATASETS1 = [
    "country_indicators",
    "diamonds",
    "gammas",
    "gapminder",
    "googleplaystore",
    "googleplaystore_reviews",
    "happiness",
    "harry_potter_characters",
    "iris",
    "mpg",
]
ALL_DATASETS2 = [
    "netflix_titles",
    "penguins",
    "planets",
    "pokemon",
    "seinfeld_episodes",
    "seinfeld_scripts",
    "stockdata",
    "tips",
    "titanic",
    "trump_tweets",
    "us_shooting_incidents",
]
DATASETS1 = solara.reactive([])
DATASETS2 = solara.reactive([])
DDB = duckdb.connect()

@solara.component
def Page():
    query, set_query = solara.use_state(
        "SELECT * FROM googleplaystore_reviews INNER JOIN googleplaystore ON googleplaystore_reviews.App=googleplaystore.App"
    )
    query_executed, set_query_executed = solara.use_state(cast(Optional[str], None))
    ddb_cursor, set_ddb_cursor = solara.use_state(DDB.cursor())
    registered_tables, set_registered_tables = solara.use_state([])
    table_hints, set_table_hints = solara.use_state({})

    def register_tables(cancel: threading.Event):
        for name in DATASETS1.value:
            register_table(
                con=ddb_cursor,
                path=f"{TEMP_PATH}/{name}.parquet",
                name=name,
            )
            set_registered_tables(registered_tables + [name])

        for name in DATASETS2.value:
            register_table(
                con=ddb_cursor,
                path=f"{TEMP_PATH}/{name}.parquet",
                name=name,
            )
            set_registered_tables(registered_tables + [name])

        # table_names = ddb_cursor.sql("SHOW TABLES").pl()["name"]
        if len(registered_tables):
            table_hints_ = {
                table_name: ddb_cursor.sql(f"FROM {table_name}").columns
                for table_name in registered_tables
            }
            set_table_hints(table_hints_)

    def run_query(cancel: threading.Event) -> pd.DataFrame:
        if not query_executed:
            return

        df = ddb_cursor.sql(query_executed).df()

        return df

    result: solara.Result[pd.DataFrame] = solara.use_thread(
        run_query, dependencies=[query_executed]
    )

    with solara.VBox() as main:
        with solara.Card("Toggle to register the dataset."):
            solara.ToggleButtonsMultiple(
                DATASETS1, ALL_DATASETS1, on_value=register_tables
            )
            solara.ToggleButtonsMultiple(
                DATASETS2, ALL_DATASETS2, on_value=register_tables
            )

        solara.SqlCode(
            label="Query Editor",
            query=query,
            tables=table_hints,
            on_query=set_query,
            height="240px",
        )
        enable_execute = (query != query_executed) or result.error is not None

        def execute():
            if not len(registered_tables):
                register_tables()

            set_query_executed(query)
            if query == query_executed and result.error:
                result.retry()  # type: ignore

        solara.Button("Execute", on_click=execute, disabled=not enable_execute)
        if result.error:
            solara.Error(f"Ooops {result.error}")
        elif not query:
            solara.Info("No query")
        elif result.value is not None:
            solara.Markdown(
                f"<details><summary>Query</summary> `{query_executed}`</details>"
            )
            df = result.value
            solara.DataFrame(df)

        elif query_executed is not None:
            with solara.Div():
                solara.Text("Loading data...")
                rv.ProgressCircular(indeterminate=True, class_="solara-progress")

        if result.value is not None:
            with solara.Card("Download"):
                with solara.Column():
                    df = result.value
                    solara.FileDownload(
                        data=df.to_csv(index=False),
                        filename="result.csv",
                        label="Download query result",
                    )

    return main

def run_query():
    download_dataset("googleplaystore")
    download_dataset("googleplaystore_reviews")
    ddb_cursor = DDB.cursor()
    register_table(
        ddb_cursor, "googleplaystore", path=f"{TEMP_PATH}/googleplaystore.parquet"
    )
    register_table(
        ddb_cursor,
        "googleplaystore_reviews",
        path=f"{TEMP_PATH}/googleplaystore_reviews.parquet",
    )

    start_time = time.time()
    df = ddb_cursor.sql(
        "SELECT * FROM googleplaystore_reviews INNER JOIN googleplaystore ON googleplaystore_reviews.App = googleplaystore.App"
    ).df()
    print(f"Execution time: {time.time()-start_time:.2f} seconds.")

if __name__ == "__main__":
    Page()
    # run_query()
maartenbreddels commented 1 year ago

Really love how you put this together.

Took me a while to spot the issue. You are converting the data to csv on each render. We anticipated this, so data can be a callable to generate the data only when requested. I.e.:

                    solara.FileDownload(
                        data=lambda: df.to_csv(index=False),
                        filename="result.csv",
                        label="Download query result",
                    )

Does that make sense?

legout commented 1 year ago

Thanks. Makes sense. 😊