datamol-io / datamol

Molecular Processing Made Easy.
https://docs.datamol.io
Apache License 2.0
462 stars 48 forks source link

Add flexible file reader and saver #163

Closed zhu0619 closed 1 year ago

zhu0619 commented 1 year ago

Load file without specifying the file reader, but only based on the file type/extension.

from typing import cast

import pandas as pd
import datamol as dm

EXTENSIONS_DICT = {
    "csv": [
        ".csv",
        ".csv.gz",
        ".csv.bz2",
        ".csv.zip",
        ".csv.xz",
        ".csv.zst",
        ".csv.tar",
        ".csv.tar.gz",
        ".csv.tar.xz",
        ".csv.tar.bz2",
    ],
    "excel": [".xlsx"],
    "parquet": [".parquet"],
    "json": [
        ".json",
        ".json.gz",
        ".json.bz2",
        ".json.zip",
        ".json.xz",
        ".json.zst",
        ".json.tar",
        ".json.tar.gz",
        ".json.tar.xz",
        ".json.tar.bz2",
    ],
    "sdf": [".sdf"],
}

def guess_filetype(path: str):
    """Return a filetype given an input path. Filetypes returned can be from
    `csv, excel, parquet, json, sdf`.
    """
    for name, extensions in EXTENSIONS_DICT.items():
        for ext in extensions:
            if path.endswith(ext):
                return name

def open_dataframe(path: str) -> pd.DataFrame:
    """Open a dataframe file whatever its filetype from
    `csv, excel, parquet, json, sdf`.
    """

    filetype = guess_filetype(path)

    data = None
    if filetype == "csv":
        data = pd.read_csv(path)
    elif filetype == "excel":
        data = pd.read_excel(path)
    elif filetype == "parquet":
        data = pd.read_parquet(path)
    elif filetype == "json":
        data = pd.read_json(path)
    elif filetype == "sdf":
        data = dm.read_sdf(path, as_df=True)

    if data is None:
        raise ValueError(f"The file type of {path} is not supported.")

    data = cast(pd.DataFrame, data)

    return data

def save_dataframe(data: pd.DataFrame, path: str):
    """Save a dataframe file whatever its filetype from
    `csv, excel, parquet, json, sdf`.
    """

    filetype = guess_filetype(path)

    if filetype == "csv":
        data.to_csv(path, index=False)
    elif filetype == "excel":
        data.to_excel(path, index=False)
    elif filetype == "parquet":
        data.to_parquet(path)
    elif filetype == "json":
        data.to_json(path)
    elif filetype == "sdf":
        dm.to_sdf(data, path)
    else:
        raise ValueError(f"The file type of {path} is not supported.")
dessygil commented 1 year ago

Hi @hadim, Thanks for all the help on the last issue. If you haven't begun working on this issue, would you mind if I gave it an attempt?

hadim commented 1 year ago

I haven't started working on that. If you want to give it a try, feel free!

dessygil commented 1 year ago

Hey @hadim, I'm going to get started on this, but before jumping in, @zhu0619 has provided the code to be implemented into Datamol. I'm assuming into datamol/utils. The only thing that would be left is to implement and provide test cases. Did I understand this correctly?