OpenDendro / dplPy

The Dendrochronology Program Library for Python
https://opendendro.org/python/
GNU General Public License v3.0
7 stars 8 forks source link

add read `https://` hosted files to dplpy.readers() capability #66

Open tyson-swetnam opened 10 months ago

tyson-swetnam commented 10 months ago

dplpy.readers() should have the ability to read both files on local disk, e.g., /home/jovyan/data.rwl and from URLs on paleo database websites, e.g., https://www.ncei.noaa.gov/pub/data/paleo/treering/measurements/africa/morc016.rwl

Minimal modifications using:

import requests
from io import StringIO

def readers(filename: str, skip_lines=0, header=False):
    # ... existing code ...

    # Determine the file format
    is_url = filename.startswith("https://")
    FORMAT = ".rwl" if filename.lower().endswith(".rwl") else ".csv"

    print("\nAttempting to read input file: " + os.path.basename(filename) + " as " + FORMAT + " format\n")

    # Handling URL input for .CSV and .RWL files
    if is_url:
        response = requests.get(filename)
        if response.status_code != 200:
            raise ValueError("Unable to download data from URL")

        # For CSV files
        if FORMAT == ".csv":
            data = StringIO(response.text)
            series_data = pd.read_csv(data, skiprows=skip_lines)

        # For RWL files
        elif FORMAT == ".rwl":
            data = StringIO(response.text)
            rwl_lines = data.readlines()
            rwl_data, first_date, last_date = read_rwl(rwl_lines)
            if rwl_data is None:
                return None
            series_data = process_rwl_data(rwl_data, first_date, last_date)

    elif filename.upper().endswith(".CSV"):
        series_data = pd.read_csv(filename, skiprows=skip_lines)
    elif filename.upper().endswith(".RWL"):
        series_data = process_rwl_pandas(filename, skip_lines, header)
    else:
        # ... existing error handling code ...

    # ... existing code for processing series_data ...

    return series_data

# New function to process RWL data from a dictionary into a DataFrame
def process_rwl_data(rwl_data, first_date, last_date):
    # ... existing logic from process_rwl_pandas function to convert rwl_data to DataFrame ...
    # ... return the DataFrame ...