MrPowers / farsante

Fake Pandas / PySpark DataFrame creator
42 stars 6 forks source link

Make it easy to generate a fake timeseries dataset #1

Open MrPowers opened 1 year ago

MrPowers commented 1 year ago

Make it simple to generate timeseries data.

MrPowers commented 1 year ago

Useful code snippet to leverage:

import itertools
from datetime import datetime, timedelta

import pyarrow as pa
import pyarrow.compute as pc
from deltalake import DeltaTable, write_deltalake

def record_observations(date: datetime) -> pa.Table:
    """Pulls data for a certain datetime"""
    nrows = 1000
    return pa.table(
        {
            "date": pa.array([date.date()] * nrows),
            "timestamp": pa.array([date] * nrows),
            "value": pc.random(nrows),
        }
    )

# Example of output
record_observations(datetime(2021, 1, 1, 12)).to_pandas()

hours_iter = (datetime(2021, 1, 1) + timedelta(hours=i) for i in itertools.count())

# Write 100 hours worth of data
for timestamp in itertools.islice(hours_iter, 100):
    write_deltalake(
        "observation_data",
        record_observations(timestamp),
        partition_by=["date"],
        mode="append",
    )