Programatic Monthly Reindexing Distributions

class Collection:
    def __init__(self, config_name: str, scrape_time: int):
        self.config_name = config_name
        self.scrape_time = scrape_time

    def __repr__(self):
        return f"Collection({self.config_name}, scrape_time={self.scrape_time})"

def segment_reindexing_lists(collections: list[Collection], hours_per_list: float) -> list[list[Collection]]:
    # Convert hours to seconds for the max capacity per bin
    max_capacity = hours_per_list * 3600

    # Sort collections by scrape_time in descending order for efficient packing
    sorted_collections = sorted(collections, key=lambda x: x.scrape_time, reverse=True)

    # List to store bins (each bin is a list of collections)
    bins = []

    for collection in sorted_collections:
        placed = False
        # Try to place the collection in an existing bin
        for b in bins:
            if sum(item.scrape_time for item in b) + collection.scrape_time <= max_capacity:
                b.append(collection)
                placed = True
                break

        # If it can't be placed in any existing bin, create a new one
        if not placed:
            bins.append([collection])

    return bins

# Example usage
collections = [
    Collection("Collection1", scrape_time=400),
    Collection("Collection2", scrape_time=8000),
    Collection("Collection3", scrape_time=300),
    Collection("Collection4", scrape_time=900),
    Collection("Collection5", scrape_time=100),
    Collection("Collection6", scrape_time=4000),
    Collection("Collection7", scrape_time=500),
    Collection("Collection8", scrape_time=800),
    Collection("Collection9", scrape_time=1000),
    Collection("Collection10", scrape_time=700),
    Collection("Collection11", scrape_time=15),
    Collection("Collection12", scrape_time=200),
    Collection("Collection13", scrape_time=15),
    Collection("Collection14", scrape_time=8000),
    Collection("Collection15", scrape_time=100),
    Collection("Collection16", scrape_time=4000),
    Collection("Collection17", scrape_time=500),
    Collection("Collection18", scrape_time=1000),
    Collection("Collection19", scrape_time=400),
    Collection("Collection20", scrape_time=300),
]

hours_per_list = 2  # Maximum time per list in hours
result = segment_reindexing_lists(collections, hours_per_list)

for i, bin in enumerate(result):
    print(f"Bin {i+1}: {bin}, Total scrape_time: {sum(item.scrape_time for item in bin)} seconds")
NASA-IMPACT / COSMOS

Programatic Monthly Reindexing Distributions #1048

Description

Implementation Considerations

Deliverable

Dependencies