Open CarsonDavis opened 3 weeks ago
class Collection:
def __init__(self, config_name: str, scrape_time: int):
self.config_name = config_name
self.scrape_time = scrape_time
def __repr__(self):
return f"Collection({self.config_name}, scrape_time={self.scrape_time})"
def segment_reindexing_lists(collections: list[Collection], hours_per_list: float) -> list[list[Collection]]:
# Convert hours to seconds for the max capacity per bin
max_capacity = hours_per_list * 3600
# Sort collections by scrape_time in descending order for efficient packing
sorted_collections = sorted(collections, key=lambda x: x.scrape_time, reverse=True)
# List to store bins (each bin is a list of collections)
bins = []
for collection in sorted_collections:
placed = False
# Try to place the collection in an existing bin
for b in bins:
if sum(item.scrape_time for item in b) + collection.scrape_time <= max_capacity:
b.append(collection)
placed = True
break
# If it can't be placed in any existing bin, create a new one
if not placed:
bins.append([collection])
return bins
# Example usage
collections = [
Collection("Collection1", scrape_time=400),
Collection("Collection2", scrape_time=8000),
Collection("Collection3", scrape_time=300),
Collection("Collection4", scrape_time=900),
Collection("Collection5", scrape_time=100),
Collection("Collection6", scrape_time=4000),
Collection("Collection7", scrape_time=500),
Collection("Collection8", scrape_time=800),
Collection("Collection9", scrape_time=1000),
Collection("Collection10", scrape_time=700),
Collection("Collection11", scrape_time=15),
Collection("Collection12", scrape_time=200),
Collection("Collection13", scrape_time=15),
Collection("Collection14", scrape_time=8000),
Collection("Collection15", scrape_time=100),
Collection("Collection16", scrape_time=4000),
Collection("Collection17", scrape_time=500),
Collection("Collection18", scrape_time=1000),
Collection("Collection19", scrape_time=400),
Collection("Collection20", scrape_time=300),
]
hours_per_list = 2 # Maximum time per list in hours
result = segment_reindexing_lists(collections, hours_per_list)
for i, bin in enumerate(result):
print(f"Bin {i+1}: {bin}, Total scrape_time: {sum(item.scrape_time for item in bin)} seconds")
Description
When new collections are added, we need to ensure they are brought into the reindexing fold. There may be several ways to ensure this happens.
Implementation Considerations
Deliverable
arst
Dependencies
depends on