A huge percentage of our aevaluate calls were trying to serialize part of the example to send in the evaluator trace (where we used to show the first 10k characters). Let's just not do that.
Experiment view before and after
flame graphs before and after
Benchmarking code
async def run_abenchmark(
n_examples=200,
min_size=1000000,
max_size=4000000,
n_evaluators=5,
min_llm_time=0.2,
max_llm_time=1.2,
n_concurrency=None
):
# setup dataset
inputs = [
{"key": "a" * randint(min_size / 2, max_size / 2)}
for _ in range(n_examples)
]
outputs = [
{"key": "b" * randint(min_size / 2, max_size / 2)}
for _ in range(n_examples)
]
if ls_client.has_dataset(dataset_name="jake_benchmarking"):
ls_client.delete_dataset(dataset_name="jake_benchmarking")
print("Creating dataset...")
dataset = ls_client.create_dataset("jake_benchmarking")
print("Uploading examples...")
for i in range(0, n_examples, UPLOAD_BATCH_SIZE):
ls_client.create_examples(
dataset_id=dataset.id,
inputs=inputs[i:i+UPLOAD_BATCH_SIZE],
outputs=outputs[i:i+UPLOAD_BATCH_SIZE]
)
# setup evaluators
evaluators = []
for i in range(n_evaluators):
evaluators.append(create_aevaluator(f"jake_benchmarking_{i}", uniform(min_llm_time, max_llm_time)))
async def target(input):
await asyncio.sleep(uniform(min_llm_time, max_llm_time))
return {"value": "b" * len(input["key"])}
print("Running evaluation...")
await aevaluate(
target,
data=dataset.id,
evaluators=evaluators,
max_concurrency=n_concurrency,
client=ls_client
)
A huge percentage of our
aevaluate
calls were trying to serialize part of the example to send in the evaluator trace (where we used to show the first 10k characters). Let's just not do that.Experiment view before and after
flame graphs before and after
Benchmarking code