langchain-ai / langsmith-sdk

LangSmith Client SDK Implementations
https://docs.smith.langchain.com/
MIT License
421 stars 80 forks source link

perf: cut down cpu time of aevaluate by 30% on 1-4MB examples with this one trick #1217

Closed jakerachleff closed 1 week ago

jakerachleff commented 1 week ago

A huge percentage of our aevaluate calls were trying to serialize part of the example to send in the evaluator trace (where we used to show the first 10k characters). Let's just not do that.

Experiment view before and after

Screenshot 2024-11-14 at 3 22 21 PM Screenshot 2024-11-14 at 3 23 06 PM

flame graphs before and after

Screenshot 2024-11-14 at 2 43 36 PM Screenshot 2024-11-14 at 3 32 50 PM

Benchmarking code

async def run_abenchmark(
    n_examples=200, 
    min_size=1000000, 
    max_size=4000000, 
    n_evaluators=5,
    min_llm_time=0.2,
    max_llm_time=1.2,
    n_concurrency=None
):

    # setup dataset
    inputs = [
        {"key": "a" * randint(min_size / 2, max_size / 2)}
        for _ in range(n_examples)
    ]
    outputs = [
        {"key": "b" * randint(min_size / 2, max_size / 2)}
        for _ in range(n_examples)
    ]

    if ls_client.has_dataset(dataset_name="jake_benchmarking"):
        ls_client.delete_dataset(dataset_name="jake_benchmarking")

    print("Creating dataset...")
    dataset = ls_client.create_dataset("jake_benchmarking")

    print("Uploading examples...")
    for i in range(0, n_examples, UPLOAD_BATCH_SIZE):
        ls_client.create_examples(
            dataset_id=dataset.id,
            inputs=inputs[i:i+UPLOAD_BATCH_SIZE],
            outputs=outputs[i:i+UPLOAD_BATCH_SIZE]
        )

    # setup evaluators
    evaluators = []
    for i in range(n_evaluators):
        evaluators.append(create_aevaluator(f"jake_benchmarking_{i}", uniform(min_llm_time, max_llm_time)))

    async def target(input):
        await asyncio.sleep(uniform(min_llm_time, max_llm_time))
        return {"value": "b" * len(input["key"])}

    print("Running evaluation...")
    await aevaluate(
        target,
        data=dataset.id,
        evaluators=evaluators,
        max_concurrency=n_concurrency,
        client=ls_client
    )