google / JetStream

JetStream is a throughput and memory optimized engine for LLM inference on XLA devices, starting with TPUs (and GPUs in future -- PRs welcome).
Apache License 2.0
194 stars 24 forks source link

Request input/output size metrics #123

Closed Bslabe123 closed 1 month ago

Bslabe123 commented 1 month ago

Added the following metrics:

Scrape results after running the following commands:

seq 50 | xargs -P 50 -n 1 curl --request POST --header "Content-type: application/json" -s localhost:8000/generate --data '{
    "prompt": "Can you provide a comprehensive and detailed overview of the history and development of artificial intelligence.",
    "max_tokens": 512
}'

seq 50 | xargs -P 50 -n 1 curl --request POST --header "Content-type: application/json" -s localhost:8000/generate --data '{
    "prompt": "Tell me about AI",
    "max_tokens": 250
}'
 HELP jetstream_request_input_length Number of input tokens per request
# TYPE jetstream_request_input_length histogram
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="16.0"} 50.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="32.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="64.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="128.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="256.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="512.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="1024.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="2048.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="4096.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="8192.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="16384.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="32768.0"} 100.0
jetstream_request_input_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="+Inf"} 100.0
jetstream_request_input_length_count{id="maxengine-server-58dc8c7895-p84bw"} 100.0
jetstream_request_input_length_sum{id="maxengine-server-58dc8c7895-p84bw"} 1150.0
# HELP jetstream_request_input_length_created Number of input tokens per request
# TYPE jetstream_request_input_length_created gauge
jetstream_request_input_length_created{id="maxengine-server-58dc8c7895-p84bw"} 1.7229010142987714e+09
# HELP jetstream_request_output_length Number of output tokens per request
# TYPE jetstream_request_output_length histogram
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="1.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="2.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="5.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="10.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="20.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="50.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="100.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="200.0"} 0.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="500.0"} 50.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="1000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="2000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="5000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="10000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="20000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="50000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="100000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="200000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="500000.0"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="1e+06"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="2e+06"} 100.0
jetstream_request_output_length_bucket{id="maxengine-server-58dc8c7895-p84bw",le="+Inf"} 100.0
jetstream_request_output_length_count{id="maxengine-server-58dc8c7895-p84bw"} 100.0
jetstream_request_output_length_sum{id="maxengine-server-58dc8c7895-p84bw"} 38200.0
# HELP jetstream_request_output_length_created Number of output tokens per request
# TYPE jetstream_request_output_length_created gauge
jetstream_request_output_length_created{id="maxengine-server-58dc8c7895-p84bw"} 1.722901039303407e+09