vectorch-ai / ScaleLLM

A high-performance inference system for large language models, designed for production environments.
https://docs.vectorch.com/
Apache License 2.0
317 stars 24 forks source link

feat: Added prometheus metrics #210

Closed guocuimi closed 1 month ago

guocuimi commented 1 month ago
$ curl localhost:8080/metrics
# HELP request_status_total Total number of request status
# TYPE request_status_total counter
request_status_total{code="UNAVAILABLE"} 0
request_status_total{code="UNAUTHENTICATED"} 0
request_status_total{code="UNIMPLEMENTED"} 0
request_status_total{code="RESOURCE_EXHAUSTED"} 0
request_status_total{code="INVALID_ARGUMENT"} 0
request_status_total{code="DEADLINE_EXCEEDED"} 0
request_status_total{code="UNKNOWN"} 0
request_status_total{code="CANCELLED"} 0
request_status_total{code="OK"} 0

# HELP request_handling_latency_seconds Request handling latency in seconds
# TYPE request_handling_latency_seconds counter
request_handling_latency_seconds{type="completion"} 0
request_handling_latency_seconds{type="chat"} 0

# HELP tokenization_latency_seconds Prompt tokenization latency in seconds
# TYPE tokenization_latency_seconds counter
tokenization_latency_seconds 0

# HELP chat_template_latency_seconds Chat template latency in seconds
# TYPE chat_template_latency_seconds counter
chat_template_latency_seconds 0

# HELP scheduling_latency_seconds Latency of scheduling in seconds
# TYPE scheduling_latency_seconds counter
scheduling_latency_seconds 6.893800000000001e-05

# HELP prompt_tokens_total Total number of prompt tokens
# TYPE prompt_tokens_total counter
prompt_tokens_total 0

# HELP generated_tokens_total Total number of generated tokens
# TYPE generated_tokens_total counter
generated_tokens_total 0

# HELP detokenization_latency_seconds Latency of detokenization in seconds
# TYPE detokenization_latency_seconds counter
detokenization_latency_seconds{mode="non-stream"} 0
detokenization_latency_seconds{mode="stream"} 0

# HELP responsing_latency_seconds Latency of responding in seconds
# TYPE responsing_latency_seconds counter
responsing_latency_seconds{mode="non-stream"} 0
responsing_latency_seconds{mode="stream"} 0

# HELP speculative_execution_latency_seconds Execution latency in seconds
# TYPE speculative_execution_latency_seconds counter
speculative_execution_latency_seconds{stage="validation"} 0
speculative_execution_latency_seconds{stage="target"} 0
speculative_execution_latency_seconds{stage="draft"} 0

# HELP num_accepted_tokens_total Total number of accepted tokens in validation
# TYPE num_accepted_tokens_total counter
num_accepted_tokens_total 0

# HELP prepare_input_latency_seconds Latency of preparing input in seconds
# TYPE prepare_input_latency_seconds counter
prepare_input_latency_seconds 0

# HELP execution_latency_seconds Execution latency in seconds
# TYPE execution_latency_seconds counter
execution_latency_seconds{stage="sampling"} 0
execution_latency_seconds{stage="logits_processing"} 0
execution_latency_seconds{stage="model"} 0

# HELP num_model_execution_total Total number of model execution
# TYPE num_model_execution_total counter
num_model_execution_total{mode="eager"} 0
num_model_execution_total{mode="cuda_graph"} 0

# HELP prefix_cache_query_latency_seconds Latency of querying prefix cache in seconds
# TYPE prefix_cache_query_latency_seconds counter
prefix_cache_query_latency_seconds 0

# HELP prefix_cache_insert_latency_seconds Latency of inserting into prefix cache in seconds
# TYPE prefix_cache_insert_latency_seconds counter
prefix_cache_insert_latency_seconds 0

# HELP prefix_cache_match_length Length of matched prefix in tokens
# TYPE prefix_cache_match_length counter
prefix_cache_match_length 0

# HELP num_pending_requests Number of pending requests in scheduler
# TYPE num_pending_requests gauge
num_pending_requests 0

# HELP num_running_requests Number of running requests in scheduler
# TYPE num_running_requests gauge
num_running_requests 0

# HELP num_waiting_requests Number of waiting requests in scheduler
# TYPE num_waiting_requests gauge
num_waiting_requests 0

# HELP num_preempted_requests Number of preempted requests in scheduler
# TYPE num_preempted_requests gauge
num_preempted_requests 0

# HELP kv_cache_utilization_perc Utilization of the kv cache in percentage
# TYPE kv_cache_utilization_perc gauge
kv_cache_utilization_perc 2.746649088112503e-05

# HELP num_blocks_in_prefix_cache Number of blocks in the prefix cache
# TYPE num_blocks_in_prefix_cache gauge
num_blocks_in_prefix_cache 0

# HELP num_free_blocks Number of free blocks in the block allocator
# TYPE num_free_blocks gauge
num_free_blocks 36407

# HELP num_blocks_in_use Effective number of blocks in use
# TYPE num_blocks_in_use gauge
num_blocks_in_use 1

# HELP end_2_end_latency_seconds Histogram of end to end latency in seconds
# TYPE end_2_end_latency_seconds histogram
end_2_end_latency_seconds_count 0
end_2_end_latency_seconds_sum 0
end_2_end_latency_seconds_bucket{le="0.2"} 0
end_2_end_latency_seconds_bucket{le="0.5"} 0
end_2_end_latency_seconds_bucket{le="1"} 0
end_2_end_latency_seconds_bucket{le="2"} 0
end_2_end_latency_seconds_bucket{le="5"} 0
end_2_end_latency_seconds_bucket{le="10"} 0
end_2_end_latency_seconds_bucket{le="15"} 0
end_2_end_latency_seconds_bucket{le="20"} 0
end_2_end_latency_seconds_bucket{le="30"} 0
end_2_end_latency_seconds_bucket{le="60"} 0
end_2_end_latency_seconds_bucket{le="+Inf"} 0