splitgraph / seafowl

Analytical database for data-driven Web applications 🪶
https://seafowl.io
Apache License 2.0
386 stars 9 forks source link

Add various object store cache Prometheus metrics #519

Closed mildbyte closed 2 months ago

mildbyte commented 2 months ago

Add a bunch of various timing / sizing metrics to the object store cache:

Limitations:

This is based on top of https://github.com/splitgraph/seafowl/pull/518/ which has some DataFusion MemoryManager metrics

Sample metrics scrape from a TPC-H/DS run:

# HELP seafowl_object_store_cache_get_range_requested_bytes_total Bytes requested in get_range calls by DataFusion before caching
# TYPE seafowl_object_store_cache_get_range_requested_bytes_total counter
seafowl_object_store_cache_get_range_requested_bytes_total 1653431887

# HELP seafowl_object_store_requests_total Number of calls to the actual object store
# TYPE seafowl_object_store_requests_total counter
seafowl_object_store_requests_total{operation="get_range",status="success"} 73
seafowl_object_store_requests_total{operation="list",status="unknown"} 4587
seafowl_object_store_requests_total{operation="get",status="success"} 13761

# HELP seafowl_object_store_cache_hit_read_bytes_total Bytes read from the object store cache (hit)
# TYPE seafowl_object_store_cache_hit_read_bytes_total counter
seafowl_object_store_cache_hit_read_bytes_total{location="disk"} 3768333104
seafowl_object_store_cache_hit_read_bytes_total{location="memory"} 2763647

# HELP seafowl_object_store_cache_get_range_requests_total Number of get_range requests from DataFusion before caching
# TYPE seafowl_object_store_cache_get_range_requests_total counter
seafowl_object_store_cache_get_range_requests_total 23932

# HELP seafowl_object_store_cache_evicted_bytes Bytes evicted from cache
# TYPE seafowl_object_store_cache_evicted_bytes counter
seafowl_object_store_cache_evicted_bytes 0

# HELP seafowl_object_store_cache_disk_written_bytes_total Bytes written to on-disk cache
# TYPE seafowl_object_store_cache_disk_written_bytes_total counter
seafowl_object_store_cache_disk_written_bytes_total 13268070

# HELP grpc_requests Counter tracking gRPC request statistics
# TYPE grpc_requests counter
grpc_requests{path="/arrow.flight.protocol.FlightService/GetFlightInfo",status="13"} 373
grpc_requests{path="/arrow.flight.protocol.FlightService/GetFlightInfo",status="0"} 4661
grpc_requests{path="/arrow.flight.protocol.FlightService/DoGet",status="0"} 4661

# HELP seafowl_object_store_cache_warnings_total Number of times various cache race conditions were discovered (read-after-evict, double-write, double-delete)
# TYPE seafowl_object_store_cache_warnings_total counter
seafowl_object_store_cache_warnings_total{error="deletion"} 0
seafowl_object_store_cache_warnings_total{error="double_write"} 0
seafowl_object_store_cache_warnings_total{error="redownload"} 0

# HELP seafowl_object_store_cache_get_range_read_bytes_total Bytes downloaded from the upstream object store for get_range cache misses
# TYPE seafowl_object_store_cache_get_range_read_bytes_total counter
seafowl_object_store_cache_get_range_read_bytes_total 15893361

# HELP seafowl_object_store_cache_usage_bytes Approximate current occupation of the cache
# TYPE seafowl_object_store_cache_usage_bytes gauge
seafowl_object_store_cache_usage_bytes 13267128

# HELP seafowl_object_store_cache_capacity_bytes Total cache capacity
# TYPE seafowl_object_store_cache_capacity_bytes gauge
seafowl_object_store_cache_capacity_bytes 268435456

# HELP seafowl_object_store_request_latency_seconds Time-to-first-byte of various requests to the actual object store
# TYPE seafowl_object_store_request_latency_seconds summary
seafowl_object_store_request_latency_seconds{operation="get_range",status="success",quantile="0"} 0
seafowl_object_store_request_latency_seconds{operation="get_range",status="success",quantile="0.5"} 0
seafowl_object_store_request_latency_seconds{operation="get_range",status="success",quantile="0.9"} 0
seafowl_object_store_request_latency_seconds{operation="get_range",status="success",quantile="0.95"} 0
seafowl_object_store_request_latency_seconds{operation="get_range",status="success",quantile="0.99"} 0
seafowl_object_store_request_latency_seconds{operation="get_range",status="success",quantile="0.999"} 0
seafowl_object_store_request_latency_seconds{operation="get_range",status="success",quantile="1"} 0
seafowl_object_store_request_latency_seconds_sum{operation="get_range",status="success"} 0.23178057900000007
seafowl_object_store_request_latency_seconds_count{operation="get_range",status="success"} 73
seafowl_object_store_request_latency_seconds{operation="get",status="success",quantile="0"} 0.000781908
seafowl_object_store_request_latency_seconds{operation="get",status="success",quantile="0.5"} 0.0007819654274285743
seafowl_object_store_request_latency_seconds{operation="get",status="success",quantile="0.9"} 0.0007819654274285743
seafowl_object_store_request_latency_seconds{operation="get",status="success",quantile="0.95"} 0.0007819654274285743
seafowl_object_store_request_latency_seconds{operation="get",status="success",quantile="0.99"} 0.0007819654274285743
seafowl_object_store_request_latency_seconds{operation="get",status="success",quantile="0.999"} 0.0007819654274285743
seafowl_object_store_request_latency_seconds{operation="get",status="success",quantile="1"} 0.000781908
seafowl_object_store_request_latency_seconds_sum{operation="get",status="success"} 11.07166186999995
seafowl_object_store_request_latency_seconds_count{operation="get",status="success"} 13761
seafowl_object_store_request_latency_seconds{operation="list",status="unknown",quantile="0"} 0.000001859
seafowl_object_store_request_latency_seconds{operation="list",status="unknown",quantile="0.5"} 0.0000033102062906382345
seafowl_object_store_request_latency_seconds{operation="list",status="unknown",quantile="0.9"} 0.000004868619916292589
seafowl_object_store_request_latency_seconds{operation="list",status="unknown",quantile="0.95"} 0.000005156254853472572
seafowl_object_store_request_latency_seconds{operation="list",status="unknown",quantile="0.99"} 0.000006454712686792021
seafowl_object_store_request_latency_seconds{operation="list",status="unknown",quantile="0.999"} 0.000006454712686792021
seafowl_object_store_request_latency_seconds{operation="list",status="unknown",quantile="1"} 0.000012291
seafowl_object_store_request_latency_seconds_sum{operation="list",status="unknown"} 0.013296752000000018
seafowl_object_store_request_latency_seconds_count{operation="list",status="unknown"} 4587

# HELP seafowl_object_store_cache_disk_latency_seconds Time spent waiting for disk cache read / write
# TYPE seafowl_object_store_cache_disk_latency_seconds summary
seafowl_object_store_cache_disk_latency_seconds{operation="read",quantile="0"} 0.000084778
seafowl_object_store_cache_disk_latency_seconds{operation="read",quantile="0.5"} 0.00018950407426943935
seafowl_object_store_cache_disk_latency_seconds{operation="read",quantile="0.9"} 0.0002668830441480235
seafowl_object_store_cache_disk_latency_seconds{operation="read",quantile="0.95"} 0.00032911608360367857
seafowl_object_store_cache_disk_latency_seconds{operation="read",quantile="0.99"} 0.0004255619484064256
seafowl_object_store_cache_disk_latency_seconds{operation="read",quantile="0.999"} 0.0004255619484064256
seafowl_object_store_cache_disk_latency_seconds{operation="read",quantile="1"} 0.000481224
seafowl_object_store_cache_disk_latency_seconds_sum{operation="read"} 23.07007866300003
seafowl_object_store_cache_disk_latency_seconds_count{operation="read"} 29875
seafowl_object_store_cache_disk_latency_seconds{operation="write",quantile="0"} 0
seafowl_object_store_cache_disk_latency_seconds{operation="write",quantile="0.5"} 0
seafowl_object_store_cache_disk_latency_seconds{operation="write",quantile="0.9"} 0
seafowl_object_store_cache_disk_latency_seconds{operation="write",quantile="0.95"} 0
seafowl_object_store_cache_disk_latency_seconds{operation="write",quantile="0.99"} 0
seafowl_object_store_cache_disk_latency_seconds{operation="write",quantile="0.999"} 0
seafowl_object_store_cache_disk_latency_seconds{operation="write",quantile="1"} 0
seafowl_object_store_cache_disk_latency_seconds_sum{operation="write"} 0.030619615999999995
seafowl_object_store_cache_disk_latency_seconds_count{operation="write"} 81