grafana / loki

Like Prometheus, but for logs.
https://grafana.com/loki
GNU Affero General Public License v3.0
23.97k stars 3.46k forks source link

Query frontend: too many unhealthy instances in the ring #14934

Open ahsifer opened 1 week ago

ahsifer commented 1 week ago

Describe the bug Once in a while, Loki becomes too slow and even the labels query fails. I have attached log lines from the query frontend, querier, and ingester

To Reproduce Steps to reproduce the behavior:

helm ls
NAME    NAMESPACE   REVISION    UPDATED                                 STATUS      CHART           APP VERSION
loki    default     1           2024-11-14 03:01:14.312731379 +0300 +03 deployed    loki-6.19.0     3.2.0  

Environment:

Screenshots, Promtail config, or terminal output This is the logs from the query-frontend

{"caller":"retry.go:107","code":"Code(500)","end":"2024-11-14T18:15:45.436Z","end_delta":"-57.250601969s","err":"rpc error: code = Code(500) desc = too many unhealthy instances in the ring","length":"5m0s","level":"error","msg":"error processing request","org_id":"fake","query":"{log_level=\"Verbose\"} |= \"\"","query_hash":3288896298,"retry_in":"4.596118935s","start":"2024-11-14T18:10:45.436Z","start_delta":"4m2.749397134s","traceID":"1fdbad8d04dbd3a4","try":2,"ts":"2024-11-14T18:14:48.185404545Z","type":"queryrange.LokiRequest"}
{"caller":"retry.go:107","code":"Code(500)","end":"2024-11-14T18:15:45.414Z","end_delta":"-52.786385373s","err":"rpc error: code = Code(500) desc = too many unhealthy instances in the ring","length":"5m0s","level":"error","msg":"error processing request","org_id":"fake","query":"{log_level=\"Verbose\"} |= \"\"","query_hash":3288896298,"retry_in":"4.464963017s","start":"2024-11-14T18:10:45.414Z","start_delta":"4m7.213613697s","traceID":"29cec51b93debe20","try":3,"ts":"2024-11-14T18:14:52.627635884Z","type":"queryrange.LokiRequest"}
{"cache_chunk_bytes_fetched":0,"cache_chunk_bytes_stored":0,"cache_chunk_download_time":"0s","cache_chunk_hit":0,"cache_chunk_req":0,"cache_index_download_time":"0s","cache_index_hit":0,"cache_index_req":0,"cache_result_download_time":"0s","cache_result_hit":0,"cache_result_query_length_served":"0s","cache_result_req":0,"cache_stats_results_download_time":"0s","cache_stats_results_hit":0,"cache_stats_results_req":0,"cache_volume_results_download_time":"0s","cache_volume_results_hit":0,"cache_volume_results_req":0,"caller":"metrics.go:223","chunk_refs_fetch_time":"2.837299ms","component":"frontend","congestion_control_latency":"0s","disable_pipeline_wrappers":"false","duration":"8.963556604s","end_delta":"-52.791194475s","index_bloom_filter_ratio":"0.00","index_post_bloom_filter_chunks":0,"index_shard_resolver_duration":"0s","index_total_chunks":0,"ingester_chunk_compressed_bytes":"24kB","ingester_chunk_decompressed_bytes":"268kB","ingester_chunk_downloaded":0,"ingester_chunk_head_bytes":"23kB","ingester_chunk_matches":1,"ingester_chunk_refs":0,"ingester_post_filter_lines":197,"ingester_requests":3,"latency":"fast","length":"5m0s","level":"info","limit":500,"lines_per_second":21,"org_id":"fake","pipeline_wrapper_filtered_lines":0,"post_filter_lines":197,"query":"{log_level=\"Verbose\"} |= ``","query_hash":2248683790,"query_referenced_structured_metadata":false,"query_type":"limited","queue_time":"227µs","range_type":"range","returned_lines":0,"shards":1,"splits":0,"start_delta":"4m7.208805422s","status":"200","step":"200ms","store_chunks_download_time":"0s","throughput":"32kB","total_bytes":"291kB","total_bytes_structured_metadata":"1.8kB","total_entries":84,"total_lines":197,"traceID":"1fdbad8d04dbd3a4","ts":"2024-11-14T18:14:52.644858332Z"}

The following attached images from the distributor pods ring status page Image Image Image

The following is my Loki configuration

global:
  dnsService: "kube-dns"
  dnsNamespace: "kube-system"
deploymentMode: Distributed
loki:
  auth_enabled: false
  schemaConfig:
    configs:
      - from: "2024-10-25"
        store: tsdb
        object_store: s3
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  storage_config:
    tsdb_shipper:
      active_index_directory: /var/loki/tsdb-shipper-active
      cache_location: /var/loki/tsdb-shipper-cache
      cache_ttl: 48h0m0s
      resync_interval: 5m
      index_gateway_client:
        server_address: '{{ include "loki.indexGatewayAddress" . }}'
    hedging:
      at: "300ms"
      up_to: 2
      max_per_second: 15
  server:
    http_listen_port: 3100
    grpc_listen_port: 9095
    http_server_read_timeout: 5m
    http_server_write_timeout: 5m
    http_listen_conn_limit: 0
    register_instrumentation: true
    http_server_idle_timeout: 5m
    log_level: debug
    log_format: "json"
    log_source_ips_enabled: true
    log_request_headers: true
    grpc_server_max_concurrent_streams: 256
  limits_config:
    discover_service_name: []
    unordered_writes: true
    reject_old_samples: true
    reject_old_samples_max_age: 1w
    max_cache_freshness_per_query: 5m
    split_queries_by_interval: 3h
    tsdb_max_query_parallelism: 256
    query_timeout: 5m
    volume_enabled: true
    max_entries_limit_per_query: 10000
    retention_period: 4320h0m0s
    max_query_lookback: 4320h0m0s
    max_global_streams_per_user: 0
    max_streams_per_user: 0
    per_stream_rate_limit: 16MB
    per_stream_rate_limit_burst: 64MB
    max_streams_matchers_per_query: 300
    max_query_series: 300
    max_query_length: 31d1h
    tsdb_max_bytes_per_shard: 600MB
    ingestion_burst_size_mb: 100
    ingestion_rate_mb: 100000
    ingestion_rate_strategy: local
    max_line_size: 256KB
    max_line_size_truncate: true
  commonConfig:
    replication_factor: 1
  ingester:
    chunk_encoding: snappy
    chunk_retain_period: 10s
    chunk_idle_period: 1h
    max_chunk_age: 2h0m0s
    wal:
      enabled: true
      checkpoint_duration: 5m0s
      dir: /var/loki/wal
      flush_on_shutdown: true
      replay_memory_ceiling: 3GB
  frontend:
    log_queries_longer_than: 10s
  querier:
    max_concurrent: 16
    query_ingesters_within: 3h0m0s
  compactor:
    retention_enabled: true
    working_directory: /tmp/loki/compactor
    delete_request_store: s3
    compaction_interval: 10m
    retention_delete_delay: 1h
    retention_delete_worker_count: 32
  tracing:
    enabled: true
  storage:
    type: s3
    bucketNames:
      chunks: "loki-test"
    s3:
      endpoint: s3.net
      secretAccessKey: ****
      accessKeyId: ****
      signatureVersion: v4
      s3ForcePathStyle: true
      insecure: false
      http_config:
        timeout: 15s
        insecure_skip_verify: true
      backoff_config:
        min_period: 100ms
        max_period: 3s
        max_retries: 5
ingester:
  replicas: 3
  maxUnavailable: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
  zoneAwareReplication:
    enabled: false
  persistence:
    enabled: true
    claims:
      - name: data
        size: 25G
        mountPath: /var/loki/
        storageClass: local-storage
querier:
  replicas: 3
  maxUnavailable: 2
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
  autoscaling:
    enabled: true
    minReplicas: 3
    maxReplicas: 6
    targetCPUUtilizationPercentage: 70
    targetMemoryUtilizationPercentage: 50
  extraArgs:
gateway:
  enabled: true
  replicas: 2
  maxUnavailable: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
  containerPort: 443
  basicAuth:
    enabled: true
    username: admin
    password: admin
  service:
    type: NodePort
    nodePort: 31000
    port: 80
  extraVolumes:
    - name: loki-gateway-tls
      secret:
        secretName: loki-gateway-tls
  extraVolumeMounts:
    - name: loki-gateway-tls
      mountPath: /etc/nginx/tls
      readOnly: true
  readinessProbe:
    failureThreshold: 3
    httpGet:
      path: /
      port: 443
      scheme: HTTPS
  nginxConfig:
    schema: http
    enableIPv6: false
    logFormat: |-
      main '$remote_addr - $remote_user [$time_local]  $status '
              '"$request" $body_bytes_sent "$http_referer" '
              '"$http_user_agent" "$http_x_forwarded_for"';
    serverSnippet: |
      listen 443 ssl;
      ssl_certificate /etc/nginx/tls/tls.crt;
      ssl_certificate_key /etc/nginx/tls/tls.key;
      ssl_protocols TLSv1.2 TLSv1.3;
      ssl_ciphers 'TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256';
      ssl_prefer_server_ciphers on;
    httpSnippet: >-
      {{ if .Values.loki.tenants }}proxy_set_header X-Scope-OrgID $remote_user;{{ end }}
    ssl: true
queryFrontend:
  replicas: 2
  maxUnavailable: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
queryScheduler:
  replicas: 2
  maxUnavailable: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
distributor:
  replicas: 3
  maxUnavailable: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
compactor:
  replicas: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
indexGateway:
  replicas: 2
  maxUnavailable: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "3100"
  persistence:
    enabled: true
    inMemory: false
    storageClass: local-storage
    size: 24G
test:
  enabled: false
lokiCanary:
  enabled: false
  extraArgs:
    - "-user=admin"
    - "-pass=admin"
memcached:
  image:
    repository: memcached
    tag: 1.6.32
resultsCache:
  enabled: true
  defaultValidity: 12h
  timeout: 500ms
  replicas: 2
  port: 11211
  allocatedMemory: 1024
  maxItemMemory: 20
  connectionLimit: 16384
  writebackSizeLimit: 500MB
  writebackBuffer: 500000
  writebackParallelism: 8
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "9150"
chunksCache:
  enabled: true
  batchSize: 8
  parallelism: 5
  timeout: 2000ms
  defaultValidity: 0s
  replicas: 2
  port: 11211
  allocatedMemory: 1024
  maxItemMemory: 16
  connectionLimit: 16384
  writebackSizeLimit: 500MB
  writebackBuffer: 500000
  writebackParallelism: 8
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "9150"
  persistence:
    enabled: true
    storageClass: local-storage
    storageSize: 25G
    mountPath: /data
bloomCompactor:
  replicas: 0
bloomGateway:
  replicas: 0
ruler:
  replicas: 0
backend:
  replicas: 0
read:
  replicas: 0
write:
  replicas: 0
singleBinary:
  replicas: 0