grafana / loki

Like Prometheus, but for logs.
https://grafana.com/loki
GNU Affero General Public License v3.0
23.59k stars 3.41k forks source link

Slow Data Loading in Grafana-Loki #8434

Open ramRK1 opened 1 year ago

ramRK1 commented 1 year ago

We are experiencing slow results when analyzing our configurations and resources assigned to Loki. We have properly labelled our data, however, we sometimes face slow data loading. Our data input is around 0.8-1 TB per month and we have restricted the maximum query time to 30 days. Currently, the maximum series for a single query is set to 10000, however, it exceeds in limited cases.

We are looking for suggestions to improve the overall performance of Grafana-Loki. Can you recommend what can be looked at and improved in terms of configurations and resources to increase the speed of results?

Thank you for your time and assistance.

---
loki-distributed:  
  global:
    clusterDomain: <cluster_domain>
    dnsService: "coredns"
  serviceMonitor:
    enabled: true
    namespace: loki
    labels:
      release: prom

  loki:
    revisionHistoryLimit: 2
    schemaConfig:
        configs:
        - chunks:
            period: 24h
            prefix: loki_chunks_
          from: 2022-11-09
          store: boltdb-shipper
          object_store: aws
          schema: v11
          index:
            prefix: loki_index_
            period: 24h
    containerSecurityContext:
      allowPrivilegeEscalation: true

    config: |
      auth_enabled: false
      server:
        http_listen_port: 3100
        http_server_read_timeout: 310s
        http_server_write_timeout: 310s
        grpc_server_max_concurrent_streams: 1000
        grpc_server_max_recv_msg_size: 1073741824
        grpc_server_max_send_msg_size: 1073741824
        log_level: info
      distributor:
        ring:
          kvstore:
            store: memberlist
      memberlist:
        join_members:
          - {{ include "loki.fullname" . }}-memberlist
      ingester:
        lifecycler:
          ring:
            kvstore:
              store: memberlist
            replication_factor: 2
        chunk_idle_period: 20m
        chunk_block_size: 262144
        chunk_target_size: 2621440
        chunk_encoding: snappy
        chunk_retain_period: 45s
        max_transfer_retries: 0
        concurrent_flushes: 32
        flush_check_period: 20s
        flush_op_timeout: 10m
        max_chunk_age: 4h
        wal:
          dir: /var/loki/wal
          enabled: true
          flush_on_shutdown: true
          replay_memory_ceiling: 4GB
      limits_config:
        enforce_metric_name: true
        reject_old_samples: true
        reject_old_samples_max_age: 168h
        max_cache_freshness_per_query: 5m
        retention_period: 2160h
        ingestion_rate_mb: 1024
        ingestion_burst_size_mb: 1024
        split_queries_by_interval: 1h
        max_query_series: 10000
        max_query_parallelism: 32
        per_stream_rate_limit: 500MB    
        per_stream_rate_limit_burst: 1000MB
      {{- if .Values.loki.schemaConfig}}
      schema_config:
      {{- toYaml .Values.loki.schemaConfig | nindent 2}}
      {{- end}}
      {{- if .Values.loki.storageConfig}}
      storage_config:
      {{- if .Values.indexGateway.enabled}}
      {{- $indexGatewayClient := dict "server_address" (printf "dns:///%s:9095" (include "loki.indexGatewayFullname" .)) }}
      {{- $_ := set .Values.loki.storageConfig.boltdb_shipper "index_gateway_client" $indexGatewayClient }}
      {{- end}}
      {{- toYaml .Values.loki.storageConfig | nindent 2}}
      {{- end}}
      chunk_store_config:
        max_look_back_period: 0s

      query_scheduler:
        max_outstanding_requests_per_tenant: 500
        grpc_client_config:
          max_recv_msg_size: 1073741824
          max_send_msg_size: 209715200
          grpc_compression: 'snappy'
        use_scheduler_ring: true
        scheduler_ring:
          kvstore:
            store: memberlist
      querier:
        query_timeout: 2m
        max_concurrent: 12
        query_ingesters_within: 2h
        engine:
          timeout: 3m
          max_look_back_period: 30s
      query_range:
        align_queries_with_step: true
        parallelise_shardable_queries: true
        max_retries: 5
        cache_results: true
        results_cache:
          cache:
            enable_fifocache: false
            redis:
              endpoint: <endpoint>:<port>
              password: <password>
      frontend_worker:
        frontend_address: {{ include "loki.queryFrontendFullname" . }}:9095
        parallelism: 6
        grpc_client_config:
          grpc_compression: 'snappy'
          max_recv_msg_size: 1073741824
          max_send_msg_size: 1.048576e+08
      frontend:
        log_queries_longer_than: 5s
        compress_responses: true
        max_outstanding_per_tenant: 2048
        tail_proxy_url: http://{{ include "loki.querierFullname" . }}:3100
      compactor:
        shared_store: filesystem
        retention_enabled: true
      ruler:
        storage:
          type: local
          local:
            directory: /etc/loki/rules
        ring:
          kvstore:
            store: memberlist
        rule_path: /tmp/loki/scratch
        alertmanager_url: https://alertmanager.xx
        external_url: https://alertmanager.xx
        enable_sharding: true
    storageConfig:
      index_cache_validity: 5m
      max_chunk_batch_size: 500
      index_queries_cache_config:
        enable_fifocache: true
        redis:
          endpoint: <endpoint>:<port>
          password: <password>
          idle_timeout: 60s
          tls_insecure_skip_verify: true
          expiration: 86400s
      boltdb_shipper:
        shared_store: s3
        active_index_directory: /var/loki/boltdb/index
        cache_location: /var/loki/boltdb/cache
        cache_ttl: 24h
      aws:
        s3: <s3_url>
        s3forcepathstyle: true
        bucketnames: <bucket_name>
        endpoint: <endpoint>:<port>
        access_key_id: <user>
        secret_access_key: <password>
        insecure: true
        sse_encryption: false
        http_config:
          idle_conn_timeout: 1m
          response_header_timeout: 10s
        backoff_config:
          min_period: 150ms
          max_period: 5s
          max_retries: 5

  ingester:
    replicas: 4
    persistence:
      enabled: false
      size: 10Gi
      storageClass: erasure-coded-block-storageclass
    resources:
      requests:
        memory: "2Gi"
        cpu: "500m"
      limits:
        memory: "5Gi"
        cpu: "2000m"

  gateway:
    enabled: true
    replicas: 5
    containerSecurityContext:
      allowPrivilegeEscalation: true
    image:
      tag: 1.23-alpine
    nginxConfig:
      serverSnippet: "client_max_body_size 50M; proxy_read_timeout 560; proxy_connect_timeout 530; proxy_send_timeout 590;"
    resources:
      requests:
        memory: "1Gi"
        cpu: "2000m"
      limits:
        memory: "3Gi"
        cpu: "5000m"
    affinity: |
    ingress:
      enabled: true
      ingressClassName: 'nginx'
      annotations:
        nginx.org/proxy-connect-timeout: "530s"
        nginx.org/proxy-read-timeout: "560s"
        nginx.org/proxy-send-timeout: "590s"
        nginx.org/client_max_body_size: "100M"

      hosts:
        - host: <host_name>
          paths:
            - path: /
              pathType: Prefix
      tls:
        - secretName: <secret_name>
          hosts:
            - <hostname>
    basicAuth:
      enabled: true
      username: <username>
      password: <password>

  distributor:
    replicas: 2
    autoscaling:
      enabled: true
      minReplicas: 2
      maxReplicas: 3
      targetCPUUtilizationPercentage: 80
      targetMemoryUtilizationPercentage: 90
    resources:
      requests:
        memory: "1Gi"
        cpu: "400m"
      limits:
        memory: "3Gi"
        cpu: "1000m"

  querier:
    replicas: 4
    autoscaling:
      enabled: true
      minReplicas: 4
      maxReplicas: 8
      targetCPUUtilizationPercentage: 80
      targetMemoryUtilizationPercentage: 90
    persistence:
      enabled: true
      size: 20Gi
      storageClass: cephfs-fs-storageclass
    resources:
      requests:
        memory: "3Gi"
        cpu: "3000m"
      limits:
        memory: "5Gi"
        cpu: "5000m"
    affinity: {}
  queryFrontend:
    replicas: 4
    autoscaling:
      enabled: true
      minReplicas: 4
      maxReplicas: 10
      targetCPUUtilizationPercentage: 50
      targetMemoryUtilizationPercentage: 70
    resources:
      requests:
        memory: "1Gi"
        cpu: "500m"
      limits:
        memory: "4Gi"
        cpu: "2000m"
    affinity: {}
  compactor:
    enabled: true
    persistence:
      enabled: true
      size: 20Gi
      storageClass: cephfs-fs-storageclass
    serviceAccount:
      create: true
    resources:
      requests:
        memory: "1Gi"
        cpu: "300m"
      limits:
        memory: "3Gi"
        cpu: "1000m"

So, these are the helm chart values.yaml attached above. Waiting for your reply.

kaushik-manish commented 1 year ago

I have configured somewhat similar configs for loki setup and facing slowness as well. Can someone help with the optimizations of this config.

aditya-pingsafe commented 1 year ago

@kaushik-manish @ramRK1 seems like this issue is related to https://github.com/grafana/loki/issues/4790. I faced the same issue and defining storage_config.aws.http_config.response_header_timeout: 5s in my config file resolved it.