grafana / mimir

Grafana Mimir provides horizontally scalable, highly available, multi-tenant, long-term storage for Prometheus.
https://grafana.com/oss/mimir/
GNU Affero General Public License v3.0
4.13k stars 529 forks source link

/cortex.Ingester/MetricsForLabelMatchers time consumption #9112

Closed wangjinxiang0522 closed 2 months ago

wangjinxiang0522 commented 2 months ago

Describe the bug

/cortex.Ingester/MetricsForLabelMatchers time consumption screenshot-20240827-190712

To Reproduce

Steps to reproduce the behavior: version: 2.11.0 Kubernetes mimir-distributed

Expected behavior

A clear and concise description of what you expected to happen.

Environment

image:
  repository: reg.sprucetec.com/monitor/grafana/mimir
  tag: r281-93e069f
  pullPolicy: IfNotPresent

global:
  dnsService: "coredns"
  # extraEnvFrom:
  #   - secretRef:
  #       name: mimir-bucket-secret
  podAnnotations:
    bucketSecretVersion: "0"

# This turns of the built-in MinIO support
minio:
  enabled: false
nginx:
  enabled: false

gateway:
  enabledNonEnterprise: true
  nodeSelector:
    node-type: monitoring
  replicas: 2
  nginxConfig:
    accessLogEnabled: false
  nginx:
    config:
      enableIPv6: false
    verboseLogging: false
    image:
      registry: reg.sprucetec.com
      repository: monitor/docker.io/nginxinc/nginx-unprivileged
      tag: 1.25-alpine
      pullPolicy: IfNotPresent
  service:
    legacyPorts: null
  resources:
    limits:
      memory: "2Gi"
      cpu: "2"
    requests:
      memory: "128Mi"
      cpu: "200m"
  env:
    - name: GOMEMLIMIT
      value: "1800MiB"
    - name: GOMAXPROCS
      value: "2"

ingester:
  replicas: 10
  nodeSelector:
    node-type: monitoring
  zoneAwareReplication:
    enabled: false
  env:
    - name: GOMEMLIMIT
      value: "3686MiB"
    - name: GOMAXPROCS
      value: "1"
    - name: JAEGER_AGENT_HOST
      value: mc-collector-ta-collector.monitoring.svc.cluster.local
    - name: JAEGER_AGENT_PORT
      value: "6831"
    - name: JAEGER_SAMPLER_TYPE
      value: const
    - name: JAEGER_SAMPLER_PARAM
      value: "1"
  resources:
    limits:
      memory: "4Gi"
      cpu: "1"
    requests:
      memory: "1Gi"
      cpu: "200m"
  autoscaling:
    enabled: true
    minReplicas: 3
    maxReplicas: 10
    targetCPUUtilizationPercentage: 80
    targetMemoryUtilizationPercentage: 80
  native-histograms-ingestion-enabled: true
  persistentVolume:
    enabled: true
    size: 60Gi
    storageClass: "monitoring-openebs-hostpath"
#    storageClass: "monitoring-ceph-sc"

memcachedExporter:
  enabled: true
  image:
    repository: reg.sprucetec.com/monitor/memcached-exporter
    tag: v0.14.3
    pullPolicy: IfNotPresent

memcached:
  image:
    repository: reg.sprucetec.com/monitor/memcached
    tag: 1.6.25-alpine
    pullPolicy: IfNotPresent

admin-cache:
  enabled: true
  replicas: 2
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "8Gi"
      cpu: "2"
    requests:
      memory: "128Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "7372MiB"
    - name: GOMAXPROCS
      value: "2"

chunks-cache:
  enabled: true
  replicas: 4
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "8Gi"
      cpu: "2"
    requests:
      memory: "128Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "7372MiB"
    - name: GOMAXPROCS
      value: "2"
    - name: JAEGER_AGENT_HOST
      value: mc-collector-ta-collector.monitoring.svc.cluster.local
    - name: JAEGER_AGENT_PORT
      value: "6831"
    - name: JAEGER_SAMPLER_TYPE
      value: const
    - name: JAEGER_SAMPLER_PARAM
      value: "1"

index-cache:
  enabled: true
  replicas: 4
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "8Gi"
      cpu: "2"
    requests:
      memory: "128Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "7372MiB"
    - name: GOMAXPROCS
      value: "2"

metadata-cache:
  enabled: true
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "1Gi"
      cpu: "1"
    requests:
      memory: "128Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "900MiB"
    - name: GOMAXPROCS
      value: "1"

results-cache:
  enabled: true
  replicas: 4
  allocatedMemory: 1024
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "8Gi"
      cpu: "2"
    requests:
      memory: "128Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "7372MiB"
    - name: GOMAXPROCS
      value: "2"
    - name: JAEGER_AGENT_HOST
      value: mc-collector-ta-collector.monitoring.svc.cluster.local
    - name: JAEGER_AGENT_PORT
      value: "6831"
    - name: JAEGER_SAMPLER_TYPE
      value: const
    - name: JAEGER_SAMPLER_PARAM
      value: "1"

distributor:
  replicas: 6
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "4Gi"
      cpu: "2"
    requests:
      memory: "512Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"

querier:
  replicas: 5
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "4Gi"
      cpu: "2"
    requests:
      memory: "128Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"
#  extraEnvFrom:
#    - name: JAEGER_AGENT_HOST
#      value: mc-collector-ta-collector.monitoring.svc.cluster.local
#    - name: JAEGER_AGENT_PORT
#      value: "6831"
#    - name: JAEGER_SAMPLER_TYPE
#      value: const
#    - name: JAEGER_SAMPLER_PARAM
#      value: "1"

query_frontend:
  replicas: 2
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "4Gi"
      cpu: "2"
    requests:
      memory: "256Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"
    - name: JAEGER_AGENT_HOST
      value: mc-collector-ta-collector.monitoring.svc.cluster.local
    - name: JAEGER_AGENT_PORT
      value: "6831"
    - name: JAEGER_SAMPLER_TYPE
      value: const
    - name: JAEGER_SAMPLER_PARAM
      value: "1"

ruler:
  replicas: 2
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "4Gi"
      cpu: "2"
    requests:
      memory: "256Mi"
      cpu: "100m"
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"

alertmanager:
  nodeSelector:
    node-type: monitoring
  persistentVolume:
    enabled: true
    size: 8Gi
    storageClass: "monitoring-openebs-hostpath"
#    storageClass: "monitoring-ceph-sc"

  replicas: 2
  resources:
    limits:
      memory: "4Gi"
      cpu: "2"
    requests:
      memory: "256Mi"
      cpu: "100m"
  statefulSet:
    enabled: true
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"

store_gateway:
  replicas: 2
  zoneAwareReplication:
    enabled: false
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "4Gi"
      cpu: "2"
    requests:
      memory: "256Mi"
      cpu: "700m"
  persistentVolume:
    enabled: true
    size: 8Gi
    storageClass: "monitoring-openebs-hostpath"
#    storageClass: "monitoring-ceph-sc"
#  extraEnvFrom:
#    - name: JAEGER_AGENT_HOST
#      value: mc-collector-ta-collector.monitoring.svc.cluster.local
#    - name: JAEGER_AGENT_PORT
#      value: "6831"
#    - name: JAEGER_SAMPLER_TYPE
#      value: const
#    - name: JAEGER_SAMPLER_PARAM
#      value: "1"

compactor:
  replicas: 2
  nodeSelector:
    node-type: monitoring
  resources:
    limits:
      memory: "4Gi"
      cpu: "2"
    requests:
      memory: "256Mi"
      cpu: "200m"
  persistentVolume:
    enabled: true
    size: 18Gi
    storageClass: "monitoring-openebs-hostpath"
#    storageClass: "monitoring-ceph-sc"

  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"

overrides_exporter:
  enabled: true
  replicas: 2
  nodeSelector:
    node-type: monitoring
  resources:
    requests:
      cpu: 100m
      memory: 128Mi
    limits:
      memory: "4Gi"
      cpu: "2"
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"

query_scheduler:
  enabled: true
  replicas: 2
  nodeSelector:
    node-type: monitoring
  resources:
    requests:
      cpu: 100m
      memory: 128Mi
    limits:
      memory: "4Gi"
      cpu: "2"
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"
    - name: JAEGER_AGENT_HOST
      value: mc-collector-ta-collector.monitoring.svc.cluster.local
    - name: JAEGER_AGENT_PORT
      value: "6831"
    - name: JAEGER_SAMPLER_TYPE
      value: const
    - name: JAEGER_SAMPLER_PARAM
      value: "1"

rollout_operator:
  enabled: true
  #image: reg.sprucetec.com/monitor/grafana/rollout-operator:v0.13.0
  #imagePullPolicy: IfNotPresent
  image:
    repository: reg.sprucetec.com/monitor/grafana/rollout-operator
    tag: v0.13.0
    pullPolicy: IfNotPresent
  nodeSelector:
    node-type: monitoring
  resources:
    requests:
      cpu: 100m
      memory: 128Mi
    limits:
      memory: "4Gi"
      cpu: "2"
  env:
    - name: GOMEMLIMIT
      value: "3684MiB"
    - name: GOMAXPROCS
      value: "2"

mimir:
  structuredConfig:
    multitenancy_enabled: false
    common:
      storage:
        backend: s3
        s3:
          endpoint: 10.12.1.116:9000
          secret_access_key: minio  # This is a secret injected via an environment variable
          access_key_id: minio # This is a secret injected via an environment variable
          insecure:          true
    # Uncomment when using Grafana Enterprise Metrics
    # admin_client:
    #   storage:
    #     s3:
    #       bucket_name: my-admin-bucket
    #       access_key_id: ${AWS_ACCESS_KEY_ID}
    #       endpoint: s3.amazonaws.com
    #       secret_access_key: ${AWS_SECRET_ACCESS_KEY}
    alertmanager_storage:
      s3:
        bucket_name: k8s-mimir-alertmanager
    blocks_storage:
      bucket_store:
        ignore_blocks_within: 20m # 默认10h
      backend: s3
      s3:
        bucket_name: k8s-mimir
    ruler_storage:
      s3:
        bucket_name: k8s-mimir-rule
    memberlist:
      cluster_label: mimir
    limits:
      max_global_series_per_user: 0
      max_label_name_length: 102400
      max_label_value_length: 102400
      max_label_names_per_series: 100
      ingestion_rate: 10000000000
      out_of_order_time_window: 5m
      ruler_max_rule_groups_per_tenant: 0
    querier:
      query_store_after: 30m
      max_concurrent: 64
    ingester:
      ring:
        replication_factor: 3
        kvstore:
          store: etcd
          etcd:
            endpoints:
              - http://10.2.115.2:2379
              - http://10.2.116.2:2379
              - http://10.2.118.2:2379
    distributor:
      ring:
        kvstore:
          store: etcd
          etcd:
            endpoints:
              - http://10.2.115.2:2379
              - http://10.2.116.2:2379
              - http://10.2.118.2:2379
      remote_timeout: 30s
    query_scheduler:
      ring:
        kvstore:
          store: etcd
          etcd:
            endpoints:
              - http://10.2.115.2:2379
              - http://10.2.116.2:2379
              - http://10.2.118.2:2379
    ruler:
      query_frontend:
        address: dns:///mimir-query-frontend.monitoring.svc.cluster.local:9095
      ring:
        kvstore:
          store: etcd
          etcd:
            endpoints:
              - http://10.2.115.2:2379
              - http://10.2.116.2:2379
              - http://10.2.118.2:2379
    alertmanager:
      sharding_ring:
        kvstore:
          store: etcd
          etcd:
            endpoints:
              - http://10.2.115.2:2379
              - http://10.2.116.2:2379
              - http://10.2.118.2:2379
    compactor:
      compaction_interval: 10m
      data_dir: /data/
      deletion_delay: 1h
      first_level_compaction_wait_period: 25m
      max_closing_blocks_concurrency: 2
      max_opening_blocks_concurrency: 4
      sharding_ring:
        kvstore:
          store: etcd
          etcd:
            endpoints:
              - http://10.2.115.2:2379
              - http://10.2.116.2:2379
              - http://10.2.118.2:2379
    store_gateway:
      sharding_ring:
        kvstore:
          store: etcd
          etcd:
            endpoints:
              - http://10.2.115.2:2379
              - http://10.2.116.2:2379
              - http://10.2.118.2:2379

metaMonitoring:
  serviceMonitor:
#    interval: 60s
    enabled: true
    namespace: monitoring
    labels:
      release: prometheus

Hi,all. How should I adjust the parameters to solve the issue of high time consumption? Thanks.

aknuds1 commented 2 months ago

Going to convert this to a discussion, since it's a request for help rather than reporting an actual bug.