grafana / loki

Like Prometheus, but for logs.
https://grafana.com/loki
GNU Affero General Public License v3.0
23.92k stars 3.45k forks source link

Getting err="too many failed ingesters" when running on docker #2155

Closed dorroddorrod closed 4 years ago

dorroddorrod commented 4 years ago

When running loki on separate components mode ( ingester,querier,distibuter,table-manager) Getting the following error : loki-querier | level=error ts=2020-06-01T09:22:37.9300754Z caller=pool.go:161 msg="error removing stale clients" err="too many failed ingesters" loki-distributor | level=error ts=2020-06-01T09:22:38.2747819Z caller=pool.go:161 msg="error removing stale clients" err="too many failed ingesters"

My docker-compose.yaml :

version: '3.8'

services:

  redis:
    image: bitnami/redis:latest
    container_name: redis
    environment:
      ALLOW_EMPTY_PASSWORD: "yes"
    ports:
      - 6379

  loki-distributor:
    image: grafana/loki:1.5.0
    container_name: loki-distributor
    ports:
      - 3100
    volumes:
      - ./config.yaml:/etc/loki/config.yaml
    command: -config.file=/etc/loki/config.yaml -target=distributor

  loki-querier:
    image: grafana/loki:1.5.0
    container_name: loki-querier
    ports:
      - 3100
    volumes:
      - ./config.yaml:/etc/loki/config.yaml
    command: -config.file=/etc/loki/config.yaml -target=querier

  loki-ingester:
    image: grafana/loki:1.5.0
    container_name: loki-ingester
    ports:
      - 3100
    volumes:
      - ./config.yaml:/etc/loki/config.yaml
    command: -config.file=/etc/loki/config.yaml -target=ingester

  loki-table-manager:
    image: grafana/loki:1.5.0
    container_name: loki-table-manager
    ports:
      - 3100
    volumes:
      - ./config.yaml:/etc/loki/config.yaml
    command: -config.file=/etc/loki/config.yaml -target=table-manager

  etcd-1:
    container_name: etcd1
    image: quay.io/coreos/etcd:latest
    entrypoint: /usr/local/bin/etcd
    command:
      - '--name=etcd-1'
      - '--initial-advertise-peer-urls=http://etcd-1:2380'
      - '--listen-peer-urls=http://0.0.0.0:2380'
      - '--listen-client-urls=http://0.0.0.0:2379'
      - '--advertise-client-urls=http://etcd-1:2379'
      - '--initial-cluster-token=mys3cr3ttok3n'
      - '--heartbeat-interval=250'
      - '--election-timeout=1250'
      - '--initial-cluster=etcd-1=http://etcd-1:2380,etcd-2=http://etcd-2:2380,etcd-3=http://etcd-3:2380'
      - '--initial-cluster-state=new'
    ports:
      - 2379
    volumes:
      - etcd1:/etcd_data

  etcd-2:
    container_name: etcd2
    image: quay.io/coreos/etcd:latest
    entrypoint: /usr/local/bin/etcd
    command:
      - '--name=etcd-2'
      - '--initial-advertise-peer-urls=http://etcd-2:2380'
      - '--listen-peer-urls=http://0.0.0.0:2380'
      - '--listen-client-urls=http://0.0.0.0:2379'
      - '--advertise-client-urls=http://etcd-2:2379'
      - '--initial-cluster-token=mys3cr3ttok3n'
      - '--heartbeat-interval=250'
      - '--election-timeout=1250'
      - '--initial-cluster=etcd-1=http://etcd-1:2380,etcd-2=http://etcd-2:2380,etcd-3=http://etcd-3:2380'
      - '--initial-cluster-state=new'
    ports:
      - 2379
    volumes:
      - etcd2:/etcd_data

  etcd-3:
    container_name: etcd3
    image: quay.io/coreos/etcd:latest
    entrypoint: /usr/local/bin/etcd
    command:
      - '--name=etcd-3'
      - '--initial-advertise-peer-urls=http://etcd-3:2380'
      - '--listen-peer-urls=http://0.0.0.0:2380'
      - '--listen-client-urls=http://0.0.0.0:2379'
      - '--advertise-client-urls=http://etcd-3:2379'
      - '--initial-cluster-token=mys3cr3ttok3n'
      - '--heartbeat-interval=250'
      - '--election-timeout=1250'
      - '--initial-cluster=etcd-1=http://etcd-1:2380,etcd-2=http://etcd-2:2380,etcd-3=http://etcd-3:2380'
      - '--initial-cluster-state=new'
    ports:
      - 2379
    volumes:
      - etcd3:/etcd_data

  promtail:
    image:  grafana/promtail:latest
    volumes:
      - /var/log:/var/log
      - ./promtail-config.yaml:/etc/promtail/docker-config.yaml
    command: -config.file=/etc/promtail/docker-config.yaml

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"

  e3w:
    image: soyking/e3w:latest
    volumes:
      - ./conf/config.default.ini:/app/conf/config.default.ini
    ports:
      - "8080:8080"
    depends_on:
      - etcd-3

volumes:
  etcd1:
  etcd2:
  etcd3:

loki config file :

auth_enabled: false

server:
  http_listen_port: 3100

ingester:
  lifecycler:
    ring:
      kvstore:
        store: etcd
        etcd:
          endpoints:
            - http://etcd-1:2379
            - http://etcd-2:2379
            - http://etcd-3:2379
          dial_timeout: 10s
          max_retries: 10
      heartbeat_timeout: 1m
      replication_factor: 3
    num_tokens: 128
    heartbeat_period: 5s
    join_after: 0s
    min_ready_duration: 10s
    interface_names:
      - "eth0"
    final_sleep: 30s
  chunk_idle_period: 5m
  chunk_retain_period: 30s

schema_config:
  configs:
    - from: 2020-05-15
      store: aws
      object_store: s3
      schema: v11
      index:
        prefix: loki_

storage_config:
  aws:
    s3: s3://*****@eu-west-1/eu-west-1-test-loki
    dynamodb:
      dynamodb_url: dynamodb://*******@eu-west-1

limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h

chunk_store_config:
  chunk_cache_config:
    redis:
        endpoint: "redis:6379"
        timeout: 100ms
        expiration: 0s
        max_idle_conns: 80
        max_active_conns: 0
  max_look_back_period: 0s
  write_dedupe_cache_config:
    redis:
      endpoint: "redis:6379"
      timeout: 100ms
      expiration: 0s
      max_idle_conns: 80
      max_active_conns: 0

table_manager:
  chunk_tables_provisioning:
    inactive_read_throughput: 1
    inactive_write_throughput: 1
    provisioned_read_throughput: 5
    provisioned_write_throughput: 5
  index_tables_provisioning:
    inactive_read_throughput: 1
    inactive_write_throughput: 1
    provisioned_read_throughput: 5
    provisioned_write_throughput: 5
  retention_deletes_enabled: false
  retention_period: 0s
owen-d commented 4 years ago

Hey, let's see if I can help. The current bottleneck is due to the combination of the replication_factor: 3 config and running one replica of the ingester. It's basically saying "all data must be replicated 3 times", but only presents it one ingester to replicate the data to. You'll need to either scale back the replication factor or scale out the ingesters.

dorroddorrod commented 4 years ago

Thanks, it works !