grafana / loki

Like Prometheus, but for logs.
https://grafana.com/loki
GNU Affero General Public License v3.0
23.87k stars 3.45k forks source link

Promtail: "Not ready: Unable to find any logs to tail. Please verify permissions, volumes, scrape_config, etc" #5955

Open x0rtdan opened 2 years ago

x0rtdan commented 2 years ago

Hello!

I'm trying to install promtail with manifests and export logs from Kubernetes cluster to Loki. While I couldn't get manifest installation described in documentation going I gave helm install a try.

Helm install works fine. It labels and tags the logs files and sends them to loki.

helm upgrade --install promtail grafana/promtail -f /tmp/promtail_values.yml

Now I've tried to extract manifests from helm using:

helm upgrade --install promtail grafana/promtail -f /tmp/promtail_values.yml --dry-run

and applied them.

Promtail errors out with

msg="GET /ready (500) 35.791µs Response: \"Not ready: Unable to find any logs to tail. Please verify permissions, volumes, scrape_config, etc.\\n\" ws: false; Accept: */*; Connection: close; User-Agent: kube-probe/1.23

Kubernetes: v1.23.4 Helm: v3.8.0+gd141386

Manifest:

# Source: promtail/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: promtail
  namespace: loki
  labels:
    helm.sh/chart: promtail-4.2.0
    app.kubernetes.io/name: promtail
    app.kubernetes.io/instance: promtail
    app.kubernetes.io/version: "2.5.0"
    app.kubernetes.io/managed-by: Helm
---
# Source: promtail/templates/secret.yaml
apiVersion: v1
kind: Secret
metadata:
  name: promtail
  namespace: loki
  labels:
    helm.sh/chart: promtail-4.2.0
    app.kubernetes.io/name: promtail
    app.kubernetes.io/instance: promtail
    app.kubernetes.io/version: "2.5.0"
    app.kubernetes.io/managed-by: Helm
stringData:
  promtail.yaml: |
    server:
      log_level: info
      http_listen_port: 3101

    clients:
      - url: http://loki-loki-simple-scalable-gateway/loki/api/v1/push

    positions:
      filename: /run/promtail/positions.yaml

    scrape_configs:
      # See also https://github.com/grafana/loki/blob/master/production/ksonnet/promtail/scrape_config.libsonnet for reference
      - job_name: kubernetes-pods
        pipeline_stages:
          - cri: {}
        kubernetes_sd_configs:
          - role: pod
        relabel_configs:
          - source_labels:
              - __meta_kubernetes_pod_controller_name
            regex: ([0-9a-z-.]+?)(-[0-9a-f]{8,10})?
            action: replace
            target_label: __tmp_controller_name
          - source_labels:
              - __meta_kubernetes_pod_label_app_kubernetes_io_name
              - __meta_kubernetes_pod_label_app
              - __tmp_controller_name
              - __meta_kubernetes_pod_name
            regex: ^;*([^;]+)(;.*)?$
            action: replace
            target_label: app
          - source_labels:
              - __meta_kubernetes_pod_label_app_kubernetes_io_component
              - __meta_kubernetes_pod_label_component
            regex: ^;*([^;]+)(;.*)?$
            action: replace
            target_label: component
          - action: replace
            source_labels:
            - __meta_kubernetes_pod_node_name
            target_label: node_name
          - action: replace
            source_labels:
            - __meta_kubernetes_namespace
            target_label: namespace
          - action: replace
            replacement: $1
            separator: /
            source_labels:
            - namespace
            - app
            target_label: job
          - action: replace
            source_labels:
            - __meta_kubernetes_pod_name
            target_label: pod
          - action: replace
            source_labels:
            - __meta_kubernetes_pod_container_name
            target_label: container
          - action: replace
            replacement: /var/log/pods/*$1/*.log
            separator: /
            source_labels:
            - __meta_kubernetes_pod_uid
            - __meta_kubernetes_pod_container_name
            target_label: __path__
          - action: replace
            regex: true/(.*)
            replacement: /var/log/pods/*$1/*.log
            separator: /
            source_labels:
            - __meta_kubernetes_pod_annotationpresent_kubernetes_io_config_hash
            - __meta_kubernetes_pod_annotation_kubernetes_io_config_hash
            - __meta_kubernetes_pod_container_name
            target_label: __path__
---
# Source: promtail/templates/clusterrole.yaml
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: promtail
  labels:
    helm.sh/chart: promtail-4.2.0
    app.kubernetes.io/name: promtail
    app.kubernetes.io/instance: promtail
    app.kubernetes.io/version: "2.5.0"
    app.kubernetes.io/managed-by: Helm
rules:
  - apiGroups:
      - ""
    resources:
      - nodes
      - nodes/proxy
      - services
      - endpoints
      - pods
    verbs:
      - get
      - watch
      - list
---
# Source: promtail/templates/clusterrolebinding.yaml
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: promtail
  labels:
    helm.sh/chart: promtail-4.2.0
    app.kubernetes.io/name: promtail
    app.kubernetes.io/instance: promtail
    app.kubernetes.io/version: "2.5.0"
    app.kubernetes.io/managed-by: Helm
subjects:
  - kind: ServiceAccount
    name: promtail
    namespace: loki
roleRef:
  kind: ClusterRole
  name: promtail
  apiGroup: rbac.authorization.k8s.io
---
# Source: promtail/templates/daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: promtail
  namespace: loki
  labels:
    helm.sh/chart: promtail-4.2.0
    app.kubernetes.io/name: promtail
    app.kubernetes.io/instance: promtail
    app.kubernetes.io/version: "2.5.0"
    app.kubernetes.io/managed-by: Helm
spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: promtail
      app.kubernetes.io/instance: promtail
  updateStrategy:
    {}
  template:
    metadata:
      labels:
        app.kubernetes.io/name: promtail
        app.kubernetes.io/instance: promtail
      annotations:
        checksum/config: e94680e64beaef2d5b2debc874a8580d23b627b7e373d2a2f207d872170d15c2
    spec:
      serviceAccountName: promtail
      securityContext:
        runAsGroup: 0
        runAsUser: 0
      containers:
        - name: promtail
          image: "docker.io/grafana/promtail:2.5.0"
          imagePullPolicy: IfNotPresent
          args:
            - "-config.file=/etc/promtail/promtail.yaml"
          volumeMounts:
            - name: config
              mountPath: /etc/promtail
            - name: run
              mountPath: /run/promtail
            - mountPath: /var/lib/docker/containers
              name: containers
              readOnly: true
            - mountPath: /var/log/pods
              name: pods
              readOnly: true
          env:
            - name: HOSTNAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
          ports:
            - name: http-metrics
              containerPort: 3101
              protocol: TCP
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop:
              - ALL
            readOnlyRootFilesystem: true
          readinessProbe:
            failureThreshold: 5
            httpGet:
              path: /ready
              port: http-metrics
            initialDelaySeconds: 10
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 1
      tolerations:
        - effect: NoSchedule
          key: node-role.kubernetes.io/master
          operator: Exists
        - effect: NoSchedule
          key: node-role.kubernetes.io/control-plane
          operator: Exists
      volumes:
        - name: config
          secret:
            secretName: promtail
        - name: run
          hostPath:
            path: /run/promtail
        - hostPath:
            path: /var/lib/docker/containers
          name: containers
        - hostPath:
            path: /var/log/pods
          name: pods

Log:

level=warn ts=2022-04-19T08:42:35.660135972Z caller=logging.go:72 msg="GET /ready (500) 113.083µs Response: \"Not ready: Unable to find any logs to tail. Please verify permissions, volumes, scrape_config, etc.\\n\" ws: false; Accept: */*; Connection: close; User-Agent: kube-probe/1.23; "
level=warn ts=2022-04-19T08:42:45.660860296Z caller=logging.go:72 msg="GET /ready (500) 32.75µs Response: \"Not ready: Unable to find any logs to tail. Please verify permissions, volumes, scrape_config, etc.\\n\" ws: false; Accept: */*; Connection: close; User-Agent: kube-probe/1.23; "
slim-bean commented 2 years ago

I think if i read this correctly if you install it with helm it works fine? but when you try to extract the manifests from helm and install them it doesn't work?

If that's the case I'm not sure we can offer much support here, the helm charts are largely already community maintained and we are not really helm experts, and given that the helm chart seems to be working it seems like maybe the way you are extracting the manifests is the issue?

m62534 commented 2 years ago

check if hostpath exists and the daemonset has access to those mounts. you can exec into promtail pods and try cd to /var/log/pods and see if your pod logs exist there (or other path appropriate for your k8s node setup) - if you can't, then promtail wont be able to find the logs to tail.

tchellomello commented 2 years ago

I'm hitting the same issue and I can list the contents from the /var/log/pods directory from the container. Going to look if I can gather more information. Interesting is that I'm seeing this only in 1 node.

$ kubectl grep pods -n observability loki -o wide 
NAMESPACE       NAME                         READY   STATUS    RESTART   AGE   IP              NODENAME
observability   loki-stack-loki-0           1/1     Running   0         15m   10.223.28.101   p53.tatu.home
observability   loki-stack-promtail-98465   1/1     Running   0         14m   10.223.40.58    p70.tatu.home
observability   loki-stack-promtail-hqdl4   0/1     Running   0         15m   10.223.28.90    p53.tatu.home
observability   loki-stack-promtail-m67nb   1/1     Running   0         15m   10.223.42.255   t470n2.tatu.home
observability   loki-stack-promtail-m6hn4   1/1     Running   0         16m   10.223.59.3     t470n1.tatu.home

$ kubectl images -n observability loki-stack-promtail 
[Summary]: 1 namespaces, 4 pods, 4 containers and 2 different images
+----------------------------+-----------+----------------------------------+
|            Pod             | Container |              Image               |
+----------------------------+-----------+----------------------------------+
| loki-stack-promtail-98465  | promtail  | docker.io/grafana/promtail:2.6.1 |
+----------------------------+           +                                  +
| loki-stack-promtail-hqdl4  |           |                                  |
+----------------------------+           +                                  +
| loki-stack-promtail-m67nb  |           |                                  |
+----------------------------+           +----------------------------------+
| loki-stack-promtail-m6hn4  |           | docker.io/grafana/promtail:2.4.2 |
+----------------------------+-----------+----------------------------------+

From the logs, I see:

loki-stack-promtail-hqdl4 promtail W1011 03:03:11.593343       1 reflector.go:324] github.com/prometheus/prometheus/discovery/kubernetes/kubernetes.go:502: failed to list *v1.Pod: Get "https://10.96.0.1:443/api/v1/pods?fieldSelector=spec.nodeName%3Dp53.tatu.home&limit=500&resourceVersion=0": dial tcp 10.96.0.1:443: i/o timeout
loki-stack-promtail-hqdl4 promtail I1011 03:03:11.593490       1 trace.go:205] Trace[815994244]: "Reflector ListAndWatch" name:github.com/prometheus/prometheus/discovery/kubernetes/kubernetes.go:502 (11-Oct-2022 03:02:41.591) (total time: 30001ms):
loki-stack-promtail-hqdl4 promtail Trace[815994244]: ---"Objects listed" error:Get "https://10.96.0.1:443/api/v1/pods?fieldSelector=spec.nodeName%3Dp53.tatu.home&limit=500&resourceVersion=0": dial tcp 10.96.0.1:443: i/o timeout 30001ms (03:03:11.593)
loki-stack-promtail-hqdl4 promtail Trace[815994244]: [30.001581055s] [30.001581055s] END
loki-stack-promtail-hqdl4 promtail E1011 03:03:11.593516       1 reflector.go:138] github.com/prometheus/prometheus/discovery/kubernetes/kubernetes.go:502: Failed to watch *v1.Pod: failed to list *v1.Pod: Get "https://10.96.0.1:443/api/v1/pods?fieldSelector=spec.nodeName%3Dp53.tatu.home&limit=500&resourceVersion=0": dial tcp 10.96.0.1:443: i/o timeout
loki-stack-promtail-hqdl4 promtail level=warn ts=2022-10-11T03:03:19.802334186Z caller=logging.go:86 msg="GET /ready (500) 96.942µs Response: \"Not ready: Unable to find any logs to tail. Please verify permissions, volumes, scrape_config, etc.\\n\" ws: false; Accept: */*; Connection: close; User-Agent: kube-probe/1.25; "

However from the container, I see the hostPath volumes:

$ kubectl iexec -n observability promtail /bin/bash
Namespace: observability | Pod: ✔ loki-stack-promtail-hqdl4
root@loki-stack-promtail-hqdl4:/# ls -la /var/log/pods/ | head
total 12
drwxr-xr-x 31 root root 8192 Oct 11 02:46 .
drwxr-xr-x  1 root root   18 Oct 11 02:45 ..
drwxr-xr-x  3 root root   22 Oct  7 05:40 cadvisor_cadvisor-kx7lh_f189a7d6-bb3e-4011-9899-60924f7d28b7
drwxr-xr-x  3 root root   20 Oct  5 06:16 default_iperf3-kkpmf_f8184cd5-a98b-414a-a867-0301f99f6933
drwxr-xr-x  3 root root   29 Oct 10 11:37 default_ookla-speedtest-cronjob-27756697-tdp6x_035abf20-73a6-4ee3-aea4-a83b6fe82cea
drwxr-xr-x  3 root root   29 Oct 10 12:37 default_ookla-speedtest-cronjob-27756757-xj7dc_a561193b-d7da-4928-825e-e9e1a7790607
drwxr-xr-x  3 root root   29 Oct 10 13:37 default_ookla-speedtest-cronjob-27756817-clj2c_c371b08e-dacd-4b1e-b7ed-19b49ef55438
drwxr-xr-x  3 root root   29 Oct 10 14:37 default_ookla-speedtest-cronjob-27756877-28pzh_b7efbe8e-a0f2-46ba-a9c8-c8c9d3e8ffa2
drwxr-xr-x  3 root root   29 Oct 10 15:37 default_ookla-speedtest-cronjob-27756937-cf8bw_58d9b93f-e984-419c-aa66-c33b9bd2dedf

root@loki-stack-promtail-hqdl4:/# ls -la /var/log/pods/*  | wc -l 
176
tchellomello commented 2 years ago

Interesting that I've start seeing this after migrating to Kubernetes 1.25.2. Interesting that the only node that I'm get the issue is a worker node and I can see this:

$ kubectl get pods -A -o wide -w | grep promt                                                                                  9.3m  Mon 10 Oct 2022 11:23:49 PM EDT
observability           loki-stack-promtail-29l5j                                 1/1     Running     0               8m35s   10.223.59.56     t470n1.tatu.home   <none>           <none>
observability           loki-stack-promtail-98465                                 1/1     Running     0               37m     10.223.40.58     p70.tatu.home      <none>           <none>
observability           loki-stack-promtail-m67nb                                 1/1     Running     0               38m     10.223.42.255    t470n2.tatu.home   <none>           <none>
observability           loki-stack-promtail-scw89                                 0/1     Running     0               8m35s   10.223.28.81     p53.tatu.home      <none>           <none>

From the logs, I can see the k8s API is returning a timeout, however, it's working from the other nodes:

loki-stack-promtail-scw89 promtail W1011 03:21:10.994690       1 reflector.go:324] github.com/prometheus/prometheus/discovery/kubernetes/kubernetes.go:502: failed to list *v1.Pod: Get "https://10.96.0.1:443/api/v1/pods?fieldSelector=spec.nodeName%3Dp53.tatu.home&limit=500&resourceVersion=0": dial tcp 10.96.0.1:443: i/o timeout

However, if I run kubectl proxy and then try that namespace it works fine:

$ kubectl proxy
$ curl 2>/dev/null  "http://127.0.0.1:8001/api/v1/pods?fieldSelector=spec.nodeName%3Dp53.tatu.home&limit=500" | jq | head
{
  "kind": "PodList",
  "apiVersion": "v1",
  "metadata": {
    "resourceVersion": "119135761"
  },
  "items": [
    {
      "metadata": {
        "name": "cadvisor-kx7lh",

I can also achieve the same if I try directly to the k8s API using the token (mounted at the pod) (which also explains why the other pods are working). Interesting...

tchellomello commented 2 years ago

To investigate the issue I've enabled the EphemeralContainers so I could attach a curl to the container to test the API version timeout. Turns out that I found an issue with calico and the k8s 1.25.2 which was fixed on the latest calico version.

Deployed the new calico version and once I rolled out the loki DS it worked like a charm. So it seems the error was being triggered but the API call for that given node.

$ kubectl get pods -A -o wide -w | grep promtai
observability           loki-stack-promtail-95mmm                                1/1     Running   0               28s     10.223.40.24     p70.tatu.home      <none>           <none>
observability           loki-stack-promtail-c69vk                                1/1     Running   0               28s     10.223.59.11     t470n1.tatu.home   <none>           <none>
observability           loki-stack-promtail-cln9z                                1/1     Running   0               28s     10.223.28.143    p53.tatu.home      <none>           <none>
observability           loki-stack-promtail-qtkd6                                1/1     Running   0               28s     10.223.42.193    t470n2.tatu.home   <none>           <none>

@jsteppe on your logs, check if you are hitting the same timeout when querying the k8s API. It might help you on this direction.

olyhao commented 1 year ago

I also encountered the same problem. After updating the promtail configmap file and adding keep to capture specific pod logs, a node's promtail failed to start

hopik007 commented 1 year ago

same "problem". I have specified specific app labels for the collection of logs only for particular pods and pods for this app not running on all nodes. So promtail is not running on node where no pods match labels. So it is probably correct but monitoring yell at me that loki "daemonset mismatch"

Kunone commented 1 year ago

I'm experiencing the same issue. Specifically, only the Promtail instance running on a node without any pods being monitored by the associated configuration is generating a warning. This behavior appears to be expected, since there are no logs to capture and thus the liveness/readiness checks fail.

vojtechvelkjop commented 1 year ago

yes, all loki daemonset pods should be up doesn't matter if there aren`t logs for tailing. Similar as for splunk collector, elasticsearch collectors ...

WeAreHadock commented 1 year ago

Hi, any update on this issue ? If promtail is running on a node where there isn't any log to collect, readiness probe fails. Is this behavior expected ?

renanqts commented 1 year ago

I'm using it to find the logs in EKS, I hope it helps you.

          - action: replace
            replacement: /var/log/pods/*$1/*.log
            separator: /
            source_labels:
              - __meta_kubernetes_pod_uid
              - __meta_kubernetes_pod_container_name
            target_label: __path__
          - action: replace
            replacement: /var/log/pods/*$1/*.log
            regex: true/(.*)
            separator: /
            source_labels:
              - __meta_kubernetes_pod_annotationpresent_kubernetes_io_config_hash
              - __meta_kubernetes_pod_annotation_kubernetes_io_config_hash
              - __meta_kubernetes_pod_container_name
            target_label: __path__
fabiopaiva commented 1 year ago

I'm combining the labels before matching, maybe it helps you

config:
  snippets:
    extraRelabelConfigs:
      # Combine labels in order to keep logs from NGINX Ingress and Promtail # https://github.com/grafana/loki/issues/808#issuecomment-592698307
      - source_labels: [ __meta_kubernetes_pod_label_app_kubernetes_io_instance, __meta_kubernetes_pod_label_app_kubernetes_io_name ]
        separator: ';'
        target_label: combined_labels
      - source_labels: [ combined_labels ]
        action: keep
        regex: alb-ingress-nginx;.*|.*;promtail
ep4sh commented 5 months ago

Hi everyone, I'm facing the same issue when trying to select specific pods with pod selector via kubernetes_sd_configs.

My config is like the following:

      scrape_configs:
        # See also https://github.com/grafana/loki/blob/master/production/ksonnet/promtail/scrape_config.libsonnet for reference
        - job_name: kubernetes-pods
          kubernetes_sd_configs:
            - role: pod
              selectors:
              - role: pod
                label: "app=myapp"
          relabel_configs:
            - source_labels: [ __meta_kubernetes_namespace ]
              regex: '[^(myapp.*)].+'
              action: drop
            - source_labels:
                - __meta_kubernetes_pod_label_app_kubernetes_io_name
                - __meta_kubernetes_pod_label_app
                - __tmp_controller_name
                - __meta_kubernetes_pod_name
              regex: ^;*([^;]+)(;.*)?$
              action: replace
              target_label: app
            - action: replace
              source_labels:
              - __meta_kubernetes_pod_name
              target_label: pod
            - action: replace
              source_labels:
              - __meta_kubernetes_pod_container_name
              target_label: container
            - action: replace
              replacement: /var/log/pods/*$1/*.log
              separator: /
              source_labels:
              - __meta_kubernetes_pod_uid
              - __meta_kubernetes_pod_container_name
              target_label: __path__
            - action: replace
              regex: true/(.*)
              replacement: /var/log/pods/*$1/*.log
              separator: /
              source_labels:
              - __meta_kubernetes_pod_annotationpresent_kubernetes_io_config_hash
              - __meta_kubernetes_pod_annotation_kubernetes_io_config_hash
              - __meta_kubernetes_pod_container_name
              target_label: __path__
          pipeline_stages:
            - cri: {}
            - match:
                selector: '{app="myapp"}'
                stages:
                - json:
                    expressions:
                      level: level
                      method: method
                      status_code: status_code
                      path: path
                      msg: msg
                - labels:
                    level:
                    method:
                    status_code:
                    path:
                    msg:
│ level=warn ts=2024-05-28T12:31:13.758554373Z caller=logging.go:126 traceID=63d4bda14f0ee10b msg="GET /ready (500) 41.249µs Response: \"Not ready: Unable to find any log  │
│ s to tail. Please verify permissions, volumes, scrape_config, etc.\\n\" ws: false; Accept: */*; Connection: close; User-Agent: kube-probe/1.28;