fluent / fluentd-kubernetes-daemonset

Fluentd daemonset for Kubernetes and it Docker image
Apache License 2.0
1.26k stars 982 forks source link

config error file="/fluentd/etc/fluent.conf" error_class=Fluent::ConfigError error="Invalid Kubernetes API v1 endpoint https://10.96.0.1:443/api: Timed out connecting to server" #1411

Closed kavita1205 closed 8 months ago

kavita1205 commented 1 year ago

Hi Team,

Few of our GPU servers not able to send logs to splunk. I am getting below error . Can someone please help me here.

config error file="/fluentd/etc/fluent.conf" error_class=Fluent::ConfigError error="Invalid Kubernetes API v1 endpoint https://10.96.0.1:443/api: Timed out connecting to server"

Here is the values.yaml file

COMPUTED VALUES:
USER-SUPPLIED VALUES: null
global:
  kubernetes:
    clusterName: cluster_name
  logLevel: info
  metrics:
    service:
      enabled: true
      headless: true
  monitoring_agent_enabled: null
  monitoring_agent_index_name: null
  prometheus_enabled: null
  serviceMonitor:
    additionalLabels: {}
    enabled: false
    interval: ""
    metricsPort: 24231
    scrapeTimeout: 10s
  splunk:
    hec:
      host: splunk-hec.oi.com
      indexRouting: null
      insecureSSL: true
      port: 8088
      protocol: https
      token: 779EE032-1473-40F8-AA19-********
splunk-kubernetes-logging:
  affinity: {}
  buffer:
    '@type': memory
    chunk_limit_records: 100000
    chunk_limit_size: 20m
    flush_interval: 5s
    flush_thread_count: 1
    overflow_action: block
    retry_max_times: 5
    retry_type: periodic
    total_limit_size: 600m
  charEncodingUtf8: false
  containers:
    logFormatType: json
    path: /var/log
    pathDest: /var/lib/docker/containers
    removeBlankEvents: true
  customFilters: {}
  enabled: true
  extraVolumeMounts: []
  extraVolumes: []
  fluentd:
    path: /var/log/containers/*.log
  global:
    kubernetes:
      clusterName: cluster_name
    logLevel: info
    metrics:
      service:
        enabled: true
        headless: true
    serviceMonitor:
      additionalLabels: {}
      enabled: false
      interval: ""
      metricsPort: 24231
      scrapeTimeout: 10s
    splunk:
      hec:
        caFile: null
        clientCert: null
        clientKey: null
        host: splunk-hec.oi.com
        indexName: null
        insecureSSL: true
        port: 8088
        protocol: https
        token: 779EE032-1473-40F8-AA19-*******
  image:
    name: splunk/fluentd-hec
    pullPolicy: IfNotPresent
    registry: docker.io
    tag: 1.2.8
    usePullSecret: false
  indexFields: []
  journalLogPath: /var/log/journal
  k8sMetadata:
    cache_ttl: 3600
    podLabels:
    - app
    - k8s-app
    - release
    watch: true
  kubernetes:
    clusterName: ***-m**-lv
    securityContext: false
  logs:
    dns-controller:
      from:
        pod: dns-controller
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:dns-controller
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    dns-sidecar:
      from:
        container: sidecar
        pod: kube-dns
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kubedns-sidecar
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    dnsmasq:
      from:
        pod: kube-dns
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:dnsmasq
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    docker:
      from:
        journald:
          unit: docker.service
      sourcetype: kube:docker
      timestampExtraction:
        format: '%Y-%m-%dT%H:%M:%S.%NZ'
        regexp: time="(?<time>\d{4}-\d{2}-\d{2}T[0-2]\d:[0-5]\d:[0-5]\d.\d{9}Z)"
    etcd:
      from:
        container: etcd-container
        pod: etcd-server
      timestampExtraction:
        format: '%Y-%m-%d %H:%M:%S.%N'
        regexp: (?<time>\d{4}-\d{2}-\d{2} [0-2]\d:[0-5]\d:[0-5]\d\.\d{6})
    etcd-events:
      from:
        container: etcd-container
        pod: etcd-server-events
      timestampExtraction:
        format: '%Y-%m-%d %H:%M:%S.%N'
        regexp: (?<time>\d{4}-[0-1]\d-[0-3]\d [0-2]\d:[0-5]\d:[0-5]\d\.\d{6})
    etcd-minikube:
      from:
        container: etcd
        pod: etcd-minikube
      timestampExtraction:
        format: '%Y-%m-%d %H:%M:%S.%N'
        regexp: (?<time>\d{4}-\d{2}-\d{2} [0-2]\d:[0-5]\d:[0-5]\d\.\d{6})
    kube-apiserver:
      from:
        pod: kube-apiserver
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kube-apiserver
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    kube-audit:
      from:
        file:
          path: /var/log/kube-apiserver-audit.log
      sourcetype: kube:apiserver-audit
      timestampExtraction:
        format: '%Y-%m-%dT%H:%M:%SZ'
    kube-controller-manager:
      from:
        pod: kube-controller-manager
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kube-controller-manager
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    kube-dns-autoscaler:
      from:
        container: autoscaler
        pod: kube-dns-autoscaler
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kube-dns-autoscaler
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    kube-proxy:
      from:
        pod: kube-proxy
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kube-proxy
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    kube-scheduler:
      from:
        pod: kube-scheduler
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kube-scheduler
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    kubedns:
      from:
        pod: kube-dns
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kubedns
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
    kubelet:
      from:
        journald:
          unit: kubelet.service
      multiline:
        firstline: /^\w[0-1]\d[0-3]\d/
      sourcetype: kube:kubelet
      timestampExtraction:
        format: '%m%d %H:%M:%S.%N'
        regexp: \w(?<time>[0-1]\d[0-3]\d [^\s]*)
  nodeSelector:
    beta.kubernetes.io/os: linux
  podSecurityPolicy:
    apiGroup: policy
    apparmor_security: true
    create: false
  rbac:
    create: true
    openshiftPrivilegedSccBinding: false
  resources:
    requests:
      cpu: 100m
      memory: 200Mi
  secret:
    create: true
  sendAllMetadata: false
  serviceAccount:
    create: true
  sourcetypePrefix: kube
  splunk:
    hec:
      indexName: ml_logs
      indexRouting: false
      indexRoutingDefaultIndex: default
    ingest_api: {}
  tolerations:
  - effect: NoSchedule
    key: node-role.kubernetes.io/master
splunk-kubernetes-metrics:
  affinity: {}
  aggregatorBuffer:
    '@type': memory
    chunk_limit_records: 10000
    chunk_limit_size: 10m
    flush_interval: 5s
    flush_thread_count: 1
    overflow_action: block
    retry_max_times: 5
    retry_type: periodic
    total_limit_size: 400m
  aggregatorTolerations: {}
  buffer:
    '@type': memory
    chunk_limit_records: 10000
    chunk_limit_size: 10m
    flush_interval: 5s
    flush_thread_count: 1
    overflow_action: block
    retry_max_times: 5
    retry_type: periodic
    total_limit_size: 400m
  customFilters: {}
  enabled: true
  global:
    kubernetes:
      clusterName: cluster_name
    logLevel: info
    metrics:
      service:
        enabled: true
        headless: true
    serviceMonitor:
      additionalLabels: {}
      enabled: false
      interval: ""
      metricsPort: 24231
      scrapeTimeout: 10s
    splunk:
      hec:
        caFile: null
        clientCert: null
        clientKey: null
        host: splunk-hec.oi.com
        indexName: null
        insecureSSL: true
        port: 8088
        protocol: https
        token: 779EE032-1473-40F8-AA19-*******
  image:
    name: splunk/k8s-metrics
    pullPolicy: IfNotPresent
    registry: docker.io
    tag: 1.1.7
    usePullSecret: false
  imageAgg:
    name: splunk/k8s-metrics-aggr
    pullPolicy: IfNotPresent
    registry: docker.io
    tag: 1.1.7
    usePullSecret: false
  kubernetes:
    clusterName: ***-ml-lv
    insecureSSL: true
    kubeletAddress: '"#{ENV[''KUBERNETES_NODE_IP'']}"'
    kubeletPort: 10250
    useRestClientSSL: true
  metricsInterval: 60s
  nodeSelector:
    beta.kubernetes.io/os: linux
  podSecurityPolicy:
    apiGroup: policy
    apparmor_security: true
    create: false
  rbac:
    create: true
  resources:
    fluent:
      limits:
        cpu: 200m
        memory: 300Mi
      requests:
        cpu: 200m
        memory: 300Mi
  secret:
    create: true
  serviceAccount:
    create: true
    name: splunk-kubernetes-metrics
    usePullSecrets: false
  splunk:
    hec:
      indexName: em_metrics
  tolerations:
  - effect: NoSchedule
    key: node-role.kubernetes.io/master
splunk-kubernetes-objects:
  affinity: {}
  buffer:
    '@type': memory
    chunk_limit_records: 10000
    chunk_limit_size: 20m
    flush_interval: 5s
    flush_thread_count: 1
    overflow_action: block
    retry_max_times: 5
    retry_type: periodic
    total_limit_size: 600m
  checkpointFile:
    name: kubernetes-objects.pos
  customFilters: {}
  enabled: true
  global:
    kubernetes:
      clusterName: cluster_name
    logLevel: info
    metrics:
      service:
        enabled: true
        headless: true
    serviceMonitor:
      additionalLabels: {}
      enabled: false
      interval: ""
      metricsPort: 24231
      scrapeTimeout: 10s
    splunk:
      hec:
        caFile: null
        clientCert: null
        clientKey: null
        host: splunk-hec.oi.com
        indexName: null
        insecureSSL: true
        port: 8088
        protocol: https
        token: 779EE032-1473-40F8-AA19-****
  image:
    name: splunk/kube-objects
    pullPolicy: IfNotPresent
    registry: docker.io
    tag: 1.1.8
    usePullSecret: false
  indexFields: []
  kubernetes:
    clusterName: xperi-ml
    insecureSSL: true
  nodeSelector:
    beta.kubernetes.io/os: linux
  objects:
    apps:
      v1:
      - interval: 60s
        name: daemon_sets
    core:
      v1:
      - interval: 60s
        name: pods
      - interval: 60s
        name: nodes
      - interval: 60s
        name: namespaces
      - interval: 60s
        name: persistent_volumes
      - interval: 60s
        name: persistent_volume_claims
      - mode: watch
        name: events
      - interval: 60s
        name: services
  podSecurityPolicy:
    apiGroup: policy
    apparmor_security: true
    create: false
  rbac:
    create: true
  resources:
    requests:
      cpu: 100m
      memory: 200Mi
  secret:
    create: true
  serviceAccount:
    create: true
    name: splunk-kubernetes-objects
    usePullSecrets: false
  splunk:
    hec:
      indexName: em_meta
  tolerations: []

Deomonset yaml file

[kavsingh@sjc2-nixutil01 ~]$ kubectl get ds -n splunk-sck lv-splunk-connect-splunk-kubernetes-logging -o yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  annotations:
    deprecated.daemonset.template.generation: "1"
    meta.helm.sh/release-name: lv-splunk-connect
    meta.helm.sh/release-namespace: splunk-sck
  creationTimestamp: "2021-12-12T20:51:14Z"
  generation: 1
  labels:
    app: splunk-kubernetes-logging
    app.kubernetes.io/managed-by: Helm
    chart: splunk-kubernetes-logging-1.4.10
    engine: fluentd
    heritage: Helm
    release: lv-splunk-connect
  name: lv-splunk-connect-splunk-kubernetes-logging
  namespace: splunk-sck
  resourceVersion: "361624389"
  selfLink: /apis/apps/v1/namespaces/splunk-sck/daemonsets/lv-splunk-connect-splunk-kubernetes-logging
  uid: b8ddba03-f084-4162-a380-32b2e112dcb1
spec:
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: splunk-kubernetes-logging
      release: lv-splunk-connect
  template:
    metadata:
      annotations:
        checksum/config: ea7843ca58d9389c5480c7d8c53c6669e8c93c96c792c5b62c7a51264b9ff6ea
      creationTimestamp: null
      labels:
        app: splunk-kubernetes-logging
        release: lv-splunk-connect
    spec:
      containers:
      - env:
        - name: K8S_NODE_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: spec.nodeName
        - name: MY_NAMESPACE
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
        - name: MY_POD_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.name
        - name: SPLUNK_HEC_TOKEN
          valueFrom:
            secretKeyRef:
              key: splunk_hec_token
              name: splunk-kubernetes-logging
        image: docker.io/splunk/fluentd-hec:1.2.8
        imagePullPolicy: IfNotPresent
        livenessProbe:
          failureThreshold: 3
          httpGet:
            path: /api/plugins.json
            port: 24220
            scheme: HTTP
          initialDelaySeconds: 60
          periodSeconds: 60
          successThreshold: 1
          timeoutSeconds: 1
        name: splunk-fluentd-k8s-logs
        resources:
          requests:
            cpu: 100m
            memory: 200Mi
        securityContext:
          privileged: false
          runAsUser: 0
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /var/log
          name: varlog
        - mountPath: /var/lib/docker/containers
          name: varlogdest
          readOnly: true
        - mountPath: /var/log/journal
          name: journallogpath
          readOnly: true
        - mountPath: /fluentd/etc
          name: conf-configmap
        - mountPath: /fluentd/etc/splunk
          name: secrets
          readOnly: true
      dnsPolicy: ClusterFirst
      nodeSelector:
        beta.kubernetes.io/os: linux
      restartPolicy: Always
      schedulerName: default-scheduler
      securityContext: {}
      serviceAccount: lv-splunk-connect-splunk-kubernetes-logging
      serviceAccountName: lv-splunk-connect-splunk-kubernetes-logging
      terminationGracePeriodSeconds: 30
      tolerations:
      - effect: NoSchedule
        key: node-role.kubernetes.io/master
      volumes:
      - hostPath:
          path: /var/log
          type: ""
        name: varlog
      - hostPath:
          path: /var/lib/docker/containers
          type: ""
        name: varlogdest
      - hostPath:
          path: /var/log/journal
          type: ""
        name: journallogpath
      - configMap:
          defaultMode: 420
          name: lv-splunk-connect-splunk-kubernetes-logging
        name: conf-configmap
      - name: secrets
        secret:
          defaultMode: 420
          secretName: splunk-kubernetes-logging
  updateStrategy:
    rollingUpdate:
      maxUnavailable: 1
    type: RollingUpdate
status:
  currentNumberScheduled: 62
  desiredNumberScheduled: 62
  numberAvailable: 57
  numberMisscheduled: 2
  numberReady: 57
  numberUnavailable: 5
  observedGeneration: 1
  updatedNumberScheduled: 62

environment:

Kubernetes version (use kubectl version):1.17 Splunk Connect for Kubernetes helm chart version: 1.4.10

kavita1205 commented 1 year ago

Can someone help me here.

blakehawkins commented 1 year ago

I have the same issue, not found a solution yet

github-actions[bot] commented 1 year ago

This issue has been automatically marked as stale because it has been open 90 days with no activity. Remove stale label or comment or this issue will be closed in 30 days

blakehawkins commented 1 year ago

Resolved this by switching to fluent-bit

github-actions[bot] commented 9 months ago

This issue has been automatically marked as stale because it has been open 90 days with no activity. Remove stale label or comment or this issue will be closed in 30 days

github-actions[bot] commented 8 months ago

This issue was automatically closed because of stale in 30 days