vectordotdev / vector

A high-performance observability data pipeline.
https://vector.dev
Mozilla Public License 2.0
17.77k stars 1.57k forks source link

vector::topology: An error occurred that Vector couldn't handle: failed to encode record: BufferTooSmall #21631

Closed kefiras closed 1 day ago

kefiras commented 1 day ago

A note for the community

Problem

We are seeing below errors on our vector instance that receives logs from Openshift cluster (also using vector) over HTTP.

- [ ] 024-10-28T12:36:57.561312Z ERROR transform{component_kind="transform" component_id=kubernetes_application component_type=remap}: vector_buffers::topology::channel::sender: Disk buffer writer has encountered an unrecoverable error.
- [ ] 2024-10-28T12:36:57.653952Z ERROR transform{component_kind="transform" component_id=kubernetes_application component_type=remap}: vector::topology: An error occurred that Vector couldn't handle: failed to encode record: BufferTooSmall.
- [ ] 2024-10-28T12:36:57.654063Z  INFO vector: Vector has stopped.
2024-10-28T12:36:57.655341Z  INFO vector::topology::running: Shutting down... Waiting on running components. remaining_components="splunk_container_logs, splunk_kube_audit, kubernetes_audit, fluentbit, auditd, journal_logs, containerlogs, route_logs, kubernetes_infrastructure, httpd" time_remaining="59 seconds left"
2024-10-28T12:36:57.966692Z ERROR transform{component_kind="transform" component_id=route_logs component_type=route}: vector::topology: An error occurred that Vector couldn't handle: receiver disconnected.
2024-10-28T12:36:58.019845Z ERROR transform{component_kind="transform" component_id=containerlogs component_type=remap}: vector::topology: An error occurred that Vector couldn't handle: receiver disconnected.

Our disk buffer is set to 1GB as per below configuration. Does it mean it runs out of space ? Or is this some other sort of an issue ?

Configuration

cat /etc/vector/vector.yaml 
sources:
  httpd:
    type: "http_server"
    address: "0.0.0.0:8080"
    auth:
      username: "$VECTOR_HTTP_USERNAME"
      password: "$VECTOR_HTTP_PASSWORD"
    tls:
      enabled: true
      crt_file: "/etc/vector/tls/cert.pem"
      key_file: "/etc/vector/tls/cert.key"
  vector_metrics:
    type: "internal_metrics"
# all container logs
transforms:
  containerlogs:
    type: "remap"
    inputs: 
      - "httpd"
    source: ". = parse_json!(.message)"
    #reroute_dropped: true

  # route logs based on log_type field (infrastructure|application|audit|journal)
  route_logs:
    type: route
    inputs:
      - "containerlogs"
    route:
      kubernetes_infrastructure: |
        exists(.kubernetes) && .log_type == "infrastructure"
      kubernetes_application: |
        exists(.kubernetes) && .log_type == "application" 
      kubernetes_audit: |
        .log_type == "audit" && .tag == ".k8s-audit.log"
      journal: |
        .log_type == "infrastructure" && .tag == ".journal.system"
      auditd: |
        .log_type == "audit" && .tag == ".linux-audit.log"
  # Rules for application logs , drops logs without orgid label on the namespace
  kubernetes_application:
    inputs:
      - "route_logs.kubernetes_application"
    type: "remap"
    drop_on_abort: true
    #reroute_dropped: true
    source: |
      if exists(.kubernetes.labels.company_example_com_app) {
        .app = .kubernetes.labels.company_example_com_app
      } else if exists(.kubernetes.labels.app) {
        .app = .kubernetes.labels.app
      } else if exists(.kubernetes.labels."k8s-app") {
        .app = .kubernetes.labels."k8s-app"
      } else if exists(.kubernetes.labels.app_kubernetes_io_name) {
        .app = .kubernetes.labels.app_kubernetes_io_name
      } else if exists(.kubernetes.labels.name) {
        .app = .kubernetes.labels.name
      } else {
        .app = .kubernetes.namespace_name
      }
      .container_id = .kubernetes.container_id
      # No info for init containers
      # https://github.com/vectordotdev/vector/issues/18665
      if is_null(.kubernetes.container_image) {
        .container_image = "unknown"
      } else {
      .container_image = .kubernetes.container_image
      }
      .service = .kubernetes.container_name
      .namespace = .kubernetes.namespace_name
      .pod_name = .kubernetes.pod_name
      .pod_id = .kubernetes.pod_id
      .pod_owner = .kubernetes.pod_owner
      .clustername = "cluster1"
      .datacenter = "zone1"
      if exists(.kubernetes.namespace_labels.company_example_com_appenv) {
        .env = .kubernetes.namespace_labels.company_example_com_appenv
      } else {
        .env = "dev"
      }
      if exists(.kubernetes.namespace_labels.company_example_com_orgid) {
        .orgid = .kubernetes.namespace_labels.company_example_com_orgid
      } else {
        .orgid = "example"
      }
      .family = if exists(.kubernetes.labels.company_example_com_family) {
        .family = .kubernetes.labels.company_example_com_family 
      } else {
        .family = .kubernetes.namespace_name
      }
      #.time = del(to_unix_timestamp(to_timestamp!(.@timestamp)))
      .host = .hostname
      # parse string to timestamp type
      .ts = parse_timestamp!(."@timestamp", "%FT%T%.9fZ")

  # Rules for infrastructure logs
  kubernetes_infrastructure:
    inputs: 
      - "route_logs.kubernetes_infrastructure"
    type: "remap"
    #reroute_dropped: true
    source: |
      # Openshift Logging swaps [./] with [_]
      # https://github.com/openshift/cluster-logging-operator/blob/627b0c7f8c993f89250756d9601d1a632b024c94/internal/generator/vector/normalize/dedot_labels.go#L10
      # oc get secret collector-config -o yaml | yq '.data."vector.toml"' | base64 -d  | grep transforms.httpout_dedot -A 30
      if exists(.kubernetes.labels.company_example_com_app) {
        .app = .kubernetes.labels.company_example_com_app
      } else if exists(.kubernetes.labels.app) {
        .app = .kubernetes.labels.app
      } else if exists(.kubernetes.labels."k8s-app") {
        .app = .kubernetes.labels."k8s-app"
      } else if exists(.kubernetes.labels.app_kubernetes_io_name) {
        .app = .kubernetes.labels.app_kubernetes_io_name
      } else if exists(.kubernetes.labels.name) {
        .app = .kubernetes.labels.name
      } else {
        .app = .kubernetes.namespace_name
      }
      .container_id = .kubernetes.container_id
      # No info for init containers
      # https://github.com/vectordotdev/vector/issues/18665
      if is_null(.kubernetes.container_image) {
        .container_image = "unknown"
      } else {
        .container_image = .kubernetes.container_image
      }
      .service = .kubernetes.container_name
      .namespace = .kubernetes.namespace_name
      .pod_name = .kubernetes.pod_name
      .pod_id = .kubernetes.pod_id
      .pod_owner = .kubernetes.pod_owner
      .clustername = "cluster1"
      .datacenter = "zone1"
      .env = "dev"
      .orgid = "example"
      .family = .kubernetes.namespace_name
      .host = .hostname
      # parse string to timestamp type
      .ts = parse_timestamp!(."@timestamp", "%FT%T%.9fZ")

  # Rules for kube-audit logs
  kubernetes_audit:
    inputs:
      - "route_logs.kubernetes_audit"
    type: "remap"
    #reroute_dropped: true
    source: |
      .user = .user.username
      .clustername = "cluster1"
      .message = .requestURI
      .orgid = "example"
      .status_code = .responseStatus.code
      del(.annotations)
      del(.apiVersion)
      del(.auditID)
      del(.k8s_audit_level)
      del(.kind)
      del(.level)
      del(.objectRef)
      del(.openshift)
      del(.requestReceivedTimestamp)
      del(.source_type)
      del(.stage)
      del(.stageTimestamp)
      .ts = parse_timestamp!(."@timestamp", "%FT%T%.9fZ")
      .host = .hostname
  journal_logs:
    inputs:
      - "route_logs.journal"
    type: "remap"
    #reroute_dropped: true
    source: |
      .facility = .systemd.u.SYSLOG_FACILITY
      .pid = .systemd.t.PID
      .app = .systemd.u.SYSLOG_IDENTIFIER
      if .level == "emergency" {
        .severity = 0
      } else if .level == "alert" {
        .severity = 1
      } else if .level == "critical" {
        .severity = 2
      } else if .level == "error" {
        .severity = 3
      } else if .level == "warning" {
        .severity = 4
      } else if .level == "notice" {
        .severity = 5
      } else if .level == "info" {
        .severity = 6
      } else if .level == "debug" {
        .severity = 7
      }
  auditd:
    inputs:
      - "route_logs.auditd"
    type: "remap"
    source: |
      .facility = 3
      .pid = 1
      .app = "auditd"
      .severity = 6

sinks:
  prometheus_exporter:
    type: "prometheus_exporter"
    inputs:
      - "vector_metrics"
    address: "0.0.0.0:9100"
  fluentbit:
    type: http
    inputs:
      - "journal_logs"
      - "auditd"
    uri: "http://localhost:9880"
    encoding:
      codec: json
    buffer:
      when_full: drop_newest
  splunk_container_logs:
    type: "splunk_hec_logs"
    inputs: 
      - "kubernetes_infrastructure"
      - "kubernetes_application"
    endpoint: "https://localhost:8088"
    default_token: "$SPLUNK_HEC_TOKEN"
    encoding:
      codec: "raw_message"
    tls:
      verify_certificate: false
    source: "{{ orgid }}:{{ family }}:{{ app }}:{{ service }}:{{ env }}"
    sourcetype: "docker:company"
    timestamp_key: "ts"
    index: "{{ orgid }}"
    indexed_fields: 
      - "log_type"
      - "app"
      - "clustername"
      - "datacenter"
      - "container_image"
      - "service"
      - "env"
      - "family"
      - "namespace"
      - "pod_name"
      - "pod_id"
      - "pod_owner"
    buffer:
      type: "disk"
      max_size: 1000000000
  splunk_kube_audit:
    type: "splunk_hec_logs"
    inputs:
      - "kubernetes_audit"
    endpoint: "https://localhost:8088"
    default_token: "$SPLUNK_HEC_TOKEN"
    encoding:
      codec: "raw_message"
    tls:
      verify_certificate: false
    source: "company_openshift_audit_logs"
    sourcetype: "company:k8s:audit"
    index: "{{ orgid }}"
    indexed_fields:
      - "user"
      - "verb"
      - "clustername"
      - "sourceIPs"
      - "status_code"
      - "log_type"
    timestamp_key: "ts"
    buffer:
      type: "disk"
      max_size: 1000000000
api:
  enabled: true
  address: "0.0.0.0:8686"

Version

vector 0.36.0 (x86_64-unknown-linux-musl a5e48bb 2024-02-13 14:43:11.911392615)

Debug Output

No response

Example Data

No response

Additional Context

We are running on Openshift

References

No response

jszwedko commented 1 day ago

Hi @kefiras ! I think this is a duplicate of https://github.com/vectordotdev/vector/issues/18346 so I'll close this one, but let me know if you disagree!