open-telemetry / opentelemetry-helm-charts

OpenTelemetry Helm Charts
https://opentelemetry.io
Apache License 2.0
364 stars 445 forks source link

opentelemetry-collector fails to deploy on IPv6 EKS cluster #816

Open DavidS-ovm opened 1 year ago

DavidS-ovm commented 1 year ago

I've followed the instructions on the honeycomb blog, which was pretty easy, and was slightly surprised when the deployed collectors failed with

Error: invalid configuration: receivers::jaeger: invalid port number for the gRPC endpoint: endpoint is not formatted correctly: address 2a05:d01c:997:3200:abcd::1:14250: too many colons in address                                                                                                                                                       

I've eventually got it to work with the following terraform, that should also be easily translatable into the regular values.yaml:

resource "helm_release" "otelcol-node" {
  name       = "otelcol-node"
  repository = "https://open-telemetry.github.io/opentelemetry-helm-charts"
  chart      = "opentelemetry-collector"
  timeout    = 60
  values = [yamlencode({
    mode = "daemonset"
    resources = {
      requests = {
        cpu    = "100m"
        memory = "128Mi"
      }
    }
    config = {
      exporters = {
        otlp = {
          endpoint = "api.honeycomb.io:443"
          headers = {
            "X-Honeycomb-Team"    = var.honeycomb_api_key
            "X-Honeycomb-Dataset" = "kubernetes-metrics"
          }
        }
      }
      receivers = {
        jaeger = null
        otlp = {
          protocols = {
            grpc = {
              endpoint = "[$${env:MY_POD_IP}]:4317"
            }
            http = {
              endpoint = "[$${env:MY_POD_IP}]:4318"
            }
          }
        }
        prometheus = null
        zipkin     = null
      }
      service = {
        telemetry = {
          metrics = {
            address = "[$${env:MY_POD_IP}]:8888"
          }
        }
        pipelines = {
          traces = {
            receivers = ["otlp"]
            exporters = ["otlp"]
          }
          metrics = {
            receivers = ["otlp"]
            exporters = ["otlp"]
          }
          logs = {
            receivers = ["otlp"]
            exporters = ["otlp"]
          }
        }
      }
    }
    presets = {
      hostMetrics          = { enabled = true }
      kubeletMetrics       = { enabled = true }
      kubernetesAttributes = { enabled = true }
      logsCollection       = { enabled = true }
    }
  })]
}

resource "helm_release" "otelcol-cluster" {
  name       = "otelcol-cluster"
  repository = "https://open-telemetry.github.io/opentelemetry-helm-charts"
  chart      = "opentelemetry-collector"
  timeout    = 60
  values = [yamlencode({
    mode = "deployment"
    resources = {
      requests = {
        cpu    = "100m"
        memory = "128Mi"
      }
    }
    config = {
      exporters = {
        otlp = {
          endpoint = "api.honeycomb.io:443"
          headers = {
            "X-Honeycomb-Team"    = var.honeycomb_api_key
            "X-Honeycomb-Dataset" = "kubernetes-metrics"
          }
        }
      }
      receivers = {
        jaeger = null
        otlp = {
          protocols = {
            grpc = {
              endpoint = "[$${env:MY_POD_IP}]:4317"
            }
            http = {
              endpoint = "[$${env:MY_POD_IP}]:4318"
            }
          }
        }
        prometheus = null
        zipkin     = null
      }
      service = {
        telemetry = {
          metrics = {
            address = "[$${env:MY_POD_IP}]:8888"
          }
        }
        pipelines = {
          traces = {
            receivers = ["otlp"]
            exporters = ["otlp"]
          }
          metrics = {
            receivers = ["otlp"]
            exporters = ["otlp"]
          }
          logs = {
            receivers = ["otlp"]
            exporters = ["otlp"]
          }
        }
      }
    }
    presets = {
      clusterMetrics       = { enabled = true }
      kubernetesAttributes = { enabled = true }
      kubernetesEvents     = { enabled = true }
    }
  })]
}

When the IP address ends in :: I'm still getting a single agent stuck crashlooping on:

Error: failed to get config: cannot resolve the configuration: expanding ${env:MY_POD_IP}, expected convertable to string value type, got map["2a05:d01c:997:3200:3bb0:":<nil>](map[string]interface {})
TylerHelmuth commented 1 year ago

@DavidS-om it looks like you solved your issue by setting .Values.config.receivers.jaeger to null and then not including it in the traces pipeline, but it is not clear to me why your install got an error. By default the helm chart configures the jaeger receiver like

  receivers:
    jaeger:
      protocols:
        grpc:
          endpoint: ${env:MY_POD_IP}:14250
        thrift_http:
          endpoint: ${env:MY_POD_IP}:14268
        thrift_compact:
          endpoint: ${env:MY_POD_IP}:6831

and it appears that env:MY_POD_IP is formatted correctly for the other receivers.

TylerHelmuth commented 1 year ago

Our CI does not test this chart in a IPv6 environment so I'm getting that's is the issue.

DavidS-ovm commented 1 year ago

@DavidS-om it looks like you solved your issue by setting .Values.config.receivers.jaeger to null and then not including it in the traces pipeline, but it is not clear to me why your install got an error. By default the helm chart configures the jaeger receiver like

  receivers:
    jaeger:
      protocols:
        grpc:
          endpoint: ${env:MY_POD_IP}:14250
        thrift_http:
          endpoint: ${env:MY_POD_IP}:14268
        thrift_compact:
          endpoint: ${env:MY_POD_IP}:6831

and it appears that env:MY_POD_IP is formatted correctly for the other receivers.

The collector aborts after encountering the first error. Everything else also needed fixed. The correct syntax for IP:PORT in IPv6 is [abcd:...:ef09]:PORT (note the extra square brackets.

TylerHelmuth commented 1 year ago

I don't think we can do that without breaking IPv4 support by default.

DavidS-ovm commented 1 year ago

Yeah, I was pretty surprised that k8s doesn't provide any indication in status.podIP whether it is a v4 or v6 address.

kentquirk commented 1 year ago

This sounds like it might point to a bug in the hostport parsing -- Go's SplitHostPort function knows how to do this.

TylerHelmuth commented 1 year ago

@kentquirk the values.yaml is building the endpoint string itself with the assumption that the value from k8s downward api sets MY_POD_IP with a usable ip address.

DavidS-ovm commented 1 year ago

... with the assumption that the value from k8s downward api sets MY_POD_IP with a usable ip address.

... with a usable IPv4 address.

DavidS-ovm commented 1 year ago

For future reference, this is the "minimal" config I settled on to deploy a k8s metrics/logs-only collector:

resource "helm_release" "otelcol-node" {
  name       = "otelcol-node"
  repository = "https://open-telemetry.github.io/opentelemetry-helm-charts"
  chart      = "opentelemetry-collector"
  timeout    = 60
  values = [yamlencode({
    mode = "daemonset"
    resources = {
      requests = {
        cpu    = "100m"
        memory = "128Mi"
      }
    }
    config = {
      exporters = {
        otlp = {
          endpoint = "api.honeycomb.io:443"
          headers = {
            "X-Honeycomb-Team"    = var.honeycomb_api_key
            "X-Honeycomb-Dataset" = "kubernetes-metrics"
          }
        }
      }
      receivers = {
        jaeger     = null
        otlp       = null
        prometheus = null
        zipkin     = null
        hostmetrics = {
          collection_interval = "60s"
        }
        kubeletstats = {
          collection_interval = "60s"
        }
      }
      service = {
        telemetry = {
          metrics = {
            address = "localhost:8888"
          }
        }
        pipelines = {
          traces = null
          metrics = {
            receivers = []
            exporters = ["otlp"]
          }
          logs = {
            receivers = []
            exporters = ["otlp"]
          }
        }
      }
    }
    presets = {
      hostMetrics          = { enabled = true }
      kubeletMetrics       = { enabled = true }
      kubernetesAttributes = { enabled = true }
      logsCollection       = { enabled = true }
    }
  })]
}

resource "helm_release" "otelcol-cluster" {
  name       = "otelcol-cluster"
  repository = "https://open-telemetry.github.io/opentelemetry-helm-charts"
  chart      = "opentelemetry-collector"
  timeout    = 60
  values = [yamlencode({
    mode = "deployment"
    resources = {
      requests = {
        cpu    = "100m"
        memory = "128Mi"
      }
    }
    config = {
      exporters = {
        otlp = {
          endpoint = "api.honeycomb.io:443"
          headers = {
            "X-Honeycomb-Team"    = var.honeycomb_api_key
            "X-Honeycomb-Dataset" = "kubernetes-metrics"
          }
        }
      }
      receivers = {
        jaeger     = null
        otlp       = null
        prometheus = null
        zipkin     = null
        k8s_cluster = {
          collection_interval = "60s"
        }
      }
      service = {
        telemetry = {
          metrics = {
            address = "localhost:8888"
          }
        }
        pipelines = {
          traces = null
          metrics = {
            receivers = []
            exporters = ["otlp"]
          }
          logs = {
            receivers = []
            exporters = ["otlp"]
          }
        }
      }
    }
    presets = {
      clusterMetrics       = { enabled = true }
      kubernetesAttributes = { enabled = true }
      kubernetesEvents     = { enabled = true }
    }
  })]
}
darwin67 commented 7 months ago

Found this issue after encountering something similar, but also slightly different after.

For future references of people coming to this. Simply replacing ${env:MY_POD_IP}:<port> to an IPv6 format of [${env:MY_POD_IP}]:<port> could result in occasional failures like this.

Error: failed to resolve config: cannot resolve the configuration: expanding ${env:MY_POD_IP}, expected convertable to string value type, got map["2600:1f16:96c:6866:53c0:":<nil>](map[string]interface {})
2023/12/06 19:47:24 collector server run finished with error: failed to resolve config: cannot resolve the configuration: expanding ${env:MY_POD_IP}, expected convertable to string value type, got map["2600:1f16:96c:6866:53c0:":<nil>](map[string]interface {})

And the pods will go into crash loops.

It's unclear why the expansion logic sees MY_POD_IP as a map instead of a string. For people running IPv6 cluster, it's probably easier to just replace all as [::]:<port> which is similar to 0.0.0.0:<port> in IPv4.

Or maybe just :<port> since Go should know how to treat that as both IPv4 and IPv6

DavidS-ovm commented 7 months ago

It's unclear why the expansion logic sees MY_POD_IP as a map instead of a string.

because YAML has [, :, and ] as load-bearing characters that get parsed as structure. The value needs to be quoted (probably in the template).

darwin67 commented 6 months ago

it was quoted but I still got that error, and it's not consistent. anyways, :<port> works fine so it might be better to change the default to that format.

Vinaum8 commented 2 months ago

i'll try with => "'${env:MY_POD_IP}':8888"

Vinaum8 commented 2 months ago
 ## The chart only includes the loggingexporter by default 
  ## If you want to send your data somewhere you need to 
  ## configure an exporter, such as the otlpexporter
  # Base collector configuration.
  config: 
    exporters: 
      otlp: 
        endpoint: "tempo-distributor.observability.svc.cluster.local:4317"
        tls: 
          insecure: true
    extensions:
      # The health_check extension is mandatory for this chart.
      # Without the health_check extension the collector will fail the readiness and liveliness probes.
      # The health_check extension can be modified, but should never be removed.
      health_check:
        endpoint: "0.0.0.0:13133"
      memory_ballast: {}
    processors:
      batch: {}
      # If set to null, will be overridden with values based on k8s resource limits
      memory_limiter: null
    receivers:
      jaeger:
        protocols:
          grpc:
            endpoint: "0.0.0.0:14250"
          thrift_http:
            endpoint: "0.0.0.0:14268"
          thrift_compact:
            endpoint: "0.0.0.0:6831"
      otlp:
        protocols:
          grpc:
            endpoint: "0.0.0.0:4317"
          http:
            endpoint: "0.0.0.0:4318"
      prometheus:
        config:
          scrape_configs:
            - job_name: opentelemetry-collector
              scrape_interval: 10s
              static_configs:
                - targets:
                    - "0.0.0.0:8888"
      zipkin:
        endpoint: "0.0.0.0:9411"
    service:
      telemetry:
        metrics:
          address: "0.0.0.0:8888"
      extensions:
        - health_check
        - memory_ballast
      pipelines:
        logs:
          exporters:
            - debug
          processors:
            - memory_limiter
            - batch
          receivers:
            - otlp
        metrics:
          exporters:
            - debug
          processors:
            - memory_limiter
            - batch
          receivers:
            - otlp
            - prometheus
        traces:
          exporters:
            - debug
          processors:
            - memory_limiter
            - batch
          receivers:
            - otlp
            - jaeger
            - zipkin

This helm values work for me on IPv6 EKS Cluster.