open-telemetry / opentelemetry-collector

OpenTelemetry Collector
https://opentelemetry.io
Apache License 2.0
4.36k stars 1.44k forks source link

Collector crash v0.29.0 #3533

Closed vishiy closed 3 years ago

vishiy commented 3 years ago

Describe the bug I took the latest v0.29.0 release and was running prometheus receiver-->otel exporter, and am seeing random crashes . see below for 2 of the stacks that i saw in 2 crashes in few mins....

Environment "Ubuntu 20.04" go version go1.15.13 linux/amd64

Below is the stack for first crash -

panic: runtime error: index out of range [0] with length 0

goroutine 151 [running]: go.opentelemetry.io/collector/internal/data/protogen/metrics/v1.(InstrumentationLibraryMetrics).MarshalToSizedBuffer(0xc06f548600, 0xc080ed0000, 0xc1514, 0xc1514, 0xc1514, 0x7fa1ea5b4560, 0xc080ed0000) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/metrics/v1/metrics.pb.go:1895 +0x251 go.opentelemetry.io/collector/internal/data/protogen/metrics/v1.(ResourceMetrics).MarshalToSizedBuffer(0xc06f5485c0, 0xc080ed0000, 0xc1514, 0xc1514, 0x45120c, 0xc2000, 0x29599c0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/metrics/v1/metrics.pb.go:1848 +0xc5 go.opentelemetry.io/collector/internal/data/protogen/collector/metrics/v1.(ExportMetricsServiceRequest).MarshalToSizedBuffer(0xc078096ba0, 0xc080ed0000, 0xc1514, 0xc1514, 0x40d725, 0x2be4720, 0x2f724c0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:257 +0xbf go.opentelemetry.io/collector/internal/data/protogen/collector/metrics/v1.(ExportMetricsServiceRequest).Marshal(0xc078096ba0, 0x2f724c0, 0xc078096ba0, 0x7fa1a850e6d8, 0xc078096ba0, 0xc000050c01) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:237 +0x7a google.golang.org/protobuf/internal/impl.legacyMarshal(0x3617120, 0xc0738faaa0, 0x0, 0x0, 0x0, 0x0, 0x7fa1a850be20, 0x2f724c0, 0xc078096ba0, 0xc06bc7cf80, ...) /home/runner/go/pkg/mod/google.golang.org/protobuf@v1.26.0/internal/impl/legacy_message.go:404 +0xb0 google.golang.org/protobuf/proto.MarshalOptions.marshal(0xc073000001, 0x0, 0x0, 0x0, 0x3617120, 0xc0738faaa0, 0xc0738faaa0, 0x3617120, 0xc0738faaa0, 0x2f724c0, ...) /home/runner/go/pkg/mod/google.golang.org/protobuf@v1.26.0/proto/encode.go:163 +0x2c7 google.golang.org/protobuf/proto.MarshalOptions.MarshalAppend(0x2000001, 0x0, 0x0, 0x0, 0x3589860, 0xc0738faaa0, 0x2f724c0, 0x7fa1a850bdf0, 0xc0557be0c0, 0x0, ...) /home/runner/go/pkg/mod/google.golang.org/protobuf@v1.26.0/proto/encode.go:122 +0x98 github.com/golang/protobuf/proto.marshalAppend(0x0, 0x0, 0x0, 0x7fa1a850bdf0, 0xc078096ba0, 0x100c06914d000, 0x0, 0x0, 0x0, 0x4, ...) /home/runner/go/pkg/mod/github.com/golang/protobuf@v1.5.2/proto/wire.go:40 +0xc7 github.com/golang/protobuf/proto.Marshal(...) /home/runner/go/pkg/mod/github.com/golang/protobuf@v1.5.2/proto/wire.go:23 google.golang.org/grpc/encoding/proto.codec.Marshal(0x2f724c0, 0xc078096ba0, 0x20301d, 0x0, 0x417187, 0x20301d, 0x20301d) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/encoding/proto/proto.go:45 +0x6d google.golang.org/grpc.encode(0x7fa1a850bd80, 0x4a8ae40, 0x2f724c0, 0xc078096ba0, 0xc0760b8800, 0x0, 0xc000050c00, 0x0, 0xc0738faa50) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/rpc_util.go:593 +0x52 google.golang.org/grpc.prepareMsg(0x2f724c0, 0xc078096ba0, 0x7fa1a850bd80, 0x4a8ae40, 0x0, 0x0, 0x35cc3c0, 0xc00014d220, 0x3f, 0x0, ...) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/stream.go:1590 +0x107 google.golang.org/grpc.(clientStream).SendMsg(0xc01a143b00, 0x2f724c0, 0xc078096ba0, 0x0, 0x0) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/stream.go:771 +0x165 google.golang.org/grpc.invoke(0x35e34c0, 0xc01aec8a20, 0x3175865, 0x3f, 0x2f724c0, 0xc078096ba0, 0x2f52480, 0x4a8ae40, 0xc0001bc000, 0xc0760b86a0, ...) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/call.go:70 +0xe4 google.golang.org/grpc.(ClientConn).Invoke(0xc0001bc000, 0x35e34c0, 0xc01aec8a20, 0x3175865, 0x3f, 0x2f724c0, 0xc078096ba0, 0x2f52480, 0x4a8ae40, 0xc0008062c0, ...) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/call.go:37 +0x1b3 go.opentelemetry.io/collector/internal/data/protogen/collector/metrics/v1.(metricsServiceClient).Export(0xc000804050, 0x35e34c0, 0xc01aec8a20, 0xc078096ba0, 0xc0008062c0, 0x1, 0x1, 0xc06bc7d600, 0x46f24b, 0xc01780d818) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:177 +0xbc go.opentelemetry.io/collector/internal/pdatagrpc.(metricsClient).Export(0xc0008062a0, 0x35e34c0, 0xc01aec8a20, 0xc078096ba0, 0xc0008062c0, 0x1, 0x1, 0xc0738faa30, 0xc01780d810, 0x12a05ef44) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/pdatagrpc/metrics.go:60 +0x7c go.opentelemetry.io/collector/exporter/otlpexporter.(grpcSender).exportMetrics(0xc000808120, 0x35e34c0, 0xc01aec8a20, 0xc078096ba0, 0xc01aec8a20, 0x35e3500) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/otlpexporter/otlp.go:131 +0x95 go.opentelemetry.io/collector/exporter/otlpexporter.(exporter).pushMetrics(0xc0007e00b0, 0x35e34c0, 0xc01aec8a20, 0xc078096ba0, 0x4a31cb, 0x35e3500) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/otlpexporter/otlp.go:74 +0x51 go.opentelemetry.io/collector/exporter/exporterhelper.(metricsRequest).export(0xc0760b8640, 0x35e34c0, 0xc01aec8a20, 0x35e34c0, 0xc01aec8a20) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/metrics.go:54 +0x47 go.opentelemetry.io/collector/exporter/exporterhelper.(timeoutSender).send(0xc0007d0310, 0x35f6540, 0xc0760b8640, 0x0, 0x0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/common.go:229 +0x88 go.opentelemetry.io/collector/exporter/exporterhelper.(retrySender).send(0xc0005b8000, 0x35f6540, 0xc0760b8640, 0x30e2f93, 0x8) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/queued_retry.go:274 +0x299 go.opentelemetry.io/collector/exporter/exporterhelper.(metricsSenderWithObservability).send(0xc0005d4020, 0x35f6540, 0xc0760b8640, 0x35f6540, 0xc0760b8640) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/metrics.go:120 +0xc7 go.opentelemetry.io/collector/exporter/exporterhelper.(queuedRetrySender).start.func1(0x2cfe880, 0xc0760b8640) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/queued_retry.go:154 +0x75 github.com/jaegertracing/jaeger/pkg/queue.ConsumerFunc.Consume(0xc0008062d0, 0x2cfe880, 0xc0760b8640) /home/runner/go/pkg/mod/github.com/jaegertracing/jaeger@v1.23.0/pkg/queue/bounded_queue.go:104 +0x3a github.com/jaegertracing/jaeger/pkg/queue.(BoundedQueue).StartConsumersWithFactory.func1(0xc00080c0d0, 0xc00014c000) /home/runner/go/pkg/mod/github.com/jaegertracing/jaeger@v1.23.0/pkg/queue/bounded_queue.go:83 +0xe9 created by github.com/jaegertracing/jaeger/pkg/queue.(*BoundedQueue).StartConsumersWithFactory /home/runner/go/pkg/mod/github.com/jaegertracing/jaeger@v1.23.0/pkg/queue/bounded_queue.go:73 +0xc5


below is the stack for second crash

panic: runtime error: slice bounds out of range [-850:]

goroutine 184 [running]: go.opentelemetry.io/collector/internal/data/protogen/metrics/v1.(Metric).MarshalToSizedBuffer(0xc007966900, 0xc082952000, 0x6622, 0x3233ce, 0x291f9, 0x6623, 0x0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/metrics/v1/metrics.pb.go:1943 +0x3b0 go.opentelemetry.io/collector/internal/data/protogen/metrics/v1.(InstrumentationLibraryMetrics).MarshalToSizedBuffer(0xc0797e6440, 0xc082952000, 0x2262f5, 0x3233ce, 0xd7, 0x2262fa, 0x0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/metrics/v1/metrics.pb.go:1895 +0xc5 go.opentelemetry.io/collector/internal/data/protogen/metrics/v1.(ResourceMetrics).MarshalToSizedBuffer(0xc0797e63c0, 0xc082952000, 0x2262f5, 0x3233ce, 0xfcc5a, 0x2262f6, 0x0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/metrics/v1/metrics.pb.go:1848 +0xc5 go.opentelemetry.io/collector/internal/data/protogen/collector/metrics/v1.(ExportMetricsServiceRequest).MarshalToSizedBuffer(0xc036ad19c0, 0xc082952000, 0x3233ce, 0x3233ce, 0x40d725, 0x2be4720, 0x2f724c0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:257 +0xbf go.opentelemetry.io/collector/internal/data/protogen/collector/metrics/v1.(ExportMetricsServiceRequest).Marshal(0xc036ad19c0, 0x2f724c0, 0xc036ad19c0, 0x7fc9fc10b120, 0xc036ad19c0, 0xc000381c01) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:237 +0x7a google.golang.org/protobuf/internal/impl.legacyMarshal(0x3617120, 0xc002102bb0, 0x0, 0x0, 0x0, 0x0, 0x7fc9fc10b0e0, 0x2f724c0, 0xc036ad19c0, 0xc046b84f80, ...) /home/runner/go/pkg/mod/google.golang.org/protobuf@v1.26.0/internal/impl/legacy_message.go:404 +0xb0 google.golang.org/protobuf/proto.MarshalOptions.marshal(0xc002000001, 0x0, 0x0, 0x0, 0x3617120, 0xc002102bb0, 0xc002102bb0, 0x3617120, 0xc002102bb0, 0x2f724c0, ...) /home/runner/go/pkg/mod/google.golang.org/protobuf@v1.26.0/proto/encode.go:163 +0x2c7 google.golang.org/protobuf/proto.MarshalOptions.MarshalAppend(0x2000001, 0x0, 0x0, 0x0, 0x3589860, 0xc002102bb0, 0x2f724c0, 0x7fc9fc10b0b0, 0xc024465b90, 0x0, ...) /home/runner/go/pkg/mod/google.golang.org/protobuf@v1.26.0/proto/encode.go:122 +0x98 github.com/golang/protobuf/proto.marshalAppend(0x0, 0x0, 0x0, 0x7fc9fc10b0b0, 0xc036ad19c0, 0x1000000000200, 0xc018defd7e, 0x1, 0x0, 0x4, ...) /home/runner/go/pkg/mod/github.com/golang/protobuf@v1.5.2/proto/wire.go:40 +0xc7 github.com/golang/protobuf/proto.Marshal(...) /home/runner/go/pkg/mod/github.com/golang/protobuf@v1.5.2/proto/wire.go:23 google.golang.org/grpc/encoding/proto.codec.Marshal(0x2f724c0, 0xc036ad19c0, 0x20300f, 0x0, 0x43c61c, 0x20300f, 0x20300f) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/encoding/proto/proto.go:45 +0x6d google.golang.org/grpc.encode(0x7fc9fc10b040, 0x4a8ae40, 0x2f724c0, 0xc036ad19c0, 0xc03e6d8f80, 0x0, 0xc000381c00, 0x0, 0xc002102b50) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/rpc_util.go:593 +0x52 google.golang.org/grpc.prepareMsg(0x2f724c0, 0xc036ad19c0, 0x7fc9fc10b040, 0x4a8ae40, 0x0, 0x0, 0x35cc3c0, 0xc000152000, 0x3f, 0x0, ...) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/stream.go:1590 +0x107 google.golang.org/grpc.(clientStream).SendMsg(0xc01c5b4240, 0x2f724c0, 0xc036ad19c0, 0x0, 0x0) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/stream.go:771 +0x165 google.golang.org/grpc.invoke(0x35e34c0, 0xc03db26840, 0x3175865, 0x3f, 0x2f724c0, 0xc036ad19c0, 0x2f52480, 0x4a8ae40, 0xc000479500, 0xc03e6d8e20, ...) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/call.go:70 +0xe4 google.golang.org/grpc.(ClientConn).Invoke(0xc000479500, 0x35e34c0, 0xc03db26840, 0x3175865, 0x3f, 0x2f724c0, 0xc036ad19c0, 0x2f52480, 0x4a8ae40, 0xc0008846a0, ...) /home/runner/go/pkg/mod/google.golang.org/grpc@v1.38.0/call.go:37 +0x1b3 go.opentelemetry.io/collector/internal/data/protogen/collector/metrics/v1.(metricsServiceClient).Export(0xc000011780, 0x35e34c0, 0xc03db26840, 0xc036ad19c0, 0xc0008846a0, 0x1, 0x1, 0xc0355bd600, 0x46f24b, 0xc043116a58) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:177 +0xbc go.opentelemetry.io/collector/internal/pdatagrpc.(metricsClient).Export(0xc000884680, 0x35e34c0, 0xc03db26840, 0xc036ad19c0, 0xc0008846a0, 0x1, 0x1, 0xc002102b30, 0xc043116a50, 0x12a05f00c) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/internal/pdatagrpc/metrics.go:60 +0x7c go.opentelemetry.io/collector/exporter/otlpexporter.(grpcSender).exportMetrics(0xc0003c4c60, 0x35e34c0, 0xc03db26840, 0xc036ad19c0, 0xc03db26840, 0x35e3500) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/otlpexporter/otlp.go:131 +0x95 go.opentelemetry.io/collector/exporter/otlpexporter.(exporter).pushMetrics(0xc000827de0, 0x35e34c0, 0xc03db26840, 0xc036ad19c0, 0x4a31cb, 0x35e3500) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/otlpexporter/otlp.go:74 +0x51 go.opentelemetry.io/collector/exporter/exporterhelper.(metricsRequest).export(0xc03e6d8dc0, 0x35e34c0, 0xc03db26840, 0x35e34c0, 0xc03db26840) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/metrics.go:54 +0x47 go.opentelemetry.io/collector/exporter/exporterhelper.(timeoutSender).send(0xc00005b900, 0x35f6540, 0xc03e6d8dc0, 0x0, 0x0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/common.go:229 +0x88 go.opentelemetry.io/collector/exporter/exporterhelper.(retrySender).send(0xc0003c4a20, 0x35f6540, 0xc03e6d8dc0, 0x30e2f93, 0x8) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/queued_retry.go:274 +0x299 go.opentelemetry.io/collector/exporter/exporterhelper.(metricsSenderWithObservability).send(0xc00087da40, 0x35f6540, 0xc03e6d8dc0, 0x35f6540, 0xc03e6d8dc0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/metrics.go:120 +0xc7 go.opentelemetry.io/collector/exporter/exporterhelper.(queuedRetrySender).start.func1(0x2cfe880, 0xc03e6d8dc0) /home/runner/go/pkg/mod/go.opentelemetry.io/collector@v0.29.0/exporter/exporterhelper/queued_retry.go:154 +0x75 github.com/jaegertracing/jaeger/pkg/queue.ConsumerFunc.Consume(0xc0008846b0, 0x2cfe880, 0xc03e6d8dc0) /home/runner/go/pkg/mod/github.com/jaegertracing/jaeger@v1.23.0/pkg/queue/bounded_queue.go:104 +0x3a github.com/jaegertracing/jaeger/pkg/queue.(BoundedQueue).StartConsumersWithFactory.func1(0xc000799360, 0xc0009674a0) /home/runner/go/pkg/mod/github.com/jaegertracing/jaeger@v1.23.0/pkg/queue/bounded_queue.go:83 +0xe9 created by github.com/jaegertracing/jaeger/pkg/queue.(BoundedQueue).StartConsumersWithFactory /home/runner/go/pkg/mod/github.com/jaegertracing/jaeger@v1.23.0/pkg/queue/bounded_queue.go:73 +0xc5

bogdandrutu commented 3 years ago

/cc @dashpole @rakyll @Aneurysm9 @odeke-em @alolita

sincejune commented 3 years ago

I tried both v0.29.0 and main branch. They all works well to me.

Maybe it's related to your configurations and data. Could you share your configs which will help debug?

vishiy commented 3 years ago

Nothing to do with config. It’s just prom receiver and otlp exporter. Just run for a longer time. I also see this in 0.28 as well. So far 0.27 seems running fine…. Seems like a regression in marshalling…

Aneurysm9 commented 3 years ago

The stack trace definitely looks like it's an error marshalling, but I'm perplexed as to how it is possible. The first trace seems to be pointing at this code:

    if len(m.Metrics) > 0 {
        for iNdEx := len(m.Metrics) - 1; iNdEx >= 0; iNdEx-- {
            {
                size, err := m.Metrics[iNdEx].MarshalToSizedBuffer(dAtA[:i])

Absent a race condition with another goroutine modifying m.Metrics, that last line (which is where the panic happened) should never be reached when len(m.metrics) == 0.

bogdandrutu commented 3 years ago

@vishiy do you have by any chance multiple exporters (like logging one) or multiple pipelines defined in the config?

vishiy commented 3 years ago

@bogdandrutu its just one pipeline with prom receiver and otlp exporter. There is also a batch and resource processors. I will share the config later today as I am traveling.

vishiy commented 3 years ago

pls find the config below -

exporters:
  otlp:
    endpoint: 127.0.0.1:55680
    insecure: true
    compression: "gzip"
processors:
  batch:
    send_batch_size: 8192
    timeout: 200ms
    send_batch_max_size: 10000
  resource:
    attributes:
    - key: cluster
      value: "$customResourceId"
      action: "upsert"
receivers:
    prometheus:
      config:
        scrape_configs:
        - job_name: node
          scheme: http
          scrape_interval: 30s
          kubernetes_sd_configs:
          - role: endpoints
            namespaces:
             names:
             - monitoring
          relabel_configs:
          - action: keep
            source_labels: [__meta_kubernetes_endpoints_name]
            regex: prom1-prometheus-node-exporter

        - job_name: kube-state-metrics
          scrape_interval: 30s
          static_configs:
          - targets: ['prom1-kube-state-metrics.monitoring.svc.cluster.local:8080']

        - job_name: kube-apiserver
          scrape_interval: 30s
          kubernetes_sd_configs:
          - role: endpoints
          scheme: https
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          relabel_configs:
          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
            action: keep
            regex: default;kubernetes;https

        - job_name: kube-proxy
          scrape_interval: 30s
          kubernetes_sd_configs:
          - role: pod
          relabel_configs:
          - action: keep
            source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_pod_name]
            separator: '/'
            regex: 'kube-system/kube-proxy.+'
          - source_labels:
            - __address__
            action: replace
            target_label: __address__
            regex: (.+?)(\:\d+)?
            replacement: $$1:10249

        - job_name: cadvisor
          scheme: https
          metrics_path: /metrics/cadvisor
          scrape_interval: 30s
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          kubernetes_sd_configs:
          - role: node

        - job_name: kube-dns
          scheme: http
          metrics_path: /metrics
          scrape_interval: 30s
          relabel_configs:
          - action: keep
            source_labels: [__meta_kubernetes_namespace,__meta_kubernetes_pod_name]
            separator: '/'
            regex: 'kube-system/coredns.+'
          - source_labels: [__meta_kubernetes_pod_container_port_name]
            action: keep
            regex: metrics
          - source_labels: [__meta_kubernetes_pod_name]
            target_label: pod
          kubernetes_sd_configs:
          - role: pod

        - job_name: kubelet
          scheme: https
          metrics_path: /metrics
          scrape_interval: 30s
          tls_config:
            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            insecure_skip_verify: true
          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
          relabel_configs:
          - source_labels: [__metrics_path__]
            regex: (.*)
            target_label: metrics_path
          kubernetes_sd_configs:
          - role: node

service:
  pipelines:
    metrics:
      receivers: [prometheus]
      exporters: [otlp]
      processors: [batch, resource]
bogdandrutu commented 3 years ago

Mark this as stale, sorry @vishiy, if you still have this issue please reopen

tim-mwangi commented 3 years ago

Hello,

I'm seeing this issue with opentelemetry collector v0.33.0 as well. It does take a while to crash though. When I was using v0.29.0 I saw it after 2 weeks of the collector pod running(I've deployed a collector instance in k8s). Then I upgraded to v0.33.0 and I did not see it for several deployments that ran for several hours but this one time after running for 6 hours it crashed, restarted and then continued running. So the crash is unpredictable.

My config when I deployed the v0.33.0 did have 2 exporters defined for metrics. I've now switched to just one exporter and will see if it crashes.

Config:

# opentelemetry collector v0.33.0
extensions:
  health_check:
  pprof:
    endpoint: 0.0.0.0:1777
  zpages:
    endpoint: 0.0.0.0:55679

receivers:
  otlp:
    protocols:
      grpc:
        max_recv_msg_size_mib: 4
      http:
  opencensus:

  # Collect own metrics
  prometheus:
    config:
      scrape_configs:
        - job_name: "my-collector"
          scrape_interval: 10s
          static_configs:
            - targets: ["0.0.0.0:8888"]
  jaeger:
    protocols:
      grpc:
      thrift_binary:
      thrift_compact:
      thrift_http:
  zipkin:

processors:
  batch:
    timeout: 200ms
    send_batch_size: 8192
    send_batch_max_size: 10000
  # Some other customer processors

exporters:
  otlp:
    endpoint: "myendpoint:443"
    compression: gzip
  prometheus:
    endpoint: "0.0.0.0:8889"
    namespace: traceableai

service:
  pipelines:
    traces:
      receivers: [otlp, opencensus, zipkin, jaeger, ]
      processors: [batch] # Some other custom trace processors in this list too
      exporters: [otlp]
    metrics:
      receivers: [otlp, opencensus, prometheus]
      processors: [batch]
      exporters: [otlp, prometheus] # 2 exporters. Now testing with 1 - otlp - to see if it crashes

  extensions: [health_check, pprof, zpages]

Crash log:

panic: runtime error: slice bounds out of range [-5942:]

goroutine 278 [running]:
go.opentelemetry.io/collector/model/internal/data/protogen/metrics/v1.(*Metric).MarshalToSizedBuffer(0xc0065cef40, 0xc00e8c6000, 0x454f0, 0x46d1a, 0x46d1a, 0x25, 0x0)
  /src/vendor/go.opentelemetry.io/collector/model/internal/data/protogen/metrics/v1/metrics.pb.go:2171 +0x505
go.opentelemetry.io/collector/model/internal/data/protogen/metrics/v1.(*InstrumentationLibraryMetrics).MarshalToSizedBuffer(0xc00dbd0be0, 0xc00e8c6000, 0x454f0, 0x46d1a, 0x9b, 0x0, 0x0)
  /src/vendor/go.opentelemetry.io/collector/model/internal/data/protogen/metrics/v1/metrics.pb.go:2123 +0x1ef
go.opentelemetry.io/collector/model/internal/data/protogen/metrics/v1.(*ResourceMetrics).MarshalToSizedBuffer(0xc000b460f0, 0xc00e8c6000, 0x454f0, 0x46d1a, 0x1827, 0x0, 0x0)
  /src/vendor/go.opentelemetry.io/collector/model/internal/data/protogen/metrics/v1/metrics.pb.go:2069 +0x1ef
go.opentelemetry.io/collector/model/internal/data/protogen/collector/metrics/v1.(*ExportMetricsServiceRequest).MarshalToSizedBuffer(0xc00ca38720, 0xc00e8c6000, 0x46d1a, 0x46d1a, 0x6d3125, 0xdfe3e0, 0x7fcac4a62ae0)
  /src/vendor/go.opentelemetry.io/collector/model/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:257 +0x10f
go.opentelemetry.io/collector/model/internal/data/protogen/collector/metrics/v1.(*ExportMetricsServiceRequest).Marshal(0xc00ca38720, 0x7fcac4a62ae0, 0xc00ca38720, 0x7fcadc13c520, 0xc00ca38720, 0x1)
  /src/vendor/go.opentelemetry.io/collector/model/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:237 +0x7c
google.golang.org/protobuf/internal/impl.legacyMarshal(0xfce938, 0xc00e691950, 0x0, 0x0, 0x0, 0x0, 0x7fcac4a62ae0, 0x7fcadc13c4e0, 0x7fcac4a62ae0, 0xc00ca38720, ...)
  /src/vendor/google.golang.org/protobuf/internal/impl/legacy_message.go:404 +0xb3
google.golang.org/protobuf/proto.MarshalOptions.marshal(0xc00e000001, 0x0, 0x0, 0x0, 0xfce938, 0xc00e691950, 0xc00e691950, 0xfce938, 0xc00e691950, 0x7fcac4a62ae0, ...)
  /src/vendor/google.golang.org/protobuf/proto/encode.go:163 +0x25e
google.golang.org/protobuf/proto.MarshalOptions.MarshalAppend(0x7fcac4000001, 0x0, 0x0, 0x0, 0xfba200, 0xc00e691950, 0x7fcac4a62ae0, 0x7fcadc13c4b0, 0xc00a4f2c78, 0xc8eac5, ...)
  /src/vendor/google.golang.org/protobuf/proto/encode.go:122 +0x98
github.com/golang/protobuf/proto.marshalAppend(0x0, 0x0, 0x0, 0x7fcadc13c4b0, 0xc00ca38720, 0xc00a4f2e00, 0xc00e6918f0, 0xc00a4f2e00, 0x0, 0x0, ...)
  /src/vendor/github.com/golang/protobuf/proto/wire.go:40 +0xdc
github.com/golang/protobuf/proto.Marshal(...)
  /src/vendor/github.com/golang/protobuf/proto/wire.go:23
google.golang.org/grpc/encoding/proto.codec.Marshal(0x7fcac4a62ae0, 0xc00ca38720, 0x0, 0xfc56f0, 0x7fcadd93e160, 0x0, 0x0)
  /src/vendor/google.golang.org/grpc/encoding/proto/proto.go:45 +0x6d
google.golang.org/grpc.encode(0x7fcadd93e160, 0x1420b38, 0x7fcac4a62ae0, 0xc00ca38720, 0x7fca6d955ba8, 0x8, 0x18, 0x7fcb0675b108, 0x18)
  /src/vendor/google.golang.org/grpc/rpc_util.go:594 +0x52
google.golang.org/grpc.prepareMsg(0x7fcac4a62ae0, 0xc00ca38720, 0x7fcadd93e160, 0x1420b38, 0x0, 0x0, 0xfc56f0, 0xc000100730, 0x3f, 0x0, ...)
  /src/vendor/google.golang.org/grpc/stream.go:1603 +0x10f
google.golang.org/grpc.(*clientStream).SendMsg(0xc00dadc5a0, 0x7fcac4a62ae0, 0xc00ca38720, 0x0, 0x0)
  /src/vendor/google.golang.org/grpc/stream.go:784 +0x165
google.golang.org/grpc.invoke(0xfc6b20, 0xc00e6a1500, 0x7fcac3c84223, 0x3f, 0x7fcac4a62ae0, 0xc00ca38720, 0x7fcac4a38700, 0x1420b38, 0xc000c4ee00, 0xc00cb37d00, ...)
  /src/vendor/google.golang.org/grpc/call.go:70 +0xe7
github.com/MyOrg/myproduct/pkg/auth.(*Authenticator).Unary.func1(0xfc6ae8, 0xc00dad9500, 0x7fcac3c84223, 0x3f, 0x7fcac4a62ae0, 0xc00ca38720, 0x7fcac4a38700, 0x1420b38, 0xc000c4ee00, 0xf113f0, ...)
  /src/pkg/auth/authenticator.go:64 +0xf1
google.golang.org/grpc.(*ClientConn).Invoke(0xc000c4ee00, 0xfc6ae8, 0xc00dad9500, 0x7fcac3c84223, 0x3f, 0x7fcac4a62ae0, 0xc00ca38720, 0x7fcac4a38700, 0x1420b38, 0xc000b7df30, ...)
  /src/vendor/google.golang.org/grpc/call.go:35 +0x10b
go.opentelemetry.io/collector/model/internal/data/protogen/collector/metrics/v1.(*metricsServiceClient).Export(0xc000fc9250, 0xfc6ae8, 0xc00dad9500, 0xc00ca38720, 0xc000b7df30, 0x1, 0x1, 0xc00a4f3268, 0xc002790000, 0xc000c6e900)
  /src/vendor/go.opentelemetry.io/collector/model/internal/data/protogen/collector/metrics/v1/metrics_service.pb.go:177 +0xbe
go.opentelemetry.io/collector/model/otlpgrpc.(*metricsClient).Export(0xc000b7df10, 0xfc6ae8, 0xc00dad9500, 0xc00ca38720, 0xc000b7df30, 0x1, 0x1, 0xc00e6918e0, 0xc000032d70, 0x12a05e590)
  /src/vendor/go.opentelemetry.io/collector/model/otlpgrpc/metrics.go:60 +0x8d
go.opentelemetry.io/collector/exporter/otlpexporter.(*grpcSender).exportMetrics(0xc000c373e0, 0xfc6ae8, 0xc00dad9500, 0xc00ca38720, 0xc00dad9500, 0xfc6b20)
  /src/vendor/go.opentelemetry.io/collector/exporter/otlpexporter/otlp.go:131 +0x95
go.opentelemetry.io/collector/exporter/otlpexporter.(*exporter).pushMetrics(0xc000b7c050, 0xfc6ae8, 0xc00dad9500, 0xc00ca38720, 0x7fcac15aa72d, 0xfc6b20)
  /src/vendor/go.opentelemetry.io/collector/exporter/otlpexporter/otlp.go:74 +0x53
go.opentelemetry.io/collector/exporter/exporterhelper.(*metricsRequest).export(0xc00cb37c80, 0xfc6ae8, 0xc00dad9500, 0xfc6ae8, 0xc00dad9500)
  /src/vendor/go.opentelemetry.io/collector/exporter/exporterhelper/metrics.go:52 +0x49
go.opentelemetry.io/collector/exporter/exporterhelper.(*timeoutSender).send(0xc000b58640, 0x7fcac4ceccf8, 0xc00cb37c80, 0x0, 0x0)
  /src/vendor/go.opentelemetry.io/collector/exporter/exporterhelper/common.go:228 +0x8a
go.opentelemetry.io/collector/exporter/exporterhelper.(*retrySender).send(0xc000b54c80, 0x7fcac4ceccf8, 0xc00cb37c80, 0xfc6b20, 0xc00e6a14a0)
  /src/vendor/go.opentelemetry.io/collector/exporter/exporterhelper/queued_retry.go:281 +0x486
go.opentelemetry.io/collector/exporter/exporterhelper.(*metricsSenderWithObservability).send(0xc000018738, 0x7fcac4ceccf8, 0xc00cb37c80, 0x7fcac4ceccf8, 0xc00cb37c80)
  /src/vendor/go.opentelemetry.io/collector/exporter/exporterhelper/metrics.go:117 +0x9e
go.opentelemetry.io/collector/exporter/exporterhelper.(*queuedRetrySender).start.func1(0x7fcac47a2140, 0xc00cb37c80)
  /src/vendor/go.opentelemetry.io/collector/exporter/exporterhelper/queued_retry.go:155 +0x77
github.com/jaegertracing/jaeger/pkg/queue.ConsumerFunc.Consume(0xc000b7df40, 0x7fcac47a2140, 0xc00cb37c80)
  /src/vendor/github.com/jaegertracing/jaeger/pkg/queue/bounded_queue.go:104 +0x3c
github.com/jaegertracing/jaeger/pkg/queue.(*BoundedQueue).StartConsumersWithFactory.func1(0xc000c34dc0, 0xc0004cf630)
  /src/vendor/github.com/jaegertracing/jaeger/pkg/queue/bounded_queue.go:83 +0xeb
created by github.com/jaegertracing/jaeger/pkg/queue.(*BoundedQueue).StartConsumersWithFactory
  /src/vendor/github.com/jaegertracing/jaeger/pkg/queue/bounded_queue.go:73 +0xcb

I thought I should comment on this issue instead of creating a new one. I can open one if you want. Thanks!

CC @bogdandrutu @vishiy