DataDog / datadog-agent

Main repository for Datadog Agent
https://docs.datadoghq.com/
Apache License 2.0
2.9k stars 1.21k forks source link

Datadog Agents are no longer pulling kubernetes.* metrics after upgrade from 7.33.1 to 7.34.0 #11475

Open gberenice opened 2 years ago

gberenice commented 2 years ago

Output of the info page (if this is a bug) agent status -c /etc/datadog-agent/datadog.yaml - stucks like this:

Getting the status from the agent.
2022-03-28 14:06:58 UTC | CORE | WARN | (pkg/util/log/log.go:592 in func1) | Deactivating Autoconfig will disable most components. It's recommended to use autoconfig_exclude_features and autoconfig_include_features to activate/deactivate features selectively

agent config -c /etc/datadog-agent/datadog.yaml

ac_exclude: []
ac_include: []
ac_load_timeout: 30000
ad_config_poll_interval: 10
additional_checksd: /etc/datadog-agent/checks.d
additional_endpoints: {}
admission_controller:
  certificate:
    expiration_threshold: 720
    secret_name: webhook-certificate
    validity_bound: 8760
  enabled: false
  inject_config:
    enabled: true
    endpoint: /injectconfig
  inject_tags:
    enabled: true
    endpoint: /injecttags
  mutate_unlabelled: false
  namespace_selector_fallback: false
  pod_owners_cache_validity: 10
  port: 8000
  service_name: datadog-admission-controller
  timeout_seconds: 10
  webhook_name: datadog-webhook
aggregator_buffer_size: 100
aggregator_flush_metrics_and_serialize_in_parallel: true
aggregator_flush_metrics_and_serialize_in_parallel_buffer_size: 4000
aggregator_flush_metrics_and_serialize_in_parallel_chan_size: 200
aggregator_stop_timeout: 2
aggregator_use_tags_store: false
allow_arbitrary_tags: false
allow_python_path_heuristics_failure: false
api_key: ********************************
apm_config:
  apm_non_local_traffic: true
  enabled: true
  max_cpu_percent: 0
  max_memory: 0
  receiver_port: 8126
  remote_tagger: true
  telemetry:
    enabled: true
  windows_pipe_buffer_size: 1000000
  windows_pipe_security_descriptor: D:AI(A;;GA;;;WD)
app_key: ****************************************
appsec_config:
  appsec_dd_url: ""
  enabled: true
  max_payload_size: 5242880
auth_token_file_path: ""
auto_exit:
  noprocess:
    enabled: false
    excluded_processes: []
  validation_period: 60
autoconf_template_dir: /datadog/check_configs
autoconf_template_url_timeout: 5
autoconfig_exclude_features: []
autoconfig_from_environment: true
autoconfig_include_features: []
azure_hostname_style: os
basic_telemetry_add_container_tags: false
bosh_id: ""
c_core_dump: false
c_stacktrace_collection: false
cf_os_hostname_aliasing: false
check_runners: 4
check_sampler_bucket_commits_count_expiry: 2
check_sampler_expire_metrics: true
check_sampler_stateful_metric_expiration_time: 25h0m0s
checks_tag_cardinality: low
clc_runner_enabled: false
clc_runner_host: ""
clc_runner_id: ""
clc_runner_port: 5005
clc_runner_server_readheader_timeout: 10
clc_runner_server_write_timeout: 15
cloud_foundry: false
cloud_foundry_bbs:
  ca_file: ""
  cert_file: ""
  env_exclude: []
  env_include: []
  key_file: ""
  poll_interval: 15
  url: https://bbs.service.cf.internal:8889
cloud_foundry_cc:
  apps_batch_size: 5000
  client_id: ""
  client_secret: ""
  poll_interval: 60
  skip_ssl_validation: false
  url: https://cloud-controller-ng.service.cf.internal:9024
cloud_foundry_garden:
  listen_address: /var/vcap/data/garden/garden.sock
  listen_network: unix
cloud_provider_metadata:
- aws
- gcp
- azure
- alibaba
- oracle
cluster_agent:
  auth_token: ********
  cmd_port: 5005
  enabled: false
  kubernetes_service_name: datadog-cluster-agent
  server:
    idle_timeout_seconds: 60
    read_timeout_seconds: 2
    write_timeout_seconds: 2
  tagging_fallback: false
  url: ""
cluster_checks:
  advanced_dispatching_enabled: false
  clc_runners_port: 5005
  cluster_tag_name: cluster_name
  enabled: false
  extra_tags: []
  node_expiration_timeout: 30
  warmup_duration: 30
cluster_name: ""
cmd.check.fullsketches: false
cmd_host: localhost
cmd_port: 5001
collect_ec2_tags: false
collect_gce_tags: true
collect_kubernetes_events: true
compliance_config:
  check_interval: 20m0s
  check_max_events_per_run: 100
  dir: /etc/datadog-agent/compliance.d
  enabled: false
  endpoints:
    batch_max_concurrent_send: 0
    batch_max_content_size: 1000000
    batch_max_size: 1000
    batch_wait: 5
    compression_level: 6
    connection_reset_interval: 0
    logs_no_ssl: false
    sender_backoff_base: 1
    sender_backoff_factor: 2
    sender_backoff_max: 120
    sender_recovery_interval: 2
    sender_recovery_reset: false
    use_compression: true
    use_v2_api: true
  run_path: /opt/datadog-agent/run
conf_path: .
confd_path: /etc/datadog-agent/conf.d
container_cgroup_prefix: ""
container_cgroup_root: /sys/fs/cgroup/
container_env_as_tags: {}
container_exclude:
- name:datadog-agent
container_exclude_logs: []
container_exclude_metrics: []
container_exclude_stopped_age: 22
container_include: []
container_include_logs: []
container_include_metrics: []
container_labels_as_tags: {}
container_lifecycle:
  enabled: false
container_proc_root: /proc
containerd_namespace: ""
cri_connection_timeout: 1
cri_query_timeout: 5
cri_socket_path: ""
database_monitoring:
  activity:
    batch_max_concurrent_send: 0
    batch_max_content_size: 1000000
    batch_max_size: 1000
    batch_wait: 5
    compression_level: 6
    connection_reset_interval: 0
    logs_no_ssl: false
    sender_backoff_base: 1
    sender_backoff_factor: 2
    sender_backoff_max: 120
    sender_recovery_interval: 2
    sender_recovery_reset: false
    use_compression: true
    use_v2_api: true
  metrics:
    batch_max_concurrent_send: 0
    batch_max_content_size: 1000000
    batch_max_size: 1000
    batch_wait: 5
    compression_level: 6
    connection_reset_interval: 0
    logs_no_ssl: false
    sender_backoff_base: 1
    sender_backoff_factor: 2
    sender_backoff_max: 120
    sender_recovery_interval: 2
    sender_recovery_reset: false
    use_compression: true
    use_v2_api: true
  samples:
    batch_max_concurrent_send: 0
    batch_max_content_size: 1000000
    batch_max_size: 1000
    batch_wait: 5
    compression_level: 6
    connection_reset_interval: 0
    logs_no_ssl: false
    sender_backoff_base: 1
    sender_backoff_factor: 2
    sender_backoff_max: 120
    sender_recovery_interval: 2
    sender_recovery_reset: false
    use_compression: true
    use_v2_api: true
default_integration_http_timeout: 9
disable_cluster_name_tag_key: false
disable_file_logging: false
disable_py3_validation: false
disable_unsafe_yaml: true
docker_env_as_tags: {}
docker_labels_as_tags: {}
docker_query_timeout: 5
dogstatsd_buffer_size: 8192
dogstatsd_capture_depth: 0
dogstatsd_capture_path: ""
dogstatsd_context_expiry_seconds: 300
dogstatsd_disable_verbose_logs: false
dogstatsd_entity_id_precedence: false
dogstatsd_eol_required: []
dogstatsd_expiry_seconds: 300
dogstatsd_mapper_cache_size: 1000
dogstatsd_metrics_stats_enable: false
dogstatsd_non_local_traffic: false
dogstatsd_origin_detection: false
dogstatsd_packet_buffer_flush_timeout: 100ms
dogstatsd_packet_buffer_size: 32
dogstatsd_pipe_name: ""
dogstatsd_port: 8125
dogstatsd_queue_size: 1024
dogstatsd_so_rcvbuf: 0
dogstatsd_socket: ""
dogstatsd_stats_buffer: 10
dogstatsd_stats_enable: false
dogstatsd_stats_port: 5000
dogstatsd_string_interner_size: 4096
dogstatsd_tag_cardinality: low
dogstatsd_tags: []
ec2_metadata_timeout: 300
ec2_metadata_token_lifetime: 21600
ec2_prefer_imdsv2: false
ec2_use_windows_prefix_detection: false
ecs_agent_container_name: ecs-agent
ecs_agent_url: ""
ecs_collect_resource_tags_ec2: false
ecs_metadata_timeout: 500
ecs_resource_tags_replace_colon: false
eks_fargate: true
enable_events_stream_payload_serialization: true
enable_gohai: true
enable_json_stream_shared_compressor_buffers: true
enable_metadata_collection: true
enable_payloads:
  events: true
  json_to_v1_intake: true
  series: true
  service_checks: true
  sketches: true
enable_service_checks_stream_payload_serialization: true
enable_sketch_stream_payload_serialization: true
enable_stream_payload_serialization: true
enhanced_metrics: true
env: dev-internal
exclude_gce_tags:
- kube-env
- kubelet-config
- containerd-configure-sh
- startup-script
- shutdown-script
- configure-sh
- sshKeys
- ssh-keys
- user-data
- cli-cert
- ipsec-cert
- ssl-cert
- google-container-manifest
- bosh_settings
- windows-startup-script-ps1
- common-psm1
- k8s-node-setup-psm1
- serial-port-logging-enable
- enable-oslogin
- disable-address-manager
- disable-legacy-endpoints
- windows-keys
- kubeconfig
exclude_pause_container: true
experimental:
  otlp:
    internal_traces_port: 5003
    metrics:
      tag_cardinality: low
    metrics_enabled: true
    traces_enabled: true
expvar_port: "5000"
external_metrics:
  aggregator: avg
external_metrics_provider:
  api_key: ""
  app_key: ""
  batch_window: 10
  bucket_size: 300
  config: {}
  enabled: false
  endpoint: ""
  local_copy_refresh_rate: 30
  max_age: 120
  port: 443
  refresh_period: 30
  rollup: 30
  use_datadogmetric_crd: false
  wpa_controller: false
extra_config_providers: []
extra_listeners: []
extra_tags: []
flare_stripped_keys: []
force_tls_12: false
forwarder_apikey_validation_interval: 60
forwarder_backoff_base: 2
forwarder_backoff_factor: 2
forwarder_backoff_max: 64
forwarder_connection_reset_interval: 0
forwarder_flush_to_disk_mem_ratio: 0.5
forwarder_high_prio_buffer_size: 100
forwarder_low_prio_buffer_size: 100
forwarder_num_workers: 1
forwarder_outdated_file_in_days: 10
forwarder_recovery_interval: 2
forwarder_recovery_reset: false
forwarder_requeue_buffer_size: 100
forwarder_stop_timeout: 2
forwarder_storage_max_disk_ratio: 0.8
forwarder_storage_max_size_in_bytes: 0
forwarder_storage_path: ""
forwarder_timeout: 20
gce_metadata_timeout: 1000
gce_send_project_id_tag: false
go_core_dump: false
gui_port: -1
health_port: 0
heroku_dyno: false
histogram_aggregates:
- max
- median
- avg
- count
histogram_copy_to_distribution: false
histogram_copy_to_distribution_prefix: ""
histogram_percentiles:
- "0.95"
host_aliases: []
hostname: ""
hostname_file: ""
hostname_force_config_as_canonical: false
hostname_fqdn: false
hpa_configmap_name: datadog-custom-metrics
hpa_watcher_gc_period: 300
hpa_watcher_polling_freq: 10
ignore_autoconf: []
ignore_host_etc: false
internal_profiling:
  block_profile_rate: 0
  cpu_duration: 1m0s
  enable_goroutine_stacktraces: false
  enabled: false
  mutex_profile_fraction: 0
  period: 5m0s
inventories_enabled: true
inventories_max_interval: 600
inventories_min_interval: 300
iot_host: false
ipc_address: localhost
jmx_check_period: 15000
jmx_collection_timeout: 60
jmx_custom_jars: []
jmx_log_file: ""
jmx_max_restarts: 3
jmx_reconnection_thread_pool_size: 3
jmx_reconnection_timeout: 60
jmx_restart_interval: 5
jmx_thread_pool_size: 3
jmx_use_cgroup_memory_limit: false
jmx_use_container_support: true
kube_cache_sync_timeout_seconds: 5
kube_resources_namespace: ""
kubelet_auth_token_path: ""
kubelet_cache_pods_duration: 5
kubelet_client_ca: ""
kubelet_client_crt: ""
kubelet_client_key: ""
kubelet_listener_polling_interval: 5
kubelet_tls_verify: false
kubelet_wait_on_missing_container: 0
kubernetes_apiserver_ca_path: ""
kubernetes_apiserver_client_timeout: 10
kubernetes_apiserver_tls_verify: true
kubernetes_apiserver_use_protobuf: false
kubernetes_collect_metadata_tags: true
kubernetes_event_collection_timeout: 100
kubernetes_http_kubelet_port: 10255
kubernetes_https_kubelet_port: 10250
kubernetes_informers_resync_period: 300
kubernetes_kubeconfig_path: ""
kubernetes_kubelet_host: 10.111.21.241
kubernetes_kubelet_nodename: fargate-ip-10-111-21-241.ec2.internal
kubernetes_map_services_on_ip: false
kubernetes_metadata_tag_update_freq: 60
kubernetes_namespace_labels_as_tags: {}
kubernetes_node_annotations_as_host_aliases:
- cluster.k8s.io/machine
kubernetes_node_labels_as_tags: {}
kubernetes_pod_annotations_as_tags: {}
kubernetes_pod_expiration_duration: 900
kubernetes_pod_labels_as_tags: '{"app.kubernetes.io/name": "kube_app_name","app.kubernetes.io/version":
  "kube_app_version"}'
leader_election: true
leader_lease_duration: "60"
log_all_goroutines_when_unhealthy: false
log_enabled: false
log_file: ""
log_file_max_rolls: 1
log_file_max_size: 10Mb
log_format_json: false
log_format_rfc3339: false
log_level: WARN
log_payloads: false
log_to_console: true
log_to_syslog: false
logging_frequency: 500
logs_config:
  aggregation_timeout: 1000
  auditor_ttl: 23
  auto_multi_line_default_match_threshold: 0.48
  auto_multi_line_default_match_timeout: 30
  auto_multi_line_default_sample_size: 500
  auto_multi_line_detection: false
  auto_multi_line_extra_patterns: []
  batch_max_concurrent_send: 0
  batch_max_content_size: 1000000
  batch_max_size: 1000
  batch_wait: 5
  close_timeout: 60
  compression_level: 6
  connection_reset_interval: 0
  container_collect_all: false
  dd_port: 10516
  dd_url_443: agent-443-intake.logs.datadoghq.com
  dev_mode_use_proto: true
  docker_client_read_timeout: 30
  docker_container_force_use_file: false
  docker_container_use_file: true
  expected_tags_duration: 0s
  file_scan_period: 10
  frame_size: 9000
  k8s_container_use_file: false
  logs_no_ssl: false
  open_files_limit: 100
  run_path: /opt/datadog-agent/run
  sender_backoff_base: 1
  sender_backoff_factor: 2
  sender_backoff_max: 120
  sender_recovery_interval: 2
  sender_recovery_reset: false
  socks5_proxy_address: ""
  stop_grace_period: 30
  tagger_warmup_duration: 0
  use_compression: true
  use_http: false
  use_podman_logs: false
  use_port_443: false
  use_tcp: false
  use_v2_api: true
  validate_pod_container_id: false
logs_enabled: false
memtrack_enabled: true
metadata_endpoints_max_hostname_size: 255
metrics_port: "5000"
network_config:
  conntrack_init_timeout: 10s
  dns_recorded_query_types: []
  enable_dns_by_querytype: false
  enable_gateway_lookup: false
  ignore_conntrack_init_failure: false
network_devices:
  metadata:
    batch_max_concurrent_send: 0
    batch_max_content_size: 1000000
    batch_max_size: 1000
    batch_wait: 5
    compression_level: 6
    connection_reset_interval: 0
    logs_no_ssl: false
    sender_backoff_base: 1
    sender_backoff_factor: 2
    sender_backoff_max: 120
    sender_recovery_interval: 2
    sender_recovery_reset: false
    use_compression: true
    use_v2_api: true
  namespace: default
no_proxy_nonexact_match: false
orchestrator_explorer:
  container_scrubbing:
    enabled: true
  custom_sensitive_words: []
  enabled: false
  extra_tags: []
proc_root: /proc
process_config:
  dd_agent_bin: /opt/datadog-agent/bin/agent/agent
  disable_realtime_checks: false
  enabled: "true"
  grpc_connection_timeout_secs: 60
  log_file: /var/log/datadog/process-agent.log
  process_discovery:
    enabled: true
    interval: 4h0m0s
  remote_tagger: false
procfs_path: /proc
prometheus_scrape:
  enabled: false
  service_endpoints: false
  version: 1
python_version: "3"
python3_linter_timeout: 120
remote_configuration:
  clients:
    ttl_seconds: 30s
  config_root: ""
  director_root: ""
  enabled: false
  endpoint: ""
  key: ""
  refresh_interval: 1m0s
remote_tagger_timeout_seconds: 30
run_path: /opt/datadog-agent/run
runtime_security_config:
  agent_monitoring_events: true
  cookie_cache_size: 100
  custom_sensitive_words: []
  dentry_cache_size: 1024
  enable_approvers: true
  enable_kernel_filters: true
  enable_remote_configuration: false
  enable_runtime_compiled_constants: false
  enabled: false
  endpoints:
    batch_max_concurrent_send: 0
    batch_max_content_size: 1000000
    batch_max_size: 1000
    batch_wait: 5
    compression_level: 6
    connection_reset_interval: 0
    logs_no_ssl: false
    sender_backoff_base: 1
    sender_backoff_factor: 2
    sender_backoff_max: 120
    sender_recovery_interval: 2
    sender_recovery_reset: false
    use_compression: true
    use_v2_api: true
  erpc_dentry_resolution_enabled: true
  event_server:
    burst: 40
    rate: 10
    retention: 6
  events_stats:
    polling_interval: 20
    tags_cardinality: high
  flush_discarder_window: 3
  load_controller:
    control_period: 2
    discarder_timeout: 60
    events_count_threshold: 20000
  log_patterns: []
  map_dentry_resolution_enabled: true
  pid_cache_size: 10000
  policies:
    dir: /etc/datadog-agent/runtime-security.d
  remote_tagger: true
  run_path: /opt/datadog-agent/run
  self_test:
    enabled: true
  socket: /opt/datadog-agent/run/runtime-security.sock
  syscall_monitor:
    enabled: false
secret_backend_arguments: []
secret_backend_command: ""
secret_backend_command_allow_group_exec_perm: false
secret_backend_output_max_size: 1048576
secret_backend_skip_checks: false
secret_backend_timeout: 30
security_agent:
  cmd_port: 5010
  expvar_port: 5011
  log_file: /var/log/datadog/security-agent.log
  remote_tagger: true
serializer_max_payload_size: 2621440
serializer_max_uncompressed_payload_size: 4194304
server_timeout: 30
serverless:
  logs_enabled: true
service_monitoring_config:
  enabled: false
skip_ssl_validation: false
snmp_traps_config:
  bind_host: localhost
  community_strings: ********
  port: 162
  stop_timeout: 5
snmp_traps_enabled: false
sslkeylogfile: ""
statsd_forward_host: ""
statsd_forward_port: 0
statsd_metric_blocklist: []
statsd_metric_namespace: ""
statsd_metric_namespace_blacklist:
- datadog.agent
- datadog.dogstatsd
- datadog.process
- datadog.trace_agent
- datadog.tracer
- activemq
- activemq_58
- airflow
- cassandra
- confluent
- hazelcast
- hive
- ignite
- jboss
- jvm
- kafka
- presto
- sidekiq
- solr
- tomcat
- runtime
syslog_key: ""
syslog_pem: ""
syslog_rfc: false
syslog_tls_verify: true
syslog_uri: ""
system_probe_config:
  apt_config_dir: /etc/apt
  bpf_debug: false
  bpf_dir: /opt/datadog-agent/embedded/share/system-probe/ebpf
  closed_channel_size: 500
  collect_dns_domains: true
  collect_dns_stats: true
  collect_local_dns: false
  conntrack_max_state_size: 131072
  conntrack_rate_limit: 500
  debug_port: 0
  dest_excludes: {}
  disable_dns_inspection: false
  disable_ipv6: false
  disable_tcp: false
  disable_udp: false
  dns_timeout_in_s: 15
  dogstatsd_host: 127.0.0.1
  dogstatsd_port: 8125
  enable_conntrack: true
  enable_conntrack_all_namespaces: true
  enable_kernel_header_download: false
  enable_oom_kill: false
  enable_runtime_compiler: false
  enable_tcp_queue_length: false
  enable_tracepoints: false
  enabled: false
  excluded_linux_versions: []
  external: false
  internal_profiling:
    api_key: ********************************
    block_profile_rate: 0
    cpu_duration: 1m0s
    enable_goroutine_stacktraces: false
    enabled: false
    env: dev-internal
    mutex_profile_fraction: 0
    period: 5m0s
    profile_dd_url: ""
    site: datadoghq.com
  kernel_header_dirs: []
  kernel_header_download_dir: /var/tmp/datadog-agent/system-probe/kernel-headers
  log_file: /var/log/datadog/system-probe.log
  log_level: WARN
  max_closed_connections_buffered: 50000
  max_connection_state_buffered: 75000
  max_conns_per_message: 600
  max_dns_stats: 20000
  max_tracked_connections: 65536
  offset_guess_threshold: 400
  process_config:
    enabled: false
  runtime_compiler_output_dir: /var/tmp/datadog-agent/system-probe/build
  source_excludes: {}
  sysprobe_socket: /opt/datadog-agent/run/sysprobe.sock
  windows:
    driver_buffer_size: 1024
    enable_monotonic_count: false
  yum_repos_dir: /etc/yum.repos.d
  zypper_repos_dir: /etc/zypp/repos.d
tag_value_split_separator: {}
tags:
- cluster_name:oc-dev-internal-eks-cluster
- environment:dev
- customer:internal
telemetry:
  dogstatsd:
    aggregator_channel_latency_buckets: []
    listeners_channel_latency_buckets: []
    listeners_latency_buckets: []
  enabled: false
tracemalloc_blacklist: ""
tracemalloc_debug: false
tracemalloc_exclude: ""
tracemalloc_include: ""
tracemalloc_whitelist: ""
use_dogstatsd: true
use_proxy_for_cloud_metadata: false
use_v2_api:
  series: false
vector:
  metrics:
    enabled: false
    url: ""
windows_use_pythonpath: false

agent diagnose -c /etc/datadog-agent/datadog.yaml

...
=== Running Cluster Agent availability diagnosis ===
===> FAIL

=== Running Containerd availability diagnosis ===
[ERROR] errorf: Containerd init error: temporary failure in containerdutil, will retry later: failed to dial "/var/run/containerd/containerd.sock": context deadline exceeded - 1648476488488103709
===> FAIL

=== Running Docker availability diagnosis ===
===> FAIL

=== Running EC2 Metadata availability diagnosis ===
2022-03-28 14:08:08 UTC | CORE | ERROR | (pkg/util/containerd/containerd_util.go:96 in NewContainerdUtil) | Containerd init error: temporary failure in containerdutil, will retry later: failed to dial "/var/run/containerd/containerd.sock": context deadline exceeded
===> FAIL
...
=== Running Kubelet availability diagnosis ===
===> PASS

=== Running Kubernetes API Server availability diagnosis ===
===> FAIL
...

Describe what happened: Datadog Agents are running as a Sidecar containers in EKS Fargate cluster. After we upgraded from 7.33.1 to 7.34.0, kubernetes.* metrics (such as kubernetes.memory.limits) are not visible in DD. Agent configuration wasn't changed - only the version was updated.

Note, that kubernetes_state.* are available.

Describe what you expected: Kubelet metrics kubernetes.* are available.

Steps to reproduce the issue: Datadog Agent helm template:

- image: datadog/agent:7
  name: datadog-agent

  ports:
  - containerPort: 8125
    name: dogstatsdport
    protocol: UDP

  - containerPort: 8126
    name: traceport
    protocol: TCP

  env:
  - name: DD_API_KEY
    valueFrom:
      secretKeyRef:
        name: datadog-secrets
        key: api-key

  - name: DD_APP_KEY
    valueFrom:
      secretKeyRef:
        name: datadog-secrets
        key: app-key

  - name: DD_ENV
    value: {{ include "oc-lib.environmentCustomerPair" . }}

  - name: DD_TAGS
    value: cluster_name:{{ include "oc-lib.clusterName" . }} environment:{{ include "oc-lib.environment" . }} customer:{{ include "oc-lib.customer" . }}

  - name: DD_KUBERNETES_POD_LABELS_AS_TAGS
    value: '{"app.kubernetes.io/name": "kube_app_name","app.kubernetes.io/version": "kube_app_version"}'

  - name: DD_COLLECT_KUBERNETES_EVENTS
    value: "true"

  - name: DD_LEADER_ELECTION
    value: "true"

  - name: DD_PROCESS_AGENT_ENABLED
    value: "true"

  - name: DD_LOG_LEVEL
    value: "WARN"

  - name: DD_LOGS_ENABLED
    value: "false"

  - name: DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL
    value: "false"

  - name: DD_CONTAINER_EXCLUDE
    value: "name:datadog-agent"

  - name: DD_APM_ENABLED
    value: "true"

  - name: DD_EKS_FARGATE
    value: "true"

  - name: DD_KUBELET_TLS_VERIFY
    value: "false"

  - name: DD_KUBERNETES_KUBELET_NODENAME
    valueFrom:
      fieldRef:
        apiVersion: v1
        fieldPath: spec.nodeName

  - name: DD_KUBERNETES_KUBELET_HOST
    valueFrom:
      fieldRef:
        fieldPath: status.hostIP

  resources:
    requests:
      memory: "256Mi"
      cpu: "200m"
    limits:
      memory: "256Mi"
      cpu: "200m"

Additional environment details (Operating System, Cloud provider, etc): Kubernetes Version: 1.21 EKS Platform: eks.5

mscanlon72 commented 2 years ago

Likewise here, but I think the issue for me occurred before the version change. However, I also just updated from 7.33.1 -> 7.34.0.