argoproj / argo-cd

Declarative Continuous Deployment for Kubernetes
https://argo-cd.readthedocs.io
Apache License 2.0
16.45k stars 4.97k forks source link

StatefulSet with volume claim causes diff with `ServerSideApply=true` #11143

Closed Cowboy-coder closed 4 months ago

Cowboy-coder commented 1 year ago

Checklist:

Describe the bug

As described in https://github.com/argoproj/argo-cd/issues/11074 I get OutOfSync diff when installing loki using ServerSideApply. I haven't double checked but Im pretty sure this is not a Loki-specific issue.

Live manifest ``` apiVersion: apps/v1 kind: StatefulSet metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: > {"apiVersion":"apps/v1","kind":"StatefulSet","metadata":{"annotations":{},"labels":{"app.kubernetes.io/component":"write","app.kubernetes.io/instance":"loki","app.kubernetes.io/managed-by":"Helm","app.kubernetes.io/name":"loki","app.kubernetes.io/part-of":"memberlist","app.kubernetes.io/version":"2.6.1","argocd.argoproj.io/instance":"logging","helm.sh/chart":"loki-3.2.0"},"name":"loki-write","namespace":"logging"},"spec":{"podManagementPolicy":"Parallel","replicas":3,"revisionHistoryLimit":10,"selector":{"matchLabels":{"app.kubernetes.io/component":"write","app.kubernetes.io/instance":"loki","app.kubernetes.io/name":"loki"}},"serviceName":"loki-write-headless","template":{"metadata":{"annotations":{"checksum/config":"dc4356fb9c8ae2285982e39f348eaa3087a7bd09084224adb6915903fdf04574"},"labels":{"app.kubernetes.io/component":"write","app.kubernetes.io/instance":"loki","app.kubernetes.io/name":"loki","app.kubernetes.io/part-of":"memberlist"}},"spec":{"affinity":{"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"labelSelector":{"matchLabels":{"app.kubernetes.io/component":"write","app.kubernetes.io/instance":"loki","app.kubernetes.io/name":"loki"}},"topologyKey":"kubernetes.io/hostname"}]}},"automountServiceAccountToken":true,"containers":[{"args":["-config.file=/etc/loki/config/config.yaml","-target=write"],"env":[{"name":"AWS_ACCESS_KEY_ID","valueFrom":{"secretKeyRef":{"key":"AWS_ACCESS_KEY_ID","name":"loki-s3"}}},{"name":"AWS_SECRET_ACCESS_KEY","valueFrom":{"secretKeyRef":{"key":"AWS_SECRET_ACCESS_KEY","name":"loki-s3"}}}],"image":"docker.io/grafana/loki:2.6.1","imagePullPolicy":"IfNotPresent","name":"write","ports":[{"containerPort":3100,"name":"http-metrics","protocol":"TCP"},{"containerPort":9095,"name":"grpc","protocol":"TCP"},{"containerPort":7946,"name":"http-memberlist","protocol":"TCP"}],"readinessProbe":{"httpGet":{"path":"/ready","port":"http-metrics"},"initialDelaySeconds":30,"timeoutSeconds":1},"resources":{},"securityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true},"volumeMounts":[{"mountPath":"/etc/loki/config","name":"config"},{"mountPath":"/var/loki","name":"data"}]}],"securityContext":{"fsGroup":10001,"runAsGroup":10001,"runAsNonRoot":true,"runAsUser":10001},"serviceAccountName":"loki","terminationGracePeriodSeconds":300,"volumes":[{"configMap":{"name":"loki"},"name":"config"}]}},"updateStrategy":{"rollingUpdate":{"partition":0}},"volumeClaimTemplates":[{"metadata":{"name":"data"},"spec":{"accessModes":["ReadWriteOnce"],"resources":{"requests":{"storage":"10Gi"}},"storageClassName":"openebs-hostpath"}}]}} creationTimestamp: '2022-10-26T09:13:13Z' generation: 1 labels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/managed-by: Helm app.kubernetes.io/name: loki app.kubernetes.io/part-of: memberlist app.kubernetes.io/version: 2.6.1 argocd.argoproj.io/instance: logging helm.sh/chart: loki-3.2.0 managedFields: - apiVersion: apps/v1 fieldsType: FieldsV1 fieldsV1: 'f:metadata': 'f:labels': 'f:app.kubernetes.io/component': {} 'f:app.kubernetes.io/instance': {} 'f:app.kubernetes.io/managed-by': {} 'f:app.kubernetes.io/name': {} 'f:app.kubernetes.io/part-of': {} 'f:app.kubernetes.io/version': {} 'f:argocd.argoproj.io/instance': {} 'f:helm.sh/chart': {} 'f:spec': 'f:podManagementPolicy': {} 'f:replicas': {} 'f:revisionHistoryLimit': {} 'f:selector': {} 'f:serviceName': {} 'f:template': 'f:metadata': 'f:annotations': 'f:checksum/config': {} 'f:labels': 'f:app.kubernetes.io/component': {} 'f:app.kubernetes.io/instance': {} 'f:app.kubernetes.io/name': {} 'f:app.kubernetes.io/part-of': {} 'f:spec': 'f:affinity': 'f:podAntiAffinity': 'f:requiredDuringSchedulingIgnoredDuringExecution': {} 'f:automountServiceAccountToken': {} 'f:containers': 'k:{"name":"write"}': .: {} 'f:args': {} 'f:env': 'k:{"name":"AWS_ACCESS_KEY_ID"}': .: {} 'f:name': {} 'f:valueFrom': 'f:secretKeyRef': {} 'k:{"name":"AWS_SECRET_ACCESS_KEY"}': .: {} 'f:name': {} 'f:valueFrom': 'f:secretKeyRef': {} 'f:image': {} 'f:imagePullPolicy': {} 'f:name': {} 'f:ports': 'k:{"containerPort":3100,"protocol":"TCP"}': .: {} 'f:containerPort': {} 'f:name': {} 'f:protocol': {} 'k:{"containerPort":7946,"protocol":"TCP"}': .: {} 'f:containerPort': {} 'f:name': {} 'f:protocol': {} 'k:{"containerPort":9095,"protocol":"TCP"}': .: {} 'f:containerPort': {} 'f:name': {} 'f:protocol': {} 'f:readinessProbe': 'f:httpGet': 'f:path': {} 'f:port': {} 'f:initialDelaySeconds': {} 'f:timeoutSeconds': {} 'f:resources': {} 'f:securityContext': 'f:allowPrivilegeEscalation': {} 'f:capabilities': 'f:drop': {} 'f:readOnlyRootFilesystem': {} 'f:volumeMounts': 'k:{"mountPath":"/etc/loki/config"}': .: {} 'f:mountPath': {} 'f:name': {} 'k:{"mountPath":"/var/loki"}': .: {} 'f:mountPath': {} 'f:name': {} 'f:securityContext': 'f:fsGroup': {} 'f:runAsGroup': {} 'f:runAsNonRoot': {} 'f:runAsUser': {} 'f:serviceAccountName': {} 'f:terminationGracePeriodSeconds': {} 'f:volumes': 'k:{"name":"config"}': .: {} 'f:configMap': 'f:name': {} 'f:name': {} 'f:updateStrategy': 'f:rollingUpdate': 'f:partition': {} 'f:volumeClaimTemplates': {} manager: argocd-controller operation: Apply time: '2022-10-28T07:36:52Z' - apiVersion: apps/v1 fieldsType: FieldsV1 fieldsV1: 'f:metadata': 'f:annotations': {} 'f:labels': .: {} 'f:app.kubernetes.io/component': {} 'f:app.kubernetes.io/managed-by': {} 'f:app.kubernetes.io/name': {} 'f:app.kubernetes.io/part-of': {} 'f:app.kubernetes.io/version': {} 'f:helm.sh/chart': {} 'f:spec': 'f:podManagementPolicy': {} 'f:replicas': {} 'f:revisionHistoryLimit': {} 'f:selector': {} 'f:serviceName': {} 'f:template': 'f:metadata': 'f:annotations': .: {} 'f:checksum/config': {} 'f:labels': .: {} 'f:app.kubernetes.io/component': {} 'f:app.kubernetes.io/instance': {} 'f:app.kubernetes.io/name': {} 'f:app.kubernetes.io/part-of': {} 'f:spec': 'f:affinity': .: {} 'f:podAntiAffinity': .: {} 'f:requiredDuringSchedulingIgnoredDuringExecution': {} 'f:automountServiceAccountToken': {} 'f:containers': 'k:{"name":"write"}': .: {} 'f:args': {} 'f:env': .: {} 'k:{"name":"AWS_ACCESS_KEY_ID"}': .: {} 'f:name': {} 'f:valueFrom': .: {} 'f:secretKeyRef': {} 'k:{"name":"AWS_SECRET_ACCESS_KEY"}': .: {} 'f:name': {} 'f:valueFrom': .: {} 'f:secretKeyRef': {} 'f:image': {} 'f:imagePullPolicy': {} 'f:name': {} 'f:ports': .: {} 'k:{"containerPort":3100,"protocol":"TCP"}': .: {} 'f:containerPort': {} 'f:name': {} 'f:protocol': {} 'k:{"containerPort":7946,"protocol":"TCP"}': .: {} 'f:containerPort': {} 'f:name': {} 'f:protocol': {} 'k:{"containerPort":9095,"protocol":"TCP"}': .: {} 'f:containerPort': {} 'f:name': {} 'f:protocol': {} 'f:readinessProbe': .: {} 'f:failureThreshold': {} 'f:httpGet': .: {} 'f:path': {} 'f:port': {} 'f:scheme': {} 'f:initialDelaySeconds': {} 'f:periodSeconds': {} 'f:successThreshold': {} 'f:timeoutSeconds': {} 'f:resources': {} 'f:securityContext': .: {} 'f:allowPrivilegeEscalation': {} 'f:capabilities': .: {} 'f:drop': {} 'f:readOnlyRootFilesystem': {} 'f:terminationMessagePath': {} 'f:terminationMessagePolicy': {} 'f:volumeMounts': .: {} 'k:{"mountPath":"/etc/loki/config"}': .: {} 'f:mountPath': {} 'f:name': {} 'k:{"mountPath":"/var/loki"}': .: {} 'f:mountPath': {} 'f:name': {} 'f:dnsPolicy': {} 'f:restartPolicy': {} 'f:schedulerName': {} 'f:securityContext': .: {} 'f:fsGroup': {} 'f:runAsGroup': {} 'f:runAsNonRoot': {} 'f:runAsUser': {} 'f:serviceAccount': {} 'f:serviceAccountName': {} 'f:terminationGracePeriodSeconds': {} 'f:volumes': .: {} 'k:{"name":"config"}': .: {} 'f:configMap': .: {} 'f:defaultMode': {} 'f:name': {} 'f:name': {} 'f:updateStrategy': 'f:rollingUpdate': .: {} 'f:partition': {} 'f:type': {} manager: argocd-application-controller operation: Update time: '2022-10-26T09:13:13Z' - apiVersion: apps/v1 fieldsType: FieldsV1 fieldsV1: 'f:status': 'f:availableReplicas': {} 'f:collisionCount': {} 'f:currentReplicas': {} 'f:currentRevision': {} 'f:observedGeneration': {} 'f:readyReplicas': {} 'f:replicas': {} 'f:updateRevision': {} 'f:updatedReplicas': {} manager: kube-controller-manager operation: Update subresource: status time: '2022-10-26T09:18:53Z' - apiVersion: apps/v1 fieldsType: FieldsV1 fieldsV1: 'f:metadata': 'f:annotations': 'f:kubectl.kubernetes.io/last-applied-configuration': {} 'f:labels': 'f:app.kubernetes.io/instance': {} 'f:argocd.argoproj.io/instance': {} manager: argocd-controller operation: Update time: '2022-10-28T07:19:13Z' name: loki-write namespace: logging resourceVersion: '46346521' uid: 159449f2-01c3-4ee1-8b91-2e1e90c1e9eb spec: podManagementPolicy: Parallel replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/name: loki serviceName: loki-write-headless template: metadata: annotations: checksum/config: dc4356fb9c8ae2285982e39f348eaa3087a7bd09084224adb6915903fdf04574 creationTimestamp: null labels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/name: loki app.kubernetes.io/part-of: memberlist spec: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchLabels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/name: loki topologyKey: kubernetes.io/hostname automountServiceAccountToken: true containers: - args: - '-config.file=/etc/loki/config/config.yaml' - '-target=write' env: - name: AWS_ACCESS_KEY_ID valueFrom: secretKeyRef: key: AWS_ACCESS_KEY_ID name: loki-s3 - name: AWS_SECRET_ACCESS_KEY valueFrom: secretKeyRef: key: AWS_SECRET_ACCESS_KEY name: loki-s3 image: 'docker.io/grafana/loki:2.6.1' imagePullPolicy: IfNotPresent name: write ports: - containerPort: 3100 name: http-metrics protocol: TCP - containerPort: 9095 name: grpc protocol: TCP - containerPort: 7946 name: http-memberlist protocol: TCP readinessProbe: failureThreshold: 3 httpGet: path: /ready port: http-metrics scheme: HTTP initialDelaySeconds: 30 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 resources: {} securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL readOnlyRootFilesystem: true terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /etc/loki/config name: config - mountPath: /var/loki name: data dnsPolicy: ClusterFirst restartPolicy: Always schedulerName: default-scheduler securityContext: fsGroup: 10001 runAsGroup: 10001 runAsNonRoot: true runAsUser: 10001 serviceAccount: loki serviceAccountName: loki terminationGracePeriodSeconds: 300 volumes: - configMap: defaultMode: 420 name: loki name: config updateStrategy: rollingUpdate: partition: 0 type: RollingUpdate volumeClaimTemplates: - apiVersion: v1 kind: PersistentVolumeClaim metadata: creationTimestamp: null name: data spec: accessModes: - ReadWriteOnce resources: requests: storage: 10Gi storageClassName: openebs-hostpath volumeMode: Filesystem status: phase: Pending status: availableReplicas: 3 collisionCount: 0 currentReplicas: 3 currentRevision: loki-write-68f4b7bcfc observedGeneration: 1 readyReplicas: 3 replicas: 3 updateRevision: loki-write-68f4b7bcfc updatedReplicas: 3 ```
Desired manifest ``` apiVersion: apps/v1 kind: StatefulSet metadata: labels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/managed-by: Helm app.kubernetes.io/name: loki app.kubernetes.io/part-of: memberlist app.kubernetes.io/version: 2.6.1 argocd.argoproj.io/instance: logging helm.sh/chart: loki-3.2.0 name: loki-write namespace: logging spec: podManagementPolicy: Parallel replicas: 3 revisionHistoryLimit: 10 selector: matchLabels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/name: loki serviceName: loki-write-headless template: metadata: annotations: checksum/config: dc4356fb9c8ae2285982e39f348eaa3087a7bd09084224adb6915903fdf04574 labels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/name: loki app.kubernetes.io/part-of: memberlist spec: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchLabels: app.kubernetes.io/component: write app.kubernetes.io/instance: loki app.kubernetes.io/name: loki topologyKey: kubernetes.io/hostname automountServiceAccountToken: true containers: - args: - '-config.file=/etc/loki/config/config.yaml' - '-target=write' env: - name: AWS_ACCESS_KEY_ID valueFrom: secretKeyRef: key: AWS_ACCESS_KEY_ID name: loki-s3 - name: AWS_SECRET_ACCESS_KEY valueFrom: secretKeyRef: key: AWS_SECRET_ACCESS_KEY name: loki-s3 image: 'docker.io/grafana/loki:2.6.1' imagePullPolicy: IfNotPresent name: write ports: - containerPort: 3100 name: http-metrics protocol: TCP - containerPort: 9095 name: grpc protocol: TCP - containerPort: 7946 name: http-memberlist protocol: TCP readinessProbe: httpGet: path: /ready port: http-metrics initialDelaySeconds: 30 timeoutSeconds: 1 resources: {} securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL readOnlyRootFilesystem: true volumeMounts: - mountPath: /etc/loki/config name: config - mountPath: /var/loki name: data securityContext: fsGroup: 10001 runAsGroup: 10001 runAsNonRoot: true runAsUser: 10001 serviceAccountName: loki terminationGracePeriodSeconds: 300 volumes: - configMap: name: loki name: config updateStrategy: rollingUpdate: partition: 0 volumeClaimTemplates: - metadata: name: data spec: accessModes: - ReadWriteOnce resources: requests: storage: 10Gi storageClassName: openebs-hostpath ```

I also saw this old issue https://github.com/argoproj/argo-cd/issues/4126 related to what looks like the same problem.

To Reproduce

Install this helm chart: https://github.com/grafana/loki/tree/main/production/helm/loki

Expected behavior

No diff.

Screenshots

198033872-4ca9bfbb-654d-466e-bed0-41888c58d39c

Version

argocd: v2.4.11+3d9e9f2.dirty
  BuildDate: 2022-08-22T19:32:10Z
  GitCommit: 3d9e9f2f95b7801b90377ecfc4073e5f0f07205b
  GitTreeState: dirty
  GoVersion: go1.19
  Compiler: gc
  Platform: darwin/amd64
WARN[0000] Failed to invoke grpc call. Use flag --grpc-web in grpc calls. To avoid this warning message, use flag --grpc-web. 
argocd-server: v2.5.0+b895da4

Logs

Paste any relevant application logs here.
maikelpoot commented 1 year ago

We run in to the same problem but not only with a StateFullSet but also with a custom resource where an optional field causes the resource to stay out of sync. This only occurred in the application with ServerSideApply enabled.

afbeelding

password is the default value for property if omitted

leoluz commented 1 year ago

We run in to the same problem but not only with a StateFullSet but also with a custom resource where an optional field causes the resource to stay out of sync. This only occurred in the application with ServerSideApply enabled.

Hi @maikelpoot. This is a different issue and is tracked in https://github.com/argoproj/argo-cd/issues/11139

mananpreetsingh commented 1 year ago

+1 We are also experiencing the exactly same issue.

sathieu commented 1 year ago

Probably related to https://github.com/kubernetes/kubernetes/pull/87706 (current version here). But I don't understand why the conversion is done...

EDIT: https://github.com/kubernetes/website/pull/38981

apelisse commented 1 year ago

Server-side apply doesn't ask for a conversion, but the apiserver does lots of conversions internally to operate on the statefulset. Without server-side apply, the object is not sent to the server, so nothing happens. With server-side apply, the conversion acts exactly as a default in that case (the conversion defaults these fields), so they end-up appearing in the dry-run output (I don't know much about how any of this is working, but I suspect the diff is done through a dry-run request)

i.e. in practice, it's the same thing as https://github.com/argoproj/argo-cd/issues/11143#issuecomment-1324956158

linhng98 commented 1 year ago

+1 this issue appear with serverSideApply in any volumeClaimTemplates related helmchart (harbor, vault, eck, nats, ...)

leoluz commented 1 year ago

Server-side apply doesn't ask for a conversion, but the apiserver does lots of conversions internally to operate on the statefulset. Without server-side apply, the object is not sent to the server, so nothing happens. With server-side apply, the conversion acts exactly as a default in that case (the conversion defaults these fields), so they end-up appearing in the dry-run output (I don't know much about how any of this is working, but I suspect the diff is done through a dry-run request)

@apelisse Tks for jumping in the discussion. In the current version of Argo CD Server-Side Apply, we don't do a dry-run request. We tried to follow the same approach used by CSA aiming for efficiency and avoid sending that additional request to k8s API server. For that, I checked how K8s API server does the conversions and duplicated that logic in Argo CD controller. Unfortunately that is causing issues with default fields. I suspect that it is related with the OpenAPI document that we provide to the managedfields.NewGVKParser() constructor. I couldn't find a way to retrieve an OpenAPI document that contains the definitions of schemas default values.

Going forward, I believe that dry run is a better way to achieve more accurate diffs as (I believe) it will handle cases where the resource is configured with mutation webhooks. I have an Argo CD proposal to implement dryrun during diff calculation (https://github.com/argoproj/argo-cd/issues/11574). I am currently negotiating time to dedicate on the implementation.

apelisse commented 1 year ago

I couldn't find a way to retrieve an OpenAPI document that contains the definitions of schemas default values.

Yeah, we'd love to have more defaults in there, but you'll never get all of them anyway (mostly, as you mentioned, because of mutating webhook and such). Note that dry-run doesn't work well for multi-resources when there's a dependency (namespace must be created before the resources in the namespace can be created, CRD must be created before the CRs can be created, etc...). Let me know if I can help with anything.

alita1991 commented 1 year ago

+1 I hit the same issue today on PVCs and on the ServiceMonitor CR from the Prometheus stack.

anthonyalayo commented 1 year ago

I hit the same issue today with Loki.

blackliner commented 11 months ago

Same issue with kuik

https://github.com/enix/kube-image-keeper/blob/eab1d909dd058a71154d1aaaf86d62b3f7bbceb4/helm/kube-image-keeper/templates/registry-statefulset.yaml#L108-L110

image

mtrin commented 11 months ago

for now I'm just patching statefulsets on kustomize/helm

  - target:
      kind: StatefulSet
      name: mimir-alertmanager
    patch: |-
      - op: remove
        path: /spec/volumeClaimTemplates
      - op: add
        path: /spec/volumeClaimTemplates
        value:
          - apiVersion: v1
            kind: PersistentVolumeClaim
            metadata:
              name: storage
            spec:
              accessModes:
                - ReadWriteOnce
              resources:
                requests:
                  storage: 1Gi
  - target:
      kind: StatefulSet
      name: mimir-compactor
    patch: |-
      - op: remove
        path: /spec/volumeClaimTemplates
      - op: add
        path: /spec/volumeClaimTemplates
        value:
          - apiVersion: v1
            kind: PersistentVolumeClaim
            metadata:
              name: storage
            spec:
              accessModes:
                - ReadWriteOnce
              resources:
                requests:
                  storage: 1Gi
  - target:
      kind: StatefulSet
      name: mimir-store-gateway
    patch: |-
      - op: remove
        path: /spec/volumeClaimTemplates
      - op: add
        path: /spec/volumeClaimTemplates
        value:
          - apiVersion: v1
            kind: PersistentVolumeClaim
            metadata:
              name: storage
            spec:
              accessModes:
                - ReadWriteOnce
              resources:
                requests:
                  storage: 1Gi
evanrich commented 10 months ago

any updates on this? is there a way to ignore the differences in the first screenshot? I have multiple helm charts complaining about apiversion and kind, just like the screenshot.

kevinetore commented 10 months ago

+1 I have the same issue, multiple Helm charts.

sgsollie commented 10 months ago

Based on https://argo-cd.readthedocs.io/en/stable/user-guide/diffing And a post I saw on a related issue: https://github.com/argoproj/argo-cd/issues/11074#issuecomment-1298825859

This ignore differences block in your application definition for the kube prometheus stack (where you're server side applying loki) will give you the shiny green checkmark:

ignoreDifferences:
  - group: monitoring.coreos.com
    kind: ServiceMonitor
    jqPathExpressions:
      - '.spec.endpoints[]?.relabelings[]?.action'
  - group: apps
    kind: StatefulSet
    jqPathExpressions:
      - '.spec.volumeClaimTemplates[]?'

For my use case, I am not too worried about Argo "thinking" there is a diff for these statefulsets because we mostly use default settings for the kube prometheus stack. We install it and forget about it.

Others of you may not want this workaround. Really the bug in how Argo is diffing needs addressing.

crenshaw-dev commented 10 months ago

Server-side diffing is still in Leo's queue (he recently built a PoC), so a proper fix is on its way. But I don't have an ETA yet.

sgsollie commented 10 months ago

Server-side diffing is still in Leo's queue (he recently built a PoC), so a proper fix is on its way. But I don't have an ETA yet.

Hey @crenshaw-dev thanks for the update on that. Appreciate all your guys efforts.

Hariharasuthan99 commented 9 months ago

We have run into this issue in our organization as well. Eagerly waiting for the fix. :)

guillermoelia commented 8 months ago

Hi fine folk of ArgoCD, any news from the front? I just happened to hit this today as well.

vl-kp commented 8 months ago

we got the same issue

MohammedNoureldin commented 7 months ago

I have got a similar issue with loki, probably related?

image

sathieu commented 7 months ago

@MohammedNoureldin Yes.

crenshaw-dev commented 7 months ago

Talked with Leo about this yesterday. He's working through how to manage caching the diff.

Vinaum8 commented 4 months ago

Checklist:

  • [x] I've searched in the docs and FAQ for my answer: https://bit.ly/argocd-faq.
  • [x] I've included steps to reproduce the bug.
  • [x] I've pasted the output of argocd version.

Describe the bug

As described in #11074 I get OutOfSync diff when installing loki using ServerSideApply. I haven't double checked but Im pretty sure this is not a Loki-specific issue.

Live manifest Desired manifest I also saw this old issue #4126 related to what looks like the same problem.

To Reproduce

Install this helm chart: https://github.com/grafana/loki/tree/main/production/helm/loki

Expected behavior

No diff.

Screenshots

198033872-4ca9bfbb-654d-466e-bed0-41888c58d39c

Version

argocd: v2.4.11+3d9e9f2.dirty
  BuildDate: 2022-08-22T19:32:10Z
  GitCommit: 3d9e9f2f95b7801b90377ecfc4073e5f0f07205b
  GitTreeState: dirty
  GoVersion: go1.19
  Compiler: gc
  Platform: darwin/amd64
WARN[0000] Failed to invoke grpc call. Use flag --grpc-web in grpc calls. To avoid this warning message, use flag --grpc-web. 
argocd-server: v2.5.0+b895da4

Logs

Paste any relevant application logs here.

I resolved this.

I desatived the - ServerSideApply=true in the application of argocd and worked for me! :D

leoluz commented 4 months ago

ServerSide Diff feature is merged and available in Argo CD 2.10-RC1. It should address this and other diff problems when ServerSide Apply is enabled.

I am closing this for now and feel free to reopen if the issue persists.

STollenaar commented 3 months ago

Are there any special upgrade steps or needing to add that ignoreDifferences part? Still seeing this issue with the v2.10.0+2175939 version. image image

ilbarone87 commented 3 months ago

Are there any special upgrade steps or needing to add that ignoreDifferences part? Still seeing this issue with the v2.10.0+2175939 version.

image

image

Still here with Vault deployment. Diff is reporting volumeClaimTemplate not in sync.

ForbiddenEra commented 3 months ago

Based on https://argo-cd.readthedocs.io/en/stable/user-guide/diffing And a post I saw on a related issue: #11074 (comment)

This ignore differences block in your application definition for the kube prometheus stack (where you're server side applying loki) will give you the shiny green checkmark:

ignoreDifferences:
  - group: monitoring.coreos.com
    kind: ServiceMonitor
    jqPathExpressions:
      - '.spec.endpoints[]?.relabelings[]?.action'
  - group: apps
    kind: StatefulSet
    jqPathExpressions:
      - '.spec.volumeClaimTemplates[]?'

For my use case, I am not too worried about Argo "thinking" there is a diff for these statefulsets because we mostly use default settings for the kube prometheus stack. We install it and forget about it.

Others of you may not want this workaround. Really the bug in how Argo is diffing needs addressing.

<3

MohammedNoureldin commented 3 months ago

Like the last couple of comments, I also applied server side diffing for a specific application, particularly CRDs of Prometheus Operator, still I see a lot of diffs that cause my Application to be reported as out of sync. The same also for Loki.

I also tried with mutation webhook option, but also this made no difference.

  annotations:
    argocd.argoproj.io/compare-options: ServerSideDiff=true,IncludeMutationWebhook=true

Is there anything we are missing?

@STollenaar or anybody, did you find a solution for this?

Here is a screenshot to show an example:

image

ForbiddenEra commented 3 months ago

Like the last couple of comments, I also applied server side diffing for a specific application, particularly CRDs of Prometheus Operator, still I see a lot of diffs that cause my Application to be reported as out of sync. The same also for Loki.

@MohammedNoureldin - For me, also using Prometheus-Operator, the fix posted by @sgsollie that I replied and <3'd worked great.

sstarcher commented 3 months ago

With 2.10.1 we are still seeing diffs with server-side apply for volume claims Screenshot 2024-02-27 at 11 26 48 AM

crenshaw-dev commented 3 months ago

@MohammedNoureldin the diff you're seeing makes sense to me... the contents on-cluster are actually simply different from the contents in git.

Others, can you confirm that you're still seeing a diff after upgrading and enabling server-side-diff (which is enabled separately from server-side apply)?

sstarcher commented 3 months ago

I have not so that's my issue thank you. I'll go ensure that's enabled https://argo-cd.readthedocs.io/en/latest/user-guide/diff-strategies/

MohammedNoureldin commented 3 months ago

Thank you for replying, @crenshaw-dev

I though in the beginning that it is an issue in Argo CD, but it is not.

Actually I figured out. Installing Kube-Prometheus chart and Loki chart together is the issue. Loki installs older version of CRDs, which causes the issue. Prevening Loki from installing its CRDs to rely only on the CRDs of Grafana seems also to not be working (I am not 100% about this, it needs to be confirmed).

So it was kind of cycle, everyone of these reports out-of-sync after syncing the other one.

Yes, I activated server-side diffs (and also apply). I am not sure if it is relevant now after finding that the source of the issue is not Argo.

STollenaar commented 3 months ago

I was able to have it not show up when I used the servidesidediff option. But enabling it would only work when I used the annotation on the application for some reason

lud0v1c commented 2 months ago

Can confirm that the argocd.argoproj.io/compare-options: ServerSideDiff=true annotation stopped 6 annoying StatefulSet PVCs from showing out of sync just like OP described. Once this gets out of beta hope it works as well globally.

ForbiddenEra commented 2 months ago

With 2.10.1 we are still seeing diffs with server-side apply for volume claims Screenshot 2024-02-27 at 11 26 48 AM

BTW, the ignoreDifferences workaround works for me even w/SSA.

dioni-dev commented 1 month ago

I am having the same issue, i am running the last argocd version... omg

lifeofmoo commented 1 month ago

I believe I am also experiencing this issue - I don't see the Out of Sync issue if I am explicit and have a PVC instead of a volumeClaimTemplate

https://github.com/argoproj/argo-cd/issues/17968

Is the fix to annotate the App with argocd.argoproj.io/compare-options: ServerSideDiff=true ?

apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
  name: bp-knowledge-graph-api
  namespace: argocd
  annotations:
    argocd.argoproj.io/compare-options: ServerSideDiff=true
spec:
  destination:
    namespace: bp-knowledge-graph
    server: https://kubernetes.default.svc
  project: bp-knowledge-graph
  source:
    path: bp-knowledge-graph-api/overlays/dev
    repoURL: https://github.com/ME-Ltd/my-repo
    targetRevision: HEAD
  syncPolicy:
    automated: {}

Because this didn't work for me.

image

MohammedNoureldin commented 1 week ago

Does anyone have the issue in ArgoCD 2.10.0, that the whole ArgoCD seems to be kind of stuck when enabling server side diff? Refresh button does nothing, and most/all of resources looks like they are stuck. The logs show nill pointer issue.

I am not sure what is the exact source of this issue.

time="2024-05-23T17:10:33Z" level=error msg="Recovered from panic: runtime error: invalid memory address or nil pointer dereference\ngoroutine 265 [running]:\nruntime/debug.Stack()\n\t/usr/local/go/src/runtime/debug/stack.go:24 +0x64\ngithub.com/argoproj/argo-cd/v2/controller.(*ApplicationController).processAppRefreshQueueItem.func1()\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:1449 +0x50\npanic({0x2f42ca0?, 0x696bca0?})\n\t/usr/local/go/src/runtime/panic.go:920 +0x26c\nk8s.io/apimachinery/pkg/util/managedfields.(*GvkParser).Type(...)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/managedfields/gvkparser.go:43\ngithub.com/argoproj/gitops-engine/pkg/diff.removeWebhookMutation(0x40073c6038, 0x400d3ea6d0, 0x0, {0x1?, 0x365b150?})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:210 +0xa0\ngithub.com/argoproj/gitops-engine/pkg/diff.serverSideDiff(0x400d3ea670, 0x400d3ea6d0, {0x4003e61858, 0x8, 0x8})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:176 +0x21c\ngithub.com/argoproj/gitops-engine/pkg/diff.ServerSideDiff(0x400d3ea6d0?, 0x4003e61858?, {0x4003e61858?, 0x8?, 0x4945f00?})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:138 +0x28\ngithub.com/argoproj/gitops-engine/pkg/diff.Diff(0x400d3ea358, 0x400d3ea360, {0x4003e61858, 0x8, 0x8})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:88 +0x1c0\ngithub.com/argoproj/gitops-engine/pkg/diff.DiffArray({0x40090dc400, 0x2d, 0x4003e617f8?}, {0x40090dc200, 0x2d?, 0x3378d20?}, {0x4003e61858, 0x8, 0x8})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:814 +0x11c\ngithub.com/argoproj/argo-cd/v2/util/argo/diff.StateDiffs({0x400e9ca480?, 0x400abbd500?, 0x26?}, {0x40090dc000?, 0x4006e01880?, 0x4?}, {0x4959ef0, 0x4007f08370?})\n\t/go/src/github.com/argoproj/argo-cd/util/argo/diff/diff.go:310 +0x5a8\ngithub.com/argoproj/argo-cd/v2/controller.(*appStateManager).CompareAppState(0x400030a460, 0x4000e74800, 0x4008a43680, {0x4008a1f240, 0x4, 0x4}, {0x4006e01880?, 0x4, 0x4}, 0x0?, ...)\n\t/go/src/github.com/argoproj/argo-cd/controller/state.go:649 +0x2e24\ngithub.com/argoproj/argo-cd/v2/controller.(*ApplicationController).processAppRefreshQueueItem(0x4000b99520)\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:1560 +0xe44\ngithub.com/argoproj/argo-cd/v2/controller.(*ApplicationController).Run.func3()\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:798 +0x2c\nk8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0x40009c56b0?)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:157 +0x40\nk8s.io/apimachinery/pkg/util/wait.BackoffUntil(0x4000a12bd0?, {0x49047c0, 0x4000f48990}, 0x1, 0x400029baa0)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:158 +0x90\nk8s.io/apimachinery/pkg/util/wait.JitterUntil(0x40009c5f80?, 0x3b9aca00, 0x0, 0x20?, 0x40009c4ba0?)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:135 +0x80\nk8s.io/apimachinery/pkg/util/wait.Until(0x40009c4ff0?, 0x40009c5170?, 0x4000a12240?)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:92 +0x28\ncreated by github.com/argoproj/argo-cd/v2/controller.(*ApplicationController).Run in goroutine 117\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:797 +0x5ec\n"
evanrich commented 1 week ago

Does anyone have the issue in ArgoCD 2.10.0, that the whole ArgoCD seems to be kind of stuck when enabling server side diff? Refresh button does nothing, and most/all of resources looks like they are stuck. The logs show nill pointer issue.

I am not sure what is the exact source of this issue.

time="2024-05-23T17:10:33Z" level=error msg="Recovered from panic: runtime error: invalid memory address or nil pointer dereference\ngoroutine 265 [running]:\nruntime/debug.Stack()\n\t/usr/local/go/src/runtime/debug/stack.go:24 +0x64\ngithub.com/argoproj/argo-cd/v2/controller.(*ApplicationController).processAppRefreshQueueItem.func1()\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:1449 +0x50\npanic({0x2f42ca0?, 0x696bca0?})\n\t/usr/local/go/src/runtime/panic.go:920 +0x26c\nk8s.io/apimachinery/pkg/util/managedfields.(*GvkParser).Type(...)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/managedfields/gvkparser.go:43\ngithub.com/argoproj/gitops-engine/pkg/diff.removeWebhookMutation(0x40073c6038, 0x400d3ea6d0, 0x0, {0x1?, 0x365b150?})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:210 +0xa0\ngithub.com/argoproj/gitops-engine/pkg/diff.serverSideDiff(0x400d3ea670, 0x400d3ea6d0, {0x4003e61858, 0x8, 0x8})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:176 +0x21c\ngithub.com/argoproj/gitops-engine/pkg/diff.ServerSideDiff(0x400d3ea6d0?, 0x4003e61858?, {0x4003e61858?, 0x8?, 0x4945f00?})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:138 +0x28\ngithub.com/argoproj/gitops-engine/pkg/diff.Diff(0x400d3ea358, 0x400d3ea360, {0x4003e61858, 0x8, 0x8})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:88 +0x1c0\ngithub.com/argoproj/gitops-engine/pkg/diff.DiffArray({0x40090dc400, 0x2d, 0x4003e617f8?}, {0x40090dc200, 0x2d?, 0x3378d20?}, {0x4003e61858, 0x8, 0x8})\n\t/go/pkg/mod/github.com/argoproj/gitops-engine@v0.7.1-0.20240122213038-792124280fcc/pkg/diff/diff.go:814 +0x11c\ngithub.com/argoproj/argo-cd/v2/util/argo/diff.StateDiffs({0x400e9ca480?, 0x400abbd500?, 0x26?}, {0x40090dc000?, 0x4006e01880?, 0x4?}, {0x4959ef0, 0x4007f08370?})\n\t/go/src/github.com/argoproj/argo-cd/util/argo/diff/diff.go:310 +0x5a8\ngithub.com/argoproj/argo-cd/v2/controller.(*appStateManager).CompareAppState(0x400030a460, 0x4000e74800, 0x4008a43680, {0x4008a1f240, 0x4, 0x4}, {0x4006e01880?, 0x4, 0x4}, 0x0?, ...)\n\t/go/src/github.com/argoproj/argo-cd/controller/state.go:649 +0x2e24\ngithub.com/argoproj/argo-cd/v2/controller.(*ApplicationController).processAppRefreshQueueItem(0x4000b99520)\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:1560 +0xe44\ngithub.com/argoproj/argo-cd/v2/controller.(*ApplicationController).Run.func3()\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:798 +0x2c\nk8s.io/apimachinery/pkg/util/wait.BackoffUntil.func1(0x40009c56b0?)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:157 +0x40\nk8s.io/apimachinery/pkg/util/wait.BackoffUntil(0x4000a12bd0?, {0x49047c0, 0x4000f48990}, 0x1, 0x400029baa0)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:158 +0x90\nk8s.io/apimachinery/pkg/util/wait.JitterUntil(0x40009c5f80?, 0x3b9aca00, 0x0, 0x20?, 0x40009c4ba0?)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:135 +0x80\nk8s.io/apimachinery/pkg/util/wait.Until(0x40009c4ff0?, 0x40009c5170?, 0x4000a12240?)\n\t/go/pkg/mod/k8s.io/apimachinery@v0.26.11/pkg/util/wait/wait.go:92 +0x28\ncreated by github.com/argoproj/argo-cd/v2/controller.(*ApplicationController).Run in goroutine 117\n\t/go/src/github.com/argoproj/argo-cd/controller/appcontroller.go:797 +0x5ec\n"

recent versions were royally jacked for me, I ended up rolling back to 6.7.18 version of the chart, which = 2.10.9 this sort of works.