litmuschaos / litmus

Litmus helps SREs and developers practice chaos engineering in a Cloud-native way. Chaos experiments are published at the ChaosHub (https://hub.litmuschaos.io). Community notes is at https://hackmd.io/a4Zu_sH4TZGeih-xCimi3Q
https://litmuschaos.io
Apache License 2.0
4.45k stars 698 forks source link

errorInYamlDescription in 3.4.0 UI when submitting an existing 2.14 Workflow Experiment #4479

Open sebay opened 9 months ago

sebay commented 9 months ago

What happened: When I try to create a new Experiment from an existing 2.14.0 yaml I get an error without any detail except: "errorInYamlDescription". I am also unable to edit or run the experiment.

What you expected to happen: It should be possible to import existing 2.14.0 experiment

Where can this issue be corrected? (optional)

How to reproduce it (as minimally and precisely as possible):

kind: Workflow
apiVersion: argoproj.io/v1alpha1
metadata:
  name: custom-active-broker-container-kill
  namespace: application
  labels:
    subject: custom-active-broker-container-kill
spec:
  templates:
    - name: custom-chaos
      inputs: {}
      outputs: {}
      metadata: {}
      steps:
        - - name: install-chaos-experiments
            template: install-chaos-experiments
            arguments: {}
        - - name: check-ocp-prometheus-is-up
            template: check-ocp-prometheus-is-up
            arguments: {}          
        - - name: generate-traffic
            template: generate-traffic
            arguments: {}         
        - - name: container-kill-active
            template: container-kill-active
            arguments: {}
        - - name: stop-traffic
            template: stop-traffic
            arguments: {}
    - name: install-chaos-experiments
      inputs:
        artifacts:
          - name: container-kill-aqh
            path: /tmp/container-kill-aqh.yaml
            raw:
              data: |
                apiVersion: litmuschaos.io/v1alpha1
                description:
                  message: |
                    Kills a container belonging to an application pod 
                kind: ChaosExperiment
                metadata:
                  name: container-kill
                  labels:
                    name: container-kill
                    app.kubernetes.io/part-of: litmus
                    app.kubernetes.io/component: chaosexperiment
                    app.kubernetes.io/version: latest
                spec:
                  definition:
                    scope: Namespaced
                    permissions:
                      - apiGroups:
                          - ""
                        resources:
                          - pods
                        verbs:
                          - create
                          - delete
                          - get
                          - list
                          - patch
                          - update
                          - deletecollection
                      - apiGroups:
                          - ""
                        resources:
                          - events
                        verbs:
                          - create
                          - get
                          - list
                          - patch
                          - update
                      - apiGroups:
                          - ""
                        resources:
                          - configmaps
                        verbs:
                          - get
                          - list
                      - apiGroups:
                          - ""
                        resources:
                          - pods/log
                        verbs:
                          - get
                          - list
                          - watch
                      - apiGroups:
                          - ""
                        resources:
                          - pods/exec
                        verbs:
                          - get
                          - list
                          - create
                      - apiGroups:
                          - apps
                        resources:
                          - deployments
                          - statefulsets
                          - replicasets
                          - daemonsets
                        verbs:
                          - list
                          - get
                      - apiGroups:
                          - apps.openshift.io
                        resources:
                          - deploymentconfigs
                        verbs:
                          - list
                          - get
                      - apiGroups:
                          - ""
                        resources:
                          - replicationcontrollers
                        verbs:
                          - get
                          - list
                      - apiGroups:
                          - argoproj.io
                        resources:
                          - rollouts
                        verbs:
                          - list
                          - get
                      - apiGroups:
                          - batch
                        resources:
                          - jobs
                        verbs:
                          - create
                          - list
                          - get
                          - delete
                          - deletecollection
                      - apiGroups:
                          - litmuschaos.io
                        resources:
                          - chaosengines
                          - chaosexperiments
                          - chaosresults
                        verbs:
                          - create
                          - list
                          - get
                          - patch
                          - update
                          - delete
                    image: xxx/litmuschaos/go-runner:3.4.0
                    imagePullPolicy: IfNotPresent
                    args:
                      - -c
                      - ./experiments -name container-kill
                    command:
                      - /bin/bash
                    env:
                      - name: TARGET_CONTAINER
                        value: ""
                      - name: RAMP_TIME
                        value: ""
                      - name: LIB
                        value: litmus
                      - name: TARGET_PODS
                        value: ""
                      - name: CHAOS_INTERVAL
                        value: "10"
                      - name: SIGNAL
                        value: SIGKILL
                      - name: SOCKET_PATH
                        value: /var/run/docker.sock
                      - name: CONTAINER_RUNTIME
                        value: docker
                      - name: TOTAL_CHAOS_DURATION
                        value: "20"
                      - name: PODS_AFFECTED_PERC
                        value: ""
                      - name: NODE_LABEL
                        value: ""
                      - name: LIB_IMAGE
                        value: regustry/litmuschaos/go-runner:3.4.0
                      - name: SEQUENCE
                        value: parallel
                    labels:
                      name: container-kill
                      app.kubernetes.io/part-of: litmus
                      app.kubernetes.io/component: experiment-job
                      app.kubernetes.io/runtime-api-usage: "true"
                      app.kubernetes.io/version: 3.4.0
      outputs: {}
      metadata: {}
      container:
        name: ""
        image: xxx/litmuschaos/k8s:3.4.0
        command:
          - sh
          - -c
        args:
          - kubectl apply -f /tmp/container-kill-aqh.yaml -n
            {{workflow.parameters.adminModeNamespace}} &&  sleep 30
        resources: {}                        
    - name: check-ocp-prometheus-is-up
      inputs: {}
      outputs: {}
      metadata: {}
      container:
        name: "check-prometheus"
        image: xxx/curlimages/curl
        imagePullPolicy: IfNotPresent
        command:
          - sh
          - -c
        args:
          - echo "Checking OCP Prometheus is up";
            PROM=$(curl -s -o /dev/null -w "%{http_code}" -k https://xx/graph);
            [ ! $PROM == "200" ] && echo "OCP Prometheus is down https://xx/graph" && exit 1;
            exit 0;
        # Not needed
        env:
        - name: OCP_PROMETHEUS
          valueFrom:
            configMapKeyRef:
              name: experiment-config
              key: refkit.client.prometheus
        resources: {}            
    - name: generate-traffic
      inputs: {}
      outputs: {}
      metadata: {}
      container:
        name: "testkit"
        image: registry/curlimages/curl
        imagePullPolicy: IfNotPresent
        command:
          - sh
          - -c
        args:
          - echo "Running Testkit";
            cd /tmp;
            cp /opt/scripts/refkitapp_send.sh /tmp/;
            chmod +x /tmp/rsend.sh;
            /tmp/send.sh;
        env:
        volumeMounts:
          - mountPath: /opt/scripts
            name: script-refkit-configmap
            readOnly: false       
        resources: {}
      volumes:
      - configMap:
          name: script-refkit-configmap
        name: script-refkit-configmap
    - name: container-kill-active
      inputs:
        artifacts:
          - name: container-kill
            path: /tmp/chaosengine-container-kill.yaml
            raw:
              data: |
                apiVersion: litmuschaos.io/v1alpha1
                kind: ChaosEngine
                metadata:
                  namespace: "{{workflow.parameters.adminModeNamespace}}"
                  generateName: container-kill
                  labels:
                    workflow_run_id: "{{workflow.uid}}"
                spec:
                  engineState: active
                  #terminationGracePeriodSeconds: 30
                  appinfo:
                    appns: application
                    applabel: node-type=message-routing-primary
                    appkind: statefulset
                  chaosServiceAccount: litmus-admin
                  jobCleanUpPolicy: retain
                  #jobCleanUpPolicy: delete
                  experiments:
                    - name: container-kill
                      spec:           
                        probe:
                        - name: "check-probe-rate"
                          type: "cmdProbe"
                          cmdProbe/inputs:
                            #command: curl ... | jq .data.result[0].value[1]'
                            command: > 
                              curl -k "https://xxx' | sed 's/\".*//' | sed  's/\..*//' | sed 's/result/0/'
                            comparator:
                              type: 'float'
                              criteria: '>='
                              value: '40'
                          mode: "OnChaos"
                          runProperties:  
                            probeTimeout: 2
                            interval: 10
                            retry: 3
                            probePollingInterval: 5
                            #stopOnFailure: true                            
                        components:               
                          env:           
                            - name: TOTAL_CHAOS_DURATION
                              value: "120"
                            - name: CHAOS_INTERVAL
                              value: "10"
                            - name: CONTAINER_RUNTIME
                              value: containerd
                            - name: SOCKET_PATH
                              value: /run/containerd/containerd.sock
                            - name: PODS_AFFECTED_PERC
                              value: ""
                            - name: TARGET_CONTAINER
                              value: ""                            
                            - name: RAMP_TIME
                              value: "30"
      outputs: {}
      metadata:
        labels:
          weight: "10"
      container:
        name: ""
        image: registry/litmuschaos/litmus-checker:3.4.0
        imagePullPolicy: IfNotPresent
        args:
          - -file=/tmp/chaosengine-container-kill.yaml
          - -saveName=/tmp/engine-name
        resources: {}

    - name: stop-traffic
      inputs: {}
      outputs: {}
      metadata: {}
      container:
        name: "testkit"
        image: registry/curlimages/curl
        imagePullPolicy: IfNotPresent
        command:
          - sh
          - -c
        args:
          - echo "Stopping Testkit";
            curl -k -X POST "https://xx" -H "accept:*/*" -d "";
        env:
        resources: {}

  entrypoint: custom-chaos

  arguments:
    parameters:
      - name: adminModeNamespace
        value: application
      - name: appNamespace
        value: default
      - name: appLabel
        value: dummy

  serviceAccountName: argo-chaos

  #podGC:
  #  strategy: OnWorkflowCompletion
  #  deleteDelayDuration: 30s

  securityContext:
    runAsUser: 1000
    runAsNonRoot: true

Anything else we need to know?:

SarthakJain26 commented 8 months ago

@Jonsy13 PTAL

dvdklnr commented 5 months ago

I'm seeing this issue as well. I'm wondering if I should just go back to version 2.x rather than fight 3.0 behavior.

Jonsy13 commented 5 months ago

Hi @sebay @dvdklnr,

Thanks for trying 3.x ChaosCenter. There are few changes done as part of 3.x version -

In the provided manifest, There are 3 steps which are free flow steps and not Chaos fault step

- - name: check-ocp-prometheus-is-up
    template: check-ocp-prometheus-is-up
    arguments: {}          
- - name: generate-traffic
    template: generate-traffic
    arguments: {}
- - name: stop-traffic
    template: stop-traffic
    arguments: {}

Since these steps are not chaos fault steps, you cannot use them. If you remove them and trying using same experiment in 3.x, it will work. I tested with your experiment itself.

On the error description, Yes this can be enhanced better. @hrishavjha can check that!

dvdklnr commented 5 months ago

I tried converting the previous workflow manifest to a version with probe annotation and now I'm getting a "failure to unmarshal chaosengine" error when trying to run the experiment.

Speaking of probe annotations - I can't find these k8s probe objects , is there a way to directly edit the probe

Here is the latest manifest:

apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
  name: gke-az-rollout-health-check
  namespace: litmus
  labels:
    workflow_name: "gke-az-rollout-health-check"
    subject: "gke-az-rollout-health-check_litmus"
  annotations:
    categories: "gcp,availability"
    definition: "gke-az-chaos-rollout-health"
    vendor: "CNCF"
spec:
  arguments:
    parameters:
      - name: TOTAL_CHAOS_DURATION
        value: "300"
      - name: GCP_PROJECT_ID
        value: "company-service-staging"
      - name: REGION
        value: "us-west1"
      - name: ZONE
        value: "us-west1-c"
      - name: ROLLOUT_NAMESPACE
        value: "company-staging-ns"
      - name: ROLLOUT_NAME
        value: "atlantis"
  entrypoint: gke-az-chaos
  serviceAccountName: argo-chaos
  templates:
    - name: gke-az-chaos
      steps:
        - - name: install-chaos-faults
            template: install-chaos-faults
        - - name: gcp-az-chaos
            template: gcp-az-chaos
        - - name: revert-chaos
            template: revert-chaos

    - name: install-chaos-faults
      inputs:
        artifacts:
          - name: gcp-az-chaos
            path: /tmp/gcp-az-chaos.yaml
            raw:
              data: |
                apiVersion: litmuschaos.io/v1alpha1
                kind: ChaosExperiment
                metadata:
                  name: gcp-az-chaos
                  namespace: {{workflow.namespace}}
                spec:
                  definition:
                    scope: cluster
                    permissions:
                      - apiGroups: [""]
                        resources: ["pods"]
                        verbs: ["get","list","patch","update"]
                    image: "litmuschaos/go-runner:latest"
                    args:
                    - -c
                    - ./experiments -name gcp-az-chaos
                    command:
                    - /bin/bash
                    env:
                    - name: TOTAL_CHAOS_DURATION
                      value: '{{workflow.parameters.TOTAL_CHAOS_DURATION}}'
                    - name: CLOUD_PROVIDER
                      value: 'gcp'
                    - name: PROJECT_ID
                      value: '{{workflow.parameters.GCP_PROJECT_ID}}'
                    - name: REGION
                      value: '{{workflow.parameters.REGION}}'
                    - name: ZONE
                      value: '{{workflow.parameters.ZONE}}'
                    - name: ZONE_SELECTION
                      value: 'single_zone'
                    labels:
                      name: gcp-az-chaos
      container:
        image: litmuschaos/k8s:latest
        command: [sh, -c]
        args:
          ["kubectl apply -f /tmp/gcp-az-chaos.yaml -n {{workflow.namespace}}"]

    - name: gcp-az-chaos
      inputs:
        artifacts:
          - name: gcp-az-chaos
            path: /tmp/chaosengine-gcp-az-chaos.yaml
            raw:
              data: |
                apiVersion: litmuschaos.io/v1alpha1
                kind: ChaosEngine
                metadata:
                  namespace: {{workflow.namespace}}
                  labels:
                    context: "{{workflow.parameters.appNamespace}}_kube-proxy"
                    workflow_run_id: "{{ workflow.uid }}"
                    workflow_name: gcp-az-chaos
                  annotations:
                    probeRef: '[{"name":"atlantis-health","mode":"SOT"}]'
                  generateName: gcp-az-chaos
                spec:
                  engineState: 'active'
                  annotationCheck: 'false'
                  appinfo:
                    appns: '{{workflow.parameters.ROLLOUT_NAMESPACE}}'
                    applabel: 'app={{workflow.parameters.ROLLOUT_NAME}}'
                    appkind: 'rollout'
                  chaosServiceAccount: argo-chaos
                  experiments:
                    - name: gcp-az-chaos
                      spec:
                        components:
                          env:
                            - name: TOTAL_CHAOS_DURATION
                              value: "60"
                            - name: CHAOS_INTERVAL
                              value: "10"
                            - name: FORCE
                              value: "false"

      container:
        image: litmuschaos/litmus-checker:latest
        args:
          - -file=/tmp/chaosengine-gcp-az-chaos.yaml
          - -saveName=/tmp/engine-name

    - name: revert-chaos
      container:
        image: litmuschaos/k8s:latest
        command: [sh, -c]
        args:
          - "kubectl delete chaosengine -l 'workflow_run_id={{workflow.uid}}' -n {{workflow.namespace}}"

edit: updated markup