kubernetes-sigs / descheduler

Descheduler for Kubernetes
https://sigs.k8s.io/descheduler
Apache License 2.0
4.23k stars 645 forks source link

Endless descheduling of pods with node affinity preferredDuringSchedulingIgnoredDuringExecution and enough resources available on not tainted node but not on a tainted node #1410

Open dbrcelum opened 1 month ago

dbrcelum commented 1 month ago

What version of descheduler are you using?

descheduler version: 0.29.0/0.30.0

Does this issue reproduce with the latest release? yes

Which descheduler CLI options are you using?

logging-format: json
v: 4

Please provide a copy of your descheduler policy config file

deschedulerPolicy:
  strategies:
    LowNodeUtilization:
      enabled: false
      params:
        nodeResourceUtilizationThresholds:
          targetThresholds:
            cpu: 50
            memory: 50
            pods: 50
          thresholds:
            cpu: 20
            memory: 20
            pods: 20
    RemoveDuplicates:
      enabled: false
    RemovePodsHavingTooManyRestarts:
      enabled: false
      params:
        podsHavingTooManyRestarts:
          includingInitContainers: true
          podRestartThreshold: 100
    RemovePodsViolatingInterPodAntiAffinity:
      enabled: false
    RemovePodsViolatingNodeAffinity:
      enabled: true
      params:
        nodeAffinityType:
        - preferredDuringSchedulingIgnoredDuringExecution
        nodeFit: true
    RemovePodsViolatingNodeTaints:
      enabled: false
    RemovePodsViolatingTopologySpreadConstraint:
      enabled: false
      params:
        includeSoftConstraints: false

What k8s version are you using (kubectl version)? v.1.28.3

kubectl version Output
$ kubectl version
```
Client Version: v1.28.3
Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3
Server Version: v1.28.3
```

What did you do? Given a deployment with nodeAffinity

      affinity:
        nodeAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              preference:
                matchExpressions:
                  - key: kubernetes.azure.com/scalesetpriority
                    operator: In
                    values:
                      - spot

and not having enough resources on the tainted node pool but on an untainted node pool leads to following behaviour:

What did you expect to see? Following test has been created:

func TestRespectPodsViolatingNodeAffinity(t *testing.T) {
    nodeLabelKey := "kubernetes.io/desiredNode"
    nodeLabelValue := "yes"
    nodeWithLabels := test.BuildTestNode("nodeWithLabels", 10, 10, 10, nil)
    nodeWithLabels.Labels["kubernetes.azure.com/scalesetpriority"] = "spot"

    nodeWithoutLabels2 := test.BuildTestNode("nodeWithoutLabels2", 2000, 3000, 10, nil)
    nodeWithoutLabels3 := test.BuildTestNode("nodeWithoutLabels3", 2000, 3000, 10, nil)

    unschedulableNodeWithLabels := test.BuildTestNode("unschedulableNodeWithLabels", 2000, 3000, 10, nil)
    unschedulableNodeWithLabels.Labels[nodeLabelKey] = nodeLabelValue
    unschedulableNodeWithLabels.Spec.Unschedulable = true

    addPodsToNode := func(node *v1.Node, deletionTimestamp *metav1.Time, affinityType string) []*v1.Pod {
        podWithNodeAffinity := test.BuildTestPod("podWithNodeAffinity", 1000, 50, node.Name, nil)
        podWithNodeAffinity.Spec.Affinity = &v1.Affinity{
            NodeAffinity: &v1.NodeAffinity{},
        }

        switch affinityType {
        case "requiredDuringSchedulingIgnoredDuringExecution":
            podWithNodeAffinity.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
                NodeSelectorTerms: []v1.NodeSelectorTerm{
                    {
                        MatchExpressions: []v1.NodeSelectorRequirement{
                            {
                                Key:      nodeLabelKey,
                                Operator: "In",
                                Values: []string{
                                    nodeLabelValue,
                                },
                            },
                        },
                    },
                },
            }
        case "preferredDuringSchedulingIgnoredDuringExecution":
            podWithNodeAffinity.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.PreferredSchedulingTerm{
                {
                    Weight: 10,
                    Preference: v1.NodeSelectorTerm{
                        MatchExpressions: []v1.NodeSelectorRequirement{
                            {
                                Key:      "kubernetes.azure.com/scalesetpriority",
                                Operator: "In",
                                Values: []string{
                                    "spot",
                                },
                            },
                        },
                    },
                },
            }
        case "requiredDuringSchedulingRequiredDuringExecution":
        default:
            t.Fatalf("Invalid affinity type %s", affinityType)
        }

        pod1 := test.BuildTestPod("pod1", 100, 0, node.Name, nil)

        podWithNodeAffinity.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
        pod1.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()

        podWithNodeAffinity.DeletionTimestamp = deletionTimestamp
        pod1.DeletionTimestamp = deletionTimestamp

        return []*v1.Pod{
            podWithNodeAffinity,
        }
    }

    var uint1 uint = 10
    tests := []struct {
        description                    string
        nodes                          []*v1.Node
        pods                           []*v1.Pod
        expectedEvictedPodCount        uint
        maxPodsToEvictPerNode          *uint
        maxNoOfPodsToEvictPerNamespace *uint
        args                           RemovePodsViolatingNodeAffinityArgs
        nodefit                        bool
    }{
        {
            description:             "Pod is scheduled on node without matching labels, and schedulable node where pod could fit is available but no having enough resources, should not evict [preferred affinity]",
            expectedEvictedPodCount: 0,
            args: RemovePodsViolatingNodeAffinityArgs{
                NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"},
            },
            pods:                  addPodsToNode(nodeWithoutLabels2, nil, "preferredDuringSchedulingIgnoredDuringExecution"),
            nodes:                 []*v1.Node{nodeWithoutLabels2, nodeWithoutLabels3, nodeWithLabels},
            maxPodsToEvictPerNode: &uint1,
            nodefit:               true,
        },
    }

    for _, tc := range tests {
        t.Run(tc.description, func(t *testing.T) {
            ctx, cancel := context.WithCancel(context.Background())
            defer cancel()

            var objs []runtime.Object
            for _, node := range tc.nodes {
                objs = append(objs, node)
            }
            for _, pod := range tc.pods {
                objs = append(objs, pod)
            }
            fakeClient := fake.NewSimpleClientset(objs...)

            sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
            podInformer := sharedInformerFactory.Core().V1().Pods().Informer()

            getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
            if err != nil {
                t.Errorf("Build get pods assigned to node function error: %v", err)
            }

            sharedInformerFactory.Start(ctx.Done())
            sharedInformerFactory.WaitForCacheSync(ctx.Done())

            eventRecorder := &events.FakeRecorder{}

            podEvictor := evictions.NewPodEvictor(
                fakeClient,
                policyv1.SchemeGroupVersion.String(),
                false,
                tc.maxPodsToEvictPerNode,
                tc.maxNoOfPodsToEvictPerNamespace,
                tc.nodes,
                false,
                eventRecorder,
            )

            defaultevictorArgs := &defaultevictor.DefaultEvictorArgs{
                EvictLocalStoragePods:   false,
                EvictSystemCriticalPods: false,
                IgnorePvcPods:           false,
                EvictFailedBarePods:     false,
                NodeFit:                 tc.nodefit,
            }

            evictorFilter, err := defaultevictor.New(
                defaultevictorArgs,
                &frameworkfake.HandleImpl{
                    ClientsetImpl:                 fakeClient,
                    GetPodsAssignedToNodeFuncImpl: getPodsAssignedToNode,
                    SharedInformerFactoryImpl:     sharedInformerFactory,
                },
            )
            if err != nil {
                t.Fatalf("Unable to initialize the plugin: %v", err)
            }

            handle := &frameworkfake.HandleImpl{
                ClientsetImpl:                 fakeClient,
                GetPodsAssignedToNodeFuncImpl: getPodsAssignedToNode,
                PodEvictorImpl:                podEvictor,
                SharedInformerFactoryImpl:     sharedInformerFactory,
                EvictorFilterImpl:             evictorFilter.(frameworktypes.EvictorPlugin),
            }

            plugin, err := New(
                &RemovePodsViolatingNodeAffinityArgs{
                    NodeAffinityType: tc.args.NodeAffinityType,
                },
                handle,
            )
            if err != nil {
                t.Fatalf("Unable to initialize the plugin: %v", err)
            }

            plugin.(frameworktypes.DeschedulePlugin).Deschedule(ctx, tc.nodes)
            actualEvictedPodCount := podEvictor.TotalEvicted()
            if actualEvictedPodCount != tc.expectedEvictedPodCount {
                t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
            }
        })
    }
}

The pod should not be descheduled - see

=== RUN   TestRespectPodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available_but_no_having_enough_resources,_should_not_evict_[preferred_affinity]
I0521 14:51:01.007609   31700 node.go:157] "Pod does not fit on any other node" pod:="default/podWithNodeAffinity" node:="nodeWithLabels" error:="[insufficient cpu, insufficient memory]"
I0521 14:51:01.009266   31700 node.go:154] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3"
I0521 14:51:01.009266   31700 defaultevictor.go:207] "pod does fit on other node" pod="default/podWithNodeAffinity"
I0521 14:51:01.009266   31700 node.go:186] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2"
I0521 14:51:01.009266   31700 node.go:186] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3"
I0521 14:51:01.009266   31700 node.go:189] "Pod does not fit on any node" pod:="default/podWithNodeAffinity" node:="nodeWithLabels" error:="[insufficient memory, insufficient cpu]"
I0521 14:51:01.009266   31700 node.go:170] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2"
I0521 14:51:01.009266   31700 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 14:51:01.009266   31700 node.go:325] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 14:51:01.009266   31700 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" sum weight=0
I0521 14:51:01.009266   31700 node.go:325] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" best weight=0
I0521 14:51:01.009266   31700 node.go:340] "Pod has weight on node " node="default/podWithNodeAffinity" best weight=0
I0521 14:51:01.009266   31700 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 14:51:01.009266   31700 node.go:325] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 14:51:01.009266   31700 node_affinity.go:108] "filtering on preferredDuringSchedulingIgnoredDuringExecution " node affinity=true evict filter =true fits other node=true best node weight=0 currentNodeWeight=0
    --- PASS: TestRespectPodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available_but_no_having_enough_resources,_should_not_evict_[preferred_affinity] (0.11s)
PASS

What did you see instead? The pod got endlessly descheduled.

=== RUN   TestRemovePodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available,_should_not_evict_[preferred_affinity]
I0521 13:35:32.301233   26416 node.go:157] "Pod does not fit on any other node" pod:="default/podWithNodeAffinity" node:="nodeWithLabels" error:="[insufficient cpu, insufficient memory]"
I0521 13:35:32.302299   26416 node.go:154] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3"
I0521 13:35:32.302299   26416 defaultevictor.go:207] "pod does fit on other node" pod="default/podWithNodeAffinity"
I0521 13:35:32.302299   26416 node.go:170] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2"
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" sum weight=0
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" best weight=0
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithLabels" sum weight=10
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithLabels" best weight=10
I0521 13:35:32.302299   26416 node.go:323] "Pod has weight on node " node="default/podWithNodeAffinity" best weight=10
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 13:35:32.302299   26416 node_affinity.go:107] "filtering on preferredDuringSchedulingIgnoredDuringExecution " node affinity=true evict filter =true fits other node=true best node weight=10 currentNodeWeight=0
    node_affinity_test.go:244: Test "Pod is scheduled on node without matching labels, and schedulable node where pod could fit is available, should not evict [preferred affinity]" failed, expected 0 pod evictions, but got 1 pod evictions
    --- FAIL: TestRemovePodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available,_should_not_evict_[preferred_affinity] (0.11s)

FAIL

Analysis I analyzed the line node_affinity#105 to cause this behaviour:

            filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool {
                return utils.PodHasNodeAffinity(pod, utils.PreferredDuringSchedulingIgnoredDuringExecution) &&
                    d.handle.Evictor().Filter(pod) &&
                    nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) &&
                                       // Here all nodes are taken into account -> also where not enough resources are available
                                      // if there is a tainted node with not enough resources it will deschedule the pod
                    (nodeutil.GetBestNodeWeightGivenPodPreferredAffinity(pod, nodes) > nodeutil.GetNodeWeightGivenPodPreferredAffinity(pod, node))
            }

As a working example for debugging purposes i tested following code (without great knowledge of how to solve this best)

            filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool {
                                fittingNodes := nodeutil.PodFittingNodes(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes)
                return utils.PodHasNodeAffinity(pod, utils.PreferredDuringSchedulingIgnoredDuringExecution) &&
                    d.handle.Evictor().Filter(pod) &&
                    nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) &&
                    (nodeutil.GetBestNodeWeightGivenPodPreferredAffinity(pod, fittingNodes) > nodeutil.GetNodeWeightGivenPodPreferredAffinity(pod, node))
            }

func PodFittingNodes(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) []*v1.Node {
    var fittingNodes []*v1.Node
    for _, node := range nodes {
        errors := NodeFit(nodeIndexer, pod, node)
        if len(errors) == 0 {
            klog.InfoS("Pod fits on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
            fittingNodes = append(fittingNodes, node)
        } else {
            klog.InfoS("Pod does not fit on any node",
                "pod:", klog.KObj(pod), "node:", klog.KObj(node), "error:", utilerrors.NewAggregate(errors).Error())
        }
    }

    return fittingNodes
}