Endless descheduling of pods with node affinity preferredDuringSchedulingIgnoredDuringExecution and enough resources available on not tainted node but not on a tainted node

What version of descheduler are you using?

descheduler version: 0.29.0/0.30.0

Does this issue reproduce with the latest release? yes

Which descheduler CLI options are you using?

logging-format: json
v: 4

Please provide a copy of your descheduler policy config file

deschedulerPolicy:
  strategies:
    LowNodeUtilization:
      enabled: false
      params:
        nodeResourceUtilizationThresholds:
          targetThresholds:
            cpu: 50
            memory: 50
            pods: 50
          thresholds:
            cpu: 20
            memory: 20
            pods: 20
    RemoveDuplicates:
      enabled: false
    RemovePodsHavingTooManyRestarts:
      enabled: false
      params:
        podsHavingTooManyRestarts:
          includingInitContainers: true
          podRestartThreshold: 100
    RemovePodsViolatingInterPodAntiAffinity:
      enabled: false
    RemovePodsViolatingNodeAffinity:
      enabled: true
      params:
        nodeAffinityType:
        - preferredDuringSchedulingIgnoredDuringExecution
        nodeFit: true
    RemovePodsViolatingNodeTaints:
      enabled: false
    RemovePodsViolatingTopologySpreadConstraint:
      enabled: false
      params:
        includeSoftConstraints: false

What k8s version are you using (kubectl version)? v.1.28.3

kubectl version Output

$ kubectl version
```
Client Version: v1.28.3
Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3
Server Version: v1.28.3
```

What did you do? Given a deployment with nodeAffinity

      affinity:
        nodeAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              preference:
                matchExpressions:
                  - key: kubernetes.azure.com/scalesetpriority
                    operator: In
                    values:
                      - spot

and not having enough resources on the tainted node pool but on an untainted node pool leads to following behaviour:

Pod gets scheduled
Descheduler compares nodes and finds on another node in the same untainted node pool a node with enough resources
Descheduler deschedules pod and pod is deployed on same node pool
Then it starts again ...

What did you expect to see? Following test has been created:

func TestRespectPodsViolatingNodeAffinity(t *testing.T) {
    nodeLabelKey := "kubernetes.io/desiredNode"
    nodeLabelValue := "yes"
    nodeWithLabels := test.BuildTestNode("nodeWithLabels", 10, 10, 10, nil)
    nodeWithLabels.Labels["kubernetes.azure.com/scalesetpriority"] = "spot"

    nodeWithoutLabels2 := test.BuildTestNode("nodeWithoutLabels2", 2000, 3000, 10, nil)
    nodeWithoutLabels3 := test.BuildTestNode("nodeWithoutLabels3", 2000, 3000, 10, nil)

    unschedulableNodeWithLabels := test.BuildTestNode("unschedulableNodeWithLabels", 2000, 3000, 10, nil)
    unschedulableNodeWithLabels.Labels[nodeLabelKey] = nodeLabelValue
    unschedulableNodeWithLabels.Spec.Unschedulable = true

    addPodsToNode := func(node *v1.Node, deletionTimestamp *metav1.Time, affinityType string) []*v1.Pod {
        podWithNodeAffinity := test.BuildTestPod("podWithNodeAffinity", 1000, 50, node.Name, nil)
        podWithNodeAffinity.Spec.Affinity = &v1.Affinity{
            NodeAffinity: &v1.NodeAffinity{},
        }

        switch affinityType {
        case "requiredDuringSchedulingIgnoredDuringExecution":
            podWithNodeAffinity.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
                NodeSelectorTerms: []v1.NodeSelectorTerm{
                    {
                        MatchExpressions: []v1.NodeSelectorRequirement{
                            {
                                Key:      nodeLabelKey,
                                Operator: "In",
                                Values: []string{
                                    nodeLabelValue,
                                },
                            },
                        },
                    },
                },
            }
        case "preferredDuringSchedulingIgnoredDuringExecution":
            podWithNodeAffinity.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.PreferredSchedulingTerm{
                {
                    Weight: 10,
                    Preference: v1.NodeSelectorTerm{
                        MatchExpressions: []v1.NodeSelectorRequirement{
                            {
                                Key:      "kubernetes.azure.com/scalesetpriority",
                                Operator: "In",
                                Values: []string{
                                    "spot",
                                },
                            },
                        },
                    },
                },
            }
        case "requiredDuringSchedulingRequiredDuringExecution":
        default:
            t.Fatalf("Invalid affinity type %s", affinityType)
        }

        pod1 := test.BuildTestPod("pod1", 100, 0, node.Name, nil)

        podWithNodeAffinity.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()
        pod1.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList()

        podWithNodeAffinity.DeletionTimestamp = deletionTimestamp
        pod1.DeletionTimestamp = deletionTimestamp

        return []*v1.Pod{
            podWithNodeAffinity,
        }
    }

    var uint1 uint = 10
    tests := []struct {
        description                    string
        nodes                          []*v1.Node
        pods                           []*v1.Pod
        expectedEvictedPodCount        uint
        maxPodsToEvictPerNode          *uint
        maxNoOfPodsToEvictPerNamespace *uint
        args                           RemovePodsViolatingNodeAffinityArgs
        nodefit                        bool
    }{
        {
            description:             "Pod is scheduled on node without matching labels, and schedulable node where pod could fit is available but no having enough resources, should not evict [preferred affinity]",
            expectedEvictedPodCount: 0,
            args: RemovePodsViolatingNodeAffinityArgs{
                NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"},
            },
            pods:                  addPodsToNode(nodeWithoutLabels2, nil, "preferredDuringSchedulingIgnoredDuringExecution"),
            nodes:                 []*v1.Node{nodeWithoutLabels2, nodeWithoutLabels3, nodeWithLabels},
            maxPodsToEvictPerNode: &uint1,
            nodefit:               true,
        },
    }

    for _, tc := range tests {
        t.Run(tc.description, func(t *testing.T) {
            ctx, cancel := context.WithCancel(context.Background())
            defer cancel()

            var objs []runtime.Object
            for _, node := range tc.nodes {
                objs = append(objs, node)
            }
            for _, pod := range tc.pods {
                objs = append(objs, pod)
            }
            fakeClient := fake.NewSimpleClientset(objs...)

            sharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
            podInformer := sharedInformerFactory.Core().V1().Pods().Informer()

            getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
            if err != nil {
                t.Errorf("Build get pods assigned to node function error: %v", err)
            }

            sharedInformerFactory.Start(ctx.Done())
            sharedInformerFactory.WaitForCacheSync(ctx.Done())

            eventRecorder := &events.FakeRecorder{}

            podEvictor := evictions.NewPodEvictor(
                fakeClient,
                policyv1.SchemeGroupVersion.String(),
                false,
                tc.maxPodsToEvictPerNode,
                tc.maxNoOfPodsToEvictPerNamespace,
                tc.nodes,
                false,
                eventRecorder,
            )

            defaultevictorArgs := &defaultevictor.DefaultEvictorArgs{
                EvictLocalStoragePods:   false,
                EvictSystemCriticalPods: false,
                IgnorePvcPods:           false,
                EvictFailedBarePods:     false,
                NodeFit:                 tc.nodefit,
            }

            evictorFilter, err := defaultevictor.New(
                defaultevictorArgs,
                &frameworkfake.HandleImpl{
                    ClientsetImpl:                 fakeClient,
                    GetPodsAssignedToNodeFuncImpl: getPodsAssignedToNode,
                    SharedInformerFactoryImpl:     sharedInformerFactory,
                },
            )
            if err != nil {
                t.Fatalf("Unable to initialize the plugin: %v", err)
            }

            handle := &frameworkfake.HandleImpl{
                ClientsetImpl:                 fakeClient,
                GetPodsAssignedToNodeFuncImpl: getPodsAssignedToNode,
                PodEvictorImpl:                podEvictor,
                SharedInformerFactoryImpl:     sharedInformerFactory,
                EvictorFilterImpl:             evictorFilter.(frameworktypes.EvictorPlugin),
            }

            plugin, err := New(
                &RemovePodsViolatingNodeAffinityArgs{
                    NodeAffinityType: tc.args.NodeAffinityType,
                },
                handle,
            )
            if err != nil {
                t.Fatalf("Unable to initialize the plugin: %v", err)
            }

            plugin.(frameworktypes.DeschedulePlugin).Deschedule(ctx, tc.nodes)
            actualEvictedPodCount := podEvictor.TotalEvicted()
            if actualEvictedPodCount != tc.expectedEvictedPodCount {
                t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount)
            }
        })
    }
}

The pod should not be descheduled - see

=== RUN   TestRespectPodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available_but_no_having_enough_resources,_should_not_evict_[preferred_affinity]
I0521 14:51:01.007609   31700 node.go:157] "Pod does not fit on any other node" pod:="default/podWithNodeAffinity" node:="nodeWithLabels" error:="[insufficient cpu, insufficient memory]"
I0521 14:51:01.009266   31700 node.go:154] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3"
I0521 14:51:01.009266   31700 defaultevictor.go:207] "pod does fit on other node" pod="default/podWithNodeAffinity"
I0521 14:51:01.009266   31700 node.go:186] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2"
I0521 14:51:01.009266   31700 node.go:186] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3"
I0521 14:51:01.009266   31700 node.go:189] "Pod does not fit on any node" pod:="default/podWithNodeAffinity" node:="nodeWithLabels" error:="[insufficient memory, insufficient cpu]"
I0521 14:51:01.009266   31700 node.go:170] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2"
I0521 14:51:01.009266   31700 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 14:51:01.009266   31700 node.go:325] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 14:51:01.009266   31700 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" sum weight=0
I0521 14:51:01.009266   31700 node.go:325] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" best weight=0
I0521 14:51:01.009266   31700 node.go:340] "Pod has weight on node " node="default/podWithNodeAffinity" best weight=0
I0521 14:51:01.009266   31700 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 14:51:01.009266   31700 node.go:325] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 14:51:01.009266   31700 node_affinity.go:108] "filtering on preferredDuringSchedulingIgnoredDuringExecution " node affinity=true evict filter =true fits other node=true best node weight=0 currentNodeWeight=0
    --- PASS: TestRespectPodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available_but_no_having_enough_resources,_should_not_evict_[preferred_affinity] (0.11s)
PASS

What did you see instead? The pod got endlessly descheduled.

=== RUN   TestRemovePodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available,_should_not_evict_[preferred_affinity]
I0521 13:35:32.301233   26416 node.go:157] "Pod does not fit on any other node" pod:="default/podWithNodeAffinity" node:="nodeWithLabels" error:="[insufficient cpu, insufficient memory]"
I0521 13:35:32.302299   26416 node.go:154] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3"
I0521 13:35:32.302299   26416 defaultevictor.go:207] "pod does fit on other node" pod="default/podWithNodeAffinity"
I0521 13:35:32.302299   26416 node.go:170] "Pod fits on node" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2"
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" sum weight=0
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels3" best weight=0
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithLabels" sum weight=10
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithLabels" best weight=10
I0521 13:35:32.302299   26416 node.go:323] "Pod has weight on node " node="default/podWithNodeAffinity" best weight=10
I0521 13:35:32.302299   26416 predicates.go:301] "node has weight for pod" pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" sum weight=0
I0521 13:35:32.302299   26416 node.go:308] "Pod has total weight on node " pod="default/podWithNodeAffinity" node="nodeWithoutLabels2" best weight=0
I0521 13:35:32.302299   26416 node_affinity.go:107] "filtering on preferredDuringSchedulingIgnoredDuringExecution " node affinity=true evict filter =true fits other node=true best node weight=10 currentNodeWeight=0
    node_affinity_test.go:244: Test "Pod is scheduled on node without matching labels, and schedulable node where pod could fit is available, should not evict [preferred affinity]" failed, expected 0 pod evictions, but got 1 pod evictions
    --- FAIL: TestRemovePodsViolatingNodeAffinity/Pod_is_scheduled_on_node_without_matching_labels,_and_schedulable_node_where_pod_could_fit_is_available,_should_not_evict_[preferred_affinity] (0.11s)

FAIL

Analysis I analyzed the line node_affinity#105 to cause this behaviour:

            filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool {
                return utils.PodHasNodeAffinity(pod, utils.PreferredDuringSchedulingIgnoredDuringExecution) &&
                    d.handle.Evictor().Filter(pod) &&
                    nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) &&
                                       // Here all nodes are taken into account -> also where not enough resources are available
                                      // if there is a tainted node with not enough resources it will deschedule the pod
                    (nodeutil.GetBestNodeWeightGivenPodPreferredAffinity(pod, nodes) > nodeutil.GetNodeWeightGivenPodPreferredAffinity(pod, node))
            }

As a working example for debugging purposes i tested following code (without great knowledge of how to solve this best)

            filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool {
                                fittingNodes := nodeutil.PodFittingNodes(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes)
                return utils.PodHasNodeAffinity(pod, utils.PreferredDuringSchedulingIgnoredDuringExecution) &&
                    d.handle.Evictor().Filter(pod) &&
                    nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) &&
                    (nodeutil.GetBestNodeWeightGivenPodPreferredAffinity(pod, fittingNodes) > nodeutil.GetNodeWeightGivenPodPreferredAffinity(pod, node))
            }

func PodFittingNodes(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) []*v1.Node {
    var fittingNodes []*v1.Node
    for _, node := range nodes {
        errors := NodeFit(nodeIndexer, pod, node)
        if len(errors) == 0 {
            klog.InfoS("Pod fits on node", "pod", klog.KObj(pod), "node", klog.KObj(node))
            fittingNodes = append(fittingNodes, node)
        } else {
            klog.InfoS("Pod does not fit on any node",
                "pod:", klog.KObj(pod), "node:", klog.KObj(node), "error:", utilerrors.NewAggregate(errors).Error())
        }
    }

    return fittingNodes
}

kubernetes-sigs / descheduler

Endless descheduling of pods with node affinity preferredDuringSchedulingIgnoredDuringExecution and enough resources available on not tainted node but not on a tainted node #1410