volcano-sh / devices

Device plugins for Volcano, e.g. GPU
Apache License 2.0
97 stars 41 forks source link

Improve the logic of finding candidate pod in Allocate RPC #25

Open shinytang6 opened 2 years ago

shinytang6 commented 2 years ago

Currently in the device plugin Allocate RPC, we need to find the candidate pod according to the container in the request. If there are multiple gpu containers in one pod, obviously there will be logic problems when finding the candidate pod.

func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
    var reqCount uint
    for _, req := range reqs.ContainerRequests {
        reqCount += uint(len(req.DevicesIDs))
    }

    responses := pluginapi.AllocateResponse{}

    firstContainerReq := reqs.ContainerRequests[0]
    firstContainerReqDeviceCount := uint(len(firstContainerReq.DevicesIDs))

    availablePods := podSlice{}
    pendingPods, err := m.kubeInteractor.GetPendingPodsOnNode()
    if err != nil {
        return nil, err
    }
    for _, pod := range pendingPods {
        current := pod
        if IsGPURequiredPod(&current) && !IsGPUAssignedPod(&current) && !IsShouldDeletePod(&current) {
            availablePods = append(availablePods, &current)
        }
    }

    sort.Sort(availablePods)

    var candidatePod *v1.Pod
    for _, pod := range availablePods {
        for i, c := range pod.Spec.Containers {
            if !IsGPURequiredContainer(&c) {
                continue
            }

            if GetGPUResourceOfContainer(&pod.Spec.Containers[i]) == firstContainerReqDeviceCount {
                klog.Infof("Got candidate Pod %s(%s), the device count is: %d", pod.UID, c.Name, firstContainerReqDeviceCount)
                candidatePod = pod
                goto Allocate
            }
        }
    }

        ....
Thor-wl commented 2 years ago

/cc @wpeng102 @william-wang