k8snetworkplumbingwg / sriov-network-operator

Operator for provisioning and configuring SR-IOV CNI plugin and device plugin
Apache License 2.0
85 stars 114 forks source link

CDI: two sriovnodepolicy configs(8 vfs) but only four deviceNode #735

Closed cyclinder closed 4 months ago

cyclinder commented 4 months ago

I have two sriovnodepolicy configs, see below:

root@controller-node-1:/home/cyclinder/sriov# kubectl get sriovnetworknodepolicies.sriovnetwork.openshift.io -A -o wide
NAMESPACE     NAME      AGE
kube-system   policy1   43m
kube-system   policy2   7m44s
root@controller-node-1:/home/cyclinder/sriov# kubectl get sriovnetworknodestates.sriovnetwork.openshift.io -n kube-system -o yaml
apiVersion: v1
items:
- apiVersion: sriovnetwork.openshift.io/v1
  kind: SriovNetworkNodeState
  metadata:
    annotations:
      sriovnetwork.openshift.io/current-state: Idle
      sriovnetwork.openshift.io/desired-state: Idle
    creationTimestamp: "2024-07-16T03:50:05Z"
    generation: 5
    name: worker-node-1
    namespace: kube-system
    ownerReferences:
    - apiVersion: sriovnetwork.openshift.io/v1
      blockOwnerDeletion: true
      controller: true
      kind: SriovOperatorConfig
      name: default
      uid: 2427ae73-ef95-4f57-aa85-c681ff9a48bb
    resourceVersion: "40147316"
    uid: 67f59ed6-85a3-4913-a1bf-3697dd008310
  spec:
    interfaces:
    - name: enp4s0f0np0
      numVfs: 4
      pciAddress: "0000:04:00.0"
      vfGroups:
      - isRdma: true
        policyName: policy1
        resourceName: rdma_resource
        vfRange: 0-3
    - name: enp4s0f1np1
      numVfs: 4
      pciAddress: "0000:04:00.1"
      vfGroups:
      - isRdma: true
        policyName: policy2
        resourceName: rdma_resource1
        vfRange: 0-3
  status:
    interfaces:
    - Vfs:
      - deviceID: "1018"
        driver: mlx5_core
        mac: e6:bc:60:22:14:6c
        mtu: 1500
        name: enp4s0f0v0
        pciAddress: "0000:04:00.2"
        vendor: 15b3
        vfID: 0
      - deviceID: "1018"
        driver: mlx5_core
        mac: a2:d7:89:ad:5d:b7
        mtu: 1500
        name: enp4s0f0v1
        pciAddress: "0000:04:00.3"
        vendor: 15b3
        vfID: 1
      - deviceID: "1018"
        driver: mlx5_core
        mac: d2:0b:3f:c9:ab:a4
        mtu: 1500
        name: enp4s0f0v2
        pciAddress: "0000:04:00.4"
        vendor: 15b3
        vfID: 2
      - deviceID: "1018"
        driver: mlx5_core
        mac: 4e:37:ab:b2:68:d7
        mtu: 1500
        name: enp4s0f0v3
        pciAddress: "0000:04:00.5"
        vendor: 15b3
        vfID: 3
      deviceID: "1017"
      driver: mlx5_core
      eSwitchMode: legacy
      linkSpeed: 25000 Mb/s
      linkType: ETH
      mac: 04:3f:72:d0:d2:b2
      mtu: 1500
      name: enp4s0f0np0
      numVfs: 4
      pciAddress: "0000:04:00.0"
      totalvfs: 4
      vendor: 15b3
    - Vfs:
      - deviceID: "1018"
        driver: mlx5_core
        mac: 3e:3a:7f:af:11:99
        mtu: 1500
        name: enp4s0f1v0
        pciAddress: "0000:04:00.6"
        vendor: 15b3
        vfID: 0
      - deviceID: "1018"
        driver: mlx5_core
        mac: 6e:c1:0e:52:ea:d8
        mtu: 1500
        name: enp4s0f1v1
        pciAddress: "0000:04:00.7"
        vendor: 15b3
        vfID: 1
      - deviceID: "1018"
        driver: mlx5_core
        mac: 8e:c8:1d:fc:69:0d
        mtu: 1500
        name: enp4s0f1v2
        pciAddress: "0000:04:01.0"
        vendor: 15b3
        vfID: 2
      - deviceID: "1018"
        driver: mlx5_core
        mac: 52:4c:5c:b1:1d:44
        mtu: 1500
        name: enp4s0f1v3
        pciAddress: "0000:04:01.1"
        vendor: 15b3
        vfID: 3
      deviceID: "1017"
      driver: mlx5_core
      eSwitchMode: legacy
      linkSpeed: 10000 Mb/s
      linkType: ETH
      mac: 04:3f:72:d0:d2:b3
      mtu: 1500
      name: enp4s0f1np1
      numVfs: 4
      pciAddress: "0000:04:00.1"
      totalvfs: 4
      vendor: 15b3
    syncStatus: Succeeded
kind: List
metadata:
  resourceVersion: ""
root@worker-node-1:~# cat /var/run/cdi/sriov-dp-spidernet.io.yaml
cdiVersion: 0.5.0
containerEdits: {}
devices:
- containerEdits:
    deviceNodes:
    - hostPath: /dev/infiniband/issm6
      path: /dev/infiniband/issm6
      permissions: rw
    - hostPath: /dev/infiniband/umad6
      path: /dev/infiniband/umad6
      permissions: rw
    - hostPath: /dev/infiniband/uverbs6
      path: /dev/infiniband/uverbs6
      permissions: rw
    - hostPath: /dev/infiniband/rdma_cm
      path: /dev/infiniband/rdma_cm
      permissions: rw
  name: "0000:04:00.6"
- containerEdits:
    deviceNodes:
    - hostPath: /dev/infiniband/issm7
      path: /dev/infiniband/issm7
      permissions: rw
    - hostPath: /dev/infiniband/umad7
      path: /dev/infiniband/umad7
      permissions: rw
    - hostPath: /dev/infiniband/uverbs7
      path: /dev/infiniband/uverbs7
      permissions: rw
    - hostPath: /dev/infiniband/rdma_cm
      path: /dev/infiniband/rdma_cm
      permissions: rw
  name: "0000:04:00.7"
- containerEdits:
    deviceNodes:
    - hostPath: /dev/infiniband/issm8
      path: /dev/infiniband/issm8
      permissions: rw
    - hostPath: /dev/infiniband/umad8
      path: /dev/infiniband/umad8
      permissions: rw
    - hostPath: /dev/infiniband/uverbs8
      path: /dev/infiniband/uverbs8
      permissions: rw
    - hostPath: /dev/infiniband/rdma_cm
      path: /dev/infiniband/rdma_cm
      permissions: rw
  name: "0000:04:01.0"
- containerEdits:
    deviceNodes:
    - hostPath: /dev/infiniband/issm9
      path: /dev/infiniband/issm9
      permissions: rw
    - hostPath: /dev/infiniband/umad9
      path: /dev/infiniband/umad9
      permissions: rw
    - hostPath: /dev/infiniband/uverbs9
      path: /dev/infiniband/uverbs9
      permissions: rw
    - hostPath: /dev/infiniband/rdma_cm
      path: /dev/infiniband/rdma_cm
      permissions: rw
  name: "0000:04:01.1"
kind: spidernet.io/net-pci
zeeke commented 4 months ago

hi @cyclinder , I'm not getting the problem here. Policies are applied to the nodestate

interfaces:
    - name: enp4s0f0np0
      numVfs: 4
      pciAddress: "0000:04:00.0"
      vfGroups:
      - isRdma: true
        policyName: policy1
        resourceName: rdma_resource
        vfRange: 0-3
    - name: enp4s0f1np1
      numVfs: 4
      pciAddress: "0000:04:00.1"
      vfGroups:
      - isRdma: true
        policyName: policy2
        resourceName: rdma_resource1
        vfRange: 0-3
cyclinder commented 4 months ago

We have 8 vfs, so we should also have 8 deviceNodes in the CDI file, right?

adrianchiris commented 4 months ago

@cyclinder are there any other files in /var/run/cdi or under /etc/cdi ?

also can you provide the following?

cyclinder commented 4 months ago

are there any other files in /var/run/cdi or under /etc/cdi ?

No, only sriov-dp-spidernet.io.yaml file under the /var/run/cdi.

root@controller-node-1:/home/cyclinder/sriov# kubectl get configmaps -n kube-system device-plugin-config -o json | jq '.data'
{
  "worker-node-1": "{\"resourceList\":[{\"resourceName\":\"rdma_resource\",\"selectors\":{\"pfNames\":[\"enp4s0f0np0\"],\"IsRdma\":true,\"NeedVhostNet\":false},\"SelectorObj\":null},{\"resourceName\":\"rdma_resource1\",\"selectors\":{\"pfNames\":[\"enp4s0f1np1\"],\"IsRdma\":true,\"NeedVhostNet\":false},\"SelectorObj\":null}]}"
}

root@controller-node-1:/home/cyclinder/sriov# kubectl get ds -n kube-system sriov-device-plugin -o yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  annotations:
    deprecated.daemonset.template.generation: "3"
    kubernetes.io/description: |
      This daemon set launches the SR-IOV network device plugin on each node.
    release.openshift.io/version: ""
  creationTimestamp: "2024-07-16T05:49:50Z"
  generation: 3
  name: sriov-device-plugin
  namespace: kube-system
  ownerReferences:
  - apiVersion: sriovnetwork.openshift.io/v1
    blockOwnerDeletion: true
    controller: true
    kind: SriovOperatorConfig
    name: default
    uid: 2427ae73-ef95-4f57-aa85-c681ff9a48bb
  resourceVersion: "40147369"
  uid: b6c01d09-98ed-4c9e-87bf-2d9f4cfa2096
spec:
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: sriov-device-plugin
  template:
    metadata:
      creationTimestamp: null
      labels:
        app: sriov-device-plugin
        component: network
        openshift.io/component: network
        type: infra
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: node-role.kubernetes.io/worker
                operator: In
                values:
                - ""
            - matchExpressions:
              - key: node-role.kubernetes.io/worker
                operator: In
                values:
                - ""
      containers:
      - args:
        - --log-level=10
        - --resource-prefix=spidernet.io
        - --config-file=/etc/pcidp/$(NODE_NAME)
        - --use-cdi
        env:
        - name: NODE_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: spec.nodeName
        image: ghcr.m.daocloud.io/k8snetworkplumbingwg/sriov-network-device-plugin
        imagePullPolicy: Always
        name: sriov-device-plugin
        resources:
          requests:
            cpu: 10m
            memory: 50Mi
        securityContext:
          privileged: true
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /var/lib/kubelet/device-plugins
          name: devicesock
        - mountPath: /var/lib/kubelet/plugins_registry
          name: plugins-registry
        - mountPath: /etc/pcidp/
          name: config-volume
          readOnly: true
        - mountPath: /var/run/k8s.cni.cncf.io/devinfo/dp
          name: device-info
        - mountPath: /var/run/cdi
          name: dynamic-cdi
      dnsPolicy: ClusterFirst
      hostNetwork: true
      nodeSelector:
        kubernetes.io/os: linux
        node-role.kubernetes.io/worker: ""
      priorityClassName: system-node-critical
      restartPolicy: Always
      schedulerName: default-scheduler
      securityContext: {}
      serviceAccount: sriov-device-plugin
      serviceAccountName: sriov-device-plugin
      terminationGracePeriodSeconds: 30
      tolerations:
      - operator: Exists
      volumes:
      - hostPath:
          path: /var/lib/kubelet/device-plugins
          type: ""
        name: devicesock
      - hostPath:
          path: /var/lib/kubelet/plugins_registry
          type: ""
        name: plugins-registry
      - configMap:
          defaultMode: 420
          name: device-plugin-config
        name: config-volume
      - hostPath:
          path: /var/run/k8s.cni.cncf.io/devinfo/dp
          type: DirectoryOrCreate
        name: device-info
      - hostPath:
          path: /var/run/cdi
          type: DirectoryOrCreate
        name: dynamic-cdi
  updateStrategy:
    rollingUpdate:
      maxSurge: 0
      maxUnavailable: 33%
    type: RollingUpdate

root@controller-node-1:/home/cyclinder/sriov# kubectl get sriovoperatorconfigs.sriovnetwork.openshift.io -n kube-system default -o yaml
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovOperatorConfig
metadata:
  annotations:
    meta.helm.sh/release-name: sriov-operator
    meta.helm.sh/release-namespace: kube-system
  creationTimestamp: "2024-07-16T03:27:39Z"
  generation: 3
  labels:
    app.kubernetes.io/managed-by: Helm
  name: default
  namespace: kube-system
  resourceVersion: "40134294"
  uid: 2427ae73-ef95-4f57-aa85-c681ff9a48bb
spec:
  configurationMode: daemon
  disableDrain: false
  enableInjector: false
  enableOperatorWebhook: false
  logLevel: 2
  useCDI: true
adrianchiris commented 4 months ago

a couple more questions @cyclinder :

if you restart device plugin will CDI file contain all devices ? if you put both PFs under the same resource ? (i.e same policy) will you then have all 7 VFs ?

im thinking sriov-device-plugin doesnt handle merging cdi spec of multiple resources well.

if thats the case can you create an issue in sriov-network-device-plugin ?

cyclinder commented 4 months ago

thanks @adrianchiris!

if you restart device plugin will CDI file contain all devices ?

Still no

if you put both PFs under the same resource ? (i.e same policy) will you then have all 7 VFs ?

Yes.

if thats the case can you create an issue in sriov-network-device-plugin ?

Sure, I would create an issue for this in sriov-network-device-plugin.

adrianchiris commented 4 months ago

Great, thx @cyclinder closing this one