Closed cyclinder closed 4 months ago
hi @cyclinder , I'm not getting the problem here. Policies are applied to the nodestate
interfaces:
- name: enp4s0f0np0
numVfs: 4
pciAddress: "0000:04:00.0"
vfGroups:
- isRdma: true
policyName: policy1
resourceName: rdma_resource
vfRange: 0-3
- name: enp4s0f1np1
numVfs: 4
pciAddress: "0000:04:00.1"
vfGroups:
- isRdma: true
policyName: policy2
resourceName: rdma_resource1
vfRange: 0-3
We have 8 vfs, so we should also have 8 deviceNodes in the CDI file, right?
@cyclinder are there any other files in /var/run/cdi or under /etc/cdi ?
also can you provide the following?
are there any other files in /var/run/cdi or under /etc/cdi ?
No, only sriov-dp-spidernet.io.yaml file under the /var/run/cdi.
root@controller-node-1:/home/cyclinder/sriov# kubectl get configmaps -n kube-system device-plugin-config -o json | jq '.data'
{
"worker-node-1": "{\"resourceList\":[{\"resourceName\":\"rdma_resource\",\"selectors\":{\"pfNames\":[\"enp4s0f0np0\"],\"IsRdma\":true,\"NeedVhostNet\":false},\"SelectorObj\":null},{\"resourceName\":\"rdma_resource1\",\"selectors\":{\"pfNames\":[\"enp4s0f1np1\"],\"IsRdma\":true,\"NeedVhostNet\":false},\"SelectorObj\":null}]}"
}
root@controller-node-1:/home/cyclinder/sriov# kubectl get ds -n kube-system sriov-device-plugin -o yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
annotations:
deprecated.daemonset.template.generation: "3"
kubernetes.io/description: |
This daemon set launches the SR-IOV network device plugin on each node.
release.openshift.io/version: ""
creationTimestamp: "2024-07-16T05:49:50Z"
generation: 3
name: sriov-device-plugin
namespace: kube-system
ownerReferences:
- apiVersion: sriovnetwork.openshift.io/v1
blockOwnerDeletion: true
controller: true
kind: SriovOperatorConfig
name: default
uid: 2427ae73-ef95-4f57-aa85-c681ff9a48bb
resourceVersion: "40147369"
uid: b6c01d09-98ed-4c9e-87bf-2d9f4cfa2096
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
app: sriov-device-plugin
template:
metadata:
creationTimestamp: null
labels:
app: sriov-device-plugin
component: network
openshift.io/component: network
type: infra
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: In
values:
- ""
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: In
values:
- ""
containers:
- args:
- --log-level=10
- --resource-prefix=spidernet.io
- --config-file=/etc/pcidp/$(NODE_NAME)
- --use-cdi
env:
- name: NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
image: ghcr.m.daocloud.io/k8snetworkplumbingwg/sriov-network-device-plugin
imagePullPolicy: Always
name: sriov-device-plugin
resources:
requests:
cpu: 10m
memory: 50Mi
securityContext:
privileged: true
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/lib/kubelet/device-plugins
name: devicesock
- mountPath: /var/lib/kubelet/plugins_registry
name: plugins-registry
- mountPath: /etc/pcidp/
name: config-volume
readOnly: true
- mountPath: /var/run/k8s.cni.cncf.io/devinfo/dp
name: device-info
- mountPath: /var/run/cdi
name: dynamic-cdi
dnsPolicy: ClusterFirst
hostNetwork: true
nodeSelector:
kubernetes.io/os: linux
node-role.kubernetes.io/worker: ""
priorityClassName: system-node-critical
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: sriov-device-plugin
serviceAccountName: sriov-device-plugin
terminationGracePeriodSeconds: 30
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /var/lib/kubelet/device-plugins
type: ""
name: devicesock
- hostPath:
path: /var/lib/kubelet/plugins_registry
type: ""
name: plugins-registry
- configMap:
defaultMode: 420
name: device-plugin-config
name: config-volume
- hostPath:
path: /var/run/k8s.cni.cncf.io/devinfo/dp
type: DirectoryOrCreate
name: device-info
- hostPath:
path: /var/run/cdi
type: DirectoryOrCreate
name: dynamic-cdi
updateStrategy:
rollingUpdate:
maxSurge: 0
maxUnavailable: 33%
type: RollingUpdate
root@controller-node-1:/home/cyclinder/sriov# kubectl get sriovoperatorconfigs.sriovnetwork.openshift.io -n kube-system default -o yaml
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovOperatorConfig
metadata:
annotations:
meta.helm.sh/release-name: sriov-operator
meta.helm.sh/release-namespace: kube-system
creationTimestamp: "2024-07-16T03:27:39Z"
generation: 3
labels:
app.kubernetes.io/managed-by: Helm
name: default
namespace: kube-system
resourceVersion: "40134294"
uid: 2427ae73-ef95-4f57-aa85-c681ff9a48bb
spec:
configurationMode: daemon
disableDrain: false
enableInjector: false
enableOperatorWebhook: false
logLevel: 2
useCDI: true
a couple more questions @cyclinder :
if you restart device plugin will CDI file contain all devices ? if you put both PFs under the same resource ? (i.e same policy) will you then have all 7 VFs ?
im thinking sriov-device-plugin doesnt handle merging cdi spec of multiple resources well.
if thats the case can you create an issue in sriov-network-device-plugin ?
thanks @adrianchiris!
if you restart device plugin will CDI file contain all devices ?
Still no
if you put both PFs under the same resource ? (i.e same policy) will you then have all 7 VFs ?
Yes.
if thats the case can you create an issue in sriov-network-device-plugin ?
Sure, I would create an issue for this in sriov-network-device-plugin.
Great, thx @cyclinder closing this one
I have two sriovnodepolicy configs, see below: