Closed rbo closed 2 years ago
Add an bridge to main interface is not supported but technical possible
Let's try first only at one master:
oc apply -f - <<EOF
apiVersion: nmstate.io/v1
kind: NodeNetworkConfigurationPolicy
metadata:
name: br1-on-eno0
spec:
nodeSelector:
kubernetes.io/hostname: inf6.coe.muc.redhat.com
desiredState:
interfaces:
- name: br1
description: Linux bridge with eno1 as a port
type: linux-bridge
state: up
ipv4:
enabled: true
dhcp: true
bridge:
options:
stp:
enabled: false
port:
- name: eno1
EOF
$ oc get nncp,nnce,nns
NAME STATUS
nodenetworkconfigurationpolicy.nmstate.io/br1-on-eno0 Degraded
NAME STATUS
nodenetworkconfigurationenactment.nmstate.io/inf6.coe.muc.redhat.com.br1-on-eno0 Failing
NAME AGE
nodenetworkstate.nmstate.io/inf4.coe.muc.redhat.com 89d
nodenetworkstate.nmstate.io/inf5.coe.muc.redhat.com 89d
nodenetworkstate.nmstate.io/inf6.coe.muc.redhat.com 89d
$
$ oc describe nodenetworkconfigurationenactment.nmstate.io/inf6.coe.muc.redhat.com.br1-on-eno0
Name: inf6.coe.muc.redhat.com.br1-on-eno0
Namespace:
Labels: app.kubernetes.io/component=network
app.kubernetes.io/managed-by=cnao-operator
app.kubernetes.io/part-of=hyperconverged-cluster
app.kubernetes.io/version=4.10.2
nmstate.io/node=inf6.coe.muc.redhat.com
nmstate.io/policy=br1-on-eno0
Annotations: <none>
API Version: nmstate.io/v1beta1
Kind: NodeNetworkConfigurationEnactment
Metadata:
Creation Timestamp: 2022-06-26T14:30:48Z
Generation: 1
Managed Fields:
API Version: nmstate.io/v1beta1
Fields Type: FieldsV1
fieldsV1:
f:metadata:
f:labels:
.:
f:app.kubernetes.io/component:
f:app.kubernetes.io/managed-by:
f:app.kubernetes.io/part-of:
f:app.kubernetes.io/version:
f:nmstate.io/node:
f:nmstate.io/policy:
f:ownerReferences:
.:
k:{"uid":"5ec8385f-6b3b-4002-82f9-4df7171fae8d"}:
Manager: manager
Operation: Update
Time: 2022-06-26T14:30:48Z
API Version: nmstate.io/v1beta1
Fields Type: FieldsV1
fieldsV1:
f:status:
.:
f:conditions:
f:desiredState:
.:
f:interfaces:
f:desiredStateMetaInfo:
.:
f:time:
f:version:
f:policyGeneration:
Manager: manager
Operation: Update
Subresource: status
Time: 2022-06-26T14:30:48Z
Owner References:
API Version: v1
Kind: Node
Name: inf6.coe.muc.redhat.com
UID: 5ec8385f-6b3b-4002-82f9-4df7171fae8d
Resource Version: 135181409
UID: 1cfdd105-701a-4923-a0dd-db4a7b510d3d
Status:
Conditions:
Last Hearbeat Time: 2022-06-26T14:30:51Z
Last Transition Time: 2022-06-26T14:30:51Z
Reason: FailedToConfigure
Status: False
Type: Progressing
Last Hearbeat Time: 2022-06-26T14:30:51Z
Last Transition Time: 2022-06-26T14:30:51Z
Message: error reconciling NodeNetworkConfigurationPolicy at desired state apply: ,
failed to execute nmstatectl set --no-commit --timeout 480: 'exit status 1'
libnmstate.error.NmstateLibnmError
Activate profile uuid:97e78279-bdc2-4555-a5cd-b94fe10e33da iface:eno1 type
ethernet failed
reason=<enum NM_ACTIVE_CONNECTION_STATE_REASON_DEVICE_DISCONNECTED of type NM.ActiveConnectionStateReason><enum NM_DEVICE_STATE_REASON_UNKNOWN of type NM.DeviceStateReason>
Message Encoded: H4sIAAAAAAAA/+xdfW/bNrf/v5+C2D/uAL/o3TKxDsjTpkNxt2xou2d4MBQCRVEOEZkUKNpJ7m6++wVJyZZjOXESqbEeuGhRWZTO7/B3+HoORRIhuACCYM4wzSibgwuekAsir7m4es9ZSudLgSTl7A+eUXwLkAQJKaggCSgkkgSgPM9uIRi+ASmiGUmA5IDcELyUBLCFfgbLDBREgtGI8RHmiwVV15IuCF9K4IUWBANyQ6WWuCyAPQCDARhMloWYZDSe5LfykjN3HEwKKskoR/gKzUkx2YivXY7zW+g6PgR/FkT8hQSjbK5+qLwNCiIHgBYgIbkgGEmSDMGyIGCgczEAlBWSoGT8BoBr82oxVhdvf3iigB9+fONYjjOygpETANuDrgV9axhYFhCcS1D9+XD+rz9/URcXJgNgRURBOYPAHltj54lCzpQSSk9jImMhCP4ZUCaJSBEmxQCCv/8ZxIImczJQSTxX1i30dSFz/T9hKM5IMoDgI8oKcnc3BIOcC2leZmihXh0Qxu3BEAxWGWL6tQVPdIIUS3alUvTFSKJ5CUuTkUCsBF6gmwEEnjXzhmCwoEqEc3f3Tf0dgkFCCiyo1k2J/JWy5Q0waoNrKi+BQgeoAAhozYZgQPOVpyUnl1jl46tYkiGoZUbdULKrDMRC669ZUr+XuVb6Nte/MgU5Kpm6+3bXbAvP32dQU4d+QwzNiajsCuyxa42tPbJm++xa3DIMEJa6ZHwmUlCyMlWPkgRgXU8hIPKSCEak4aaQSEiSNEP5e4vQc6BSymhxuQ9rug/rU1UqQcZVpVLMgyW7YvyaAcQScE2zDMQE0DnjqsVJlkKVbnlJtIYr3TDtAbUPyuB7QVStw5cEX+WcMvkwbVNnj9T3GwETLuaTVBCSkOJK8nyyXRImm0cnNsAaPwEpFwBlGUjIimJSPBH9sTw9Yh9vXxGmRc4FBHgpBGESMJORsuH/Z0CrFuWfwYrIy8RJXA+TQN+o6thWgqqk6p2oqmMqtaEKLuRyAIHt+dYQDNKsbD8GseAowajQtT3j10RE5fPLTNIqQSyZarjV5TIffDMNg1EKJYkgRVG1guVPhZuS0ILQ9YgNnSRNYUpSH/o48ZWYXJCU3kQZUU1RoJqrFcpoEmWp1O9yQVZEVE8SIchOmmnUFghHNVA3gBoPaixowDBnUvAsI0I9wlfFqLgtJFlsp60Z5Dlhq6i4phJrJjWjKq85MRII485At+CrmwyxqLgtIm8azrattJ20ayeVvm0oU0031gp8y/re5rJcDwYzbS4SQhTHnVrLgRoOkhBaCBqwl1rrpuo7BVlww+z6fkRVp6U4jVFBIm2R6oGMY5RVP6TMygclL8qrjJjxS9WHD8EAzYm54WpDLdBNLXvqTiFwpDrTyPTG27d0b21p1vjNbU2sKHDtV+YsqBa4vuHeu5EUMioHE6qwKbsneaRbq6hYLjYd9zLJg+h/ieCbxEje1CQ1PSBuNgIMp/ve3kkV9dR5nD/0LONRjoSk2gjVU1gZG8toQSRKkEQbTTIUk8pGc10UqnekzCLKLomgss5Rqh7WtVaPsup1tRp2bdfQqkPe35r67VVP75HqaVtj1xnPgnGwUyEd636FnAWzsCC4sUJWaaZCHtwwOEFA4CzQDcPMgYHnhV22DI4HNR7UWNCAFYLylekbU63nN21OlNMta6rfO8ZU8lc7LW5HxjyQUhtBD5mmdgY9TLok1LKghoNkBi0PGrA1KUrd7RaxqhLVBCQXamyo36ooMpWJsnmJsbFAda9nVrBiGAbWbgXrwgwaCxqwNsygB4WBhRMPebujxSrh2EeLxAoR9M1o0ZrBMPRQl9YgDtR4UGNBA9bhaJHIy1kcOghNd0f064RjtxG2rAAiS9vIn8Ip8ju1EXagxoMaC/rfwUZWMpsGyHZ3bLROOHYbeRinMAlNPbJgSjy7Sxt5BGo8qLGgAevWRh7Brm174Y6N1gnHbqMUOzF0fTMzJjBxQtyljVICNR7UWNCAdWujJHXi2dROdvujKuHYbRRiL4AB0TYK1djJ7rStCwnUeFBjQft7tHUuiRNsx7ttXZVw7DaahnYKA7cawdkkcbu00RRBjQc1FjRg3dlILpm1ZRt9Y8cm9yXudyq1aaADJq22a42t8W7v47jPt8jBRSOxphbEpfPRg1OcdtrEJg7UeFBjwRQP1r1VEBMXx2i3t6oSjr2WpWGSwtTM/8MYpjO7U89giqDGgxoL2q14BvfXMsK4e98j1DDKa/QIJdvV68EJ7JY1ShM84vnAD3g+ahTUld9m5sCG4nnZaMiApwaA0JnBKYGxB1N3UDLs3We4YT76GgyTBxiOxXbrq34/m9MHK+2DnPoBDEKIfGgRmHjQc9ctS2IHFvadhjl/lXDsLYvj4RjGUzOf9KGDjZ+iM89iADUe1FjQgHU7xgqtIPWseHeMtU44dhvNLCuErmn9/RDGoet0aaOZAzUe1FjQgHVrI991ptbU3vXLrBOO3kbh1IczM1dJUxg73fplZghqPKixvotfxieJi/CU7NqoSjh2Gzmh68LAr3z+Uxt16m12ENR4UGNBA9atjWI3QQ6Kd220Tjh2G3mWm8Ag1TbyptCZTTuNdHkO1HhQY0ED1ulI17k/DnOOYxyGHhiHqRy57tRHCE13itY64diLlh0mHgxsXbRcHzp+2Klb1kZQ40GNBcPv4JZFOPTDxE13bLROOHYbhR724dR0ow6G08DutIkOA6jxoMaCdsdNdMa3LJPxXXtknOcxwleDR9Ym+W6wZZn6e3XDPM9z5EzHzY6jsFO/EYS7iLbzAszd+O/2X2MWwZfynjZRihY0u1Uvab23liJJFGfKLo7vmeVDkmOuFy9dqWY707bDPC9tR1dEFLrn1UBrUy/Zusaso8lmOVHJxcR2dH/EabopLhjhSxLhjElB0vIVcy9DhYyWBdm6SW5yKkixfU8ILrbuLAuSbN2g2z9lce9nhOYVzoJIQbGmQxXJnAiz1mcIjohN07pMAq/GZxXdP15Gbd3H94fRcu3P8RLatyJaW8pzIrUtUnfXBp+4bYvbKj52YrS10npvinVitk1m63H2E7NtMltf9Xhitk1m62sVT8y2yWzdS3pitk1m67G2E7NtMluPvpyYbXVsUFtvfGK21bFB7fvZE7NtMltf0XZittUyW1vRc2K21R6stsbjxGyrrUHta4wTs622BrVvKE7MtslsPVT9X8ys/xJmzWYJ/y1xQ+sVmUTs9vEyuk1nL2IyPeP06AOHPeOzTyGunlF7/BGunhHanzBMD4nth9+1h8T2I7zVQ2L7Ed3qIbH9WETQQ2L7EYHpIbH9CMD0kNh+xF96SGw/FhD0jNjjX6fdM0L7EyjsIbH9iBP2kNh+hAl7SGw/Ii49JLYfQcIeEtuPuPZrEvtQhHDfzrunOFeHHJs9pk9hrzbpbfig/+T36qYg7+4SdRr6dsP07k7/p+lwuwzv7gJ0cup2wvTucTanZQpdUb170NPJ79sJ0w0bVJ0cwd0wvbuN/ilI1AnTDZt4nxbpdMJ0w/5WJ49cN0zvHh5wctF1wnTD9rSnlWjdML27yewpUNIJ0w3HLJ3WqnXCdMMpEacV160y3HCs22lBaydMNxzTcVp60Q3Tu8d3neKEhumYc3kgz498oZlaFoSTfvmeW/8ws0s6jz7+1ys2exGw7hWjffLP94rY4x/S9orO/oT5e0drP5y+vaO1H7Ot3tHaD4dM72jtRzSzd7T2wyXeO1r7sXSnd7T2I9DeO1r7sSand7T2w8naO1r7scS3d7T2I2TbO1r74f/vHa39WDTTO1r7serrUFq9fbTe31M0ucT5y2i9F5yaI0mukdbDtsauM7Zte+z4WqH60Wot6N4Q1cwou3pqgTBqzoKxNXGshnCbOeWQJFEh8NbzQQd52inmz8yR7YRja2xN7J3zobpQ8rl10Z46Y9cyigYtKHpA3LuZz62jVR8uI+ZE9sPLSDfZuOTlaa8Hh+43Kh1JFl5iCdOo+H3PiV0W/q1sVOGphmyY5+1jKlOlSkeShRdZoqFI9S0j5dHB90qU2YN6Jw/rc4aPpjit1d/dQbs/2tvHQP7zy4/j+9W/J2bkmz7xOlsf5owl5UzXIq35EDwwBagr84QsG/LaBDJjHcoFleo915kGQQvY3vfK5EFAr5lJdxd7ugf7290bx3KckRWMnADYHnQt6FvDYOoBwbkE1Z8P5//68xd1cUGLnAsIPjFJRIowASinYIHwaJUhBuiccUGSFwuNhQX4qhjR9Z2HJfuPSMaXiM1JAhiR11xcAX30fO2o+mpgU1VjIi8VQ2YMvj6mXs/j1ufLE6YYV1PMjygryNYZ8A1pa5EqvciJTrUtS5+BjZaSjxiZc0lRWRL0i6rhWOYZudHz6mVmmiAxonyl5UguUTZapVUhMFd/fzPH8WdJvk8bZbHayfGOB4NzOAug/iYfGpeSOYvf9rWGEeZMCp5l5tD/WGi+anc3LThly5tRLGgy1wU5ikXOhYx4rnJWaI3u864Kj05Y8EQnSLE0zau+GElT4v/+Z0CTkVC2NE8jRYxnzVR1W1Alwrm7+3Z3t69YP1ZONiWwXG13aNE+XLAa8LRatjei6wta9I92JZcBsvYlV2ub2pdchUk6kFz6XtuXXDnLO5BcRo7bl1yF+tuXXC15aV9y5Y9sX3K1nqR9yVV8qgPJZZy2A8nlWrAOJJeO+jYkJ6SgomGEUPZjqrepd2CFbO5aVd+qejvTWXXdzX1TcAkpsKB5NYD8VXW/wKgNrqm8BAodoAIgoDWrj2G0cxqCr2KphhqbzJQ3NiOEv79VoxQ93ylqz6ibCbt/Z+OlVneVnhUZ5dDh3rBq3+hh75DqwC7+r7PPF58ufgHg0x+rAKhJG9BTTFDmDezbyizwAC2qQgWuLwkDKM+zW8rm68Ki87BHj+lL9ai2rXsdRZq+W3ypJuGzNGnaveF1NGnaseN1NGnaGu6lmsyepUnTN4GvpEnDN3Ovo0nTd+yvo0nTZmAv1CS0nsdJwy4Kr6NJ07fvr6NJ024pr6NJ065PL9XEfl6JbdiK8XU0adpz7HU0afrC/XU0adp15qWaOHtG5mfFLcPAeEYh+DNP1IA8FzylGQHLJU3gbEqmoTOdjeIEOyPP9/0R8nEyimdeSmyLuG6CAFXTBajHwGqMCSs/nNJJyH3zhdA9SKuzJKlUgoDgNAwD3xl5U98beU6QjMLYxSN3FvgBCRMc+tNhqU8s7KHRpz7gfUSnfXOYrplKKaPF5V61pq9CVTtKYUlXO2wdot1GOaMbONiOUz/co9t7zhjRmmkFV9oVDCijkiJJEmgw31WY77YgMWcjXc3e/UTYcgEufovO3n/99O/z6P3vFxfn779++v0i+vL17Ou5uX/2VdV3nmpR4OK3seaCbHT4ooT93JyFwH5SFoolxoQkW1kY7uZhqDIRdZCJIUjI6r7gD+f//vT+vJT26Q+F8PHTlrAPZEUxqUTo1yMdfHn3k/5vrd5ZTa+Pv579En36Ev129uXr+Wfwf/sf+vXsP+efnejz+dmH/+zkAa21/6ig9tnhsMazq2L+YB18mXLParHAQY37NNjXuB9YAdeA79Z437P2hd6+2rdN7GeeZTHCV0BygC8Jvso5ZRJMuJhPUqHqY3EleT65MC6v3xBDcyIm79ePTuwHeQy9fQG9989EE5XG5Ibg5dNx9zge12LX7dCL5K7DFS3LXbszW5a7dvu3LHft9G9Z7tp93rLcdTCoZbnrwFjLctehj7bLQxVgarv8dlDfYmG1XgyqyF2LcnVQtwdmIoy33hz2qtna2vHlyAvqOpR/3Gp2UZ+24rNtl9cqhnqw3H3z7pfqe5jcpxeDx+USxp229eyK16cPBx6X+7T2+iA+7ZblPWE4cRiPTx/+HGj3auVAH+Sq9r9Vyz9vAHiY3KdPNB6Xm/G2tXz6NOAwuU8f/j4uF+W0TXHVIsEXi2x/4r52zXwVCBMt7e2CFxIIggmTAKMsAxkq5I/wDQAfaUbAD5NlISYxZRO20L4MLLMfhiCjjADbHgLKwE8Lniwz8vMbpXTGURIRJsVtpFHfDsr33r2zx9ZYr2zHnBU8I5FZoqI3L9wIH/z49sdt8IzGk/xWXnLmjoNJQSUZ5QhfoTkpakrVLsf5baXi1NUqLhBlWj1B5FIwgMS8GKdLht+qqxbxXCfQgAWRW3h5nt22juV7GksL12g6WwVakUjyKKHF1bA9NM+abtCM21Rj1uHePRM7o3GJOWFEKgRKRA07tO9lNDJaaM9bYZR5m2fLOWXFEDAiza0hWBFB09vIrPQebun6BEM8pp5tG0M0aKW1NZqNTarRpXhb07IFtRaTEmSjleO6NYOVsFqfi8UfxqtavC1Ilo4xZ5LcyB8rDTlL6bx9BUvMmoozu66hhjVlSmkVYXkzvkZURijLItN0PaVp2EYv81jnxzb83IMw1RbRggC5yM0X5W82ssb6xvjC/PpV3T9XdzpyWaeIZiSBQBBUcPa4H/nz+dmX3y+qMMaHT1/KJ84/POpX/qwhfm4OhZRy/7z4n4vf/7rYEw8pJbwZ/H8AAAD//9Q+pEKR3AAA
Reason: FailedToConfigure
Status: True
Type: Failing
Last Hearbeat Time: 2022-06-26T14:30:51Z
Last Transition Time: 2022-06-26T14:30:51Z
Reason: FailedToConfigure
Status: False
Type: Available
Last Hearbeat Time: 2022-06-26T14:30:51Z
Last Transition Time: 2022-06-26T14:30:48Z
Reason: FailedToConfigure
Status: False
Type: Pending
Last Hearbeat Time: 2022-06-26T14:30:51Z
Last Transition Time: 2022-06-26T14:30:48Z
Reason: SuccessfullyConfigured
Status: False
Type: Aborted
Desired State:
Interfaces:
Bridge:
Options:
Stp:
Enabled: false
Port:
Name: eno1
Vlan:
Mode: trunk
Trunk - Tags:
Id - Range:
Max: 4094
Min: 2
Description: Linux bridge with eno1 as a port
ipv4:
Dhcp: true
Enabled: true
Name: br1
State: up
Type: linux-bridge
Desired State Meta Info:
Time: 2022-06-26T14:30:48Z
Version: 0
Policy Generation: 1
Events: <none>
$
Internal Chat with eng. https://coreos.slack.com/archives/C019X3PEF2B/p1656329450916889
sh-4.4# nmcli con
NAME UUID TYPE DEVICE
eno1 d5a9f6ee-14b1-4bb3-a985-707a038fa7da ethernet eno1
Wired connection 1 1a73a011-fa77-3efe-8b31-7b5f3d27da82 ethernet --
Wired connection 2 226c83b3-e24d-3d37-9585-ea208604ae20 ethernet --
Wired connection 3 1128951e-55bc-3d6e-b50f-9299d12c9301 ethernet --
eno1 bd7f1c09-65a0-4497-aa49-8aa586e0661a ethernet --
sh-4.4#
eno1 is duplicated, lets delete the second one with uuid:
nmcli con delete bd7f1c09-65a0-4497-aa49-8aa586e0661a
With OpenShift SDN it is technically possible to share the main interface with SDN Traffic and VM attached via a bridge into the local network. Unless you use the openshift internal lb (keepalived/haproxy). With OVNKubernetes it is not possible
RFE to improve this: https://issues.redhat.com/browse/CNV-12835
Recommended solutions:
feature.node.kubernetes.io/network-sriov.capable="true"
Draft SNNP:
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetworkNodePolicy
metadata:
name: policy-x710
namespace: openshift-sriov-network-operator
spec:
resourceName: x710
nodeSelector:
feature.node.kubernetes.io/network-sriov.capable: "true"
numVfs: 8
nicSelector:
vendor: "8086"
deviceID: "1572"
rootDevices:
- 0000:19:00.0
MCP degraded... :-( let's fix it.
$ oc get mcp,nodes
NAME CONFIG UPDATED UPDATING DEGRADED MACHINECOUNT READYMACHINECOUNT UPDATEDMACHINECOUNT DEGRADEDMACHINECOUNT AGE
machineconfigpool.machineconfiguration.openshift.io/master rendered-master-92fd6629cbc7cc1d9dfba674cbe8f9af False True True 3 2 2 1 98d
machineconfigpool.machineconfiguration.openshift.io/worker rendered-worker-ff90ae481b4352e9da99a3404870bf72 True False False 0 0 0 0 98d
NAME STATUS ROLES AGE VERSION
node/inf4.coe.muc.redhat.com Ready,SchedulingDisabled master,worker 98d v1.23.5+3afdacb
node/inf5.coe.muc.redhat.com Ready master,worker 98d v1.23.5+3afdacb
node/inf6.coe.muc.redhat.com Ready master,worker 98d v1.23.5+3afdacb
$ oc get node/inf4.coe.muc.redhat.com -o yaml | grep -A5 machineconfiguration.openshift.io/reason
machineconfiguration.openshift.io/reason: 'unexpected on-disk state validating
against rendered-master-333e27092c98d0c51e65d1ce9e27f6e0: expected target osImageURL
"quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae",
have "quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:16124c9abedf4b8f6f6bfc625efcafdd445610778f2186079981687d49cd03ee"'
machineconfiguration.openshift.io/state: Degraded
sriovnetwork.openshift.io/state: Idle
oc debug node/inf4.coe.muc.redhat.com
Starting pod/inf4coemucredhatcom-debug ...
To use host binaries, run `chroot /host`
Pod IP: 10.32.96.4
If you don't see a command prompt, try pressing enter.
sh-4.4# chroot /host
sh-4.4# rpm-ostree status
State: idle
Warning: failed to finalize previous deployment
ostree-finalize-staged.service: Failed with result 'timeout'.
check `journalctl -b -1 -u ostree-finalize-staged.service`
Deployments:
* pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:16124c9abedf4b8f6f6bfc625efcafdd445610778f2186079981687d49cd03ee
CustomOrigin: Managed by machine-config-operator
Version: 410.84.202205111833-0 (2022-05-11T18:37:05Z)
pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:3888e5e253b8caf99bc6517f98f4de6f2b39548c1bcf6b3a92b0ad0ac0ba72ed
CustomOrigin: Managed by machine-config-operator
Version: 410.84.202204112301-0 (2022-04-11T23:04:43Z)
sh-4.4# journalctl -b -1 -u ostree-finalize-staged.service
-- Logs begin at Thu 2022-05-19 18:38:43 UTC, end at Wed 2022-07-06 07:50:06 UTC. --
Jul 05 15:06:42 inf4.coe.muc.redhat.com systemd[1]: Started OSTree Finalize Staged Deployment.
Jul 05 15:07:59 inf4.coe.muc.redhat.com systemd[1]: Stopping OSTree Finalize Staged Deployment...
Jul 05 15:08:00 inf4.coe.muc.redhat.com ostree[1504579]: Finalizing staged deployment
Jul 05 15:08:01 inf4.coe.muc.redhat.com ostree[1504579]: Copying /etc changes: 18 modified, 0 removed, 1911 added
Jul 05 15:08:01 inf4.coe.muc.redhat.com ostree[1504579]: Copying /etc changes: 18 modified, 0 removed, 1911 added
Jul 05 15:13:00 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: Stopping timed out. Terminating.
Jul 05 15:18:00 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: State 'stop-sigterm' timed out. Killing.
Jul 05 15:18:00 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: Killing process 1504579 (ostree) with signal SIGKILL.
Jul 05 15:23:00 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: Processes still around after SIGKILL. Ignoring.
Jul 05 15:28:00 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: State 'stop-final-sigterm' timed out. Killing.
Jul 05 15:28:00 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: Killing process 1504579 (ostree) with signal SIGKILL.
Jul 05 15:33:01 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: Processes still around after final SIGKILL. Entering failed mode.
Jul 05 15:33:01 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: Failed with result 'timeout'.
Jul 05 15:33:01 inf4.coe.muc.redhat.com systemd[1]: Stopped OSTree Finalize Staged Deployment.
Jul 05 15:33:01 inf4.coe.muc.redhat.com systemd[1]: ostree-finalize-staged.service: Consumed 1.482s CPU time
sh-4.4#
Let's try to reboot the node...
ssh -i ~/.ssh/coe-muc -l core 10.32.96.4
Warning: Permanently added '10.32.96.4' (ED25519) to the list of known hosts.
Red Hat Enterprise Linux CoreOS 410.84.202205111833-0
Part of OpenShift 4.10, RHCOS is a Kubernetes native operating system
managed by the Machine Config Operator (`clusteroperator/machine-config`).
WARNING: Direct SSH access to machines is not recommended; instead,
make configuration changes via `machineconfig` objects:
https://docs.openshift.com/container-platform/4.10/architecture/architecture-rhcos.html
---
[core@inf4 ~]$ sudo su -
[root@inf4 ~]# rpm-ostree status
State: idle
Deployments:
* pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:16124c9abedf4b8f6f6bfc625efcafdd445610778f2186079981687d49cd03ee
CustomOrigin: Managed by machine-config-operator
Version: 410.84.202205111833-0 (2022-05-11T18:37:05Z)
pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:3888e5e253b8caf99bc6517f98f4de6f2b39548c1bcf6b3a92b0ad0ac0ba72ed
CustomOrigin: Managed by machine-config-operator
Version: 410.84.202204112301-0 (2022-04-11T23:04:43Z)
[root@inf4 ~]#
$ oc get pods -n openshift-machine-config-operator -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
machine-config-controller-96bd949bb-lk2gv 1/1 Running 0 16h 10.130.0.107 inf6.coe.muc.redhat.com <none> <none>
machine-config-daemon-24ml4 2/2 Running 2 17h 10.32.96.5 inf5.coe.muc.redhat.com <none> <none>
machine-config-daemon-bpqvm 2/2 Running 2 17h 10.32.96.6 inf6.coe.muc.redhat.com <none> <none>
machine-config-daemon-vqkdx 2/2 Running 4 17h 10.32.96.4 inf4.coe.muc.redhat.com <none> <none>
machine-config-operator-5c8759c885-6v4z4 1/1 Running 0 17h 10.130.0.96 inf6.coe.muc.redhat.com <none> <none>
machine-config-server-6p8pl 1/1 Running 2 17h 10.32.96.4 inf4.coe.muc.redhat.com <none> <none>
machine-config-server-97nnb 1/1 Running 1 17h 10.32.96.5 inf5.coe.muc.redhat.com <none> <none>
machine-config-server-ghrl7 1/1 Running 1 17h 10.32.96.6 inf6.coe.muc.redhat.com <none> <none>
$ oc logs pod machine-config-daemon-vqkdx -n openshift-machine-config-operator | tail
Error from server (NotFound): pods "pod" not found
$ oc logs machine-config-daemon-vqkdx -n openshift-machine-config-operator -c machine-config-daemon | tail
I0706 08:01:15.774898 5293 update.go:1956] Disk currentConfig rendered-master-333e27092c98d0c51e65d1ce9e27f6e0 overrides node's currentConfig annotation rendered-master-92fd6629cbc7cc1d9dfba674cbe8f9af
I0706 08:01:15.777038 5293 daemon.go:1193] Validating against pending config rendered-master-333e27092c98d0c51e65d1ce9e27f6e0
E0706 08:01:15.777084 5293 writer.go:135] Marking Degraded due to: unexpected on-disk state validating against rendered-master-333e27092c98d0c51e65d1ce9e27f6e0: expected target osImageURL "quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae", have "quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:16124c9abedf4b8f6f6bfc625efcafdd445610778f2186079981687d49cd03ee"
I0706 08:02:15.793981 5293 daemon.go:910] Current config: rendered-master-92fd6629cbc7cc1d9dfba674cbe8f9af
I0706 08:02:15.793997 5293 daemon.go:911] Desired config: rendered-master-333e27092c98d0c51e65d1ce9e27f6e0
I0706 08:02:15.798150 5293 daemon.go:538] Detected a login session before the daemon took over on first boot
I0706 08:02:15.798158 5293 daemon.go:539] Applying annotation: machineconfiguration.openshift.io/ssh
I0706 08:02:15.813086 5293 update.go:1956] Disk currentConfig rendered-master-333e27092c98d0c51e65d1ce9e27f6e0 overrides node's currentConfig annotation rendered-master-92fd6629cbc7cc1d9dfba674cbe8f9af
I0706 08:02:15.816048 5293 daemon.go:1193] Validating against pending config rendered-master-333e27092c98d0c51e65d1ce9e27f6e0
E0706 08:02:15.816095 5293 writer.go:135] Marking Degraded due to: unexpected on-disk state validating against rendered-master-333e27092c98d0c51e65d1ce9e27f6e0: expected target osImageURL "quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae", have "quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:16124c9abedf4b8f6f6bfc625efcafdd445610778f2186079981687d49cd03ee"
$
=> https://access.redhat.com/solutions/4951051
touch /run/machine-config-daemon-force
on node inf4
Again:
E0706 08:07:24.726911 5293 writer.go:135] Marking Degraded due to: failed to update OS to quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae : error running rpm-ostree rebase --experimental /srv/repo:2783635c5c8421c077117928c96680bbe9d13168c3b463ecce2b9ae58e06f986 --custom-origin-url pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae --custom-origin-description Managed by machine-config-operator: error: opendir(/srv/repo): No such file or directory
: exit status 1
Let's try https://access.redhat.com/solutions/5598401
[root@inf4 ~]# export IMAGE=quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae
[root@inf4 ~]# echo $IMAGE
quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae
[root@inf4 ~]# /run/bin/machine-config-daemon pivot "${IMAGE}"
I0706 08:09:56.737134 41878 run.go:18] Running: nice -- ionice -c 3 oc image extract --path /:/run/mco-machine-os-content/os-content-1533141495 --registry-config /var/lib/kubelet/config.json quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae
I0706 08:10:10.661293 41878 rpm-ostree.go:325] Running captured: rpm-ostree status --json
I0706 08:10:10.723617 41878 rpm-ostree.go:233] Previous pivot: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:16124c9abedf4b8f6f6bfc625efcafdd445610778f2186079981687d49cd03ee
I0706 08:10:12.516343 41878 rpm-ostree.go:265] Pivoting to: 410.84.202206212304-0 (2783635c5c8421c077117928c96680bbe9d13168c3b463ecce2b9ae58e06f986)
I0706 08:10:12.516360 41878 rpm-ostree.go:297] Executing rebase from repo path /run/mco-machine-os-content/os-content-1533141495/srv/repo with customImageURL pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae and checksum 2783635c5c8421c077117928c96680bbe9d13168c3b463ecce2b9ae58e06f986
I0706 08:10:12.516379 41878 update.go:1941] Running: rpm-ostree rebase --experimental /run/mco-machine-os-content/os-content-1533141495/srv/repo:2783635c5c8421c077117928c96680bbe9d13168c3b463ecce2b9ae58e06f986 --custom-origin-url pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae --custom-origin-description Managed by machine-config-operator
0 metadata, 0 content objects imported; 0 bytes content written
Staging deployment... done
Upgraded:
conmon 2:2.0.29-1.module+el8.4.0+11822+6cc1e7d7 -> 2:2.0.29-5.rhaos4.10.el8
cri-o 1.23.2-10.rhaos4.10.gitf5a1c11.el8 -> 1.23.3-6.rhaos4.10.git74543e3.el8
cups-libs 1:2.2.6-38.el8 -> 1:2.2.6-38.el8_4.1
dracut 049-135.git20210121.el8_4.1 -> 049-137.git20220131.el8_4.1
dracut-network 049-135.git20210121.el8_4.1 -> 049-137.git20220131.el8_4.1
dracut-squash 049-135.git20210121.el8_4.1 -> 049-137.git20220131.el8_4.1
expat 2.2.5-4.el8_4.2 -> 2.2.5-4.el8_4.3
grub2-common 1:2.02-99.el8_4.1 -> 1:2.02-99.el8_4.9
grub2-efi-x64 1:2.02-99.el8_4.1 -> 1:2.02-99.el8_4.9
grub2-pc 1:2.02-99.el8_4.1 -> 1:2.02-99.el8_4.9
grub2-pc-modules 1:2.02-99.el8_4.1 -> 1:2.02-99.el8_4.9
grub2-tools 1:2.02-99.el8_4.1 -> 1:2.02-99.el8_4.9
grub2-tools-extra 1:2.02-99.el8_4.1 -> 1:2.02-99.el8_4.9
grub2-tools-minimal 1:2.02-99.el8_4.1 -> 1:2.02-99.el8_4.9
kernel 4.18.0-305.45.1.el8_4 -> 4.18.0-305.49.1.el8_4
kernel-core 4.18.0-305.45.1.el8_4 -> 4.18.0-305.49.1.el8_4
kernel-modules 4.18.0-305.45.1.el8_4 -> 4.18.0-305.49.1.el8_4
kernel-modules-extra 4.18.0-305.45.1.el8_4 -> 4.18.0-305.49.1.el8_4
libgcc 8.4.1-1.el8 -> 8.4.1-1.1.el8_4
libgomp 8.4.1-1.el8 -> 8.4.1-1.1.el8_4
libstdc++ 8.4.1-1.el8 -> 8.4.1-1.1.el8_4
mokutil 1:0.3.0-11.el8 -> 1:0.3.0-11.el8_4.1
openshift-clients 4.10.0-202205052127.p0.g04ad1b5.assembly.stream.el8 -> 4.10.0-202206211856.p0.g45460a5.assembly.stream.el8
openvswitch2.15 2.15.0-87.el8fdp -> 2.15.0-99.el8fdp
qemu-guest-agent 15:4.2.0-48.module+el8.4.0+14171+d59d4af6.4 -> 15:4.2.0-49.module+el8.4.0+15174+49839dd8.6
shim-x64 15.4-2.el8_1 -> 15.6-1.el8
systemd 239-45.el8_4.9 -> 239-45.el8_4.10
systemd-journal-remote 239-45.el8_4.9 -> 239-45.el8_4.10
systemd-libs 239-45.el8_4.9 -> 239-45.el8_4.10
systemd-pam 239-45.el8_4.9 -> 239-45.el8_4.10
systemd-udev 239-45.el8_4.9 -> 239-45.el8_4.10
xz 5.2.4-3.el8 -> 5.2.4-4.el8_4
xz-libs 5.2.4-3.el8 -> 5.2.4-4.el8_4
zlib 1.2.11-17.el8 -> 1.2.11-18.el8_4
Run "systemctl reboot" to start a reboot
[root@inf4 ~]# systemctl reboot
[root@inf4 ~]# Connection to 10.32.96.4 closed by remote host.
[core@inf4 ~]$ rpm-ostree status
State: idle
Deployments:
● pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:e73e648434bcc46d6c8538afb6f958ed2f93bd766768d1d3498e4b9afea369ae
CustomOrigin: Managed by machine-config-operator
Version: 410.84.202206212304-0 (2022-06-21T23:07:15Z)
pivot://quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:16124c9abedf4b8f6f6bfc625efcafdd445610778f2186079981687d49cd03ee
CustomOrigin: Managed by machine-config-operator
Version: 410.84.202205111833-0 (2022-05-11T18:37:05Z)
[core@inf4 ~]$
$ oc get nodes,mcp
NAME STATUS ROLES AGE VERSION
node/inf4.coe.muc.redhat.com Ready master,worker 98d v1.23.5+3afdacb
node/inf5.coe.muc.redhat.com Ready master,worker 98d v1.23.5+3afdacb
node/inf6.coe.muc.redhat.com Ready master,worker 98d v1.23.5+3afdacb
NAME CONFIG UPDATED UPDATING DEGRADED MACHINECOUNT READYMACHINECOUNT UPDATEDMACHINECOUNT DEGRADEDMACHINECOUNT AGE
machineconfigpool.machineconfiguration.openshift.io/master rendered-master-333e27092c98d0c51e65d1ce9e27f6e0 True False False 3 3 3 0 98d
machineconfigpool.machineconfiguration.openshift.io/worker rendered-worker-ff90ae481b4352e9da99a3404870bf72 True False False 0 0 0 0 98d
$
SOLVED, let's get back to networking
$ oc apply -f - <<EOF
> apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetworkNodePolicy
metadata:
name: policy-x710
namespace: openshift-sriov-network-operator
spec:
resourceName: x710
nodeSelector:
feature.node.kubernetes.io/network-sriov.capable: "true"
numVfs: 16
nicSelector:
vendor: "8086"
deviceID: "1572"
rootDevices:
- 0000:19:00.0
> EOF
sriovnetworknodepolicy.sriovnetwork.openshift.io/policy-x710 created
$
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetwork
metadata:
name: x710-coe-infra-vms
namespace: openshift-sriov-network-operator
spec:
resourceName: x710
networkNamespace: coe-infra-vms
Enable intel_iommu
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfig
metadata:
labels:
machineconfiguration.openshift.io/role: master
name: 100-master-iommu
spec:
config:
ignition:
version: 3.2.0
kernelArguments:
- intel_iommu=on
Added Kernel Argument iommu=pt
too.
VM/POD always get:
{"component":"virt-launcher","level":"error","msg":"unsupported configuration: host doesn't support passthrough of host PCI devices","pos":"qemuHostdevPreparePCIDevicesCheckSupport:187","subcomponent":"libvirt","thread":"42","timestamp":"2022-07-06T18:33:56.648000Z"}
{"component":"virt-launcher","level":"error","msg":"unsupported configuration: pci backend driver 'default' is not supported","pos":"virHostdevGetPCIHostDevice:253","subcomponent":"libvirt","thread":"42","timestamp":"2022-07-06T18:33:56.648000Z"}
https://kubevirt.io/user-guide/virtual_machines/interfaces_and_networks/#how-to-expose-sr-iov-vfs-to-kubevirt - doesn't helped.
Let's have a chat with cnv eng. https://coreos.slack.com/archives/C017V3R4M08/p1657132571061319
Mh, I change my SriovNetworkNodePolicy
from deviceType:netdevice
into deviceType:vfio-pci
maybe this helps?
based on https://github.com/kubevirt/kubevirt/issues/3035#issuecomment-598077943
VM booted but don't get an IP, let's disable spoofChk: "off"
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetwork
metadata:
name: x710-coe-infra-vms
namespace: openshift-sriov-network-operator
spec:
resourceName: x710
networkNamespace: coe-infra-vms
spoofChk: "off"
More and more I came to the conclusion SR-IOV function <-> Linux Software Bridge will not work.
Because all VF's are connected to the PF via an internal switch (VEB/VEPA) and it looks like the mac_address from the software bridge (fdb) are not "forwarded" to the internal switch or the internal switch is not able to learn others MAC Addresses.
According to https://downloadmirror.intel.com/732255/readme.txt, it should be possible to get the mac_address & list via cat /sys/class/net/p1p2/device/sriov/1/mac_list
sadly the device/sriov part is not available on my nodes.
In several locations I found this too
Software bridging does not work with SR-IOV Virtual Functions
SR-IOV Virtual Functions are unable to send or receive traffic between VMs using emulated connections on a Linux Software bridge and connections that use SR-IOV VFs.
For example https://dl.dell.com/manuals/all-products/esuprt_ser_stor_net/esuprt_pedge_srvr_ethnt_nic/intel-pro-adapters_user's%20guide4_en-us.pdf Addtional useful resources:
Add node inf6 I patched a second network cable into a different network with DHCP:
apiVersion: nmstate.io/v1
kind: NodeNetworkConfigurationPolicy
metadata:
name: ucsbr1
spec:
nodeSelector:
kubernetes.io/hostname: inf6.coe.muc.redhat.com
maxUnavailable: 3
desiredState:
interfaces:
- name: ucsbr1
description: Linux bridge with eno2 as a port
type: linux-bridge
state: up
ipv4:
enabled: false
bridge:
options:
stp:
enabled: false
port:
- name: eno2
apiVersion: "k8s.cni.cncf.io/v1"
kind: NetworkAttachmentDefinition
metadata:
name: ucsbr1
annotations:
k8s.v1.cni.cncf.io/resourceName: bridge.network.kubevirt.io/ucsbr1
spec:
config: '{
"cniVersion": "0.3.1",
"name": "ucsbr1",
"type": "cnv-bridge",
"bridge": "ucsbr1"
}'
Attached the VM to ucsbr1 dhcp works like charm.
Let's get back to plain sr-iov configuration:
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetworkNodePolicy
metadata:
name: policy-x710-vfio-pci
namespace: openshift-sriov-network-operator
spec:
deviceType: vfio-pci
nicSelector:
deviceID: "1572"
rootDevices:
- "0000:19:00.0"
vendor: "8086"
nodeSelector:
feature.node.kubernetes.io/network-sriov.capable: "true"
numVfs: 32
resourceName: x710_vfio_pci
And watch:
$ oc get sriovnetworknodestates -n openshift-sriov-network-operator -o custom-columns=NAME:.metadata.name,SYNCSTATUS:.status.syncStatus
NAME SYNCSTATUS
inf4.coe.muc.redhat.com InProgress
inf5.coe.muc.redhat.com InProgress
inf6.coe.muc.redhat.com InProgress
Check SR-IOV Network
$ oc get sriovnetwork/x710-coe-infra-vms -o yaml
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetwork
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"sriovnetwork.openshift.io/v1","kind":"SriovNetwork","metadata":{"annotations":{},"name":"x710-coe-infra-vms","namespace":"openshift-sriov-network-operator"},"spec":{"networkNamespace":"coe-infra-vms","resourceName":"x710_vfio_pci","spoofChk":"off"}}
operator.sriovnetwork.openshift.io/last-network-namespace: coe-infra-vms
creationTimestamp: "2022-07-06T08:31:00Z"
finalizers:
- netattdef.finalizers.sriovnetwork.openshift.io
generation: 3
name: x710-coe-infra-vms
namespace: openshift-sriov-network-operator
resourceVersion: "161570622"
uid: 0e419974-fcd2-416e-99cb-8e9536c7e7d7
spec:
networkNamespace: coe-infra-vms
resourceName: x710_vfio_pci
spoofChk: "off"
apiVersion: nmstate.io/v1
kind: NodeNetworkConfigurationPolicy
metadata:
name: remove-bridges
spec:
nodeSelector:
feature.node.kubernetes.io/network-sriov.capable: "true"
maxUnavailable: 3
desiredState:
interfaces:
- name: ucsbr1
description: Linux bridge with eno2 as a port
type: linux-bridge
state: absent
- name: coebr1
description: Linux bridge with eno2 as a port
type: linux-bridge
state: absent
Check:
$ oc get nncp,nnce
NAME STATUS
nodenetworkconfigurationpolicy.nmstate.io/remove-bridges Available
NAME STATUS
nodenetworkconfigurationenactment.nmstate.io/inf4.coe.muc.redhat.com.remove-bridges Available
nodenetworkconfigurationenactment.nmstate.io/inf5.coe.muc.redhat.com.remove-bridges Available
nodenetworkconfigurationenactment.nmstate.io/inf6.coe.muc.redhat.com.remove-bridges Available
Delete nncp
$ oc delete nodenetworkconfigurationpolicy.nmstate.io/remove-bridges
nodenetworkconfigurationpolicy.nmstate.io "remove-bridges" deleted
$ oc delete network-attachment-definitions.k8s.cni.cncf.io -n coe-infra-vms coebr1 second ucsbr1
networkattachmentdefinition.k8s.cni.cncf.io "coebr1" deleted
networkattachmentdefinition.k8s.cni.cncf.io "second" deleted
networkattachmentdefinition.k8s.cni.cncf.io "ucsbr1" deleted
Metall Installed, works like charm.
Added nmstate config to coe ocp cluster to provide network access to VM's