Open mcornea opened 5 years ago
/cc @phoracek
@mcornea the OpenShift release may be outdated, @celebdor pushed some changes to machineconfig and MCO to fix keepalived. On your setup, it is likely outdated and does not configure keepalived IPs on the new default interface.
@mcornea the OpenShift release may be outdated, @celebdor pushed some changes to machineconfig and MCO to fix keepalived. On your setup, it is likely outdated and does not configure keepalived IPs on the new default interface.
I'm using the version provided by a fresh deployment with install-scripts. How can we update the version to get the deployment back in a working state?
Check please with crictl if you have some keepalived-monitor container
Check please with crictl if you have some keepalived-monitor container
It is there:
[kni@rhhi-node-worker-0 ~]$ for master in {0..2}; do ssh core@rhhi-node-master-$master 'sudo crictl ps | grep keepalived-monitor';done
dd5a6585e9133 32c724f01ed6fafe3daf62eabd0151413696e27229bd7f59e0ca749fe0d529ce About an hour ago Running keepalived-monitor 1 4d3d37d44d982
cea8db649fa84 32c724f01ed6fafe3daf62eabd0151413696e27229bd7f59e0ca749fe0d529ce About an hour ago Running keepalived-monitor 1 ad4f8e58aa58f
1abadaa14931e 32c724f01ed6fafe3daf62eabd0151413696e27229bd7f59e0ca749fe0d529ce About an hour ago Running keepalived-monitor 1 6a637becec22d
Logs from one of the keepalived-monitor containers:
Note that 192.168.123.1 is the default router, there are no etcd records configured there.
[root@rhhi-node-master-1 core]# cat /etc/resolv.conf
# Generated by NetworkManager
search rhhi-virt-cluster.qe.lab.redhat.com
nameserver 192.168.123.6
nameserver 192.168.123.1
[root@rhhi-node-master-1 core]# crictl logs -f cea8db649fa84
time="2019-09-23T15:07:48Z" level=info msg="Config change detected" new config="{{rhhi-virt-cluster qe.lab.redhat.com 192.168.123.5 99 192.168.123.6 94 192.168.123.10 68 24 0} {0 0 0 [{etcd-2.rhhi-virt-cluster.qe.lab.redhat.com. 192.168.123.114 0} {etcd-0.rhhi-virt-cluster.qe.lab.redhat.com. 192.168.123.126 0}]} 192.168.123.130 rhhi-node-master-1 rhhi-node-etcd-1 ens4 [192.168.123.1]}"
time="2019-09-23T15:07:48Z" level=info msg="Runtimecfg rendering template" path=/etc/keepalived/keepalived.conf
time="2019-09-23T15:21:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:21:59Z" level=info msg="Config change detected" new config="{{rhhi-virt-cluster qe.lab.redhat.com 192.168.123.5 99 192.168.123.6 94 192.168.123.10 68 24 0} {0 0 0 []} 192.168.123.130 rhhi-node-master-1 rhhi-node-etcd-1 brext [192.168.123.1]}"
time="2019-09-23T15:21:59Z" level=info msg="Runtimecfg rendering template" path=/etc/keepalived/keepalived.conf
time="2019-09-23T15:22:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:22:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:22:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:23:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:23:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:23:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:24:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:24:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:24:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:25:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:25:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:25:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:26:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:26:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:26:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:27:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:27:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:27:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:28:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:28:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:28:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:29:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:29:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:29:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:30:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:30:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:30:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:31:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:31:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:31:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:32:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:32:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:32:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:33:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:33:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:33:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:34:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:34:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:34:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:35:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:35:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:35:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:36:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:36:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:36:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:37:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:37:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:37:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:38:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:38:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:38:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:39:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:39:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:39:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:40:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:40:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:40:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:41:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:41:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:41:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:42:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:42:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:42:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:43:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:43:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:43:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:44:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:44:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:44:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:45:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:45:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:45:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:46:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:46:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:46:59Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:47:19Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:47:39Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:48:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:48:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:48:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:49:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:49:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:49:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:50:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:50:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:50:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:51:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:51:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:51:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:52:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:52:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:52:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:53:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:53:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:53:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:54:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:54:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:54:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:55:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:55:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:55:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:56:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:56:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:56:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:57:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:57:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:57:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:58:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:58:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:58:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:59:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:59:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T15:59:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T16:00:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T16:00:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T16:00:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T16:01:00Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T16:01:20Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
time="2019-09-23T16:01:40Z" level=info msg="Failed to get Etcd SRV members" err="lookup _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com on 192.168.123.1:53: no such host"
Apart from the DNS stuff, which should not affect the keepalived functionality, it seems the change was detected. Can you confirm that all masters have the IPs on brext and that /etc/keepalived/keepalived.conf points to the brext?
Are those master IPs correct? The same as before installing CNV?
Are those master IPs correct? The same as before installing CNV?
Yes, IPs look correct:
kni@rhhi-node-worker-0 ~]$ for master in {0..2}; do dig +short rhhi-node-master-$master; ssh core@rhhi-node-master-$master 'ip a s dev brext; sudo cat /etc/keepalived/keepalived.conf'; echo '########################';done
192.168.123.126
71: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:dd:42:24 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.126/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3098sec preferred_lft 3098sec
inet6 fe80::97a2:f95:f7da:a30a/64 scope link noprefixroute
valid_lft forever preferred_lft forever
vrrp_script chk_ocp {
script "/usr/bin/curl -o /dev/null -kLs https://0:6443/readyz"
interval 1
weight 50
}
vrrp_script chk_dns {
script "/usr/bin/host -t SRV _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com localhost"
interval 1
weight 50
}
# TODO: Improve this check. The port is assumed to be alive.
# Need to assess what is the ramification if the port is not there.
vrrp_script chk_ingress {
script "/usr/bin/curl -o /dev/null -kLs http://0:1936/healthz"
interval 1
weight 50
}
vrrp_instance rhhi-virt-cluster_API {
state BACKUP
interface brext
virtual_router_id 99
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass rhhi-virt-cluster_api_vip
}
virtual_ipaddress {
192.168.123.5/24
}
track_script {
chk_ocp
}
}
vrrp_instance rhhi-virt-cluster_DNS {
state BACKUP
interface brext
virtual_router_id 94
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass rhhi-virt-cluster_dns_vip
}
virtual_ipaddress {
192.168.123.6/24
}
track_script {
chk_dns
}
}
vrrp_instance rhhi-virt-cluster_INGRESS {
state BACKUP
interface brext
virtual_router_id 68
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass cluster_uuid_ingress_vip
}
virtual_ipaddress {
192.168.123.10/24
}
track_script {
chk_ingress
}
}
########################
192.168.123.130
61: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:bf:3f:46 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.130/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3182sec preferred_lft 3182sec
inet6 fe80::fb17:8f64:42ab:5263/64 scope link noprefixroute
valid_lft forever preferred_lft forever
vrrp_script chk_ocp {
script "/usr/bin/curl -o /dev/null -kLs https://0:6443/readyz"
interval 1
weight 50
}
vrrp_script chk_dns {
script "/usr/bin/host -t SRV _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com localhost"
interval 1
weight 50
}
# TODO: Improve this check. The port is assumed to be alive.
# Need to assess what is the ramification if the port is not there.
vrrp_script chk_ingress {
script "/usr/bin/curl -o /dev/null -kLs http://0:1936/healthz"
interval 1
weight 50
}
vrrp_instance rhhi-virt-cluster_API {
state BACKUP
interface brext
virtual_router_id 99
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass rhhi-virt-cluster_api_vip
}
virtual_ipaddress {
192.168.123.5/24
}
track_script {
chk_ocp
}
}
vrrp_instance rhhi-virt-cluster_DNS {
state BACKUP
interface brext
virtual_router_id 94
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass rhhi-virt-cluster_dns_vip
}
virtual_ipaddress {
192.168.123.6/24
}
track_script {
chk_dns
}
}
vrrp_instance rhhi-virt-cluster_INGRESS {
state BACKUP
interface brext
virtual_router_id 68
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass cluster_uuid_ingress_vip
}
virtual_ipaddress {
192.168.123.10/24
}
track_script {
chk_ingress
}
}
########################
192.168.123.114
70: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:b7:79:20 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.114/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3229sec preferred_lft 3229sec
inet6 fe80::d0a2:6ba6:24ea:514a/64 scope link noprefixroute
valid_lft forever preferred_lft forever
vrrp_script chk_ocp {
script "/usr/bin/curl -o /dev/null -kLs https://0:6443/readyz"
interval 1
weight 50
}
vrrp_script chk_dns {
script "/usr/bin/host -t SRV _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com localhost"
interval 1
weight 50
}
# TODO: Improve this check. The port is assumed to be alive.
# Need to assess what is the ramification if the port is not there.
vrrp_script chk_ingress {
script "/usr/bin/curl -o /dev/null -kLs http://0:1936/healthz"
interval 1
weight 50
}
vrrp_instance rhhi-virt-cluster_API {
state BACKUP
interface brext
virtual_router_id 99
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass rhhi-virt-cluster_api_vip
}
virtual_ipaddress {
192.168.123.5/24
}
track_script {
chk_ocp
}
}
vrrp_instance rhhi-virt-cluster_DNS {
state BACKUP
interface brext
virtual_router_id 94
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass rhhi-virt-cluster_dns_vip
}
virtual_ipaddress {
192.168.123.6/24
}
track_script {
chk_dns
}
}
vrrp_instance rhhi-virt-cluster_INGRESS {
state BACKUP
interface brext
virtual_router_id 68
priority 40
advert_int 1
authentication {
auth_type PASS
auth_pass cluster_uuid_ingress_vip
}
virtual_ipaddress {
192.168.123.10/24
}
track_script {
chk_ingress
}
}
########################
Can we have the logs for the keepalived containers?
On Mon, Sep 23, 2019, 18:19 Marius Cornea notifications@github.com wrote:
Are those master IPs correct? The same as before installing CNV?
Yes, IPs look correct:
kni@rhhi-node-worker-0 ~]$ for master in {0..2}; do dig +short rhhi-node-master-$master; ssh core@rhhi-node-master-$master 'ip a s dev brext; sudo cat /etc/keepalived/keepalived.conf'; echo '########################';done 192.168.123.126 71: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 52:54:00:dd:42:24 brd ff:ff:ff:ff:ff:ff inet 192.168.123.126/24 brd 192.168.123.255 scope global dynamic noprefixroute brext valid_lft 3098sec preferred_lft 3098sec inet6 fe80::97a2:f95:f7da:a30a/64 scope link noprefixroute valid_lft forever preferred_lft forever vrrp_script chk_ocp { script "/usr/bin/curl -o /dev/null -kLs https://0:6443/readyz" interval 1 weight 50 }
vrrp_script chk_dns { script "/usr/bin/host -t SRV _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com localhost" interval 1 weight 50 }
TODO: Improve this check. The port is assumed to be alive.# Need to assess what is the ramification if the port is not there.
vrrp_script chk_ingress { script "/usr/bin/curl -o /dev/null -kLs http://0:1936/healthz" interval 1 weight 50 }
vrrp_instance rhhi-virt-cluster_API { state BACKUP interface brext virtual_router_id 99 priority 40 advert_int 1 authentication { auth_type PASS auth_pass rhhi-virt-cluster_api_vip } virtual_ipaddress { 192.168.123.5/24 } track_script { chk_ocp } }
vrrp_instance rhhi-virt-cluster_DNS { state BACKUP interface brext virtual_router_id 94 priority 40 advert_int 1 authentication { auth_type PASS auth_pass rhhi-virt-cluster_dns_vip } virtual_ipaddress { 192.168.123.6/24 } track_script { chk_dns } }
vrrp_instance rhhi-virt-cluster_INGRESS { state BACKUP interface brext virtual_router_id 68 priority 40 advert_int 1 authentication { auth_type PASS auth_pass cluster_uuid_ingress_vip } virtual_ipaddress { 192.168.123.10/24 } track_script { chk_ingress } }######################## 192.168.123.130 61: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 52:54:00:bf:3f:46 brd ff:ff:ff:ff:ff:ff inet 192.168.123.130/24 brd 192.168.123.255 scope global dynamic noprefixroute brext valid_lft 3182sec preferred_lft 3182sec inet6 fe80::fb17:8f64:42ab:5263/64 scope link noprefixroute valid_lft forever preferred_lft forever vrrp_script chk_ocp { script "/usr/bin/curl -o /dev/null -kLs https://0:6443/readyz" interval 1 weight 50 }
vrrp_script chk_dns { script "/usr/bin/host -t SRV _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com localhost" interval 1 weight 50 }
TODO: Improve this check. The port is assumed to be alive.# Need to assess what is the ramification if the port is not there.
vrrp_script chk_ingress { script "/usr/bin/curl -o /dev/null -kLs http://0:1936/healthz" interval 1 weight 50 }
vrrp_instance rhhi-virt-cluster_API { state BACKUP interface brext virtual_router_id 99 priority 40 advert_int 1 authentication { auth_type PASS auth_pass rhhi-virt-cluster_api_vip } virtual_ipaddress { 192.168.123.5/24 } track_script { chk_ocp } }
vrrp_instance rhhi-virt-cluster_DNS { state BACKUP interface brext virtual_router_id 94 priority 40 advert_int 1 authentication { auth_type PASS auth_pass rhhi-virt-cluster_dns_vip } virtual_ipaddress { 192.168.123.6/24 } track_script { chk_dns } }
vrrp_instance rhhi-virt-cluster_INGRESS { state BACKUP interface brext virtual_router_id 68 priority 40 advert_int 1 authentication { auth_type PASS auth_pass cluster_uuid_ingress_vip } virtual_ipaddress { 192.168.123.10/24 } track_script { chk_ingress } }######################## 192.168.123.114 70: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 52:54:00:b7:79:20 brd ff:ff:ff:ff:ff:ff inet 192.168.123.114/24 brd 192.168.123.255 scope global dynamic noprefixroute brext valid_lft 3229sec preferred_lft 3229sec inet6 fe80::d0a2:6ba6:24ea:514a/64 scope link noprefixroute valid_lft forever preferred_lft forever vrrp_script chk_ocp { script "/usr/bin/curl -o /dev/null -kLs https://0:6443/readyz" interval 1 weight 50 }
vrrp_script chk_dns { script "/usr/bin/host -t SRV _etcd-server-ssl._tcp.rhhi-virt-cluster.qe.lab.redhat.com localhost" interval 1 weight 50 }
TODO: Improve this check. The port is assumed to be alive.# Need to assess what is the ramification if the port is not there.
vrrp_script chk_ingress { script "/usr/bin/curl -o /dev/null -kLs http://0:1936/healthz" interval 1 weight 50 }
vrrp_instance rhhi-virt-cluster_API { state BACKUP interface brext virtual_router_id 99 priority 40 advert_int 1 authentication { auth_type PASS auth_pass rhhi-virt-cluster_api_vip } virtual_ipaddress { 192.168.123.5/24 } track_script { chk_ocp } }
vrrp_instance rhhi-virt-cluster_DNS { state BACKUP interface brext virtual_router_id 94 priority 40 advert_int 1 authentication { auth_type PASS auth_pass rhhi-virt-cluster_dns_vip } virtual_ipaddress { 192.168.123.6/24 } track_script { chk_dns } }
vrrp_instance rhhi-virt-cluster_INGRESS { state BACKUP interface brext virtual_router_id 68 priority 40 advert_int 1 authentication { auth_type PASS auth_pass cluster_uuid_ingress_vip } virtual_ipaddress { 192.168.123.10/24 } track_script { chk_ingress } }########################
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/openshift-kni/install-scripts/issues/160?email_source=notifications&email_token=AAEXHH5TKA6NJHCGLEH3SZTQLDUA5A5CNFSM4IZMNOIKYY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOD7LNQKI#issuecomment-534173737, or mute the thread https://github.com/notifications/unsubscribe-auth/AAEXHH5UE6KPKGS7JHJF42DQLDUA5ANCNFSM4IZMNOIA .
The reload and reconfiguration happened. For some reason the health checks all fail though. I would need access to one such deployment.
curl: (7) Failed to connect to 0 port 6443: Connection refused```
I have another env with same issues, as stated by @celebdor we have test healthz at node and it's working fine
[core@rhhi-node-master-0 ~]$ curl -k https://0.0.0.0:6443/healthz
ok[core@rhhi-node-master-0 ~]$
Looks like kube-apiserver container is not running, attaching the kube-apiserver container log: kubeapiserver.log
At the other env this are the logs from keepalived and keepalived-monitor for master-0 keepalived-monitor.log keepalived.log
I see NetworkManager is re applying DHCP config (could this end up overwriting the API VIP from keepalived)
Sep 23 17:08:47 rhhi-node-master-0 systemd[1]: NetworkManager-dispatcher.service: Consumed 33ms CPU time
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.5646] audit: op="checkpoint-create" arg="/org/freedesktop/NetworkManager/Checkpoint/23" pid=11140 uid=0 result="success"
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.5802] settings-connection[0x562eebb6c210,92dc629b-fc4e-3641-9fd2-1b5e0cdb107c]: write: successfully updated (ifcfg-rh: update /etc/sysconfig/network-scripts/ifcfg-Wired_connection_2)
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.5805] audit: op="connection-update" uuid="92dc629b-fc4e-3641-9fd2-1b5e0cdb107c" name="Wired connection 2" pid=11140 uid=0 result="success"
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.5890] settings-connection[0x562eebb6c4d0,680ae8d5-4701-4578-97b9-b531263ca043]: write: successfully updated (ifcfg-rh: update /etc/sysconfig/network-scripts/ifcfg-brext)
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.5895] audit: op="connection-update" uuid="680ae8d5-4701-4578-97b9-b531263ca043" name="brext" pid=11140 uid=0 result="success"
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.6034] audit: op="device-reapply" interface="brext" ifindex=60 pid=11140 uid=0 result="success"
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.6185] audit: op="device-reapply" interface="ens4" ifindex=3 pid=11140 uid=0 result="success"
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.8263] checkpoint[0x562eebad05f0]: destroy /org/freedesktop/NetworkManager/Checkpoint/23
Sep 23 17:23:22 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259402.8272] audit: op="checkpoint-destroy" arg="/org/freedesktop/NetworkManager/Checkpoint/23" pid=11140 uid=0 result="success"
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0428] dhcp4 (brext): address 192.168.123.114
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0429] dhcp4 (brext): plen 24 (255.255.255.0)
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0429] dhcp4 (brext): gateway 192.168.123.1
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0429] dhcp4 (brext): lease time 3600
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0430] dhcp4 (brext): hostname 'rhhi-node-master-0'
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0430] dhcp4 (brext): nameserver '192.168.123.6'
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0430] dhcp4 (brext): nameserver '192.168.123.1'
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0430] dhcp4 (brext): domain name 'rhhi-virt-cluster.qe.lab.redhat.com'
Sep 23 17:32:15 rhhi-node-master-0 NetworkManager[1074]: <info> [1569259935.0430] dhcp (brext): domain search 'rhhi-virt-cluster.qe.lab.redhat.com.'
Note: the VIPs get restored after restarting the keepalived containers maually
[kni@rhhi-node-worker-0 ~]$ for node in {0..2}; do ssh core@rhhi-node-master-$node 'ip a s dev brext';done
79: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:12:e9:c6 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.128/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3364sec preferred_lft 3364sec
inet6 fe80::dd60:989b:2f05:3624/64 scope link noprefixroute
valid_lft forever preferred_lft forever
65: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:af:13:e4 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.139/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3492sec preferred_lft 3492sec
inet6 fe80::a7ca:cd27:4250:7509/64 scope link noprefixroute
valid_lft forever preferred_lft forever
59: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:15:1c:0c brd ff:ff:ff:ff:ff:ff
inet 192.168.123.127/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3234sec preferred_lft 3234sec
inet6 fe80::1d74:af19:57ff:de1d/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[kni@rhhi-node-worker-0 ~]$ for node in {0..2}; do ssh core@rhhi-node-master-$node 'KEEPALIVED_CONTAINER_ID=$(sudo crictl ps | grep "keepalived " | cut -f1 -d " "); sudo crictl stop $KEEPALIVED_CONTAINER_ID';done
c6d5645ebfc19
bdde4b2cca0ed
9a4c3b75f28c2
[kni@rhhi-node-worker-0 ~]$ for node in {0..2}; do ssh core@rhhi-node-master-$node 'ip a s dev brext';done
79: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:12:e9:c6 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.128/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3256sec preferred_lft 3256sec
inet 192.168.123.5/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.6/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.10/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet6 fe80::dd60:989b:2f05:3624/64 scope link noprefixroute
valid_lft forever preferred_lft forever
65: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:af:13:e4 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.139/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3383sec preferred_lft 3383sec
inet6 fe80::a7ca:cd27:4250:7509/64 scope link noprefixroute
valid_lft forever preferred_lft forever
59: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:15:1c:0c brd ff:ff:ff:ff:ff:ff
inet 192.168.123.127/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3126sec preferred_lft 3126sec
inet6 fe80::1d74:af19:57ff:de1d/64 scope link noprefixroute
valid_lft forever preferred_lft forever
Maybe there is a race condition between kubernetes nmstate and keepalives since both are configuring the interface, and restoring keepalives override it again.
At the other environment restarting keepalived works too.
I have also restart the three master nodes and it goes back to normal:
[kni@rhhi-node-worker-0 ~]$ for node in {0..2}; do ssh core@rhhi-node-master-$node 'ip a s dev brext';done
4: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:61:1e:9f brd ff:ff:ff:ff:ff:ff
inet 192.168.123.114/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3398sec preferred_lft 3398sec
inet 192.168.123.5/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.6/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.10/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet6 fe80::a4e:dfe8:e84d:fb49/64 scope link noprefixroute
valid_lft forever preferred_lft forever
4: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:82:d2:66 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.138/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3398sec preferred_lft 3398sec
inet6 fe80::3e1b:54c3:7cfa:afdb/64 scope link noprefixroute
valid_lft forever preferred_lft forever
4: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:53:d9:92 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.129/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3393sec preferred_lft 3393sec
inet6 fe80::acf5:3879:7fb9:5c80/64 scope link noprefixroute
valid_lft forever preferred_lft forever
After some time VIPs are gone (I suspect kubernetes-nmstate or network manager is re-applying and overriding brext config done by keepalived):
[kni@rhhi-node-worker-0 ~]$ oc get pod --all-namespaces
^C
[kni@rhhi-node-worker-0 ~]$ for node in {0..2}; do ssh core@rhhi-node-master-$node 'ip a s dev brext';done
4: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:61:1e:9f brd ff:ff:ff:ff:ff:ff
inet 192.168.123.114/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 1975sec preferred_lft 1975sec
inet6 fe80::a4e:dfe8:e84d:fb49/64 scope link noprefixroute
valid_lft forever preferred_lft forever
4: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:82:d2:66 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.138/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3332sec preferred_lft 3332sec
inet6 fe80::3e1b:54c3:7cfa:afdb/64 scope link noprefixroute
valid_lft forever preferred_lft forever
4: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:53:d9:92 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.129/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 1971sec preferred_lft 1971sec
inet6 fe80::acf5:3879:7fb9:5c80/64 scope link noprefixroute
valid_lft forever preferred_lft forever
I also see some ifcfg scripts there regarding brext, is those suppose to be gone ?
Sep 24 08:33:00 rhhi-node-master-0 NetworkManager[1248]: <info> [1569313980.3588] settings-connection[0x55bfea5e2250,680ae8d5-4701-4578-97b9-b531263ca043]: write: successfully updated (ifcfg-rh: update /etc/sysconfig/network-scripts/ifcfg-brext)
@qinqon does the loss of IP happen immediately after the configuration or only after some time? I'm hoping the issue is caused by the missing ip
in the image. That may be causing re-reconciliation and removal of IP (maybe?). ifcfg is created by NetworkManager for backward compatibility AFAIK.
Well it happends after install scripts finished and this can some time after configuration
Another issue is that, after we don't have API connectivity until keepalived is up, don't know if this is an issue.
@qinqon that is expected. keepalived IPs are used for API connection.
After fixing our kubernetes-nmstate-handler and install ip command there the API and INGRESS VIPs are there and steady after reboot keepalived, but DNS is not there.
@celebdor do you know if there is any issue with DNS VIPs ?
Note: the VIPs are lost after restarting nmstate-handler container:
[root@rhhi-node-master-0 core]# ip a s dev brext
79: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:12:e9:c6 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.128/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3228sec preferred_lft 3228sec
inet 192.168.123.5/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.6/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet6 fe80::dd60:989b:2f05:3624/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
3ca5491b1d19a 1e9b425297396fc41fc6d79cf2f523ab85bfc429840bca314c04376dc5fd26f6 2 minutes ago Running nmstate-handler 1 036c980c7d2bc
[root@rhhi-node-master-0 core]# crictl stop $(crictl ps | awk '/nmstate-handler/ {print $1}')
3ca5491b1d19a
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
[root@rhhi-node-master-0 core]# ip a s dev brext
79: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:12:e9:c6 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.128/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3191sec preferred_lft 3191sec
inet 192.168.123.5/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.6/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet6 fe80::dd60:989b:2f05:3624/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
[root@rhhi-node-master-0 core]# ip a s dev brext
79: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:12:e9:c6 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.128/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3186sec preferred_lft 3186sec
inet 192.168.123.5/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.6/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet6 fe80::dd60:989b:2f05:3624/64 scope link noprefixroute
valid_lft forever preferred_lft forever
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
[root@rhhi-node-master-0 core]# crictl ps | grep nmstate-handler
ecf6bc9dd9b67 1e9b425297396fc41fc6d79cf2f523ab85bfc429840bca314c04376dc5fd26f6 Less than a second ago Running nmstate-handler 2 036c980c7d2bc
[root@rhhi-node-master-0 core]# ip a s dev brext
79: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:12:e9:c6 brd ff:ff:ff:ff:ff:ff
inet 192.168.123.128/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3181sec preferred_lft 3181sec
inet6 fe80::dd60:989b:2f05:3624/64 scope link noprefixroute
valid_lft forever preferred_lft forever
@mcornea please wait for the new build of kubernetes-nmstate (we are having some troubles making a release, I will notify you once it is done)
We have deploy with fixed kubernetes-nmstate-handler container and configure-network.sh has success.
And brext at nodes is allright:
[kni@rhhi-node-worker-0 CNV]$ for node in {0..2}; do ssh core@rhhi-node-master-$node 'ip a s dev brext';done
130: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:f8:a8:bb brd ff:ff:ff:ff:ff:ff
inet 192.168.123.121/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3209sec preferred_lft 3209sec
inet6 fe80::5be8:7e7f:822:d886/64 scope link noprefixroute
valid_lft forever preferred_lft forever
64: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:02:81:3b brd ff:ff:ff:ff:ff:ff
inet 192.168.123.131/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3210sec preferred_lft 3210sec
inet 192.168.123.6/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet 192.168.123.5/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet6 fe80::973b:d980:79bd:3a80/64 scope link noprefixroute
valid_lft forever preferred_lft forever
75: brext: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 52:54:00:1d:7c:6e brd ff:ff:ff:ff:ff:ff
inet 192.168.123.114/24 brd 192.168.123.255 scope global dynamic noprefixroute brext
valid_lft 3212sec preferred_lft 3212sec
inet 192.168.123.10/24 scope global secondary brext
valid_lft forever preferred_lft forever
inet6 fe80::778c:b57d:ee95:1cb5/64 scope link noprefixroute
valid_lft forever preferred_lft forever
The new CNV version with nmstate fixed should be out.
Can we close this ?
It is not tested yet.
The API VIP is lost after the CNV/configure-network.sh scripts gets run:
11:15:01 TASK [Run CNV make] **** echo 'Creating registry secret'", "+ cat", "+ oc create -f -", "+ echo 'Creating OperatorGroup'", "+ cat", "+ oc create -f -", "+ echo 'Creating OperatorSource'", "+ cat", "+ oc create -f -", "+ echo 'Give the cluster 30 seconds to create the catalogSourceConfig...'", "+ sleep 30", "+ cat", "+ oc apply -f -", "+ echo 'Give the cluster 30 seconds to process catalogSourceConfig...'", "+ sleep 30", "+ oc wait deploy hco-catalogsource-config --for condition=available -n openshift-marketplace --timeout=360s", "++ seq 1 10", "+ for i in $(seq 1 $RETRIES)", "+ echo 'Waiting for packagemanifest '\''kubevirt-hyperconverged'\'' to be created in namespace '\''openshift-cnv'\''...'", "+ oc get packagemanifest -n openshift-cnv kubevirt-hyperconverged", "+ break", "+ echo 'Creating Subscription'", "+ cat", "+ oc create -f -", "+ echo 'Give OLM 60 seconds to process the subscription...'", "+ sleep 60", "+ sed 's/approved: false/approved: true/'", "+ oc apply -n openshift-cnv -f -", "++ oc get installplan -n openshift-cnv --no-headers", "++ grep kubevirt-hyperconverged-operator.v2.1.0", "++ awk '{print $1}'", "+ oc get installplan -o yaml -n openshift-cnv install-d8857", "Warning: oc apply should be used on resource created by either oc create --save-config or oc apply", "+ echo 'Give OLM 60 seconds to process the installplan...'", "+ sleep 60", "++ oc get pods -n openshift-cnv", "++ grep hco-operator", "++ head -1", "++ awk '{ print $1 }'", "+ oc wait pod hco-operator-76f448c546-z7tl4 --for condition=Ready -n openshift-cnv --timeout=360s", "+ echo 'Creating the HCO'\''s Custom Resource'", "+ cat", "+ oc create -f -", "+ echo 'Waiting for HCO to get fully deployed'", "+ oc wait -n openshift-cnv hyperconverged hyperconverged-cluster --for condition=Available --timeout=10m", "++ grep machineCIDR ../OpenShift/install-config.yaml", "++ sed 's/\(.\): \(.\)/\2/'", "+ MACHINE_CIDR=192.168.123.0/24", "+ BRIDGE_NAME=brext", "+ export KUBECONFIG=../OpenShift/ocp/auth/kubeconfig", "+ KUBECONFIG=../OpenShift/ocp/auth/kubeconfig", "++ oc get nodes -o 'jsonpath={range .items[]}{.metadata.name} {end}'", "+ nodes='rhhi-node-master-0 rhhi-node-master-1 rhhi-node-master-2 '", "+ echo 'Configuring networks on nodes'", "+ for node in $nodes", "+ echo 'Detecting the default interface'", "++ oc get nodenetworkstate rhhi-node-master-0 -o 'jsonpath={.status.currentState.routes.running[?(@.destination==\"192.168.123.0/24\")].next-hop-interface}'", "++ cut -d ' ' -f 1", "+ default_iface=ens4", "+ '[' ens4 == brext ']'", "+ echo 'Detecting MAC address of the default interface'", "++ oc get nodenetworkstate rhhi-node-master-0 -o 'jsonpath={.status.currentState.interfaces[?(@.name==\"ens4\")].mac-address}'", "+ default_iface_mac=52:54:00:DD:42:24", "+ echo 'Applying node network configuration policy'", "+ cat", "+ oc apply -f -", "+ for node in $nodes", "+ echo 'Detecting the default interface'", "++ oc get nodenetworkstate rhhi-node-master-1 -o 'jsonpath={.status.currentState.routes.running[?(@.destination==\"192.168.123.0/24\")].next-hop-interface}'", "++ cut -d ' ' -f 1", "+ default_iface=ens4", "+ '[' ens4 == brext ']'", "+ echo 'Detecting MAC address of the default interface'", "++ oc get nodenetworkstate rhhi-node-master-1 -o 'jsonpath={.status.currentState.interfaces[?(@.name==\"ens4\")].mac-address}'", "+ default_iface_mac=52:54:00:BF:3F:46", "+ echo 'Applying node network configuration policy'", "+ cat", "+ oc apply -f -", "+ for node in $nodes", "+ echo 'Detecting the default interface'", "++ oc get nodenetworkstate rhhi-node-master-2 -o 'jsonpath={.status.currentState.routes.running[?(@.destination==\"192.168.123.0/24\")].next-hop-interface}'", "++ cut -d ' ' -f 1", "+ default_iface=ens4", "+ '[' ens4 == brext ']'", "+ echo 'Detecting MAC address of the default interface'", "++ oc get nodenetworkstate rhhi-node-master-2 -o 'jsonpath={.status.currentState.interfaces[?(@.name==\"ens4\")].mac-address}'", "+ default_iface_mac=52:54:00:B7:79:20", "+ echo 'Applying node network configuration policy'", "+ cat", "+ oc apply -f -", "+ echo 'Waiting until the configuration is done, it may take up to 5 minutes until keepalived gets reconfigured'", "+ for node in $nodes", "++ oc get nodenetworkstate rhhi-node-master-0 -o 'jsonpath={.status.currentState.routes.running[?(@.destination==\"192.168.123.0/24\")].next-hop-interface}'", "+ '[' 'ens4 ens4' == brext ']'", "+ sleep 10", "++ oc get nodenetworkstate rhhi-node-master-0 -o 'jsonpath={.status.currentState.routes.running[?(@.destination==\"192.168.123.0/24\")].next-hop-interface}'", "+ '[' brext == brext ']'", "+ oc wait node rhhi-node-master-0 --for condition=Ready --timeout=10m", "Unable to connect to the server: dial tcp 192.168.123.5:6443: connect: no route to host", "make[1]: [Makefile:9: deploy] Error 1", "make: [Makefile:25: CNV] Error 2"], "stdout": "", "stdout_lines": []}
Checking brext bridges on the master nodes none of them has the API VIP: