openshift / origin

Conformance test suite for OpenShift
http://www.openshift.org
Apache License 2.0
8.48k stars 4.7k forks source link

OKD 3.10 deployment fails at "TASK [Approve bootstrap nodes]" #20570

Closed yuanlinios closed 6 years ago

yuanlinios commented 6 years ago

okd 3.10 deployment fails at "TASK [Approve bootstrap nodes]"

Version
OS: centos atomic host 7.5

git describe
openshift-ansible-3.10.27-2-38-gcd1f413

oc version
oc v3.10.0+7eee6f8-2
kubernetes v1.10.0+b81c8f8
features: Basic-Auth GSSAPI Kerberos SPNEGO

Server https://master-blue.lab.local:443
openshift v3.10.0+7eee6f8-2
kubernetes v1.10.0+b81c8f8
Steps To Reproduce
  1. ansible-playbook -i openshift-ansible/playbooks/deploy_cluster.yml
Current Result

the deployment fails at "TASK [Approve bootstrap nodes]"

Expected Result

the deployment should be successful

Additional Information

"TASK [Approve bootstrap nodes]" failed in timeout. When it was in progress, I monitored with "oc get csr -w", but no pending csr:

oc get csr -w
NAME                                                   AGE       REQUESTOR                                                 CONDITION
csr-5cp8c                                              23m       system:admin                                              Approved
csr-7d2qz                                              23m       system:admin                                              Approved
csr-bdh24                                              30m       system:admin                                              Approved
csr-bm8q6                                              30m       system:admin                                              Approved
csr-chnwr                                              10m       system:admin                                              Approved
csr-dkknq                                              30m       system:admin                                              Approved
csr-h2qgb                                              30m       system:admin                                              Approved
csr-ll8m4                                              10m       system:admin                                              Approved
csr-lxwhg                                              23m       system:admin                                              Approved
csr-q2wcq                                              30m       system:admin                                              Approved
csr-vsmgf                                              30m       system:admin                                              Approved
csr-xnwrh                                              10m       system:admin                                              Approved
node-csr-3Jop4yGuAAe64622vzC8YgfHgphN4g1Dn3RDxMIofwM   23m       system:serviceaccount:openshift-infra:node-bootstrapper   Approved
node-csr-gkAjyM96d1PnN0pjnOGqnHDWUH6K6_05RCv6EedLhcs   23m       system:serviceaccount:openshift-infra:node-bootstrapper   Approved
node-csr-i6DyU7KE4gx_AwfjlahKdqNzuazoj7m_2ZBwOnyuEZU   23m       system:serviceaccount:openshift-infra:node-bootstrapper   Approved
node-csr-imq9rrCYCr2X-E0jd2r7HLX-x90k8iDgN1xL6PLADVU   23m       system:serviceaccount:openshift-infra:node-bootstrapper   Approved

error message

fatal: [atomic21a.lab.local]: FAILED! => {"changed": true, "finished": false, "msg": "Timed out accepting certificate signing requests. 
...ignoring

my inventory file

[OSEv3:children]
masters
glusterfs
nodes
etcd

[masters]
atomic21a.lab.local
atomic22a.lab.local
atomic26a.lab.local

[etcd]
atomic21a.lab.local
atomic22a.lab.local
atomic26a.lab.local

[glusterfs]
centos33a.lab.local glusterfs_ip='10.65.150.33' glusterfs_devices='["/dev/sdc"]'
centos38a.lab.local glusterfs_ip='10.65.150.38' glusterfs_devices='["/dev/sdc"]'
centos39a.lab.local glusterfs_ip='10.65.150.39' glusterfs_devices='["/dev/sdc"]'

[nodes]
atomic21a.lab.local openshift_node_group_name='node-config-master'
atomic22a.lab.local openshift_node_group_name='node-config-master'
atomic26a.lab.local openshift_node_group_name='node-config-master'

atomic23a.lab.local openshift_node_group_name='node-config-infra'
atomic27a.lab.local openshift_node_group_name='node-config-infra'

atomic24a.lab.local openshift_node_group_name='node-config-compute'
atomic28a.lab.local openshift_node_group_name='node-config-compute'

[OSEv3:vars]
ansible_ssh_user=centos
ansible_become=true
debug_level=4
containerized=true
openshift_deployment_type=origin

openshift_release=v3.10
openshift_image_tag=v3.10.0
osm_etcd_image=registry.fedoraproject.org/latest/etcd:latest
openshift_storage_glusterfs_heketi_version=latest

openshift_storage_glusterfs_is_native=false
openshift_storage_glusterfs_heketi_is_native=true

openshift_storage_glusterfs_heketi_executor=ssh
openshift_storage_glusterfs_heketi_ssh_port=22
openshift_storage_glusterfs_heketi_ssh_user=centos
openshift_storage_glusterfs_heketi_ssh_sudo=true
openshift_storage_glusterfs_heketi_ssh_keyfile=/home/centos/.ssh/id_rsa

openshift_master_default_subdomain=blue.example.com
openshift_master_cluster_hostname=master-blue.lab.local
openshift_master_cluster_public_hostname=openshift.blue.example.com

openshift_install_examples=true
docker_version="1.13.1"

openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true', 'challenge': 'true', 'kind': 'HTPasswdPasswordIdentityProvider', 'file': '/etc/origin/master/htpasswd'}]
openshift_master_htpasswd_users={'admin1':'$apr1$yz54/VRD$eFo7CdUOKvDEUMLSlKsWb.', 'admin2':'$apr1$5OAY0cwA$CG0PiQTBFCbSaE9e72gvc/', 'user1':'$apr1$fbxdNP5w$nR7bMhB4nhtPphccR1wJK/', 'user2':'$apr1$ziyR8wv2$KVroMcHBoNdXXwSCGgrXM.', 'user3':'$apr1$VdhQBVHQ$bPVimMEEpmH7r37bAnJv80'}

openshift_router_selector='node-role.kubernetes.io/infra=true'
openshift_registry_selector='node-rule.kubernets.io/infra=true'
osm_default_node_selector='node-rule.kubernetes.io/compute=true'
openshift_master_cluster_method=native

openshift_storage_glusterfs_wipe=true
openshift_storage_glusterfs_block_deploy=true
openshift_storage_glusterfs_block_host_vol_create=true
openshift_storage_glusterfs_block_host_vol_size=20
openshift_storage_glusterfs_block_host_vol_max=5
openshift_storage_glusterfs_block_storageclass=true
openshift_storage_glusterfs_registry_block_deploy=false
openshift_storage_glusterfs_storageclass=true
openshift_storage_glusterfs_storageclass_default=false
openshift_master_dynamic_provisioning_enabled=true

openshift_hosted_registry_storage_kind=glusterfs
openshift_hosted_registry_storage_volume_size=10Gi

os_sdn_network_plugin_name='redhat/openshift-ovs-networkpolicy'

osm_cluster_network_cidr=10.1.0.0/16
osm_host_subnet_length=8
openshift_portal_net=10.2.0.0/16
openshift_master_external_ip_network_cidrs=['0.0.0.0/0']
openshift_master_ingress_ip_network_cidr=172.29.0.0/16

openshift_master_api_port=443
openshift_master_console_port=443

openshift_use_dnsmasq=true
openshift_clock_enabled=true
openshift_enable_origin_repo=true
openshift_repos_enable_testing=true

openshift_enable_service_catalog=true
template_service_broker_install=true
ansible_service_broker_install=true
Reamer commented 6 years ago

I added a simple pause before ansible task "Approve bootstrap nodes". After this I had no problems.

simple git diff

diff --git a/playbooks/openshift-node/private/join.yml b/playbooks/openshift-node/private/join.yml
index 5b8869a..e8fcf3e 100644
--- a/playbooks/openshift-node/private/join.yml
+++ b/playbooks/openshift-node/private/join.yml
@@ -37,6 +37,10 @@
     debug:
       msg: "{{ l_nodes_to_join }}"

+  - name: Manual Pause
+    pause:
+      seconds: 5
+
   - name: Approve bootstrap nodes
     oc_adm_csr:
       nodes: "{{ l_nodes_to_join }}"

Let me know, if this works for you.

yuanlinios commented 6 years ago

Thanks, it works now.

And I noticed there are some typo in my inventory file ...

Reamer commented 6 years ago

@yuanlinios It's now working because of what? Because of the pause or your correction of a typo in your inventory file?

debianmaster commented 6 years ago

i facing same issue on fedora. / okd

yuanlinios commented 6 years ago

Hello @Reamer

Yes, I followed your tips to add pause in that task. And the task works. But after that I got some other irrelevant errors which result from some typos in my inventory file

@debianmaster can you post your inventory file?

debianmaster commented 6 years ago

[masters]
master1.localdomain ansible_port=31335

[etcd]
master1.localdomain  ansible_port=31335

[nodes]
master1.localdomain  openshift_node_group_name="node-config-all-in-one" ansible_port=31335
node1.localdomain openshift_node_group_name="node-config-all-in-one" ansible_port=31333
node2.localdomain openshift_node_group_name="node-config-all-in-one" ansible_port=31334

# Create an OSEv3 group that contains the masters and nodes groups
[OSEv3:children]
masters
nodes
etcd

[OSEv3:vars]
openshift_node_groups=[{"name":"node-config-all-in-one","labels":["node-role.kubernetes.io/master=true","node-role.kubernetes.io/infra=true","node-role.kubernetes.io/compute=true"]},{"name":"node-config-compute","labels":["node-role.kubernetes.io/compute=true"]}]
ansible_user=fedora
ansible_become=true
host_key_checking=false
ansible_python_interpreter=/bin/python3
openshift_deployment_type=origin
openshift_release="3.10"
openshift_master_default_subdomain=apps.cloud.run.io
openshift_master_cluster_hostname=cloud.run.io
openshift_master_api_port=31332
openshift_master_console_port=31332
debug_level=2
openshift_enable_docker_excluder=False
containerized=false
openshift_enable_excluders=false
#openshift_disable_check=disk_availability,memory_availability,docker_image_availability
openshift_disable_check=disk_availability,docker_storage,memory_availability,docker_image_availability,package_availability,package_version
oreg_url=docker-registry:5000/openshift/origin-${component}:${version}
openshift_examples_modify_imagestreams=true
openshift_docker_insecure_registries=docker-registry:5000
openshift_master_bootstrap_auto_approve=true
openshift_master_bootstrap_auto_approver_node_selector={"node-role.kubernetes.io/compute":"true"}

@yuanlinios