Failing to Verify Heketi Service

creiche commented 6 years ago

Description

Unable to install GlusterFS using release-3.10 branch or the RPM install. Playbook references the Pod in app-storage namespace not infra-storage namespace and cannot find the pod so fails.

Version

ansible 2.6.3
  config file = /etc/ansible/ansible.cfg
  configured module search path = [u'/root/.ansible/plugins/modules', u'/usr/share/ansible/plugins/modules']
  ansible python module location = /usr/lib/python2.7/site-packages/ansible
  executable location = /usr/bin/ansible
  python version = 2.7.5 (default, Jul 13 2018, 13:06:57) [GCC 4.8.5 20150623 (Red Hat 4.8.5-28)]

git describe:
openshift-ansible-3.10.43-1-10-g189969f

rpm:
openshift-ansible-3.10.43-1.git.0.4794155.el7.noarch

Steps To Reproduce

Launch deploy_cluster.yml with glusterfs turned on Launch playbooks/openshift-gluster/config.yml

Expected Results

Expected a glusterFS deployment

Observed Results

Failure

TASK [openshift_storage_glusterfs : Verify heketi service] ****************************************************************************************************************************************************
Thursday 06 September 2018  09:58:31 -0400 (0:00:00.140)       0:07:45.894 ****
fatal: [okd-m01]: FAILED! => {"changed": false, "cmd": ["oc", "--config=/tmp/openshift-glusterfs-ansible-utOBWE/admin.kubeconfig", "rsh", "--namespace=infra-storage", "heketi-storage-1-r2gn8", "heketi-cli", "-s", "http://localhost:8080", "--user", "admin", "--secret", "hKNT+9EkuXrwCjQZGeo+bV3maS9l4gGBXccdTg5cgk0=", "cluster", "list"], "delta": "0:00:00.196217", "end": "2018-09-06 09:58:32.172870", "msg": "non-zero return code", "rc": 1, "start": "2018-09-06 09:58:31.976653", "stderr": "Error from server (NotFound): pods \"heketi-storage-1-r2gn8\" not found", "stderr_lines": ["Error from server (NotFound): pods \"heketi-storage-1-r2gn8\" not found"], "stdout": "", "stdout_lines": []}

Additional Information

Provide any additional information which may help us diagnose the issue.

Your operating system and version, ie: RHEL 7.2, Fedora 23 ($ cat /etc/redhat-release)
Your inventory file (especially any non-standard configuration parameters)
Sample code, etc

CentOS 7.5

[OSEv3:children]
masters
nodes
etcd
glusterfs_registry
glusterfs
lb

[OSEv3:vars]
# admin user created in previous section
ansible_become=true
openshift_deployment_type=origin

# use HTPasswd for authentication
openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true', 'challenge': 'true', 'kind': 'HTPasswdPasswordIdentityProvider'}]

# define default sub-domain for Master node
openshift_master_default_subdomain=okd.cloud.pvt
# allow unencrypted connection within cluster
openshift_docker_insecure_registries=172.30.0.0/16

#GlusterFS
openshift_master_dynamic_provisioning_enabled=True

openshift_metrics_install_metrics=true
openshift_metrics_hawkular_nodeselector={"node-role.kubernetes.io/infra": "true"}
openshift_metrics_cassandra_nodeselector={"node-role.kubernetes.io/infra": "true"}
openshift_metrics_heapster_nodeselector={"node-role.kubernetes.io/infra": "true"}
openshift_metrics_storage_kind=dynamic
openshift_metrics_storage_volume_size=10Gi
openshift_metrics_cassandra_pvc_storage_class_name="glusterfs-registry-block"

openshift_logging_install_logging=true
openshift_logging_kibana_nodeselector={"node-role.kubernetes.io/infra": "true"}
openshift_logging_curator_nodeselector={"node-role.kubernetes.io/infra": "true"}
openshift_logging_es_nodeselector={"node-role.kubernetes.io/infra": "true"}
openshift_logging_storage_kind=dynamic
openshift_logging_es_pvc_size=10Gi
openshift_logging_es_pvc_storage_class_name="glusterfs-registry-block"

openshift_storage_glusterfs_namespace=app-storage
openshift_storage_glusterfs_storageclass=true
openshift_storage_glusterfs_storageclass_default=false
openshift_storage_glusterfs_block_deploy=true
openshift_storage_glusterfs_block_host_vol_size=100
openshift_storage_glusterfs_block_storageclass=true
openshift_storage_glusterfs_block_storageclass_default=false

openshift_storage_glusterfs_registry_namespace=infra-storage
openshift_storage_glusterfs_registry_block_deploy=true
openshift_storage_glusterfs_registry_block_host_vol_size=50
openshift_storage_glusterfs_registry_block_storageclass=true
openshift_storage_glusterfs_registry_block_storageclass_default=false

# Registry
openshift_hosted_registry_storage_kind=glusterfs
openshift_hosted_registry_storage_volume_size=25Gi
openshift_hosted_registry_selector='node-role.kubernetes.io/infra=true'

openshift_master_cluster_method=native
openshift_master_cluster_hostname=okd.cloud.pvt
openshift_master_cluster_public_hostname=okd.cloud.pvt

[lb]
okd-lb

[glusterfs_registry]
okd-in0[1:3] glusterfs_devices='[ "/dev/sdd" ]'

[glusterfs]
okd-n0[1:3] glusterfs_devices='[ "/dev/sdd" ]'

[masters]
okd-m0[1:3]

creiche commented 6 years ago

As you can see the playbook is looking for the heketi in the wrong name space

oc get pods --namespace app-storage
NAME                                          READY     STATUS    RESTARTS   AGE
glusterblock-storage-provisioner-dc-1-tdcjj   1/1       Running   0          14m
glusterfs-storage-7dfk2                       1/1       Running   0          17m
glusterfs-storage-kjf2l                       1/1       Running   0          17m
glusterfs-storage-t6c5q                       1/1       Running   0          17m
heketi-storage-1-r2gn8                        1/1       Running   0          14m

 oc get pods --namespace infra-storage
NAME                             READY     STATUS    RESTARTS   AGE
deploy-heketi-registry-1-v4246   1/1       Running   0          13m
glusterfs-registry-9xd2k         1/1       Running   0          14m
glusterfs-registry-fzkb8         1/1       Running   0          14m
glusterfs-registry-qc8dt         1/1       Running   0          14m

creiche commented 6 years ago

It appears that when the second run of the gluster role runs it does not reset the heketi_pod so when it hits

- name: Set heketi-cli command
  set_fact:
    glusterfs_heketi_client: "{% if glusterfs_heketi_is_native %}{{ openshift_client_binary }} --config={{ mktemp.stdout }}/admin.kubeconfig rsh --namespace={{ glusterfs_namespace }} {%if heketi_pod is defined %}{{ heketi_pod.metadata.name }}{% elif deploy_heketi_pod is defined %}{{ deploy_heketi_pod.metadata.name }}{% endif %} {% endif %}{{ glusterfs_heketi_cli }} -s http://{% if glusterfs_heketi_is_native %}localhost:8080{% else %}{{ glusterfs_heketi_url }}:{{ glusterfs_heketi_port }}{% endif %} --user admin {% if glusterfs_heketi_admin_key is defined %}--secret '{{ glusterfs_heketi_admin_key }}'{% endif %}"

It grabs the heketi_pod variable rather than the deploy_heketi_pod one.

crmarques commented 6 years ago

I'm facing the same issue, with RedHat 7.5 and ansible 2.6.3.

Somehow the second CNS deployment (glusterfs_registry) selects the wrong pod (the one from the app-storage namespace) in the "Set heketi-cli command" Ansible task, even though in the heketi-cli command says "infra-storage".

But there is another hidden issue. In glusterfs_registry CNS deployment, deploymentconfig "deploy-heketi-registry" creates a pod "deploy-heketi-registry-1-xxxxx" but it does not finish.

Bellow follows the logs:

[root@xxxxxxx ~]# oc logs deploy-heketi-registry-1-rqtsz -n infra-storage stat: cannot stat '/var/lib/heketi/heketi.db': No such file or directory Heketi 6.0.0 [heketi] ERROR 2018/09/06 16:32:42 /src/github.com/heketi/heketi/apps/glusterfs/app.go:100: invalid log level: [heketi] INFO 2018/09/06 16:32:42 Loaded kubernetes executor [heketi] INFO 2018/09/06 16:32:42 Block: Auto Create Block Hosting Volume set to true [heketi] INFO 2018/09/06 16:32:42 Block: New Block Hosting Volume size 208 GB [heketi] INFO 2018/09/06 16:32:42 GlusterFS Application Loaded [heketi] INFO 2018/09/06 16:32:42 Started Node Health Cache Monitor Authorization loaded Listening on port 8080 [heketi] INFO 2018/09/06 16:32:52 Starting Node Health Status refresh [heketi] INFO 2018/09/06 16:32:52 Cleaned 0 nodes from health cache [heketi] INFO 2018/09/06 16:34:42 Starting Node Health Status refresh [heketi] INFO 2018/09/06 16:34:42 Cleaned 0 nodes from health cache ...

creiche commented 6 years ago

Yeah it looks like the second time it runs to create the infra-storage it sees the heketi-pod var so it never launches the piece that deploys heketi after it starts up the the deploy-heketi-pod.

crmarques commented 6 years ago

I think I figured out the problem (or part of it). heketi_pod_check.yml runs the first CNS setup and sets variables "deploy_heketi_pod" and "heketi_pod". When the second CNS deployment runs and fails to deploy "deploy-heketi-registry" those variables are not being properly updated and because of that it uses this old "heketi_pod" when running heketi_load.yml the second time, causing this failure.

There are two problems here: (i) Second CNS DeploymentConfig "deploy-heketi-registry" fails. (ii) The second run of heketi_pod_check.yml does not clean the variables, masking the real problem.

I have no ideia yet what is causing (i).

creiche commented 6 years ago

Here is a workaround I came up with, not sure if it's the best fix but it got gluster installed for me.

diff --git a/roles/openshift_storage_glusterfs/tasks/heketi_load.yml b/roles/openshift_storage_glusterfs/tasks/heketi_load.yml
index 713f520..37083ec 100644
--- a/roles/openshift_storage_glusterfs/tasks/heketi_load.yml
+++ b/roles/openshift_storage_glusterfs/tasks/heketi_load.yml
@@ -1,7 +1,7 @@
 ---
 - name: Set heketi-cli command
   set_fact:
-    glusterfs_heketi_client: "{% if glusterfs_heketi_is_native %}{{ openshift_client_binary }} --config={{ mktemp.stdout }}/admin.kubeconfig rsh --namespace={{ glusterfs_namespace }} {%if heketi_pod i
+    glusterfs_heketi_client: "{% if glusterfs_heketi_is_native %}{{ openshift_client_binary }} --config={{ mktemp.stdout }}/admin.kubeconfig rsh --namespace={{ glusterfs_namespace }} {% if ((heketi_po

 - name: Verify heketi service
   command: "{{ glusterfs_heketi_client }} cluster list"
@@ -13,7 +13,7 @@
     dest: "{{ mktemp.stdout }}/topology.json"

 - name: Place heketi topology on heketi Pod
-  shell: "{{ openshift_client_binary }} --config={{ mktemp.stdout }}/admin.kubeconfig exec --namespace={{ glusterfs_namespace }} -i {%if heketi_pod is defined %}{{ heketi_pod.metadata.name }}{% elif d
+  shell: "{{ openshift_client_binary }} --config={{ mktemp.stdout }}/admin.kubeconfig exec --namespace={{ glusterfs_namespace }} -i {%if ((heketi_pod is defined) and (heketi_pod != \"\")) %}{{ heketi_
   when:
   - glusterfs_heketi_is_native

diff --git a/roles/openshift_storage_glusterfs/tasks/main.yml b/roles/openshift_storage_glusterfs/tasks/main.yml
index 8378f2b..7038e21 100644
--- a/roles/openshift_storage_glusterfs/tasks/main.yml
+++ b/roles/openshift_storage_glusterfs/tasks/main.yml
@@ -5,6 +5,9 @@
   when:
   - groups.glusterfs | default([]) | count > 0

+- set_fact:
+    heketi_pod: ""
+
 - import_tasks: glusterfs_registry.yml
   when: >
     groups.glusterfs_registry | default([]) | count > 0

DanyC97 commented 6 years ago

@jarrpa fyi

crmarques commented 6 years ago

It did not worked for me. Indeed it solved the wrong variable issue from the first run. But now I'm with the root problem. For some reason, "deploy-heketi-registry-1-xxxxx" is still stuck in infra-storage namespace =/

Nevertheless, if you don't mind me saying so, I would put the following code:

- set_fact:
    heketi_pod: ""
    deploy_heketi_pod: ""

in roles\openshift_storage_glusterfs\tasks\heketi_pod_check.yml file, which is the one that creates these variables. Just to be a bit more clear why we are setting this to "". Not a big change...

crmarques commented 6 years ago

@creiche, when you run

oc get pods --namespace app-storage
oc get pods --namespace infra-storage

do you get similar pods for both namespaces?

I got this:

[root@xxxxx ~]# oc get pods --namespace app-storage
NAME                                          READY     STATUS    RESTARTS   AGE
deploy-heketi-storage-1-9gk7s                 1/1       Running   0          2h
glusterblock-storage-provisioner-dc-1-jmmcj   1/1       Running   0          32m
glusterfs-storage-bpnq9                       1/1       Running   0          6h
glusterfs-storage-dm89l                       1/1       Running   0          6h
glusterfs-storage-gvg56                       1/1       Running   0          6h
heketi-storage-1-zwthc                        1/1       Running   0          6h

[root@xxxxx~]# oc get pods --namespace infra-storage
NAME                             READY     STATUS    RESTARTS   AGE
deploy-heketi-registry-1-cxsp4   1/1       Running   0          31m
glusterfs-registry-ntctz         1/1       Running   0          6h
glusterfs-registry-qs8p5         1/1       Running   0          6h
glusterfs-registry-z5qsh         1/1       Running   0          6h

creiche commented 6 years ago

Mine fully deployed. I am able to request storage via the gluster storage class now:

[root@xxxxx ~]# oc get pods --namespace app-storage
NAME                                          READY     STATUS    RESTARTS   AGE
glusterblock-storage-provisioner-dc-1-986qc   1/1       Running   0          6h
glusterfs-storage-6f4wb                       1/1       Running   0          6h
glusterfs-storage-jgzxk                       1/1       Running   0          6h
glusterfs-storage-nzps5                       1/1       Running   0          6h
heketi-storage-1-wrgtg                        1/1       Running   1          6h

[root@xxxxx ~]# oc get pods --namespace infra-storage
NAME                                           READY     STATUS    RESTARTS   AGE
glusterblock-registry-provisioner-dc-1-rfp5p   1/1       Running   0          6h
glusterfs-registry-bxgdc                       1/1       Running   0          6h
glusterfs-registry-w64rl                       1/1       Running   0          6h
glusterfs-registry-x8ptm                       1/1       Running   0          6h
heketi-registry-1-p785k                        1/1       Running   0          6h

[root@xxxxx ~]# oc get sc
NAME                       PROVISIONER                AGE
glusterfs-registry-block   gluster.org/glusterblock   6h
glusterfs-storage          kubernetes.io/glusterfs    6h
glusterfs-storage-block    gluster.org/glusterblock   1d

[root@xxxxx ~]# oc describe storageclass
Name:                  glusterfs-registry-block
IsDefaultClass:        No
Annotations:           <none>
Provisioner:           gluster.org/glusterblock
Parameters:            chapauthenabled=true,hacount=3,restsecretname=heketi-registry-admin-secret-block,restsecretnamespace=infra-storage,resturl=http://heketi-registry.infra-storage.svc:8080,restuser=admin
AllowVolumeExpansion:  <unset>
MountOptions:          <none>
ReclaimPolicy:         Delete
VolumeBindingMode:     Immediate
Events:                <none>

Name:                  glusterfs-storage
IsDefaultClass:        No
Annotations:           <none>
Provisioner:           kubernetes.io/glusterfs
Parameters:            resturl=http://heketi-storage.app-storage.svc:8080,restuser=admin,secretName=heketi-storage-admin-secret,secretNamespace=app-storage
AllowVolumeExpansion:  <unset>
MountOptions:          <none>
ReclaimPolicy:         Delete
VolumeBindingMode:     Immediate
Events:                <none>

Name:                  glusterfs-storage-block
IsDefaultClass:        No
Annotations:           <none>
Provisioner:           gluster.org/glusterblock
Parameters:            chapauthenabled=true,hacount=3,restsecretname=heketi-storage-admin-secret-block,restsecretnamespace=app-storage,resturl=http://heketi-storage.app-storage.svc:8080,restuser=admin
AllowVolumeExpansion:  <unset>
MountOptions:          <none>
ReclaimPolicy:         Delete
VolumeBindingMode:     Immediate
Events:                <none>

[root@xxxxx ~]# oc describe pv
Name:            registry-volume
Labels:          <none>
Annotations:     <none>
Finalizers:      [kubernetes.io/pv-protection]
StorageClass:
Status:          Released
Claim:           default/registry-claim
Reclaim Policy:  Retain
Access Modes:    RWX
Capacity:        25Gi
Node Affinity:   <none>
Message:
Source:
    Type:           Glusterfs (a Glusterfs mount on the host that shares a pod's lifetime)
    EndpointsName:  glusterfs-registry-endpoints
    Path:           glusterfs-registry-volume
    ReadOnly:       false
Events:             <none>

creiche commented 6 years ago

@crmarques Just checking did you run

ansible-playbook -e "openshift_storage_glusterfs_wipe=true" playbooks/openshift-glusterfs/openshift-uninstall.yml

Before trying to rerun after fixing the variables?

crmarques commented 6 years ago

No, I didn't! I really appreciate the tip!!!

jarrpa commented 6 years ago

Jumping in to chime that it's playbooks/openshift-glusterfs/uninstall.yml, and that I'm working on a proper fix for this. :)

stimko68 commented 6 years ago

FYI I had this same issue with a fresh 3.10 install but when I updated the tasks modified in #9971 in my own tasks it worked perfectly. Thanks, @jarrpa !

openshift / openshift-ansible