redhat-performance / jetpack

Automated deployment of OpenStack in Red Hat's Labs
Apache License 2.0
22 stars 16 forks source link

Jetpack deployment doesn't support Dell fc640 #290

Closed pradiptapks closed 3 years ago

pradiptapks commented 3 years ago

There are multiple issues noticed while deploying Jetpack with hardware model Del FC640.

One of the trace:

TASK [Clear redfish job queues]
failed: [localhost] (item=[u'dell', {u'stderr_lines': [], u'ansible_loop_var': u'item', u'end': u'2020-11-06 09:52:59.531953', u'failed': False, u'stdout': u'e19-h20-b01-fc640.rdu2.scalelab.redhat.com', u'changed': True, u'rc': 0, u'item'
: u'1', u'cmd': u'echo "e19-h20-b01-fc640.rdu2.scalelab.redhat.com"\n', u'stderr': u'', u'delta': u'0:00:00.003771', u'invocation': {u'module_args': {u'creates': None, u'executable': None, u'_uses_shell': True, u'strip_empty_ends': True, 
u'_raw_params': u'echo "e19-h20-b01-fc640.rdu2.scalelab.redhat.com"\n', u'removes': None, u'argv': None, u'warn': True, u'chdir': None, u'stdin_add_newline': True, u'stdin': None}}, u'stdout_lines': [u'e19-h20-b01-fc640.rdu2.scalelab.redh
at.com'], u'start': u'2020-11-06 09:52:59.528182'}]) => {"ansible_loop_var": "item", "attempts": 10, "changed": true, "cmd": "source /home/psahoo/badfish/.venv/bin/activate\npython3 badfish.py -H mgmt-e19-h20-b01-fc640.rdu2.scalelab.redha
t.com  -u quads -p rdu2@559 --clear-jobs --force\n", "delta": "0:00:00.600602", "end": "2020-11-06 09:58:12.592519", "item": ["dell", {"ansible_loop_var": "item", "changed": true, "cmd": "echo \"e19-h20-b01-fc640.rdu2.scalelab.redhat.com\
"\n", "delta": "0:00:00.003771", "end": "2020-11-06 09:52:59.531953", "failed": false, "invocation": {"module_args": {"_raw_params": "echo \"e19-h20-b01-fc640.rdu2.scalelab.redhat.com\"\n", "_uses_shell": true, "argv": null, "chdir": null
, "creates": null, "executable": null, "removes": null, "stdin": null, "stdin_add_newline": true, "strip_empty_ends": true, "warn": true}}, "item": "1", "rc": 0, "start": "2020-11-06 09:52:59.528182", "stderr": "", "stderr_lines": [], "st
dout": "e19-h20-b01-fc640.rdu2.scalelab.redhat.com", "stdout_lines": ["e19-h20-b01-fc640.rdu2.scalelab.redhat.com"]}], "msg": "non-zero return code", "rc": 1, "start": "2020-11-06 09:58:11.991917", "stderr": "- ERROR    - Failed to commun
icate with mgmt-e19-h20-b01-fc640.rdu2.scalelab.redhat.com\n- ERROR    - There was something wrong executing Badfish.", "stderr_lines": ["- ERROR    - Failed to communicate with mgmt-e19-h20-b01-fc640.rdu2.scalelab.redhat.com", "- ERROR  
  - There was something wrong executing Badfish."], "stdout": "", "stdout_lines": []}

During the OSP13(with RHEL7.9) deployment, the below tasks are failed intermittently.

https://github.com/redhat-performance/jetpack/blob/master/setup_undercloud.yml#L58-L112

Although increasing the existing retries value doesn't help to fix the issue.

diff --git a/bootstrap.yml b/bootstrap.yml
index 341491e..3201729 100644
--- a/bootstrap.yml
+++ b/bootstrap.yml
@@ -51,7 +51,8 @@
       - block:
           - name: set machines_types of overcloud nodes
             set_fact:
-              machine_types: "{{ machine_types|default([]) + [item.pm_addr.split('.')[0].split('-')[4]] }}"
+            #  machine_types: "{{ machine_types|default([]) + [item.pm_addr.split('.')[0].split('-')[4]] }}"
+              machine_types: "{{ machine_types|default([]) + [ (lab_name == 'scale') | ternary(item.pm_addr.split('.')[0].split('-')[4], item.pm_addr.split('.')[0].split('-')[3]) ] }}"
             with_items: "{{ oc_instackenv_content.nodes }}"
             vars:
               oc_instackenv_content: "{{ lookup('file', '{{ overcloud_instackenv_path }}') | from_json }}"
diff --git a/composable.yml b/composable.yml
index be5364c..37253b5 100644
--- a/composable.yml
+++ b/composable.yml
@@ -43,7 +43,7 @@
       lineinfile:
         path: "/home/stack/roles/Compute{{ item }}.yaml"
         regexp: '  HostnameFormatDefault:'
-        line: "  HostnameFormatDefault: '%stackname%-compute{{ item }}-%index%'"
+        line: "  HostnameFormatDefault: 'compute{{ item }}-%index%'"
       with_items: "{{ machine_types }}"

     - name: set roles

diff --git a/composable.yml b/composable.yml
index be5364c..37253b5 100644
--- a/composable.yml
+++ b/composable.yml
@@ -43,7 +43,7 @@
       lineinfile:
         path: "/home/stack/roles/Compute{{ item }}.yaml"
         regexp: '  HostnameFormatDefault:'
-        line: "  HostnameFormatDefault: '%stackname%-compute{{ item }}-%index%'"
+        line: "  HostnameFormatDefault: 'compute{{ item }}-%index%'"
       with_items: "{{ machine_types }}"

     - name: set roles
diff --git a/group_vars/all.yml b/group_vars/all.yml
index cc7bd40..24cdec1 100644
--- a/group_vars/all.yml
+++ b/group_vars/all.yml
@@ -70,6 +72,9 @@ scale:
       r930:
         rhel7_interfaces: [em1, em2, p1p1, p1p2]
         rhel8_interfaces: []
+      fc640:
+        rhel7_interfaces: [em2, p2p1, p2p2]
+        rhel8_interfaces: [eno2, ens2f0, ens2f1]
diff --git a/setup_undercloud.yml b/setup_undercloud.yml
index 84a2f9c..48027ae 100644
--- a/setup_undercloud.yml
+++ b/setup_undercloud.yml
@@ -4,7 +4,7 @@
       chassis_password:  "{{ instackenv_content.nodes[0].pm_password }}"
       osp_rhel_mapping:
         10: 7.7
-        13: 7.7
+        13: 7.9
         14: 7.7
         15: 8.0
         16: 8.1
@@ -30,6 +30,25 @@
       delay: 10
       with_items: "{{ async_install }}"

+    - block:
+        - name: preapare RHEL79.repo (OSP13)
+          template:
+            src: RHEL.repo.j2
+            dest: /etc/yum.repos.d/RHEL79.repo
+          delegate_to: "{{ undercloud_hostname }}"
+          vars:
+            ansible_python_interpreter: "{{ python_interpreter }}"
+            ansible_user: "root"
+
+        - name: update os to RHEL 7.9
+          shell: |
+             yum update -y
+          delegate_to: "{{ undercloud_hostname }}"
+          vars:
+            ansible_python_interpreter: "{{ python_interpreter }}"
+            ansible_user: "root"
+      when: osp_release == 13
+
     - name: list oc_instackenv_content
       shell: |
         echo "{{ (oc_instackenv_content.nodes[item | int].pm_addr | replace('mgmt-','') | replace('-drac', '')) }}"
@@ -48,7 +67,7 @@
         - "{{ host_list.results }}"
       register: clear_jobs
       until: clear_jobs is succeeded
-      retries: 3
+      retries: 10
       delay: 30

     - name: Wait for iDrac to be responsive
@@ -63,7 +82,7 @@
         - "{{ host_list.results }}"
       register: wait_for_idrac
       until: wait_for_idrac is succeeded
-      retries: 20
+      retries: 60
       delay: 30

     - name: Set the boot order to director mode (Dell)
@@ -76,7 +95,7 @@
       with_together:
         - "{{ vendors }}"
         - "{{ host_list.results }}"
-      retries: 5
+      retries: 20
       delay: 3
       register: result
       until: result.rc == 0
diff --git a/tasks/install_os.yml b/tasks/install_os.yml
index 545ff0a..df057fc 100644
--- a/tasks/install_os.yml
+++ b/tasks/install_os.yml
@@ -17,10 +17,16 @@
     os_install: "{{ needed_os }}"
   when: rhel_stdout == "" or (needed_os.split()[1] not in rhel_stdout) or (os_search_string not in rhel_stdout)

+- name: set os_install before updating os RHEL 7.9
+  set_fact:
+    os_install: "RHEL 7.8"
+  when: needed_os == "RHEL 7.9"
+
 - name: set os_install for forceful provisioning
   set_fact:
     os_install: "{{ needed_os }}"
-  when: force_reprovision == true
+  #when: force_reprovision == true
+  when: force_reprovision == true and needed_os != "RHEL 7.9"

 - name: Reboot if OS install needed
   block:
@@ -82,7 +88,7 @@
     - name: power cycle hypervisor (Dell)
       shell: |
         source {{ badfish_venv }}/bin/activate
-        python3 badfish.py -H mgmt-{{ hypervisor_host }}  -u quads -p {{ chassis_password }} --reboot-only
+        python3 badfish.py -H mgmt-{{ hypervisor_host }}  -u quads -p {{ chassis_password }} --power-cycle
       args:
         chdir: "{{ ansible_user_dir }}/badfish/src/badfish"
       when: vendor is defined and vendor == "dell"
stale[bot] commented 3 years ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.