IBM / ibm-spectrum-scale-install-infra

Spectrum Scale Installation and Configuration
Apache License 2.0
63 stars 68 forks source link

aws add_node failed with an error #116

Closed gandhisanjayv closed 4 years ago

gandhisanjayv commented 4 years ago

mmcloudworkflows add_nodes --stack_name gsanjay-may12 --node_type storage --num-instances 2 . .

2020-05-20 22:32:27,451 - common_utils.py:89 - local_stream_execution() - INFO - TASK [core/cluster : storage | Change existing NSDs] *****************************************************************************************************

2020-05-20 22:32:30,450 - common_utils.py:89 - local_stream_execution() - INFO - failed: [ip-10-0-1-221.us-east-2.compute.internal] (item={'diff': [], 'src': '/root/.ansible/tmp/ansible-tmp-1590013943.7272623-26510-128113254993684/source', 'changed': True, 'group': 'root', 'uid': 0, 'dest': '/var/tmp/StanzaFile.fs1', 'checksum': 'fb533d004eb64337dd9be35f821ff7a0ca01c8fb', 'md5sum': 'cf77c5d3a4bea43e2a5247d5d877ad21', 'owner': 'root', 'state': 'file', 'gid': 0, 'secontext': 'unconfined_u:object_r:admin_home_t:s0', 'mode': '0644', 'invocation': {'module_args': {'directory_mode': None, 'force': True, 'remote_src': None, '_original_basename': 'StanzaFile.j2', 'owner': None, 'follow': False, 'local_follow': None, 'group': None, 'unsafe_writes': None, 'setype': None, 'content': None, 'serole': None, 'dest': '/var/tmp/StanzaFile.fs1', 'selevel': None, 'regexp': None, 'validate': None, 'src': '/root/.ansible/tmp/ansible-tmp-1590013943.7272623-26510-128113254993684/source', 'checksum': 'fb533d004eb64337dd9be35f821ff7a0ca01c8fb', 'seuser': None, 'delimiter': None, 'mode': None, 'attributes': None, 'backup': False}}, 'size': 1465, 'failed': False, 'item': 'fs1', 'ansible_loop_var': 'item'}) => {"ansible_loop_var": "item", "changed": true, "cmd": ["/usr/lpp/mmfs/bin/mmchnsd", "-F", "/var/tmp/StanzaFile.fs1.nsd"], "delta": "0:00:02.690057", "end": "2020-05-20 22:32:30.406573", "item": {"ansible_loop_var": "item", "changed": true, "checksum": "fb533d004eb64337dd9be35f821ff7a0ca01c8fb", "dest": "/var/tmp/StanzaFile.fs1", "diff": [], "failed": false, "gid": 0, "group": "root", "invocation": {"module_args": {"_original_basename": "StanzaFile.j2", "attributes": null, "backup": false, "checksum": "fb533d004eb64337dd9be35f821ff7a0ca01c8fb", "content": null, "delimiter": null, "dest": "/var/tmp/StanzaFile.fs1", "directory_mode": null, "follow": false, "force": true, "group": null, "local_follow": null, "mode": null, "owner": null, "regexp": null, "remote_src": null, "selevel": null, "serole": null, "setype": null, "seuser": null, "src": "/root/.ansible/tmp/ansible-tmp-1590013943.7272623-26510-128113254993684/source", "unsafe_writes": null, "validate": null}}, "item": "fs1", "md5sum": "cf77c5d3a4bea43e2a5247d5d877ad21", "mode": "0644", "owner": "root", "secontext": "unconfined_u:object_r:admin_home_t:s0", "size": 1465, "src": "/root/.ansible/tmp/ansible-tmp-1590013943.7272623-26510-128113254993684/source", "state": "file", "uid": 0}, "msg": "non-zero return code", "rc": 1, "start": "2020-05-20 22:32:27.716516", "stderr": "mmchnsd: Duplicate disk specified: nsd5\nmmchnsd: Duplicate disk specified: nsd6\nmmchnsd: Duplicate disk specified: nsd7\nmmchnsd: Duplicate disk specified: nsd8\nmmchnsd: Command failed. Examine previous error messages to determine cause.", "stderr_lines": ["mmchnsd: Duplicate disk specified: nsd5", "mmchnsd: Duplicate disk specified: nsd6", "mmchnsd: Duplicate disk specified: nsd7", "mmchnsd: Duplicate disk specified: nsd8", "mmchnsd: Command failed. Examine previous error messages to determine cause."], "stdout": "mmchnsd: Processing disk nsd1\nmmchnsd: Processing disk nsd2\nmmchnsd: Processing disk nsd3\nmmchnsd: Processing disk nsd4\nmmchnsd: Processing disk desconlynsd\nmmchnsd: Processing disk nsd5\nmmchnsd: Processing disk nsd6\nmmchnsd: Processing disk nsd7\nmmchnsd: Processing disk nsd8\nmmchnsd: Processing disk nsd5\nmmchnsd: Processing disk nsd6\nmmchnsd: Processing disk nsd7\nmmchnsd: Processing disk nsd8", "stdout_lines": ["mmchnsd: Processing disk nsd1", "mmchnsd: Processing disk nsd2", "mmchnsd: Processing disk nsd3", "mmchnsd: Processing disk nsd4", "mmchnsd: Processing disk desconlynsd", "mmchnsd: Processing disk nsd5", "mmchnsd: Processing disk nsd6", "mmchnsd: Processing disk nsd7", "mmchnsd: Processing disk nsd8", "mmchnsd: Processing disk nsd5", "mmchnsd: Processing disk nsd6", "mmchnsd: Processing disk nsd7", "mmchnsd: Processing disk nsd8"]}

2020-05-20 22:32:30,453 - common_utils.py:89 - local_stream_execution() - INFO -

2020-05-20 22:32:30,453 - common_utils.py:89 - local_stream_execution() - INFO - NO MORE HOSTS LEFT ***************************************************************************************************************************************

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO -

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - PLAY RECAP ***********************************************************************************************************************************************

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-1-120.us-east-2.compute.internal : ok=59   changed=0    unreachable=0    failed=0    skipped=32   rescued=0    ignored=0

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-1-201.us-east-2.compute.internal : ok=59   changed=0    unreachable=0    failed=0    skipped=32   rescued=0    ignored=0

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-1-221.us-east-2.compute.internal : ok=97   changed=16   unreachable=0    failed=1    skipped=60   rescued=0    ignored=0

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-1-239.us-east-2.compute.internal : ok=59   changed=0    unreachable=0    failed=0    skipped=32   rescued=0    ignored=0

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-1-26.us-east-2.compute.internal : ok=59   changed=8    unreachable=0    failed=0    skipped=25   rescued=0    ignored=0

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-3-113.us-east-2.compute.internal : ok=59   changed=0    unreachable=0    failed=0    skipped=32   rescued=0    ignored=0

2020-05-20 22:32:30,455 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-3-228.us-east-2.compute.internal : ok=59   changed=8    unreachable=0    failed=0    skipped=25   rescued=0    ignored=0

2020-05-20 22:32:30,456 - common_utils.py:89 - local_stream_execution() - INFO - ip-10-0-3-47.us-east-2.compute.internal : ok=59   changed=0    unreachable=0    failed=0    skipped=32   rescued=0    ignored=0

2020-05-20 22:32:30,456 - common_utils.py:89 - local_stream_execution() - INFO - localhost                  : ok=4    changed=1    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0

2020-05-20 22:32:30,456 - common_utils.py:89 - local_stream_execution() - INFO -

Traceback (most recent call last):
  File "/usr/lpp/mmfs/bin/ibm_cloud_workflows/mm_cloud_workflow_add_nodes", line 339, in <module>
    CLOUD_PLAYBOOK_PATH)])
  File "/usr/lpp/mmfs/bin/ibm_cloud_workflows/ibm_cloud_utils/common_utils.py", line 93, in local_stream_execution
    process_hd.args)

[root@ip-10-0-1-221 ~]# mmlscluster

GPFS cluster information
========================
  GPFS cluster name:         gsanjay-may12.us-east-2.compute.internal
  GPFS cluster id:           6286266572994866134
  GPFS UID domain:           gsanjay-may12.us-east-2.compute.internal
  Remote shell command:      /usr/bin/ssh
  Remote file copy command:  /usr/bin/scp
  Repository type:           CCR

 Node  Daemon node name                          IP address  Admin node name                           Designation
-------------------------------------------------------------------------------------------------------------------
   1   ip-10-0-1-221.us-east-2.compute.internal  10.0.1.221  ip-10-0-1-221.us-east-2.compute.internal  quorum-manager
   2   ip-10-0-1-239.us-east-2.compute.internal  10.0.1.239  ip-10-0-1-239.us-east-2.compute.internal  quorum-manager
   3   ip-10-0-3-47.us-east-2.compute.internal   10.0.3.47   ip-10-0-3-47.us-east-2.compute.internal   quorum-manager
   4   ip-10-0-1-201.us-east-2.compute.internal  10.0.1.201  ip-10-0-1-201.us-east-2.compute.internal
   5   ip-10-0-1-120.us-east-2.compute.internal  10.0.1.120  ip-10-0-1-120.us-east-2.compute.internal
   6   ip-10-0-3-113.us-east-2.compute.internal  10.0.3.113  ip-10-0-3-113.us-east-2.compute.internal
   7   ip-10-0-1-26.us-east-2.compute.internal   10.0.1.26   ip-10-0-1-26.us-east-2.compute.internal
   8   ip-10-0-3-228.us-east-2.compute.internal  10.0.3.228  ip-10-0-3-228.us-east-2.compute.internal

[root@ip-10-0-1-221 ~]# mmgetstate -a

 Node number  Node name        GPFS state
-------------------------------------------
       1      ip-10-0-1-221    active
       2      ip-10-0-1-239    active
       3      ip-10-0-3-47     active
       4      ip-10-0-1-201    active
       5      ip-10-0-1-120    active
       6      ip-10-0-3-113    active
       7      ip-10-0-1-26     active
       8      ip-10-0-3-228    active
[root@ip-10-0-1-221 ~]# mmlsnsd

 File system   Disk name       NSD servers
------------------------------------------------------------------------------
 fs1           nsd1            ip-10-0-1-221.us-east-2.compute.internal
 fs1           nsd2            ip-10-0-1-221.us-east-2.compute.internal
 fs1           nsd3            ip-10-0-3-47.us-east-2.compute.internal
 fs1           nsd4            ip-10-0-3-47.us-east-2.compute.internal
 fs1           desconlynsd     ip-10-0-1-239.us-east-2.compute.internal
 fs1           nsd5            ip-10-0-1-120.us-east-2.compute.internal
 fs1           nsd6            ip-10-0-1-120.us-east-2.compute.internal
 fs1           nsd7            ip-10-0-3-113.us-east-2.compute.internal
 fs1           nsd8            ip-10-0-3-113.us-east-2.compute.internal

[root@ip-10-0-1-221 ~]# ssh ip-10-0-1-26 lsblk
NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
xvda    202:0    0  100G  0 disk
├─xvda1 202:1    0    1M  0 part
└─xvda2 202:2    0  100G  0 part /
xvdf    202:80   0   10G  0 disk
xvdg    202:96   0   10G  0 disk
[root@ip-10-0-1-221 ~]# ssh ip-10-0-1-26 lsblk
NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
xvda    202:0    0  100G  0 disk
├─xvda1 202:1    0    1M  0 part
└─xvda2 202:2    0  100G  0 part /
xvdf    202:80   0   10G  0 disk
xvdg    202:96   0   10G  0 disk
[root@ip-10-0-1-221 ~]#```
gandhisanjayv commented 4 years ago

logs are under s3://logs-aws1.3/ansibleissue116 region: ohio stack: gsanjay-may12

gandhisanjayv commented 4 years ago

Recreate steps create multiAZ stack with 2 storage nodes, 2 compute nodes, 2 disks per node Add two storage nodes fill filesystem to 100% add two storage nodes - it should fail with above error