Closed Madhura-08 closed 2 years ago
created custom build with the above branch at https://eos-jenkins.colo.seagate.com/job/GitHub-custom-ci-builds/job/generic/job/custom-ci/7745/console Deployed 6N cluster with configuration: sns : 3+2+0 and dix : 1+3+0 1N master + 5 worker nodes Ref. https://eos-jenkins.colo.seagate.com/job/Cortx-Automation/job/RGW/job/setup-cortx-rgw-cluster/12218/console
[root@ssc-vm-g2-rhev4-1630 ~]# kubectl get pods
NAME READY STATUS RESTARTS AGE
cortx-consul-client-5hrls 1/1 Running 0 139m
cortx-consul-client-ghdgd 1/1 Running 0 139m
cortx-consul-client-nrcg5 1/1 Running 0 139m
cortx-consul-client-s2bhz 1/1 Running 0 139m
cortx-consul-client-swm88 1/1 Running 0 139m
cortx-consul-server-0 1/1 Running 0 139m
cortx-consul-server-1 1/1 Running 0 139m
cortx-consul-server-2 1/1 Running 0 139m
cortx-control-6c6f76ccb8-dwkt4 1/1 Running 0 139m
cortx-data-g0-0 3/3 Running 0 139m
cortx-data-g0-1 3/3 Running 0 139m
cortx-data-g0-2 3/3 Running 0 139m
cortx-data-g0-3 3/3 Running 0 139m
cortx-data-g0-4 3/3 Running 0 139m
cortx-data-g1-0 3/3 Running 0 139m
cortx-data-g1-1 3/3 Running 0 139m
cortx-data-g1-2 3/3 Running 0 139m
cortx-data-g1-3 3/3 Running 0 139m
cortx-data-g1-4 3/3 Running 0 139m
cortx-ha-5c7dcfc58c-52z2v 3/3 Running 0 139m
cortx-kafka-0 1/1 Running 0 139m
cortx-kafka-1 1/1 Running 0 139m
cortx-kafka-2 1/1 Running 0 139m
cortx-server-0 2/2 Running 0 139m
cortx-server-1 2/2 Running 0 139m
cortx-server-2 2/2 Running 0 139m
cortx-server-3 2/2 Running 0 139m
cortx-server-4 2/2 Running 0 139m
cortx-zookeeper-0 1/1 Running 0 139m
cortx-zookeeper-1 1/1 Running 0 139m
cortx-zookeeper-2 1/1 Running 0 139m
[root@ssc-vm-g2-rhev4-1630 ~]# kubectl exec -it cortx-data-g0-1 -c cortx-hax -- /bin/bash
[root@cortx-data-g0-1 /]# hctl status -d
Bytecount:
critical : 0
damaged : 0
degraded : 0
healthy : 2516582
Data pool:
# fid name
0x6f00000000000001:0x0 'storage-set-1__sns'
Profile:
# fid name: pool(s)
0x7000000000000001:0x0 'Profile_the_pool': 'storage-set-1__sns' 'storage-set-1__dix' None
Services:
cortx-data-g0-0.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x0 inet:tcp:cortx-data-g0-0.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x1 inet:tcp:cortx-data-g0-0.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x2 inet:tcp:cortx-data-g0-0.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g0-1.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x3 inet:tcp:cortx-data-g0-1.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x4 inet:tcp:cortx-data-g0-1.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x5 inet:tcp:cortx-data-g0-1.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g0-2.cortx-data-headless.cortx.svc.cluster.local (RC)
[started] hax 0x7200000000000001:0x6 inet:tcp:cortx-data-g0-2.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x7 inet:tcp:cortx-data-g0-2.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x8 inet:tcp:cortx-data-g0-2.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g0-3.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x9 inet:tcp:cortx-data-g0-3.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0xa inet:tcp:cortx-data-g0-3.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0xb inet:tcp:cortx-data-g0-3.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g0-4.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0xc inet:tcp:cortx-data-g0-4.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0xd inet:tcp:cortx-data-g0-4.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0xe inet:tcp:cortx-data-g0-4.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g1-0.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0xf inet:tcp:cortx-data-g1-0.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x10 inet:tcp:cortx-data-g1-0.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x11 inet:tcp:cortx-data-g1-0.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g1-1.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x12 inet:tcp:cortx-data-g1-1.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x13 inet:tcp:cortx-data-g1-1.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x14 inet:tcp:cortx-data-g1-1.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g1-2.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x15 inet:tcp:cortx-data-g1-2.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x16 inet:tcp:cortx-data-g1-2.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x17 inet:tcp:cortx-data-g1-2.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g1-3.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x18 inet:tcp:cortx-data-g1-3.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x19 inet:tcp:cortx-data-g1-3.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x1a inet:tcp:cortx-data-g1-3.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-data-g1-4.cortx-data-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x1b inet:tcp:cortx-data-g1-4.cortx-data-headless.cortx.svc.cluster.local@22001
[started] ioservice 0x7200000000000001:0x1c inet:tcp:cortx-data-g1-4.cortx-data-headless.cortx.svc.cluster.local@21002
[started] confd 0x7200000000000001:0x1d inet:tcp:cortx-data-g1-4.cortx-data-headless.cortx.svc.cluster.local@21001
cortx-server-0.cortx-server-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x1e inet:tcp:cortx-server-0.cortx-server-headless.cortx.svc.cluster.local@22001
[started] rgw_s3 0x7200000000000001:0x1f inet:tcp:cortx-server-0.cortx-server-headless.cortx.svc.cluster.local@22501
cortx-server-1.cortx-server-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x20 inet:tcp:cortx-server-1.cortx-server-headless.cortx.svc.cluster.local@22001
[started] rgw_s3 0x7200000000000001:0x21 inet:tcp:cortx-server-1.cortx-server-headless.cortx.svc.cluster.local@22501
cortx-server-2.cortx-server-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x22 inet:tcp:cortx-server-2.cortx-server-headless.cortx.svc.cluster.local@22001
[started] rgw_s3 0x7200000000000001:0x23 inet:tcp:cortx-server-2.cortx-server-headless.cortx.svc.cluster.local@22501
cortx-server-3.cortx-server-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x24 inet:tcp:cortx-server-3.cortx-server-headless.cortx.svc.cluster.local@22001
[started] rgw_s3 0x7200000000000001:0x25 inet:tcp:cortx-server-3.cortx-server-headless.cortx.svc.cluster.local@22501
cortx-server-4.cortx-server-headless.cortx.svc.cluster.local
[started] hax 0x7200000000000001:0x26 inet:tcp:cortx-server-4.cortx-server-headless.cortx.svc.cluster.local@22001
[started] rgw_s3 0x7200000000000001:0x27 inet:tcp:cortx-server-4.cortx-server-headless.cortx.svc.cluster.local@22501
Devices:
cortx-data-g0-0.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdd
[online] /dev/sde
[online] /dev/sdc
cortx-data-g0-1.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdd
[online] /dev/sde
[online] /dev/sdc
cortx-data-g0-2.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdd
[online] /dev/sde
[online] /dev/sdc
cortx-data-g0-3.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdd
[online] /dev/sde
[online] /dev/sdc
cortx-data-g0-4.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdd
[online] /dev/sde
[online] /dev/sdc
cortx-data-g1-0.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdg
[online] /dev/sdh
[online] /dev/sdf
cortx-data-g1-1.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdg
[online] /dev/sdh
[online] /dev/sdf
cortx-data-g1-2.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdg
[online] /dev/sdh
[online] /dev/sdf
cortx-data-g1-3.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdg
[online] /dev/sdh
[online] /dev/sdf
cortx-data-g1-4.cortx-data-headless.cortx.svc.cluster.local
[online] /dev/sdg
[online] /dev/sdh
[online] /dev/sdf
cortx-server-0.cortx-server-headless.cortx.svc.cluster.local
cortx-server-1.cortx-server-headless.cortx.svc.cluster.local
cortx-server-2.cortx-server-headless.cortx.svc.cluster.local
cortx-server-3.cortx-server-headless.cortx.svc.cluster.local
cortx-server-4.cortx-server-headless.cortx.svc.cluster.local
Didn't see any ha pod hanging or init container restarts. Logs for init container:
Init Containers:
cortx-setup:
Container ID: docker://e089394703e9b41782e93dfb31e496372b66be4b060394dd9e42e2a7cf38a7e8
Image: cortx-docker.colo.seagate.com/seagate/cortx-control:2.0.0-7745-custom-ci
Image ID: docker-pullable://cortx-docker.colo.seagate.com/seagate/cortx-control@sha256:6d22db78509fef63ac0b82206b2c14c9ec31a782bf4415cbe2d9b0c88e30089e
Port: <none>
Host Port: <none>
Command:
/bin/sh
Args:
-c
export TAIL_MACHINE_ID="$(echo -n $(hostname -f) | md5sum | head --bytes=32)"
# Exit all tail jobs when finished
trap 'kill $(jobs -p)' EXIT
tail -F --quiet --lines=0 /etc/cortx/log/utils/$TAIL_MACHINE_ID/utils_setup.log 2> /dev/null &
tail -F --quiet --lines=0 /etc/cortx/log/ha/$TAIL_MACHINE_ID/ha_setup.log 2> /dev/null &
tail -F --quiet --lines=0 /etc/cortx/log/ha/$TAIL_MACHINE_ID/event_manager.log 2> /dev/null &
/opt/seagate/cortx/provisioner/bin/cortx_deploy -f /etc/cortx/solution -c $CORTX_CONFSTORE_URL
State: Terminated
Reason: Completed
Exit Code: 0
Started: Mon, 05 Sep 2022 02:21:48 -0600
Finished: Mon, 05 Sep 2022 02:23:05 -0600
Ready: True
Restart Count: 0
Environment:
CORTX_CONFSTORE_URL: consul://cortx-consul-server:8500/conf
NODE_NAME: (v1:spec.nodeName)
POD_NAME: cortx-ha-5c7dcfc58c-52z2v (v1:metadata.name)
Mounts:
/etc/cortx from data (rw)
/etc/cortx/solution from cortx-configuration (rw)
/etc/cortx/solution/secret from configuration-secrets (ro)
/etc/cortx/solution/ssl from cortx-ssl-cert (rw)
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-t8xnr (ro)
Containers:
HA logs:
2022-09-05 08:23:02 event_manager [65]: INFO [__init__] All cluster element are loaded, Ready to process alerts .................
2022-09-05 08:23:02 event_manager [65]: INFO [process_event] SystemHealth: Processing node:node:b826a2c52ce5471fe1edb7b9e87b01de with status unknown
2022-09-05 08:23:02 event_manager [65]: INFO [_update] SystemHealth: Updated status for node:node:b826a2c52ce5471fe1edb7b9e87b01de
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] Initialize all the health evaluator elements
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] HealthEvaluator ha.core.system_health.health_evaluators.cluster_health_evaluator.ClusterHealthEvaluator is initalized...
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] HealthEvaluator ha.core.system_health.health_evaluators.site_health_evaluator.SiteHealthEvaluator is initalized...
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] HealthEvaluator ha.core.system_health.health_evaluators.rack_health_evaluator.RackHealthEvaluator is initalized...
2022-09-05 08:23:02 event_manager [65]: INFO [__init__] All cluster element are loaded, Ready to process alerts .................
2022-09-05 08:23:02 event_manager [65]: INFO [process_event] SystemHealth: Processing node:node:7e9fb523c141ce689ddcb02a60870c63 with status unknown
2022-09-05 08:23:02 event_manager [65]: INFO [_update] SystemHealth: Updated status for node:node:7e9fb523c141ce689ddcb02a60870c63
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] Initialize all the health evaluator elements
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] HealthEvaluator ha.core.system_health.health_evaluators.cluster_health_evaluator.ClusterHealthEvaluator is initalized...
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] HealthEvaluator ha.core.system_health.health_evaluators.site_health_evaluator.SiteHealthEvaluator is initalized...
2022-09-05 08:23:02 event_manager [65]: INFO [init_evaluators] HealthEvaluator ha.core.system_health.health_evaluators.rack_health_evaluator.RackHealthEvaluator is initalized...
2022-09-05 08:23:02 event_manager [65]: INFO [__init__] All cluster element are loaded, Ready to process alerts .................
2022-09-05 08:23:02 event_manager [65]: INFO [process_event] SystemHealth: Processing node:node:d62dc0d2c3ee12aee9b946379519133a with status unknown
2022-09-05 08:23:02 event_manager [65]: INFO [_update] SystemHealth: Updated status for node:node:d62dc0d2c3ee12aee9b946379519133a
2022-09-05 08:23:02 event_manager [65]: INFO [process] config command is successful
2022-09-05 08:23:02 utils_setup [89]: INFO [main] Starting utils_setup init
2022-09-05 08:23:03 utils_setup [89]: INFO [init] MessageBus initialized as kafka
2022-09-05 08:23:04 utils_setup [89]: INFO [main] Command <__main__.InitCmd object at 0x7fed25310ba8> init finished with exit code 0
2022-09-05 08:23:04 cortx_setup [53]: INFO [_provision_components] /opt/seagate/cortx/ha/bin/ha_setup init --config consul://cortx-consul-server:8500/conf --services all
2022-09-05 08:23:05 cortx_setup [53]: INFO [cluster_deploy] Finished cluster bootstrap on 7d9ee991ae6ce628a0e10294bdebaa6b:cortx-ha
cc. @Madhura-08, @mssawant
HA POD deployment is facing problems and getting timed out. The probable cause is unable to connect to Kafka.
Solution: Implementing retries to connect to Kafka will help and will avoid timed out issue while deployment
Signed-off-by: Madhura Mande madhura.mande@seagate.com
Problem Statement
https://jts.seagate.com/browse/CORTX-31665
Design
Coding
Testing
Review Checklist
Review Checklist
Documentation
Checklist for Author