openark / orchestrator

MySQL replication topology management and HA
Apache License 2.0
5.63k stars 930 forks source link

the replication not detach to the new master #1071

Closed MonkeyFang closed 4 years ago

MonkeyFang commented 4 years ago
MonkeyFang commented 4 years ago

1.orchestrator.conf.json

{
  "Debug": true,
  "EnableSyslog": false,
  "ListenAddress": ":3000",

  "MySQLTopologyUser": "orchestrator",
  "MySQLTopologyPassword": "orch_topology_password",
  "MySQLTopologyCredentialsConfigFile": "",
  "MySQLTopologySSLPrivateKeyFile": "",
  "MySQLTopologySSLCertFile": "",
  "MySQLTopologySSLCAFile": "",
  "MySQLTopologySSLSkipVerify": true,
  "MySQLTopologyUseMutualTLS": false,
  "MySQLOrchestratorHost": "172.21.203.23",
  "MySQLOrchestratorPort": 3306,
  "MySQLOrchestratorDatabase": "orchestrator",
  "MySQLOrchestratorUser": "orchestrator",
  "MySQLOrchestratorPassword": "orch_topology_password",
  "MySQLOrchestratorCredentialsConfigFile": "",
  "MySQLOrchestratorSSLPrivateKeyFile": "",
  "MySQLOrchestratorSSLCertFile": "",
  "MySQLOrchestratorSSLCAFile": "",
  "MySQLOrchestratorSSLSkipVerify": true,
  "MySQLOrchestratorUseMutualTLS": false,
  "MySQLConnectTimeoutSeconds": 1,
  "DefaultInstancePort": 3306,
  "DiscoverByShowSlaveHosts": true,
  "InstancePollSeconds": 5,
  "DiscoveryIgnoreReplicaHostnameFilters": [
    "a_host_i_want_to_ignore[.]example[.]com",
    ".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
    "a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307"
  ],
  "UnseenInstanceForgetHours": 240,
  "SnapshotTopologiesIntervalHours": 0,
  "InstanceBulkOperationsWaitTimeoutSeconds": 10,

  "HostnameResolveMethod": "default",
  "MySQLHostnameResolveMethod": "@@hostname",
  "SkipBinlogServerUnresolveCheck": true,
  "ExpiryHostnameResolvesMinutes": 60,
  "RejectHostnameResolvePattern": "",
  "ReasonableReplicationLagSeconds": 10,
  "ProblemIgnoreHostnameFilters": [],
  "VerifyReplicationFilters": false,
  "ReasonableMaintenanceReplicationLagSeconds": 20,
  "CandidateInstanceExpireMinutes": 60,
  "AuditLogFile": "",
  "AuditToSyslog": false,
  "RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
  "ReadOnly": false,
  "AuthenticationMethod": "",
  "HTTPAuthUser": "",
  "HTTPAuthPassword": "",
  "AuthUserHeader": "",
  "PowerAuthUsers": [
    "*"
  ],
  "ClusterNameToAlias": {
    "127.0.0.1": "test suite"
  },

  "ReplicationLagQuery": "",
  "DetectClusterAliasQuery": "SELECT SUBSTRING_INDEX(@@hostname, '.', 1)",
  "DetectClusterDomainQuery": "",
  "DetectInstanceAliasQuery": "",
  "DetectPromotionRuleQuery": "",
  "DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
  "PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
  "PromotionIgnoreHostnameFilters": [],
  "DetectSemiSyncEnforcedQuery": "",
  "ServeAgentsHttp": false,
  "AgentsServerPort": ":3001",
  "AgentsUseSSL": false,
  "AgentsUseMutualTLS": false,
  "AgentSSLSkipVerify": false,
  "AgentSSLPrivateKeyFile": "",
  "AgentSSLCertFile": "",
  "AgentSSLCAFile": "",
  "AgentSSLValidOUs": [],
  "UseSSL": false,
  "UseMutualTLS": false,
  "SSLSkipVerify": false,
  "SSLPrivateKeyFile": "",
  "SSLCertFile": "",
  "SSLCAFile": "",
  "SSLValidOUs": [],
  "URLPrefix": "",
  "StatusEndpoint": "/api/status",
  "StatusSimpleHealth": true,
  "StatusOUVerify": false,
  "AgentPollMinutes": 60,
  "UnseenAgentForgetHours": 6,
  "StaleSeedFailMinutes": 60,
  "SeedAcceptableBytesDiff": 8192,
  "PseudoGTIDPattern": "",
  "PseudoGTIDPatternIsFixedSubstring": false,
  "PseudoGTIDMonotonicHint": "asc:",
  "DetectPseudoGTIDQuery": "",
  "BinlogEventsChunkSize": 10000,
  "SkipBinlogEventsContaining": [],
  "ReduceReplicationAnalysisCount": true,
  "FailureDetectionPeriodBlockMinutes": 60,
  "RecoveryPeriodBlockSeconds": 3600,
  "RecoveryIgnoreHostnameFilters": [],
  "RecoverMasterClusterFilters": [
    "*"
  ],
  "RecoverIntermediateMasterClusterFilters": [
    "*"
  ],
  "OnFailureDetectionProcesses": [
    "echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
  ],
  "PreGracefulTakeoverProcesses": [
    "echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
  ],
  "PreFailoverProcesses": [
    "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
  ],
  "PostFailoverProcesses": [
    "echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostUnsuccessfulFailoverProcesses": [],
  "PostMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostIntermediateMasterFailoverProcesses": [
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostGracefulTakeoverProcesses": [
    "echo 'Planned takeover complete' >> /tmp/recovery.log"
  ],

  "RaftEnabled": true,
  "RaftDataDir": "/var/lib/orchestrator",
  "RaftBind": "172.21.203.23",
  "DefaultRaftPort": 10008,
  "RaftNodes": [
    "172.21.203.23",
    "172.21.203.24",
    "172.21.203.33"
  ],

  "CoMasterRecoveryMustPromoteOtherCoMaster": true,
  "DetachLostSlavesAfterMasterFailover": true,
  "ApplyMySQLPromotionAfterMasterFailover": true,
  "PreventCrossDataCenterMasterFailover": false,
  "PreventCrossRegionMasterFailover": false,
  "MasterFailoverDetachReplicaMasterHost": false,
  "MasterFailoverLostInstancesDowntimeMinutes": 0,
  "PostponeReplicaRecoveryOnLagMinutes": 0,
  "OSCIgnoreHostnameFilters": [],
  "GraphiteAddr": "",
  "GraphitePath": "",
  "GraphiteConvertHostnameDotsToUnderscores": true,
  "ConsulAddress": "",
  "ConsulAclToken": ""
}

2.what i want to do i have three mysql nodes,one master and two slave which point to master; when i killed the master,it has voted a new master,but the other slave was not point to the new master,it continued trying to connect to the old one

3.which i want to get after old master dead,chooes new master,and the other slave point to new master

4.the message before i start to test,i exectue the command in the below: orchestrator-client -c replication-analysis zb-search-test-203-31:3306 (cluster zb-search-test-203-31:3306): NoFailoverSupportStructureWarning i don't know why?i get no issue with the cookbook and documet

5.the /tmp/recovery.log Will recover from DeadMaster on zb-search-test-203-31:3306 Recovered from DeadMaster on zb-search-test-203-31:3306. Failed: zb-search-test-203-31:3306; Promoted: zb-search-test-203-30:3306 (for all types) Recovered from DeadMaster on zb-search-test-203-31:3306. Failed: zb-search-test-203-31:3306; Successor: zb-search-test-203-30:3306

6.where the orchestrator running log? the /tmp/recovery.log ?could i get other message?

shlomi-noach commented 4 years ago

NoFailoverSupportStructureWarning: do both your replicas have:

See https://github.com/github/orchestrator/blob/master/docs/configuration-recovery.md#mysql-configuration

MonkeyFang commented 4 years ago

yes,the variables that you mentioned is already set

MonkeyFang commented 4 years ago

here is my mysql configuration file

[client]
port            = 3306
socket          = /data/mysql/mysql3306/log/3306.sock
host            = localhost

[mysqld]
report_host = '172.21.203.31'
#skip-grant-tables
# generic configuration options
port            = 3306
socket          = /data/mysql/mysql3306/log/3306.sock
user            = mysql

basedir             = /usr/local/mysql
datadir             = /data/mysql/mysql3306/data
tmpdir              = /data/mysql/mysql3306/data
log-bin             = /data/mysql/mysql3306/log/3306-bin
log_error           = /data/mysql/mysql3306/log/error.err
pid-file            = /data/mysql/mysql3306/log/3306.pid
slow_query_log_file = /data/mysql/mysql3306/log/slowquery.log
relay_log           = /data/mysql/mysql3306/log/3306-relay-bin

innodb_buffer_pool_size=3072M  
#innodb_additional_mem_pool_size=20m
max_connections = 500
#innodb_numa_interleave=on
plugin_dir=/usr/local/mysql/lib/plugin
init_connect = 'SET NAMES utf8mb4'
character_set_server = utf8mb4
collation_server=utf8mb4_general_ci
skip-name-resolve
skip-slave-start
lower_case_table_names=1
character-set-client-handshake=0
sql_mode=ONLY_FULL_GROUP_BY,NO_AUTO_VALUE_ON_ZERO,STRICT_TRANS_TABLES,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION

expire_logs_days=7
back_log = 256
#skip-networking
table_open_cache_instances=16

max_connect_errors = 10
table_open_cache = 2048
#external-locking
max_allowed_packet = 1024M
metadata_locks_hash_instances=256
binlog_cache_size = 1M
max_heap_table_size = 16M
read_buffer_size = 2M
read_rnd_buffer_size = 1M
sort_buffer_size = 2M
join_buffer_size = 1M
thread_cache_size = 64
query_cache_type=0
query_cache_size = 0
query_cache_limit = 2M
ft_min_word_len = 4
default-storage-engine = innodb
thread_stack = 192K
#transaction_isolation = repeatable-read
transaction_isolation = READ-COMMITTED
tmp_table_size = 8M
binlog_format=row
slow_query_log
long_query_time = 1
#must unique id
server-id = 231

#*** MyISAM Specific options
key_buffer_size = 64M
bulk_insert_buffer_size = 64M
myisam_sort_buffer_size = 128M
myisam_max_sort_file_size = 2G
myisam_repair_threads = 1
myisam_recover_options
# *** INNODB Specific options ***
innodb_buffer_pool_instances = 2
innodb_data_file_path = ibdata1:512M;ibdata2:512M:autoextend
innodb_write_io_threads = 8
innodb_read_io_threads = 8
innodb_thread_concurrency = 4
innodb_flush_log_at_trx_commit = 2
innodb_log_buffer_size = 16M
innodb_log_file_size = 768M
innodb_log_files_in_group = 4
innodb_max_dirty_pages_pct = 90
innodb_flush_method=O_DIRECT
innodb_lock_wait_timeout = 120
#innodb_adaptive_hash_index_partitions = 1
innodb_checksum_algorithm=strict_crc32

autocommit=1
explicit_defaults_for_timestamp=on
auto_increment_increment=1
auto_increment_offset=1
concurrent_insert=1
connect_timeout=10
default_week_format=0
delayed_insert_limit=100
delayed_insert_timeout=300
delayed_queue_size=1000
delay_key_write=on
div_precision_increment=4
ft_query_expansion_limit=20
group_concat_max_len=1024
innodb_autoinc_lock_mode=1
innodb_concurrency_tickets=500
innodb_old_blocks_pct=37
innodb_old_blocks_time=0
innodb_open_files=65535
innodb_purge_batch_size=20
innodb_purge_threads=1
innodb_doublewrite=0
innodb_read_ahead_threshold=56
innodb_rollback_on_timeout=off
innodb_stats_method=nulls_equal
innodb_stats_on_metadata=off
innodb_stats_sample_pages=8
innodb_strict_mode=off
innodb_table_locks=on
innodb_file_per_table=1
innodb_thread_sleep_delay=10000
interactive_timeout=1800
key_cache_age_threshold=300
key_cache_block_size=1024
key_cache_division_limit=100
log_queries_not_using_indexes=off
low_priority_updates=0
net_read_timeout=30
net_retry_count=10
net_write_timeout=60
query_alloc_block_size=8192
query_prealloc_size=8192
slow_launch_time=2
table_definition_cache=5000
wait_timeout=60
slave_checkpoint_group=512
slave_checkpoint_period=300
slave_pending_jobs_size_max=64M
master_info_repository = TABLE
relay_log_info_repository = TABLE
relay_log_recovery = 1
#gtid_mode=OFF
#enforce_gtid_consistency=0

log_slave_updates

# sync remi
plugin_load = "rpl_semi_sync_master=semisync_master.so;rpl_semi_sync_slave=semisync_slave.so"
rpl_semi_sync_master_enabled = 1
rpl_semi_sync_master_timeout = 1000
rpl_semi_sync_slave_enabled = 1

plugin_dir=/usr/local/mysql/lib/mysql/plugin
log_timestamps = SYSTEM
slave_parallel_type = LOGICAL_CLOCK
slave_parallel_workers = 8
slave_preserve_commit_order = ON
innodb_undo_tablespaces=4
show_compatibility_56 = ON
#sync_relay_log = 1
sync_master_info = 1
sync_relay_log_info = 1
master-info-repository  = TABLE
relay-log-info-repository = TABLE
log-slave-updates = 1
relay_log_recovery = ON

#gtid
gtid-mode=ON
enforce_gtid_consistency=on

[mysqldump]
# Do not buffer the whole result set in memory before writing it to
# file. Required for dumping very large tables
quick

max_allowed_packet = 1024M

[mysqladmin]
user=zabbix
password=jfa;uata
socket          = /data/mysql/mysql3306/log/3306.sock
[mysql]
user=zabbix
password=jfa;uata
prompt=percona-5.7.22-22-->\u@\d>
socket          = /data/mysql/mysql3306/log/3306.sock
default-character-set = utf8mb4

# Only allow UPDATEs and DELETEs that use keys.
#safe-updates

[myisamchk]
key_buffer_size = 512M
sort_buffer_size = 32K
read_buffer = 8M
write_buffer = 8M

[mysqlhotcopy]
interactive-timeout

[mysqld_safe]
open-files-limit = 65535
shlomi-noach commented 4 years ago

Please paste the output of:

orchestrator-client -c topology -alias your-cluster

orchestrator-client -c replication-analysis

MonkeyFang commented 4 years ago

[root@zb-mongodb-test-203-23 log]# orchestrator-client -c replication-analysis zb-mongodb-test-203-23:3306 (cluster zb-mongodb-test-203-23:3306): MasterSingleSlaveNotReplicating

the cluster which i used to test is not found, [root@zb-mongodb-test-203-23 log]# orchestrator-client -c topology -i zb-search-test-203-31:3306 zb-search-test-203-31:3306 [0s,ok,5.7.22-22-log,rw,ROW,>>,GTID,downtimed]

the zb-mongodb-test-203-23:3306 is orchestrator backend mysql [root@zb-mongodb-test-203-23 log]# orchestrator-client -c topology -i zb-mongodb-test-203-23:3306 zb-mongodb-test-203-23:3306 [0s,ok,5.7.22-22-log,rw,ROW,>>]

MonkeyFang commented 4 years ago

the edition which i used is orchestrator-3.1.4 tar,and i now replace it to orchestrator-3.0.14,but it has no use to solve

MonkeyFang commented 4 years ago

the backend database i had droped,and recreated it, in edition 3.1.4,the result of command orchestrator-client -c replication-analysis is orchestrator-client -c replication-analysis zb-search-test-203-31:3306 (cluster zb-search-test-203-31:3306): NoFailoverSupportStructureWarning but in edition 3.0.14,is has no result

MonkeyFang commented 4 years ago
[root@zb-mongodb-test-203-23 orchestrator]# orchestrator-client -c clusters
zb-mongodb-test-203-23:3306
zb-search-test-203-31:3306
[root@zb-mongodb-test-203-23 orchestrator]# orchestrator-client -c topology -i zb-mongodb-test-203-23:3306
zb-mongodb-test-203-23:3306   [0s,ok,5.7.22-22-log,rw,ROW,>>]
+ zb-mongodb-test-203-24:3306 [0s,ok,5.7.22-22-log,rw,ROW,>>]
[root@zb-mongodb-test-203-23 orchestrator]# orchestrator-client -c topology -i zb-search-test-203-31:3306
zb-search-test-203-31:3306   [0s,ok,5.7.22-22-log,rw,ROW,>>,GTID]
+ zb-search-test-203-29:3306 [0s,ok,5.7.22-22-log,rw,ROW,>>,GTID]
+ zb-search-test-203-30:3306 [0s,ok,5.7.22-22-log,rw,ROW,>>,GTID]
[root@zb-mongodb-test-203-23 orchestrator]# orchestrator-client -c replication-analysis
[root@zb-mongodb-test-203-23 orchestrator]# 

and then i stop the mysql instance on zb-search-test-203-31 of cluster zb-search-test-203-31:3306

[root@zb-mongodb-test-203-23 orchestrator]# orchestrator-client -c topology -i zb-search-test-203-31:3306
zb-search-test-203-31:3306 [unknown,invalid,5.7.22-22-log,rw,ROW,>>,downtimed]
[root@zb-mongodb-test-203-23 orchestrator]# orchestrator-client -c replication-analysis
zb-search-test-203-31:3306 (cluster zb-search-test-203-31:3306): DeadMasterWithoutSlaves

the recovery.log

2 Detected UnreachableMaster on zb-search-test-203-31:3306. Affected replicas: 2
2 Detected DeadMaster on zb-search-test-203-31:3306. Affected replicas: 2
3 Will recover from DeadMaster on zb-search-test-203-31:3306
4 Recovered from DeadMaster on zb-search-test-203-31:3306. Failed: zb-search-test-203-31:3306; Promoted: zb-search-test-203-30:3306
5 (for all types) Recovered from DeadMaster on zb-search-test-203-31:3306. Failed: zb-search-test-203-31:3306; Successor: zb-search-test-203-30:3306

the orchestrator.conf.json

{
  "Debug": true,
  "EnableSyslog": true,
  "ListenAddress": ":3000",
  "MySQLTopologyUser": "orchestrator",
  "MySQLTopologyPassword": "orch_topology_password",
  "MySQLTopologyCredentialsConfigFile": "",
  "MySQLTopologySSLPrivateKeyFile": "",
  "MySQLTopologySSLCertFile": "",
  "MySQLTopologySSLCAFile": "",
  "MySQLTopologySSLSkipVerify": true,
  "MySQLTopologyUseMutualTLS": false,
  "MySQLOrchestratorHost": "172.21.203.23",
  "MySQLOrchestratorPort": 3306,
  "MySQLOrchestratorDatabase": "orchestrator",
  "MySQLOrchestratorUser": "orchestrator",
  "MySQLOrchestratorPassword": "orch_topology_password",
  "MySQLOrchestratorCredentialsConfigFile": "",
  "MySQLOrchestratorSSLPrivateKeyFile": "",
  "MySQLOrchestratorSSLCertFile": "",
  "MySQLOrchestratorSSLCAFile": "",
  "MySQLOrchestratorSSLSkipVerify": true,
  "MySQLOrchestratorUseMutualTLS": false,
  "MySQLConnectTimeoutSeconds": 1,
  "MySQLTopologyReadTimeoutSeconds": 3,
  "MySQLDiscoveryReadTimeoutSeconds": 3,
  "DefaultInstancePort": 3306,
  "DiscoverByShowSlaveHosts": true,
  "InstancePollSeconds": 3,
  "UnseenInstanceForgetHours": 240,
  "SnapshotTopologiesIntervalHours": 0,
  "InstanceBulkOperationsWaitTimeoutSeconds": 10,
  "HostnameResolveMethod": "default",
  "MySQLHostnameResolveMethod": "@@hostname",
  "SkipBinlogServerUnresolveCheck": true,
  "SkipMaxScaleCheck":true,
  "ExpiryHostnameResolvesMinutes": 60,
  "RejectHostnameResolvePattern": "",
  "ReasonableReplicationLagSeconds": 10,
  "ProblemIgnoreHostnameFilters": [],
  "VerifyReplicationFilters": false,
  "ReasonableMaintenanceReplicationLagSeconds": 20,
  "CandidateInstanceExpireMinutes": 1440,
  "AuditLogFile": "",
  "AuditToSyslog": false,
  "RemoveTextFromHostnameDisplay": ":3306",
  "ReadOnly": false,
  "AuthenticationMethod": "",
  "HTTPAuthUser": "",
  "HTTPAuthPassword": "",
  "AuthUserHeader": "",
  "PowerAuthUsers": [
    "*"
  ],
  "ClusterNameToAlias": {
    "127.0.0.1": "test suite"
  },
  "SlaveLagQuery": "",
  "PhysicalEnvironmentPattern": "",
  "PromotionIgnoreHostnameFilters": [],
  "DetachLostReplicasAfterMasterFailover": true,
  "DetectSemiSyncEnforcedQuery": "SELECT 0 AS semisync FROM DUAL WHERE NOT EXISTS (SELECT 1 FROM performance_schema.global_variables WHERE VARIABLE_NAME = 'rpl_semi_sync_master_wait_no_slave' AND VARIABLE_VALUE = 'ON') UNION SELECT 1 FROM DUAL WHERE EXISTS (SELECT 1 FROM performance_schema.global_variables WHERE VARIABLE_NAME = 'rpl_semi_sync_master_wait_no_slave' AND VARIABLE_VALUE = 'ON')",
  "ServeAgentsHttp": false,
  "AgentsServerPort": ":3001",
  "AgentsUseSSL": false,
  "AgentsUseMutualTLS": false,
  "AgentSSLSkipVerify": false,
  "AgentSSLPrivateKeyFile": "",
  "AgentSSLCertFile": "",
  "AgentSSLCAFile": "",
  "AgentSSLValidOUs": [],
  "UseSSL": false,
  "UseMutualTLS": false,
  "SSLSkipVerify": false,
  "SSLPrivateKeyFile": "",
  "SSLCertFile": "",
  "SSLCAFile": "",
  "SSLValidOUs": [],
  "URLPrefix": "",
  "StatusEndpoint": "/api/status",
  "StatusSimpleHealth": true,
  "StatusOUVerify": false,
  "AgentPollMinutes": 60,
  "UnseenAgentForgetHours": 6,
  "StaleSeedFailMinutes": 60,
  "SeedAcceptableBytesDiff": 8192,
  "AutoPseudoGTID":false,
  "BinlogEventsChunkSize": 10000,
  "SkipBinlogEventsContaining": [],
  "ReduceReplicationAnalysisCount": true,
  "FailureDetectionPeriodBlockMinutes": 60,
  "RecoveryPeriodBlockSeconds": 31,
  "RecoveryIgnoreHostnameFilters": [],
  "RecoverMasterClusterFilters": ["*"],
  "RecoverIntermediateMasterClusterFilters": ["*"],
  "OnFailureDetectionProcesses": [
    "echo '2 Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
  ],
  "PreGracefulTakeoverProcesses": [
    "echo '1 Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
  ],
  "PreFailoverProcesses": [
    "echo '3 Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
  ],
  "PostMasterFailoverProcesses": [
    "echo '4 Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostFailoverProcesses": [
    "echo '5 (for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostUnsuccessfulFailoverProcesses": [
    "echo '8 >> /tmp/recovery.log'"
  ],
  "PostIntermediateMasterFailoverProcesses": [
    "echo '6 Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostGracefulTakeoverProcesses": [
    "echo '7 Planned takeover complete' >> /tmp/recovery.log"
  ],
  "CoMasterRecoveryMustPromoteOtherCoMaster": true,
  "DetachLostSlavesAfterMasterFailover": true,
  "ApplyMySQLPromotionAfterMasterFailover": true,
  "PreventCrossDataCenterMasterFailover": false,
  "MasterFailoverDetachSlaveMasterHost": false,
  "MasterFailoverLostInstancesDowntimeMinutes": 0,
  "PostponeSlaveRecoveryOnLagMinutes": 0,
  "OSCIgnoreHostnameFilters": [],
  "GraphiteAddr": "",
  "GraphitePath": "",
  "GraphiteConvertHostnameDotsToUnderscores": true,

  "RaftEnabled": true,
  "BackendDB": "mysql",
  "RaftBind": "172.21.203.23",
  "RaftDataDir": "/var/lib/orchestrator",
  "DefaultRaftPort": 10008,
  "RaftNodes": [
    "172.21.203.23",
    "172.21.203.24",
    "172.21.203.33"
    ],
 "ConsulAddress": "",
 "ConsulAclToken": ""
}

the my.cnf

====
[client]
port            = 3306
socket          = /data/mysql/mysql3306/log/3306.sock
host            = localhost

[mysqld]
report_host = '172.21.203.31'
#skip-grant-tables
# generic configuration options
port            = 3306
socket          = /data/mysql/mysql3306/log/3306.sock
user            = mysql

basedir             = /usr/local/mysql
datadir             = /data/mysql/mysql3306/data
tmpdir              = /data/mysql/mysql3306/data
log-bin             = /data/mysql/mysql3306/log/3306-bin
log_error           = /data/mysql/mysql3306/log/error.err
pid-file            = /data/mysql/mysql3306/log/3306.pid
slow_query_log_file = /data/mysql/mysql3306/log/slowquery.log
relay_log           = /data/mysql/mysql3306/log/3306-relay-bin

innodb_buffer_pool_size=3072M  
#innodb_additional_mem_pool_size=20m
max_connections = 500
#innodb_numa_interleave=on
plugin_dir=/usr/local/mysql/lib/plugin
init_connect = 'SET NAMES utf8mb4'
character_set_server = utf8mb4
collation_server=utf8mb4_general_ci
skip-name-resolve
skip-slave-start
lower_case_table_names=1
character-set-client-handshake=0
sql_mode=ONLY_FULL_GROUP_BY,NO_AUTO_VALUE_ON_ZERO,STRICT_TRANS_TABLES,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION

expire_logs_days=7
back_log = 256
#skip-networking
table_open_cache_instances=16

max_connect_errors = 10
table_open_cache = 2048
#external-locking
max_allowed_packet = 1024M
metadata_locks_hash_instances=256
binlog_cache_size = 1M
max_heap_table_size = 16M
read_buffer_size = 2M
read_rnd_buffer_size = 1M
sort_buffer_size = 2M
join_buffer_size = 1M
thread_cache_size = 64
query_cache_type=0
query_cache_size = 0
query_cache_limit = 2M
ft_min_word_len = 4
default-storage-engine = innodb
thread_stack = 192K
#transaction_isolation = repeatable-read
transaction_isolation = READ-COMMITTED
tmp_table_size = 8M
binlog_format=row
slow_query_log
long_query_time = 1
#must unique id
server-id = 231

#*** MyISAM Specific options
key_buffer_size = 64M
bulk_insert_buffer_size = 64M
myisam_sort_buffer_size = 128M
myisam_max_sort_file_size = 2G
myisam_repair_threads = 1
myisam_recover_options
# *** INNODB Specific options ***
innodb_buffer_pool_instances = 2
innodb_data_file_path = ibdata1:512M;ibdata2:512M:autoextend
innodb_write_io_threads = 8
innodb_read_io_threads = 8
innodb_thread_concurrency = 4
innodb_flush_log_at_trx_commit = 2
innodb_log_buffer_size = 16M
innodb_log_file_size = 768M
innodb_log_files_in_group = 4
innodb_max_dirty_pages_pct = 90
innodb_flush_method=O_DIRECT
innodb_lock_wait_timeout = 120
#innodb_adaptive_hash_index_partitions = 1
innodb_checksum_algorithm=strict_crc32

autocommit=1
explicit_defaults_for_timestamp=on
auto_increment_increment=1
auto_increment_offset=1
concurrent_insert=1
connect_timeout=10
default_week_format=0
delayed_insert_limit=100
delayed_insert_timeout=300
delayed_queue_size=1000
delay_key_write=on
div_precision_increment=4
ft_query_expansion_limit=20
group_concat_max_len=1024
innodb_autoinc_lock_mode=1
innodb_concurrency_tickets=500
innodb_old_blocks_pct=37
innodb_old_blocks_time=0
innodb_open_files=65535
innodb_purge_batch_size=20
innodb_purge_threads=1
innodb_doublewrite=0
innodb_read_ahead_threshold=56
innodb_rollback_on_timeout=off
innodb_stats_method=nulls_equal
innodb_stats_on_metadata=off
innodb_stats_sample_pages=8
innodb_strict_mode=off
innodb_table_locks=on
innodb_file_per_table=1
innodb_thread_sleep_delay=10000
interactive_timeout=1800
key_cache_age_threshold=300
key_cache_block_size=1024
key_cache_division_limit=100
log_queries_not_using_indexes=off
low_priority_updates=0
net_read_timeout=30
net_retry_count=10
net_write_timeout=60
query_alloc_block_size=8192
query_prealloc_size=8192
slow_launch_time=2
table_definition_cache=5000
wait_timeout=60
slave_checkpoint_group=512
slave_checkpoint_period=300
slave_pending_jobs_size_max=64M
master_info_repository = TABLE
relay_log_info_repository = TABLE
relay_log_recovery = 1
#gtid_mode=OFF
#enforce_gtid_consistency=0

log_slave_updates

# sync remi
plugin_load = "rpl_semi_sync_master=semisync_master.so;rpl_semi_sync_slave=semisync_slave.so"
rpl_semi_sync_master_enabled = 1
rpl_semi_sync_master_timeout = 1000
rpl_semi_sync_slave_enabled = 1

plugin_dir=/usr/local/mysql/lib/mysql/plugin
log_timestamps = SYSTEM
slave_parallel_type = LOGICAL_CLOCK
slave_parallel_workers = 8
slave_preserve_commit_order = ON
innodb_undo_tablespaces=4
show_compatibility_56 = ON
#sync_relay_log = 1
sync_master_info = 1
sync_relay_log_info = 1
master-info-repository  = TABLE
relay-log-info-repository = TABLE
log-slave-updates = 1
relay_log_recovery = ON

#gtid
gtid-mode=ON
enforce_gtid_consistency=on

[mysqldump]
# Do not buffer the whole result set in memory before writing it to
# file. Required for dumping very large tables
quick

max_allowed_packet = 1024M

[mysqladmin]
user=zabbix
password=jfa;uata
socket          = /data/mysql/mysql3306/log/3306.sock
[mysql]
user=zabbix
password=jfa;uata
prompt=percona-5.7.22-22-->\u@\d>
socket          = /data/mysql/mysql3306/log/3306.sock
default-character-set = utf8mb4

# Only allow UPDATEs and DELETEs that use keys.
#safe-updates

[myisamchk]
key_buffer_size = 512M
sort_buffer_size = 32K
read_buffer = 8M
write_buffer = 8M

[mysqlhotcopy]
interactive-timeout

[mysqld_safe]
open-files-limit = 65535
shlomi-noach commented 4 years ago

last question, in your replicas, can you show slave status? I suspect you may have auto_position: 0. That is, you have GTID configured but not effectively _enabled__ on the replicas.

MonkeyFang commented 4 years ago

1. row Slave_IO_State: Waiting for master to send event Master_Host: 172.21.203.31 Master_User: repl Master_Port: 3306 Connect_Retry: 60 Master_Log_File: 3306-bin.000017 Read_Master_Log_Pos: 154 Relay_Log_File: 3306-relay-bin.000002 Relay_Log_Pos: 365 Relay_Master_Log_File: 3306-bin.000017 Slave_IO_Running: Yes Slave_SQL_Running: Yes Replicate_Do_DB: Replicate_Ignore_DB: Replicate_Do_Table: Replicate_Ignore_Table: Replicate_Wild_Do_Table: Replicate_Wild_Ignore_Table: Last_Errno: 0 Last_Error: Skip_Counter: 0 Exec_Master_Log_Pos: 154 Relay_Log_Space: 571 Until_Condition: None Until_Log_File: Until_Log_Pos: 0 Master_SSL_Allowed: No Master_SSL_CA_File: Master_SSL_CA_Path: Master_SSL_Cert: Master_SSL_Cipher: Master_SSL_Key: Seconds_Behind_Master: 0 Master_SSL_Verify_Server_Cert: No Last_IO_Errno: 0 Last_IO_Error: Last_SQL_Errno: 0 Last_SQL_Error: Replicate_Ignore_Server_Ids: Master_Server_Id: 231 Master_UUID: 1f593746-475f-11ea-a7b1-005056841937 Master_Info_File: mysql.slave_master_info SQL_Delay: 0 SQL_Remaining_Delay: NULL Slave_SQL_Running_State: Slave has read all relay log; waiting for more updates Master_Retry_Count: 86400 Master_Bind: Last_IO_Error_Timestamp: Last_SQL_Error_Timestamp: Master_SSL_Crl: Master_SSL_Crlpath: Retrieved_Gtid_Set: Executed_Gtid_Set: Auto_Position: 1 Replicate_Rewrite_DB: Channel_Name: Master_TLS_Version: 1 row in set (0.00 sec)

ERROR: No query specified

MonkeyFang commented 4 years ago

need MASTER_AUTO_POSITION=0?

shlomi-noach commented 4 years ago

This looks good.

OK, last, can you start zb-search-test-203-31, make the topology good, then paste the JSON output of:

orchestrator-client -c api -path cluster/zb-search-test-203-31:3306
MonkeyFang commented 4 years ago

[root@zb-mongodb-test-203-23 mysql3306]# orchestrator-client -c api -path cluster/zb-search-test-203-31:3306 [ { "Key": { "Hostname": "zb-search-test-203-29", "Port": 3306 }, "InstanceAlias": "", "Uptime": 88119, "ServerID": 20329, "ServerUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "", "Port": 0 }, "MasterUUID": "", "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "IsDetachedMaster": false, "Slave_SQL_Running": false, "Slave_IO_Running": false, "ReplicationSQLThreadState": -1, "ReplicationIOThreadState": -1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": false, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": false }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": false }, "SlaveHosts": [ { "Hostname": "zb-search-test-203-31", "Port": 3306 }, { "Hostname": "zb-search-test-203-30", "Port": 3306 } ], "ClusterName": "zb-search-test-203-29:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 0, "IsCoMaster": false, "HasReplicationCredentials": false, "ReplicationCredentialsAvailable": false, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": true, "SemiSyncReplicaEnabled": false, "LastSeenTimestamp": "2020-02-13 16:11:41", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 3, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 4230551 }, { "Key": { "Hostname": "zb-search-test-203-30", "Port": 3306 }, "InstanceAlias": "", "Uptime": 1832, "ServerID": 407, "ServerUUID": "121ed19e-475f-11ea-97ee-005056848280", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000013", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "zb-search-test-203-29", "Port": 3306 }, "MasterUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d,121ed19e-475f-11ea-97ee-005056848280", "IsDetachedMaster": false, "Slave_SQL_Running": true, "Slave_IO_Running": true, "ReplicationSQLThreadState": 1, "ReplicationIOThreadState": 1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": true, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "3306-relay-bin.000002", "LogPos": 365, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": true }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": true }, "SlaveHosts": [], "ClusterName": "zb-search-test-203-29:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 1, "IsCoMaster": false, "HasReplicationCredentials": true, "ReplicationCredentialsAvailable": true, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": true, "SemiSyncReplicaEnabled": true, "LastSeenTimestamp": "2020-02-13 16:11:42", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 2, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 4355600 }, { "Key": { "Hostname": "zb-search-test-203-31", "Port": 3306 }, "InstanceAlias": "", "Uptime": 3069, "ServerID": 231, "ServerUUID": "1f593746-475f-11ea-a7b1-005056841937", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000018", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "zb-search-test-203-29", "Port": 3306 }, "MasterUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d,1f593746-475f-11ea-a7b1-005056841937", "IsDetachedMaster": false, "Slave_SQL_Running": true, "Slave_IO_Running": true, "ReplicationSQLThreadState": 1, "ReplicationIOThreadState": 1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": false, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "3306-relay-bin.000002", "LogPos": 319, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": true }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": true }, "SlaveHosts": [], "ClusterName": "zb-search-test-203-29:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 1, "IsCoMaster": false, "HasReplicationCredentials": true, "ReplicationCredentialsAvailable": true, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": false, "SemiSyncReplicaEnabled": true, "LastSeenTimestamp": "2020-02-13 16:11:42", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 2, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 4112055 } ]

shlomi-noach commented 4 years ago

@MonkeyFang thank you -- pro tip, use three backticks to format your code; use jq . to prettify your JSON output. It's very difficult for me to keep formatting these comments and they're unreadable. Please consider how this looks when I try to investigate the issue. Screen Shot 2020-02-13 at 10 19 29

shlomi-noach commented 4 years ago

here's that JSON again, in readable form:

[
  {
    "Key": {
      "Hostname": "zb-search-test-203-29",
      "Port": 3306
    },
    "InstanceAlias": "",
    "Uptime": 88119,
    "ServerID": 20329,
    "ServerUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d",
    "Version": "5.7.22-22-log",
    "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c",
    "FlavorName": "Percona",
    "ReadOnly": false,
    "Binlog_format": "ROW",
    "BinlogRowImage": "FULL",
    "LogBinEnabled": true,
    "LogSlaveUpdatesEnabled": true,
    "SelfBinlogCoordinates": {
      "LogFile": "3306-bin.000008",
      "LogPos": 154,
      "Type": 0
    },
    "MasterKey": {
      "Hostname": "",
      "Port": 0
    },
    "MasterUUID": "",
    "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d",
    "IsDetachedMaster": false,
    "Slave_SQL_Running": false,
    "Slave_IO_Running": false,
    "ReplicationSQLThreadState": -1,
    "ReplicationIOThreadState": -1,
    "HasReplicationFilters": false,
    "GTIDMode": "ON",
    "SupportsOracleGTID": true,
    "UsingOracleGTID": false,
    "UsingMariaDBGTID": false,
    "UsingPseudoGTID": false,
    "ReadBinlogCoordinates": {
      "LogFile": "",
      "LogPos": 0,
      "Type": 0
    },
    "ExecBinlogCoordinates": {
      "LogFile": "",
      "LogPos": 0,
      "Type": 0
    },
    "IsDetached": false,
    "RelaylogCoordinates": {
      "LogFile": "",
      "LogPos": 0,
      "Type": 1
    },
    "LastSQLError": "",
    "LastIOError": "",
    "SecondsBehindMaster": {
      "Int64": 0,
      "Valid": false
    },
    "SQLDelay": 0,
    "ExecutedGtidSet": "",
    "GtidPurged": "",
    "GtidErrant": "",
    "SlaveLagSeconds": {
      "Int64": 0,
      "Valid": false
    },
    "SlaveHosts": [
      {
        "Hostname": "zb-search-test-203-31",
        "Port": 3306
      },
      {
        "Hostname": "zb-search-test-203-30",
        "Port": 3306
      }
    ],
    "ClusterName": "zb-search-test-203-29:3306",
    "SuggestedClusterAlias": "",
    "DataCenter": "",
    "PhysicalEnvironment": "",
    "ReplicationDepth": 0,
    "IsCoMaster": false,
    "HasReplicationCredentials": false,
    "ReplicationCredentialsAvailable": false,
    "SemiSyncEnforced": true,
    "SemiSyncMasterEnabled": true,
    "SemiSyncReplicaEnabled": false,
    "LastSeenTimestamp": "2020-02-13 16:11:41",
    "IsLastCheckValid": true,
    "IsUpToDate": true,
    "IsRecentlyChecked": true,
    "SecondsSinceLastSeen": {
      "Int64": 3,
      "Valid": true
    },
    "CountMySQLSnapshots": 0,
    "IsCandidate": false,
    "PromotionRule": "neutral",
    "IsDowntimed": false,
    "DowntimeReason": "",
    "DowntimeOwner": "",
    "DowntimeEndTimestamp": "",
    "ElapsedDowntime": 0,
    "UnresolvedHostname": "",
    "AllowTLS": false,
    "LastDiscoveryLatency": 4230551
  },
  {
    "Key": {
      "Hostname": "zb-search-test-203-30",
      "Port": 3306
    },
    "InstanceAlias": "",
    "Uptime": 1832,
    "ServerID": 407,
    "ServerUUID": "121ed19e-475f-11ea-97ee-005056848280",
    "Version": "5.7.22-22-log",
    "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c",
    "FlavorName": "Percona",
    "ReadOnly": false,
    "Binlog_format": "ROW",
    "BinlogRowImage": "FULL",
    "LogBinEnabled": true,
    "LogSlaveUpdatesEnabled": true,
    "SelfBinlogCoordinates": {
      "LogFile": "3306-bin.000013",
      "LogPos": 154,
      "Type": 0
    },
    "MasterKey": {
      "Hostname": "zb-search-test-203-29",
      "Port": 3306
    },
    "MasterUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d",
    "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d,121ed19e-475f-11ea-97ee-005056848280",
    "IsDetachedMaster": false,
    "Slave_SQL_Running": true,
    "Slave_IO_Running": true,
    "ReplicationSQLThreadState": 1,
    "ReplicationIOThreadState": 1,
    "HasReplicationFilters": false,
    "GTIDMode": "ON",
    "SupportsOracleGTID": true,
    "UsingOracleGTID": true,
    "UsingMariaDBGTID": false,
    "UsingPseudoGTID": false,
    "ReadBinlogCoordinates": {
      "LogFile": "3306-bin.000008",
      "LogPos": 154,
      "Type": 0
    },
    "ExecBinlogCoordinates": {
      "LogFile": "3306-bin.000008",
      "LogPos": 154,
      "Type": 0
    },
    "IsDetached": false,
    "RelaylogCoordinates": {
      "LogFile": "3306-relay-bin.000002",
      "LogPos": 365,
      "Type": 1
    },
    "LastSQLError": "",
    "LastIOError": "",
    "SecondsBehindMaster": {
      "Int64": 0,
      "Valid": true
    },
    "SQLDelay": 0,
    "ExecutedGtidSet": "",
    "GtidPurged": "",
    "GtidErrant": "",
    "SlaveLagSeconds": {
      "Int64": 0,
      "Valid": true
    },
    "SlaveHosts": [],
    "ClusterName": "zb-search-test-203-29:3306",
    "SuggestedClusterAlias": "",
    "DataCenter": "",
    "PhysicalEnvironment": "",
    "ReplicationDepth": 1,
    "IsCoMaster": false,
    "HasReplicationCredentials": true,
    "ReplicationCredentialsAvailable": true,
    "SemiSyncEnforced": true,
    "SemiSyncMasterEnabled": true,
    "SemiSyncReplicaEnabled": true,
    "LastSeenTimestamp": "2020-02-13 16:11:42",
    "IsLastCheckValid": true,
    "IsUpToDate": true,
    "IsRecentlyChecked": true,
    "SecondsSinceLastSeen": {
      "Int64": 2,
      "Valid": true
    },
    "CountMySQLSnapshots": 0,
    "IsCandidate": false,
    "PromotionRule": "neutral",
    "IsDowntimed": false,
    "DowntimeReason": "",
    "DowntimeOwner": "",
    "DowntimeEndTimestamp": "",
    "ElapsedDowntime": 0,
    "UnresolvedHostname": "",
    "AllowTLS": false,
    "LastDiscoveryLatency": 4355600
  },
  {
    "Key": {
      "Hostname": "zb-search-test-203-31",
      "Port": 3306
    },
    "InstanceAlias": "",
    "Uptime": 3069,
    "ServerID": 231,
    "ServerUUID": "1f593746-475f-11ea-a7b1-005056841937",
    "Version": "5.7.22-22-log",
    "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c",
    "FlavorName": "Percona",
    "ReadOnly": false,
    "Binlog_format": "ROW",
    "BinlogRowImage": "FULL",
    "LogBinEnabled": true,
    "LogSlaveUpdatesEnabled": true,
    "SelfBinlogCoordinates": {
      "LogFile": "3306-bin.000018",
      "LogPos": 154,
      "Type": 0
    },
    "MasterKey": {
      "Hostname": "zb-search-test-203-29",
      "Port": 3306
    },
    "MasterUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d",
    "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d,1f593746-475f-11ea-a7b1-005056841937",
    "IsDetachedMaster": false,
    "Slave_SQL_Running": true,
    "Slave_IO_Running": true,
    "ReplicationSQLThreadState": 1,
    "ReplicationIOThreadState": 1,
    "HasReplicationFilters": false,
    "GTIDMode": "ON",
    "SupportsOracleGTID": true,
    "UsingOracleGTID": false,
    "UsingMariaDBGTID": false,
    "UsingPseudoGTID": false,
    "ReadBinlogCoordinates": {
      "LogFile": "3306-bin.000008",
      "LogPos": 154,
      "Type": 0
    },
    "ExecBinlogCoordinates": {
      "LogFile": "3306-bin.000008",
      "LogPos": 154,
      "Type": 0
    },
    "IsDetached": false,
    "RelaylogCoordinates": {
      "LogFile": "3306-relay-bin.000002",
      "LogPos": 319,
      "Type": 1
    },
    "LastSQLError": "",
    "LastIOError": "",
    "SecondsBehindMaster": {
      "Int64": 0,
      "Valid": true
    },
    "SQLDelay": 0,
    "ExecutedGtidSet": "",
    "GtidPurged": "",
    "GtidErrant": "",
    "SlaveLagSeconds": {
      "Int64": 0,
      "Valid": true
    },
    "SlaveHosts": [],
    "ClusterName": "zb-search-test-203-29:3306",
    "SuggestedClusterAlias": "",
    "DataCenter": "",
    "PhysicalEnvironment": "",
    "ReplicationDepth": 1,
    "IsCoMaster": false,
    "HasReplicationCredentials": true,
    "ReplicationCredentialsAvailable": true,
    "SemiSyncEnforced": true,
    "SemiSyncMasterEnabled": false,
    "SemiSyncReplicaEnabled": true,
    "LastSeenTimestamp": "2020-02-13 16:11:42",
    "IsLastCheckValid": true,
    "IsUpToDate": true,
    "IsRecentlyChecked": true,
    "SecondsSinceLastSeen": {
      "Int64": 2,
      "Valid": true
    },
    "CountMySQLSnapshots": 0,
    "IsCandidate": false,
    "PromotionRule": "neutral",
    "IsDowntimed": false,
    "DowntimeReason": "",
    "DowntimeOwner": "",
    "DowntimeEndTimestamp": "",
    "ElapsedDowntime": 0,
    "UnresolvedHostname": "",
    "AllowTLS": false,
    "LastDiscoveryLatency": 4112055
  }
]
MonkeyFang commented 4 years ago

i think i have find the reason,in the last,i just set the DetachLostSlavesAfterMasterFailover to true,but did not set DetachLostReplicasAfterMasterFailover,when i set it,it successful. in the end,could please give a place to find the table info of orchestrator meta table; and i could not find the parameter DetachLostReplicasAfterMasterFailover info in document too

MonkeyFang commented 4 years ago

[root@zb-mongodb-test-203-23 mysql3306]# orchestrator-client -c api -path cluster/zb-search-test-203-31:3306 | jq [ { "Key": { "Hostname": "zb-search-test-203-29", "Port": 3306 }, "InstanceAlias": "", "Uptime": 88760, "ServerID": 20329, "ServerUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "", "Port": 0 }, "MasterUUID": "", "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "IsDetachedMaster": false, "Slave_SQL_Running": false, "Slave_IO_Running": false, "ReplicationSQLThreadState": -1, "ReplicationIOThreadState": -1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": false, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": false }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": false }, "SlaveHosts": [ { "Hostname": "zb-search-test-203-30", "Port": 3306 }, { "Hostname": "zb-search-test-203-31", "Port": 3306 } ], "ClusterName": "zb-search-test-203-29:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 0, "IsCoMaster": false, "HasReplicationCredentials": false, "ReplicationCredentialsAvailable": false, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": true, "SemiSyncReplicaEnabled": false, "LastSeenTimestamp": "2020-02-13 16:22:22", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 0, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 6101748 }, { "Key": { "Hostname": "zb-search-test-203-30", "Port": 3306 }, "InstanceAlias": "", "Uptime": 2472, "ServerID": 407, "ServerUUID": "121ed19e-475f-11ea-97ee-005056848280", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000013", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "zb-search-test-203-29", "Port": 3306 }, "MasterUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d,121ed19e-475f-11ea-97ee-005056848280", "IsDetachedMaster": false, "Slave_SQL_Running": true, "Slave_IO_Running": true, "ReplicationSQLThreadState": 1, "ReplicationIOThreadState": 1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": true, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "3306-relay-bin.000002", "LogPos": 365, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": true }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": true }, "SlaveHosts": [], "ClusterName": "zb-search-test-203-29:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 1, "IsCoMaster": false, "HasReplicationCredentials": true, "ReplicationCredentialsAvailable": true, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": true, "SemiSyncReplicaEnabled": true, "LastSeenTimestamp": "2020-02-13 16:22:22", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 0, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 7191532 }, { "Key": { "Hostname": "zb-search-test-203-31", "Port": 3306 }, "InstanceAlias": "", "Uptime": 3709, "ServerID": 231, "ServerUUID": "1f593746-475f-11ea-a7b1-005056841937", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000018", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "zb-search-test-203-29", "Port": 3306 }, "MasterUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "AncestryUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d,1f593746-475f-11ea-a7b1-005056841937", "IsDetachedMaster": false, "Slave_SQL_Running": true, "Slave_IO_Running": true, "ReplicationSQLThreadState": 1, "ReplicationIOThreadState": 1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": false, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "3306-relay-bin.000002", "LogPos": 319, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": true }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": true }, "SlaveHosts": [], "ClusterName": "zb-search-test-203-29:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 1, "IsCoMaster": false, "HasReplicationCredentials": true, "ReplicationCredentialsAvailable": true, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": false, "SemiSyncReplicaEnabled": true, "LastSeenTimestamp": "2020-02-13 16:22:22", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 0, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 6531977 } ]

shlomi-noach commented 4 years ago
  1. Sorry, I lost you. I'll be honest that I find it hard to follow up and the text is unclear. Can you please clarify?

  2. The JSON you pasted shows zb-search-test-203-29 to be the master. In your previous comments zb-search-test-203-31 was the master; let's please sync on a single working scenario, again, this confuses me very much.

MonkeyFang commented 4 years ago

ok ,I'm sorry,I had many another attempts ,just wait for me a few times

MonkeyFang commented 4 years ago

[root@zb-mongodb-test-203-23 ~]# orchestrator-client -c api -path cluster/zb-search-test-203-31:3306 | jq [ { "Key": { "Hostname": "zb-search-test-203-29", "Port": 3306 }, "InstanceAlias": "", "Uptime": 89236, "ServerID": 20329, "ServerUUID": "74e4a99d-4d5b-11ea-ace8-00505684bd9d", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": true, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000008", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "zb-search-test-203-31", "Port": 3306 }, "MasterUUID": "", "AncestryUUID": "1f593746-475f-11ea-a7b1-005056841937,74e4a99d-4d5b-11ea-ace8-00505684bd9d", "IsDetachedMaster": false, "Slave_SQL_Running": false, "Slave_IO_Running": false, "ReplicationSQLThreadState": 0, "ReplicationIOThreadState": 0, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": false, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "3306-bin.000019", "LogPos": 154, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "3306-bin.000019", "LogPos": 154, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "3306-relay-bin.000001", "LogPos": 4, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": false }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": false }, "SlaveHosts": [], "ClusterName": "zb-search-test-203-31:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 1, "IsCoMaster": false, "HasReplicationCredentials": true, "ReplicationCredentialsAvailable": true, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": false, "SemiSyncReplicaEnabled": false, "LastSeenTimestamp": "2020-02-13 16:30:18", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 2, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": true, "DowntimeReason": "lost-in-recovery", "DowntimeOwner": "zb-search-test-203-33", "DowntimeEndTimestamp": "2021-02-12 16:30:19", "ElapsedDowntime": 73000000000, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 4772712 }, { "Key": { "Hostname": "zb-search-test-203-30", "Port": 3306 }, "InstanceAlias": "", "Uptime": 2950, "ServerID": 407, "ServerUUID": "121ed19e-475f-11ea-97ee-005056848280", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000013", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "zb-search-test-203-31", "Port": 3306 }, "MasterUUID": "1f593746-475f-11ea-a7b1-005056841937", "AncestryUUID": "1f593746-475f-11ea-a7b1-005056841937,121ed19e-475f-11ea-97ee-005056848280", "IsDetachedMaster": false, "Slave_SQL_Running": true, "Slave_IO_Running": true, "ReplicationSQLThreadState": 1, "ReplicationIOThreadState": 1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": true, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "3306-bin.000019", "LogPos": 154, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "3306-bin.000019", "LogPos": 154, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "3306-relay-bin.000002", "LogPos": 365, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": true }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": true }, "SlaveHosts": [], "ClusterName": "zb-search-test-203-31:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 1, "IsCoMaster": false, "HasReplicationCredentials": true, "ReplicationCredentialsAvailable": true, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": false, "SemiSyncReplicaEnabled": true, "LastSeenTimestamp": "2020-02-13 16:30:20", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 0, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 4600162 }, { "Key": { "Hostname": "zb-search-test-203-31", "Port": 3306 }, "InstanceAlias": "", "Uptime": 4187, "ServerID": 231, "ServerUUID": "1f593746-475f-11ea-a7b1-005056841937", "Version": "5.7.22-22-log", "VersionComment": "Percona Server (GPL), Release 22, Revision f62d93c", "FlavorName": "Percona", "ReadOnly": false, "Binlog_format": "ROW", "BinlogRowImage": "FULL", "LogBinEnabled": true, "LogSlaveUpdatesEnabled": true, "SelfBinlogCoordinates": { "LogFile": "3306-bin.000019", "LogPos": 154, "Type": 0 }, "MasterKey": { "Hostname": "", "Port": 0 }, "MasterUUID": "", "AncestryUUID": "1f593746-475f-11ea-a7b1-005056841937", "IsDetachedMaster": false, "Slave_SQL_Running": false, "Slave_IO_Running": false, "ReplicationSQLThreadState": -1, "ReplicationIOThreadState": -1, "HasReplicationFilters": false, "GTIDMode": "ON", "SupportsOracleGTID": true, "UsingOracleGTID": false, "UsingMariaDBGTID": false, "UsingPseudoGTID": false, "ReadBinlogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 0 }, "ExecBinlogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 0 }, "IsDetached": false, "RelaylogCoordinates": { "LogFile": "", "LogPos": 0, "Type": 1 }, "LastSQLError": "", "LastIOError": "", "SecondsBehindMaster": { "Int64": 0, "Valid": false }, "SQLDelay": 0, "ExecutedGtidSet": "", "GtidPurged": "", "GtidErrant": "", "SlaveLagSeconds": { "Int64": 0, "Valid": false }, "SlaveHosts": [ { "Hostname": "zb-search-test-203-30", "Port": 3306 } ], "ClusterName": "zb-search-test-203-31:3306", "SuggestedClusterAlias": "", "DataCenter": "", "PhysicalEnvironment": "", "ReplicationDepth": 0, "IsCoMaster": false, "HasReplicationCredentials": false, "ReplicationCredentialsAvailable": false, "SemiSyncEnforced": true, "SemiSyncMasterEnabled": true, "SemiSyncReplicaEnabled": false, "LastSeenTimestamp": "2020-02-13 16:30:20", "IsLastCheckValid": true, "IsUpToDate": true, "IsRecentlyChecked": true, "SecondsSinceLastSeen": { "Int64": 0, "Valid": true }, "CountMySQLSnapshots": 0, "IsCandidate": false, "PromotionRule": "neutral", "IsDowntimed": false, "DowntimeReason": "", "DowntimeOwner": "", "DowntimeEndTimestamp": "", "ElapsedDowntime": 0, "UnresolvedHostname": "", "AllowTLS": false, "LastDiscoveryLatency": 4658914 } ]

MonkeyFang commented 4 years ago

I have confirmed that ,all configuration remain the same,just change the orchestrator version from 3.0.14 to 3.1.4, [root@zb-mongodb-test-203-23 orchestrator]# orchestrator-client -c replication-analysis zb-search-test-203-31:3306 (cluster zb-search-test-203-31:3306): NoFailoverSupportStructureWarning

note:the orchestrator backend database was all already recreate,

in this situation,when master dead,the other slave can not point to new master

shlomi-noach commented 4 years ago

I'm not sure what's up with your setup, but this is suspicious:

"ExecutedGtidSet": "",
"GtidPurged": "",

how is it possible you're using GTID and have a completely empty set?

Sorry, I'm not sure I can assist much more on the orchestrator side, but this seems like the issue.

shlomi-noach commented 4 years ago

closing as stale