I have a leader selection program, I test the code and find a problem, here is it, if I cut the network, the session will timeout(Expected), but the client does't reconnect to server even the network get normal but throw SessionExpired error. That really confuse me. The code is as follows.
class BaseZKWatcher(object):
def __init__(self, path):
self.path = path
self.zk = KazooClient(config.zookeeper_server, timeout=10)
self.zk.start()
self.zk.add_listener(self.asger_listener)
def status(self):
return self.zk.state
def create_node(self):
pass
def do_action(self):
pass
def asger_listener(self, state):
if state == KazooState.LOST:
logger.info('zookeeper connection timeout, state is LOST')
while True: # I want to recreate the node and check the state
try:
time.sleep(10)
self.create_node()
self.zk.get_children(path=self.path, watch=self.asger_watcher)
logger.info('reconnect zookeeper completed')
break
except Exception, _:
traceback.print_exc()
elif state == KazooState.SUSPENDED:
logger.info('zookeeper connection timeout, state is SUSPENDED')
elif state == KazooState.CONNECTED:
logger.info('zookeeper connected, state is CONNECTED')
else:
logger.info('unknown state')
def asger_watcher(self, event):
if event.state == "CONNECTED" and event.type == "CREATED" or event.type == "DELETED" or event.type == "CHANGED" or event.type == "CHILD":
logger.info('child node change event occurred')
self.do_action()
else:
logger.info('unidentified event occurred')
class HAMaster(BaseZKWatcher):
def __init__(self, path):
super(HAMaster, self).__init__(path)
self.hostname = socket.gethostname()
self.is_leader = False
def create_node(self):
node = self.path + '/node'
try:
self.zk.create(path=node, value=self.hostname, ephemeral=True, sequence=True, makepath=True)
except NodeExistsError as e:
logger.info(str(e))
def do_action(self):
logger.info('node %s start select master' % self.hostname)
node_list = self.zk.get_children(path=self.path, watch=self.asger_watcher)
leader_node = min(node_list)
leader_hostname = self.zk.get(self.path + '/' + leader_node)[0]
if leader_hostname == self.hostname:
if not self.is_leader:
self.is_leader = True
logger.info('node %s is selected as master, it was not master before, scheduler start' % self.hostname)
else:
logger.info('node %s is selected as master, it was master before' % self.hostname)
else:
if self.is_leader:
self.is_leader = False
logger.info('node %s is selected as slave, it was master before, scheduler shut down' % self.hostname)
else:
logger.info('node %s is selected as slave, it was not master before' % self.hostname)
logger.info('node %s select master finished' % self.hostname)
with open(config.is_leader_file, 'w') as f:
f.write(str(self.is_leader))
If we lost the connection, I think client will reconnect to the server and I try to recreate the node and check the state in the while loop. But, seems the client will not reconnect to the server.
if state == KazooState.LOST:
logger.info('zookeeper connection timeout, state is LOST')
while True: # I want to recreate the node and check the state
try:
time.sleep(10)
self.create_node()
self.zk.get_children(path=self.path, watch=self.asger_watcher)
logger.info('reconnect zookeeper completed')
break
except Exception, _:
traceback.print_exc()
I have a leader selection program, I test the code and find a problem, here is it, if I cut the network, the session will timeout(Expected), but the client does't reconnect to server even the network get normal but throw SessionExpired error. That really confuse me. The code is as follows.
If we lost the connection, I think client will reconnect to the server and I try to recreate the node and check the state in the while loop. But, seems the client will not reconnect to the server.