python-zk / kazoo

Kazoo is a high-level Python library that makes it easier to use Apache Zookeeper.
https://kazoo.readthedocs.io
Apache License 2.0
1.3k stars 387 forks source link

Kazoo client doesn't reconnect to server automatically if session timeout due to network interruption #600

Open LiuPeien opened 4 years ago

LiuPeien commented 4 years ago

I have a leader selection program, I test the code and find a problem, here is it, if I cut the network, the session will timeout(Expected), but the client does't reconnect to server even the network get normal but throw SessionExpired error. That really confuse me. The code is as follows.

class BaseZKWatcher(object):
    def __init__(self, path):
        self.path = path
        self.zk = KazooClient(config.zookeeper_server, timeout=10)
        self.zk.start()
        self.zk.add_listener(self.asger_listener)

    def status(self):
        return self.zk.state

    def create_node(self):
        pass

    def do_action(self):
        pass

    def asger_listener(self, state):
        if state == KazooState.LOST:
            logger.info('zookeeper connection timeout, state is LOST')
            while True: # I want to recreate the node and check the state
                try:
                    time.sleep(10)
                    self.create_node()
                    self.zk.get_children(path=self.path, watch=self.asger_watcher)
                    logger.info('reconnect zookeeper completed')
                    break
                except Exception, _:
                    traceback.print_exc()
        elif state == KazooState.SUSPENDED:
            logger.info('zookeeper connection timeout, state is SUSPENDED')
        elif state == KazooState.CONNECTED:
            logger.info('zookeeper connected, state is CONNECTED')
        else:
            logger.info('unknown state')

    def asger_watcher(self, event):
        if event.state == "CONNECTED" and event.type == "CREATED" or event.type == "DELETED" or event.type == "CHANGED" or event.type == "CHILD":
            logger.info('child node change event occurred')
            self.do_action()
        else:
            logger.info('unidentified event occurred')

class HAMaster(BaseZKWatcher):
    def __init__(self, path):
        super(HAMaster, self).__init__(path)
        self.hostname = socket.gethostname()
        self.is_leader = False

    def create_node(self):
        node = self.path + '/node'
        try:
            self.zk.create(path=node, value=self.hostname, ephemeral=True, sequence=True, makepath=True)
        except NodeExistsError as e:
            logger.info(str(e))

    def do_action(self):
        logger.info('node %s start select master' % self.hostname)
        node_list = self.zk.get_children(path=self.path, watch=self.asger_watcher)
        leader_node = min(node_list)
        leader_hostname = self.zk.get(self.path + '/' + leader_node)[0]

        if leader_hostname == self.hostname:
            if not self.is_leader:
                self.is_leader = True
                logger.info('node %s is selected as master, it was not master before, scheduler start' % self.hostname)
            else:
                logger.info('node %s is selected as master, it was master before' % self.hostname)
        else:
            if self.is_leader:
                self.is_leader = False
                logger.info('node %s is selected as slave, it was master before, scheduler shut down' % self.hostname)
            else:
                logger.info('node %s is selected as slave, it was not master before' % self.hostname)
        logger.info('node %s select master finished' % self.hostname)

        with open(config.is_leader_file, 'w') as f:
            f.write(str(self.is_leader))

If we lost the connection, I think client will reconnect to the server and I try to recreate the node and check the state in the while loop. But, seems the client will not reconnect to the server.

if state == KazooState.LOST:
        logger.info('zookeeper connection timeout, state is LOST')
        while True: # I want to recreate the node and check the state
            try:
                time.sleep(10)
                self.create_node()
                self.zk.get_children(path=self.path, watch=self.asger_watcher)
                logger.info('reconnect zookeeper completed')
                break
           except Exception, _:
                traceback.print_exc()
StephenSorriaux commented 4 years ago

Hello,

Can you please precise the Kazoo version you are using? I think this is fixed since the 2.7.0 version of Kazoo.

Alex-wwei commented 3 years ago

hello,I have same questrion with the 2.8.0 version of Kazoo. Is there a solution?