#!/usr/bin/python
#
# Copyright 2016 Pinterest, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Utilities for using gevent based kazoo zk client.

This module contains a KazooClientManager, which provides a KazooClient,
a high level zookeeper client; it also contains a ServerSet implementation,
which can be used to participate a server set or monitor the changes of a
server set.

To join a server set::

    server_set = ServerSet("/.../path_to_root_znode", file_containing_zk_endpoints)
    server_set.join("127.0.0.1:8080")

The sample code above will try to join the server set in the background,
and keep retrying in case of zookeeper client having difficulties in
communicating with zookeeper, once the connection with zookeeper is
established again, the endpoint will join the server set again. However,
if you want to try to join a server set, and synchronously checking on
whether you've succeeded, and based on that applying some custom logic,
you can do the following instead::

    try:
        g = server_set.join("127.0.0.1:8080", keep_retrying=False)
        g.get()  # if it fails, an exception will be thrown here
    except Exception:
        ...

Note that in the code above, keep_retrying is passed in as False,
which means joining server set will only be tried once, if it succeeds,
then sever set will keep maintaining the endpoint in the server set,
but if it fails, the exception will be raised to containing greenlet,
the caller can choose to handle it in desired way; and joining the server
set won't be maintained by the server set in this case.

To monitor a server set, a callback function that can expect a generator of
all endpoints in a server set needs to be provided, and then the caller can
simply do the following::

    def on_change(endpoints):
        for endpoint in endpoints:
            ...
        ...

    server_set.monitor(on_change)

Similarly the caller can also choose to try to place the monitor only once::

    try:
        g = server_set.monitor(on_change, keep_retrying=False)
        g.get()
    except Exception:
        ...  # for example, using the default list of endpoints
        # as an example, now have server set keeps retrying for me now
        server_set.monitor(on_change)

As for Datawatcher, one can use it in a similar way as monitoring serverset,
except that the callback should expect two parameters, one is the value of the
znode, the other is the stats of the znode. The stat parameter has the
version number of the correspondingn znode, which can be used in the
callback to discard out-of-order notification, should that ever happen.

For example::

    def on_change(value, stat):
        ...

    watcher = DataWatcher("/.../path_to_znode")
    watcher.watch(on_change)

If you want to do it synchronously, you can do the following instead::

    try:
        g = watcher.watch(on_change, keep_retrying=False)
        g.get()
    except Exception:
       ...  # Deal with the exception.

"""
from collections import namedtuple
import functools
import logging
import os
import random
import sys
import socket
import time

import gevent
from decorators import SingletonMetaclass
from kazoo.client import KazooClient
from kazoo.handlers.gevent import SequentialGeventHandler
from kazoo.protocol.states import KazooState
from utils import dummy_statsd
from utils import hostname
from utils import _escape_path_for_stats_name
from file_watch import FileWatch

log = logging.getLogger(__name__)


class NoClientException(Exception):
    """Special exception when the underlying client is ``None``."""
    pass


class KazooClientManager(object):
    """Singleton manager for maintaining a healthy underlying zk client."""

    __metaclass__ = SingletonMetaclass

    MIN_HEALTH_LOG_INTERVAL_IN_SECS = 60

    def __init__(self, zk_hosts, max_num_consecutive_failures=3,
                 health_check_interval=30, sc=dummy_statsd, start_timeout=25.0,
                 session_timeout=16.0):

        """Constructor.

        Args:
            zk_hosts: a list of host:port. For example,
                the list should have the following format:
                ['observerzookeeper010:2181',
                 'observerzookeeper011:2181',
                 'observerzookeeper012:2181',
                 'observerzookeeper013:2181',
                 'observerzookeeper014:2181',
                 'observerzookeeper015:2181']
            max_num_consecutive_failures: the max number of consecutive
                failures can be tolerated by the buddy health checking
                greenlet before it attempts to recreate a new connection.
            max_health_check_interval: the number of seconds between health
                checks.
            sc: stats_client. See example implementation in utils.py. You can
                plug the stats_client with any stats backend, for example tsd.
            start_timeout: the start timeout (in float) of kazoo client
                connecting to Zookeeper
            session_timeout: a timeout (in float) for zookeeper sessions.

        """
        # The list to keep track of the callbacks to be invoked when client
        # has to be torn down and recreated.
        self.zk_hosts = zk_hosts
        self.start_timeout = start_timeout
        self.session_timeout = session_timeout
        self._client_callbacks = []
        self._max_num_consecutive_failures = max_num_consecutive_failures
        self._health_check_interval = health_check_interval
        self._last_success_health_check_ts = time.time()
        self._last_healthy_log_ts = None
        self._is_destroyed = False
        self._sc = sc

        # Trying to connect to zookeeper.
        self._start("Failed to connect to zk.", True)

    def is_current_client(self, client):
        """Check whether the client held by the caller is still current.

        Args:
            client: the kazoo client checked out by the caller.

        Returns:
            True if it is the current underlying client holden by the
            manager; otherwise, False.

        """
        return client is self._client

    def get_client(self, no_throw=True):
        """Get the current underlying client, could be None.

        Args:
            no_throw: if the underlying client is None, throw exception or
                not.

        Returns:
            The underlying client kept by this manager.

        Throws:
            If no_throw is False, raise NoClientException when the
            underlying client is None.

        """
        if self._client is None and not no_throw:
            raise NoClientException()

        return self._client

    def on_client_change(self, func):
        """"Register a callback when the client has to be re-established.

        The underlying client is only stopped and recreated when it is messed
        up so badly that it cannot be repaired. In this case, we will have to
        tear down the existing client and try to recreate a new connection.
        Note that this should only happen very very rarely.

        Args:
            func: the callback function invoked when client is torn down and
                recreated.

        """
        self._client_callbacks.append(func)

    def _start(self, err_msg, spawn_monit=False):
        if self._is_destroyed:
            return

        self._client = None
        # Increase the session timeout from 10 to 25 seconds.
        try:
            host_list = self.zk_hosts
            client = KazooClient(
                hosts=",".join(host_list),
                timeout=self._get_session_timeout(),
                max_retries=3,
                handler=SequentialGeventHandler())

            # Increase the start timeout to 20 seconds from 15 seconds.
            # Guard this with explicit gevent timeout to protect us from
            # some corner cases where starting client failed to respect
            # start timeout passed in below.
            with gevent.Timeout(seconds=self._get_start_timeout() + 5):
                client.start(timeout=self._get_start_timeout())
            client.ensure_path("/")
            self._last_success_health_check_ts = time.time()
            log.info("Successfully started kazoo client.")
            self._client = client
        except (Exception, gevent.Timeout):
            self._sc.increment("errors.zk.client.start.failure",
                               tags={'host': hostname},
                               sample_rate=1)
            log.exception(err_msg)
        finally:
            if spawn_monit:
                self._monit_greenlet = gevent.spawn(self._monit)
                gevent.sleep(0)

    def _get_session_timeout(self):
        """Get zookeeper timeout setting.
        Returns:
            A float number as the number of seconds.

        """
        return self.session_timeout

    def _get_start_timeout(self):
        """Get zookeeper startup timeout setting.
        Returns:
            A float number as the number of seconds.

        """
        return self.start_timeout

    def _stop_client(self):
        """Best effort to stop the client."""
        try:
            # Make sure not to mistake this scenario with failing to stop
            # client.
            if self._client is None:
                log.info("Kazoo client is None.")
                return

            _retry((Exception,), tries=3, delay=1, backoff=2,
                   sleep_func=gevent.sleep)(self._client.stop)()

            log.info("Successfully stopped kazoo client.")
        except (Exception, gevent.Timeout):
            self._sc.increment("errors.zk.client.stop.failure",
                               tags={'host': hostname},
                               sample_rate=1)
            log.exception("Failed to stop kazoo client.")

    def _dispatch_client_change_callback(self, client):
        if self._is_destroyed:
            return
        log.info("Start dispatching client change callback.")
        for callback in self._client_callbacks:
            try:
                callback(client)
            except (Exception, gevent.Timeout):
                self._sc.increment("errors.zk.client.change_callback.failure",
                                   tags={'host': hostname},
                                   sample_rate=1)
                log.exception("Failed to exec client change callback.")

    def _reconnect(self):
        log.info("Try to reconnect to zk.")
        self._stop_client()
        if self._is_destroyed:
            return False
        self._start("Failed to reconnect to zk.")
        if self._client and self._client.connected:
            gevent.spawn(self._dispatch_client_change_callback, self._client)
            return True
        self._sc.increment("zk.client.reconnect.failure",
                           tags={'host': hostname},
                           sample_rate=1)
        return False

    def _log_zk_healthy(self):
        # Suppress zk health log.
        if (self._last_healthy_log_ts is None or
                (self._last_success_health_check_ts -
                 self._last_healthy_log_ts >
                 KazooClientManager.MIN_HEALTH_LOG_INTERVAL_IN_SECS)):
            log.info("Underlying zookeeper connection is healthy.")
            self._last_healthy_log_ts = self._last_success_health_check_ts

    def _monit(self):
        """Buddy greenlet to renew client when things are messed up badly.

        Every 5 seconds, this buddy greenlet wakes up to perform a health
        check on the underlying client, if this check fails 3 times in a
        row, it will attempt to drop the current client and establish a new
        client, and upon successfully establishing the new client, invoke the
        client change callbacks.

        Ideally we shouldn't have to do anything like this with managing
        connection to zk servers, however, during testing kazoo client
        sometimes gets into a state where it can never recover on its own.
        This should happen very very rarely.

        """
        num_failures = 0
        while True and not self._is_destroyed:
            try:
                reconnect = False

                # Check the healthiness of the client
                try:
                    if self._client:
                        self._client.exists("/")
                        # This only happens every 5 seconds.
                        self._last_success_health_check_ts = time.time()
                        self._log_zk_healthy()
                        # Reset the number of failures after a success
                        num_failures = 0
                    else:
                        reconnect = True
                except (Exception, gevent.Timeout):
                    self._sc.increment("errors.zk.health_check.failure",
                                       tags={'host': hostname},
                                       sample_rate=1)
                    log.exception("Failed to check existence of zk root: %d.",
                                  num_failures)
                    # Reset _last_healthy_log_ts in case of failure.
                    self._last_healthy_log_ts = None
                    # Client is not healthy
                    num_failures += 1
                    if num_failures >= self._max_num_consecutive_failures:
                        reconnect = True
                finally:
                    # If the client manager has lost connection to zookeeper
                    # for more than 5 mins turn on the gauge.
                    if num_failures:
                        unhealthy_duration_in_secs = (
                            time.time() - self._last_success_health_check_ts)
                        if unhealthy_duration_in_secs > 5 * 60:
                            # TODO: it'd be great if we can have
                            # gauge and hookup an alert with it, or it happens
                            # rarely enough, just kill this process and let
                            # supervisor restart it, but it feels a bit scary
                            # to do that, because that has the potential to
                            # bring down the whole service.
                            log.error(
                                "Failed to connect to zookeeper for %.2f "
                                "seconds", unhealthy_duration_in_secs)
                            # In this case, reconnect right now instead of
                            # waiting for three failures in a row. I've seen
                            # python process was suspended on ngapp for
                            # about 20 mins.
                            reconnect = True

                if reconnect:
                    if self._reconnect():
                        # Reset the number of failures experienced from monit
                        num_failures = 0

                gevent.sleep(self._health_check_interval)
            except gevent.GreenletExit:
                log.exception("Zookeeper connection monit is stopped.")
                raise
            except BaseException:
                # Just log the exception and keep going in this case.
                # We catch BaseException here because we dont want to have
                # this greenlet stop in any case other than this greenlet is
                # explicitly killed. In case of making the process respond to
                # SystemExit or KeyboardInterrupt, this greenlet should
                log.exception("Unexpected exception in zookeeper monit")
                # Give other greenlet a chance.
                gevent.sleep(0)

    def _destroy(self):
        """This will cause new attempts to reconnect to stop.

        However, this doesn't guarantee the current ongoing attempt to
        reconnect is stopped. And no futher underlying health check on the
        current client will be done.

        This should only be called when an endpoint is ready to be
        terminated.

        DONT CALL THIS METHOD WITHOUT CONSULTING SOMEONE IN INFROPS.

        """
        self._is_destroyed = True


class WatcherUtil(object):

    CLIENT_CHANGED = object()

    @staticmethod
    def _get_waiting_in_secs(waiting_in_secs,
                             num_retries,
                             max_waiting_in_secs):
        """Retrieve the waiting time in seconds.

        This method uses exponential back-off in figuring out the number of
        seconds to wait; however, the max wait time shouldn't be more than
        what is specified via max_waiting_in_seconds.

        Args:
            waiting_in_secs: waiting time in seconds.
            num_retries: number of retries, starting from 0.
            max_waiting_in_secs: maximum waiting time in seconds.

        Returns:
            The number of seconds to wait.

        """
        # make the backoff going up even faster
        waiting_in_secs *= 2**num_retries
        jitter = waiting_in_secs * 0.2
        waiting_in_secs += random.triangular(-jitter, jitter)
        return min(waiting_in_secs, max_waiting_in_secs)

    @staticmethod
    def _keep_retrying_till_client_change(
            zk_hosts, client, msg, waiting_in_secs, max_waiting_in_secs,
            is_stopped, func, *args, **kwargs):
        """Keep retrying the function until the underlying client is changed.

        If the function throws, this method will keep retrying the function
        until it succeeds or the underlying client maintained by
        KazooClientManager is changed.

        Args:
            zk_hosts: the zk endpoint list
            client: a kazoo client
            msg: the message to be logged when the function throws
            waiting_in_secs: the waiting time in seconds, when the operations
                on zookeeper fail.
            max_waiting_in_secs: the max number of seconds to wait.
            is_stoppped: Whether to stop trying or not.
            func: the function to be invoked
            args: the arguments to be passed into the function
            kwargs: the named arguments to be passed into the function

        Returns:
            The result of the function passed in, or None in case that the
            underlying client is changed.

        """
        num_retries = 0
        while ((not is_stopped or not is_stopped()) and
               KazooClientManager(zk_hosts).is_current_client(client)):
            try:
                if client is None:
                    log.info("Client is None, go to sleep.")
                    # When waiting for a client don't do exponential back-off
                    gevent.sleep(
                        WatcherUtil._get_waiting_in_secs(
                            waiting_in_secs, 0, max_waiting_in_secs))
                else:
                    num_retries += 1
                    result = func(client, *args, **kwargs)
                    log.info("%s succeeded.", func)
                    return result
            except (Exception, gevent.Timeout):
                log.exception(msg)
                gevent.sleep(WatcherUtil._get_waiting_in_secs(
                    waiting_in_secs, num_retries, max_waiting_in_secs))

        log.info("Kazoo client is changed.")
        return WatcherUtil.CLIENT_CHANGED

    @staticmethod
    def _do_it(zk_hosts, keep_retrying, keep_retrying_on_reconnect, err_msg,
               waiting_in_seconds, max_waiting_in_secs, is_stopped, command):
        """Handles the complexity of client reconnection and replacement.

        Args:
            zk_hosts: the zk endpoint list
            keep_retrying: whether to only perform the initial operation once
                or keep retrying until success; if keep_retrying is set to
                True, this ought to be done in a separate greenlet or thread
                to avoid possible indefinite blocking.
            keep_retrying_on_reconnect: some of the commands already handles
                client reconnection, in that case, this method doesn't need
                to handle the reconnection event any more.
            err_msg: the error message to be logged when the operation fails.
            waiting_in_secs: the waiting time in seconds, when the operations
                on zookeeper fail.
            max_waiting_in_secs: the max waiting time in seconds,
                when the operations on zookeeper fail.
            is_stopped: Whether to stop trying or not.
            command: the operation to be performed, can only be join,
                or monitor in the current implementation.

        """

        def on_reconnect(state):
            # Here we handle the client reconnection.
            if state == KazooState.CONNECTED:
                # Spawn to not block the session event handling greenlet
                gevent.spawn(WatcherUtil._keep_retrying_till_client_change,
                             zk_hosts,
                             KazooClientManager(zk_hosts).get_client(),
                             err_msg, waiting_in_seconds, max_waiting_in_secs,
                             is_stopped, command)

        def inner_do_it(keep_retrying, client):
            if not keep_retrying and (not is_stopped or not is_stopped()):
                # The initial invocation.
                result = command(client)
            else:
                # This path only invoked when client is replaced with a new
                # one, we need to keep retrying until it succeeds or the
                # underlying client is being replaced again.
                result = WatcherUtil._keep_retrying_till_client_change(
                    zk_hosts, client, err_msg, waiting_in_seconds,
                    max_waiting_in_secs, is_stopped, command)

            # register reconnection handler if needed.
            if (keep_retrying_on_reconnect and
                    result is not WatcherUtil.CLIENT_CHANGED and
                    client is not None):
                client.add_listener(on_reconnect)

            return result

        result = None

        # Pass in the partial function, when invoked, the current client at
        # the moment will be passed in, so the operation will be tried on
        # the new client instead.
        # We need to register this before doing the operation in the current
        # greenlet to avoid missing the client change event.
        KazooClientManager(zk_hosts).on_client_change(
            functools.partial(inner_do_it, True))

        while True:
            log.debug("Start processing %s.", command)
            client = KazooClientManager(zk_hosts).get_client()
            result = inner_do_it(keep_retrying, client)
            # Underlying client changed causes inner_do_it to return or we
            # choose not to keep retrying
            if client is KazooClientManager(zk_hosts).get_client():
                log.debug("Stop because the command succeeded: %s.", command)
                break

            if not keep_retrying:
                log.debug("Stop because no keeping retrying: %s.", command)
                break

        return result

    @staticmethod
    def spawn(keep_retrying, keep_retrying_on_reconnect, err_msg,
              waiting_in_seconds, command, zk_hosts, max_waiting_in_secs=300,
              is_stopped=None):
        """Spawn a greenlet to complete the operation on the server set.

        If keep_retrying is set to False, this method will block on the
        initial operation and raise whatever exception rasied in the initial
        operation.

        Args:
            keep_retrying: whether to keep retrying the initial operation
                until it succeeds.
            keep_retrying_on_reconnect: whether to keep retrying the
                operation upon client reconnection.
            err_msg: the error message to log when operation fails.
            waiting_in_secs: the waiting time in seconds, when the operations
                on zookeeper fail.
            max_waiting_in_secs: the max waiting time in seconds,
                when the operations on zookeeper fail.
            is_stopped: whether to keep trying or not.
            command: the operation to be done using the underlying client.
            zk_hosts: the zk hosts list.

        Returns:
            A greenlet, if keep_retrying is True, please avoid blocking on
            the returned greenlet indefinately.

        """
        g = gevent.spawn(
            WatcherUtil._do_it, zk_hosts, keep_retrying,
            keep_retrying_on_reconnect, err_msg, waiting_in_seconds,
            max_waiting_in_secs, is_stopped, command)
        if not keep_retrying:
            # wait till the greenlet is done and raise exception if needed
            g.get()
        return g


class DataWatcher(object):
    def __init__(self, path, zk_hosts, waiting_in_secs=5, sc=dummy_statsd,
                 file_path=None):
        """Constructor for the data watcher.

        Args:
            path: the path to the znode to watch the data change on.
            zk_hosts: the zookeeper endpoint list.
                the list should have the following format:
                ['observerzookeeper010:2181',
                 'observerzookeeper011:2181',
                 'observerzookeeper012:2181',
                 'observerzookeeper013:2181',
                 'observerzookeeper014:2181',
                 'observerzookeeper015:2181']
            waiting_in_secs: the waiting time in seconds,
                when the operations on zookeeper fail; by default, 5 second.
            sc: the statsd client
            file_path: the local file to watch. When it is set, the data
                watcher does not talk to zk for monitoring data changes.
        """
        self._path = path
        self._path_stats_name = _escape_path_for_stats_name(path)
        self._waiting_in_secs = waiting_in_secs
        self.zk_hosts = zk_hosts
        self._sc = sc
        self._file_path = file_path
        self._file_path_stats_name = _escape_path_for_stats_name(file_path)
        # Stat is for allowing to return the stat of data with attribute
        # 'version'
        self.Stat = namedtuple('Stat', ['version'])

    def get_data(self):
        """Get the data associated with the znode path. When a local file
           is set, the function returns data in the local file.
           Otherwise it returns the latest data in zookeeper.

        Returns:
            The data in the format of a tuple (value,
            `kazoo.protocol.states.ZnodeStat`) of the znode. When a local
            file is used, the stat contains a version attribute which
            is the same as the mtime of the file.

        Raises:
            `kazoo.exceptions.NoNodeError` if the node doesn't exist;
            `kazoo.exceptions.ZookeeperError` if the server returns a
            non-zero error code; `NoClientException` if the underlying
            client is not set up correctly yet by `KazooClientManager`.

            If the local file is set but does not exist or accessible,
            OSError is raised.

        """
        if self._file_path:
            with open(self._file_path, 'r') as f:
                value = f.read()
                # Set version to be the mtime of the file
                stat = self.Stat(version=os.path.getmtime(self._file_path))
            return (value, stat)
        else:
            client = KazooClientManager(self.zk_hosts).get_client(no_throw=False)
            return client.get(self._path)

    def watch(self, func, keep_retrying=True):
        """Register a callback for data changes.

        This method will spawn a greenlet to monitor the data in zk or local
        file. If the caller wants to try monitoring and apply custom error
        handling, the caller can set keep_retrying is set to False; otherwise,
        the caller should set keep_retrying to True, all the errors are
        transparent to the caller, the caller can just make the call and
        forget about it. The former gives the caller a chance to use a
        default list of servers if the initial attempt to monitor fails.

        If _file_path is provided when the DataWather is initialized, the
        callback is triggered on the file change, otherwise it is triggered
        on data change in the zookeeper path.

        Args:
            keep_retrying: whether to keep retrying to monitor the data
                initially, or just try once, if it throws exception,
                apply the custom exception handling
            func: On server set changes, func will be invoked with the value
            and stat of data, stat is supposed to have a 'version' field.

        Returns:
            A greenlet, if keep_retrying is True, please avoid blocking on
            the returned greenlet indefinitely.

        """
        if self._file_path:
            # NOTE: we use gevent.spawn instead of WatcherUtil.spawn for
            # file based datawatch to avoid accidentally instantiate zk
            # connection. DONT combine these two code path without serious
            # discussion with infra and ops.
            g = gevent.spawn(FileWatch(sc=self._sc).add_watch,
                             self._file_path,
                             func,
                             keep_retrying=keep_retrying,
                             backoff_in_secs=self._waiting_in_secs)
            if not keep_retrying:
                g.get()
            return g

        def kazoo_datawatch(client):
            try:
                log.info("Try to place zk data watch: %s.",
                         str(self._path))
                client.DataWatch(self._path, func, allow_session_lost=True)
                log.info("Successfully placed zk data watch: %s." % self._path)
            except (Exception, gevent.Timeout):
                self._sc.increment("errors.zk.datawatch.failure",
                                   tags={'path': self._path_stats_name},
                                   sample_rate=1)
                raise
        err_msg = "Failed to place zk data watch: %s." % self._path
        return WatcherUtil.spawn(keep_retrying, False, err_msg,
                                 self._waiting_in_secs, kazoo_datawatch,
                                 self.zk_hosts)


class ServerSet(object):
    def __init__(self, path, zk_hosts, waiting_in_secs=5, sc=dummy_statsd,
                 file_path=None):
        """Constructor for the server set.

        Args:
            zk_hosts: the zk hosts list.
            path: the root path of the server set.
            waiting_in_secs: the waiting time in seconds,
                when the operations on zookeeper fail; by default, 5 second.
            sc: the statsd client
            file_path: the local file to watch. When it is set, the server
                set does not talk to zk for monitoring data changes.
        """
        self._path = path
        self._path_stats_name = _escape_path_for_stats_name(path)
        self._waiting_in_secs = waiting_in_secs
        self._is_destroyed = False
        self.zk_hosts = zk_hosts
        self._sc = sc
        self._file_path = file_path
        self._file_path_stats_name = _escape_path_for_stats_name(file_path)

    def get_endpoints(self):
        """Retrieve the endpoints in the server set.

        Returns:
            The current endpoints in the server set in a list of strings of
            the format: host:port.

        Throws:
            NoClientException, when the underlying client is None;
            or Exception, since this method doesn't make effort to be
            bullet-proof, it will pass the exception thrown by the
            underlying client to the caller. Normally only join and monitor
            are interesting to the upstream callers.

        """
        client = KazooClientManager(self.zk_hosts).get_client(no_throw=False)
        party = client.ShallowParty(self._path)
        return set(party)

    @staticmethod
    def _create_endpoint(port, use_ip):
        host_name = socket.gethostname()
        ip_addr = socket.gethostbyname(host_name)
        if use_ip:
            return "%s:%d" % (ip_addr, port)
        else:
            return "%s:%d" % (host_name, port)

    def join(self, port, use_ip=True, keep_retrying=True, data=None, node_name=None):
        """If use_ip is set and if the hosts ip_addr starts with 10., this method
        joins the zk serverset using 'ip_addr:port' else it uses 'hostname:port'.

        This method will spawn a greenlet to join the server set. If the
        caller wants to try to join once and apply custom error handling, the
        caller can set keep_retrying is set to False; otherwise,
        the caller should set keep_retrying to True, all the errors are
        transparent to the caller, the caller can just make the call and
        forget about it.

        This method always talks to zookeeper directly, no matter the local
        file is provided or not.

        Args:
            port: The port when registering in zk as "host:port"
            use_ip: Use internal ip when registering in zk i.e. "ip:port"
            keep_retrying: whether to keep retrying to join the server
                set initially, or just try once.
            data: Data to save in the zk node. By default its the string "host:port" ("ip:port" if use_ip)
            node_name: Name of the zk node. By default its the string "host:port" ("ip:port" if use_ip)

        Returns:
            A greenlet, if keep_retrying is True, please avoid blocking on
            the returned greenlet indefinately. If keep_retrying is False,

        """
        endpoint = node_name if node_name else ServerSet._create_endpoint(port, use_ip)

        def is_destroyed():
            return self._is_destroyed

        def kazoo_join(client):
            if self._is_destroyed:
                return
            try:
                log.info(
                    "Try to join server set: %s, %s.", self._path, endpoint)
                party = client.ShallowParty(self._path, endpoint)
                if data:
                    party.data = data
                party.join()
                if self._is_destroyed:
                    children = client.get_children(self._path)
                    if children:
                        for child in children:
                            if child.endswith(endpoint):
                                client.delete(self._path + "/" + child)
                log.info("Successfully join server set: %s, %s.", self._path,
                         endpoint)
            except (Exception, gevent.Timeout):
                self._sc.increment("errors.zk.serverset.join.failure",
                                   tags={'path': self._path_stats_name},
                                   sample_rate=1)
                raise

        log.info("Spawn a greenlet to join serverset: %s, %s." % (self._path, endpoint))
        err_msg = "Failed to join server set: %s, %s." % (self._path, endpoint)
        return WatcherUtil.spawn(keep_retrying, True, err_msg,
                                 self._waiting_in_secs, kazoo_join,
                                 self.zk_hosts, is_stopped=is_destroyed)

    def monitor(self, func, keep_retrying=True):
        """Register a callback for server set changes.

        This method will spawn a greenlet to monitor the server set. If the
        caller wants to try monitoring and apply custom error handling, the
        caller can set keep_retrying is set to False; otherwise,
        the caller should set keep_retrying to True, all the errors are
        transparent to the caller, the caller can just make the call and
        forget about it. The former gives the caller a chance to use a
        default list of servers if the initial attempt to monitor fails.

        If _file_path is set, the serverset watches changes in the local file,
        otherwise it watches changes in zookeeper.

        Args:
            keep_retrying: whether to keep retrying to monitor the server
                set initially, or just try once, if it throws exception,
                apply the custom exception handling
            func: On server set changes, func will be invoked with a list of
                strings of the format: host:port.

        Returns:
            A greenlet, if keep_retrying is True, please avoid blocking on
            the returned greenlet indefinately.

        """
        server_set_type = 'zk'
        path_stats_name = self._path_stats_name
        path = self._path
        if self._file_path:
            server_set_type = 'file'
            path_stats_name = self._file_path_stats_name
            path = self._file_path

        def on_server_set_change(children):
            """The callback invoked when the server set changes.

            Args:
                children: When local file is not set, it is a list of strings
                in the format of XXX-host_name:port.
                When local file is set, hosts in the file are in the format
                of host_name:port
            """
            self._sc.increment(
                "{}.serverset.monitor.change".format(server_set_type),
                tags={'path': path_stats_name},
                sample_rate=0.0001)
            log.debug("Children change observed: %s, %s.", path, children)
            if not self._file_path:
                func((child[(child.find("-") + 1):] for child in children))
            else:
                func(children)

        if self._file_path:
            # NOTE: we use gevent.spawn instead of WatcherUtil.spawn for
            # file based serverset to avoid accidentally instantiate zk
            # connection. DONT combine these two code path without serious
            # discussion with infra and ops.
            g = gevent.spawn(FileWatch(sc=self._sc).add_watch,
                             self._file_path,
                             on_server_set_change,
                             watch_type='serverset',
                             keep_retrying=keep_retrying,
                             backoff_in_secs=self._waiting_in_secs)
            if not keep_retrying:
                g.get()
            return g

        def kazoo_monitor(client):
            try:
                log.debug("Try to place serverset zk watch: %s." % self._path)
                client.ChildrenWatch(
                    self._path, on_server_set_change, allow_session_lost=True)
                log.debug("Successfully placed children watch: %s.",
                          self._path)
            except (Exception, gevent.Timeout):
                self._sc.increment("errors.zk.serverset.monitor.failure",
                                   tags={'path': self._path_stats_name},
                                   sample_rate=1)
                raise

        err_msg = "Failed to monitor zk server set: %s." % self._path

        return WatcherUtil.spawn(keep_retrying, False, err_msg,
                                 self._waiting_in_secs, kazoo_monitor,
                                 self.zk_hosts)

    def _destroy(self, endpoint):
        """Best effort to take the endpoint out of serverset.

        However, there is no guaranttee that after this call the endpoint is
        out of the serverset right away. It is only guarantteed to be out of
        the serverset after session timeout.

        This should only be called when an endpoint is ready to be
        terminated.

        DONT CALL THIS METHOD WITHOUT CONSULTING SOMEONE IN INFROPS.

        """
        KazooClientManager(self.zk_hosts)._destroy()
        self._is_destroyed = True
        client = KazooClientManager(self.zk_hosts).get_client()
        if client:
            children = client.get_children(self._path)
            if children:
                for child in children:
                    if child.endswith(endpoint):
                        client.delete(self._path + "/" + child)


def _retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=log,
           sleep_func=time.sleep, max_delay=sys.maxint):
    """Retry calling the decorated function using an exponential backoff.

    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry

    Args:
        ExceptionToCheck: exception to check. May be a tuple of
            exceptions to check.
        tries: an integer, number of times to try (not retry) before
            giving up.
        delay: an integer, initial delay between retries in seconds.
        backoff: an integer, backoff multiplier e.g. value of 2 will
            double the delay each retry
        logger: logging.Logger instance, logger to use. By default,
            we use ``logging.log.log``; if None is explicitly specified by
            the caller, ``print`` is used.
        sleep_func: the sleep function to be used for waiting between
            retries. By default, it is ``time.sleep``,
            but it could also be gevent.sleep if we are using this with
            gevent.
        max_delay: the max number of seconds to wait between retries.

    Returns:
        Decorator function.

    """
    def deco_retry(f):

        def f_retry(*args, **kwargs):
            mtries, mdelay = tries, delay
            while mtries > 1:
                try:
                    return f(*args, **kwargs)
                except ExceptionToCheck, e:
                    log.warning(
                        "%s, Retrying in %d seconds...", e, mdelay)
                    sleep_func(mdelay)
                    mtries -= 1
                    mdelay *= backoff
                    # Don't wait more than max_delay allowed
                    if mdelay > max_delay:
                        mdelay = max_delay
            return f(*args, **kwargs)

        return f_retry  # True decorator.

    return deco_retry