package org.apache.helix.manager.zk;

 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Timer;
import java.util.concurrent.TimeUnit;


import org.apache.helix.BaseDataAccessor;
import org.apache.helix.ClusterMessagingService;
import org.apache.helix.ConfigAccessor;
import org.apache.helix.HelixAdmin;
import org.apache.helix.HelixConstants.ChangeType;
import org.apache.helix.HelixDataAccessor;
import org.apache.helix.HelixException;
import org.apache.helix.HelixManager;
import org.apache.helix.HelixManagerProperties;
import org.apache.helix.HelixManagerProperty;
import org.apache.helix.HelixPropertyFactory;
import org.apache.helix.HelixTimerTask;
import org.apache.helix.InstanceType;
import org.apache.helix.LiveInstanceInfoProvider;
import org.apache.helix.PreConnectCallback;
import org.apache.helix.PropertyKey;
import org.apache.helix.PropertyKey.Builder;
import org.apache.helix.PropertyPathBuilder;
import org.apache.helix.PropertyType;
import org.apache.helix.SystemPropertyKeys;
import org.apache.helix.api.listeners.ClusterConfigChangeListener;
import org.apache.helix.api.listeners.ConfigChangeListener;
import org.apache.helix.api.listeners.ControllerChangeListener;
import org.apache.helix.api.listeners.CurrentStateChangeListener;
import org.apache.helix.api.listeners.CustomizedStateChangeListener;
import org.apache.helix.api.listeners.CustomizedStateConfigChangeListener;
import org.apache.helix.api.listeners.CustomizedStateRootChangeListener;
import org.apache.helix.api.listeners.CustomizedViewChangeListener;
import org.apache.helix.api.listeners.CustomizedViewRootChangeListener;
import org.apache.helix.api.listeners.ExternalViewChangeListener;
import org.apache.helix.api.listeners.IdealStateChangeListener;
import org.apache.helix.api.listeners.InstanceConfigChangeListener;
import org.apache.helix.api.listeners.LiveInstanceChangeListener;
import org.apache.helix.api.listeners.MessageListener;
import org.apache.helix.api.listeners.ResourceConfigChangeListener;
import org.apache.helix.api.listeners.ScopedConfigChangeListener;
import org.apache.helix.controller.GenericHelixController;
import org.apache.helix.controller.pipeline.Pipeline;
import org.apache.helix.healthcheck.ParticipantHealthReportCollector;
import org.apache.helix.healthcheck.ParticipantHealthReportCollectorImpl;
import org.apache.helix.healthcheck.ParticipantHealthReportTask;
import org.apache.helix.messaging.DefaultMessagingService;
import org.apache.helix.model.BuiltInStateModelDefinitions;
import org.apache.helix.model.HelixConfigScope.ConfigScopeProperty;
import org.apache.helix.model.LiveInstance;
import org.apache.helix.monitoring.ZKPathDataDumpTask;
import org.apache.helix.monitoring.mbeans.HelixCallbackMonitor;
import org.apache.helix.monitoring.mbeans.MonitorLevel;
import org.apache.helix.msdcommon.exception.InvalidRoutingDataException;
import org.apache.helix.participant.HelixStateMachineEngine;
import org.apache.helix.participant.StateMachineEngine;
import org.apache.helix.util.HelixUtil;
import org.apache.helix.zookeeper.api.client.HelixZkClient;
import org.apache.helix.zookeeper.api.client.RealmAwareZkClient;
import org.apache.helix.zookeeper.datamodel.ZNRecord;
import org.apache.helix.zookeeper.impl.factory.DedicatedZkClientFactory;
import org.apache.helix.zookeeper.impl.factory.HelixZkClientFactory;
import org.apache.helix.zookeeper.impl.factory.SharedZkClientFactory;
import org.apache.helix.zookeeper.zkclient.IZkStateListener;
import org.apache.helix.zookeeper.zkclient.exception.ZkInterruptedException;
import org.apache.helix.zookeeper.zkclient.serialize.PathBasedZkSerializer;
import org.apache.zookeeper.Watcher.Event.EventType;
import org.apache.zookeeper.Watcher.Event.KeeperState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.helix.zookeeper.datamodel.serializer.ChainedPathZkSerializer;
import org.apache.helix.zookeeper.datamodel.serializer.ZNRecordSerializer;

public class ZKHelixManager implements HelixManager, IZkStateListener {
  private static Logger LOG = LoggerFactory.getLogger(ZKHelixManager.class);

  public static final String ALLOW_PARTICIPANT_AUTO_JOIN = "allowParticipantAutoJoin";
  private static final int FLAPPING_TIME_WINDOW = 300000; // Default to 300 sec
  public static final int DEFAULT_MAX_DISCONNECT_THRESHOLD = 600; // Default to be a large number
  private static final int DEFAULT_WAIT_CONNECTED_TIMEOUT = 10 * 1000;  // wait until connected for up to 10 seconds.

  protected final String _zkAddress;
  private final String _clusterName;
  private final String _instanceName;
  private final InstanceType _instanceType;
  private final int _waitForConnectedTimeout; // wait time for testing connect
  private final int _sessionTimeout; // client side session timeout, will be overridden by server timeout. Disconnect after timeout
  private final int _connectionInitTimeout; // client timeout to init connect
  private final List<PreConnectCallback> _preConnectCallbacks;
  protected final List<CallbackHandler> _handlers;
  private final HelixManagerProperties _properties;
  private final HelixManagerProperty _helixManagerProperty;
  private final HelixManagerStateListener _stateListener;

   * helix version#
  private final String _version;
  private int _reportLatency;

  protected RealmAwareZkClient _zkclient;
  private final DefaultMessagingService _messagingService;
  private Map<ChangeType, HelixCallbackMonitor> _callbackMonitors;

  private final MonitorLevel _monitorLevel;

  private BaseDataAccessor<ZNRecord> _baseDataAccessor;
  private ZKHelixDataAccessor _dataAccessor;
  private final Builder _keyBuilder;
  private ConfigAccessor _configAccessor;
  private ZkHelixPropertyStore<ZNRecord> _helixPropertyStore;
  protected LiveInstanceInfoProvider _liveInstanceInfoProvider = null;

  private volatile String _sessionId;

   * Keep track of timestamps that zk State has become Disconnected If in a _timeWindowLengthMs
   * window zk State has become Disconnected for more than_maxDisconnectThreshold times disconnect
   * the zkHelixManager
  private final List<Long> _disconnectTimeHistory = new ArrayList<Long>();
  private final int _flappingTimeWindowMs;
  private final int _maxDisconnectThreshold;

   * participant fields
  private final StateMachineEngine _stateMachineEngine;
  private final List<HelixTimerTask> _timerTasks = new ArrayList<>();

  private final ParticipantHealthReportCollectorImpl _participantHealthInfoCollector;
  private Long _sessionStartTime;
  private ParticipantManager _participantManager;

   * controller fields
  private GenericHelixController _controller;
  private Set<Pipeline.Type> _enabledPipelineTypes;
  private CallbackHandler _leaderElectionHandler = null;
  protected final List<HelixTimerTask> _controllerTimerTasks = new ArrayList<>();

   * status dump timer-task
  protected static class StatusDumpTask extends HelixTimerTask {
    final HelixManager helixController;

    public StatusDumpTask(HelixManager helixController) {
      this.helixController = helixController;

    public synchronized void start() {
      long initialDelay = 0;
      long period = 15 * 60 * 1000;
      long timeThresholdNoChangeForStatusUpdates = 15 * 60 * 1000; // 15 minutes
      long timeThresholdNoChangeForErrors = 24 * 60 * 60 * 1000; // 1 day
      int maximumNumberOfLeafNodesAllowed = 100;

      if (_timer == null) {"Start StatusDumpTask");
        _timer = new Timer("StatusDumpTimerTask", true);
            new ZKPathDataDumpTask(helixController, timeThresholdNoChangeForStatusUpdates,
                timeThresholdNoChangeForErrors, maximumNumberOfLeafNodesAllowed), initialDelay,

    public synchronized void stop() {
      if (_timer != null) {"Stop StatusDumpTask");
        _timer = null;

  public ZKHelixManager(String clusterName, String instanceName, InstanceType instanceType,
      String zkAddress) {
    this(clusterName, instanceName, instanceType, zkAddress, null);

  public ZKHelixManager(String clusterName, String instanceName, InstanceType instanceType,
      String zkAddress, HelixManagerStateListener stateListener) {
    this(clusterName, instanceName, instanceType, zkAddress, stateListener,
        HelixPropertyFactory.getInstance().getHelixManagerProperty(zkAddress, clusterName));

  public ZKHelixManager(String clusterName, String instanceName, InstanceType instanceType,
      String zkAddress, HelixManagerStateListener stateListener,
      HelixManagerProperty helixManagerProperty) {"Create a zk-based cluster manager. zkSvr: " + zkAddress + ", clusterName: "
        + clusterName + ", instanceName: " + instanceName + ", type: " + instanceType);

    _zkAddress = zkAddress;
    _clusterName = clusterName;
    _instanceType = instanceType;

    if (instanceName == null) {
      try {
        instanceName =
            InetAddress.getLocalHost().getCanonicalHostName() + "-" + instanceType.toString();
      } catch (UnknownHostException e) {
        // can ignore it"Unable to get host name. Will set it to UNKNOWN, mostly ignorable", e);
        instanceName = "UNKNOWN";

    _instanceName = instanceName;
    _enabledPipelineTypes =
        Sets.newHashSet(Pipeline.Type.DEFAULT, Pipeline.Type.TASK);
    _preConnectCallbacks = new ArrayList<>();
    _handlers = new ArrayList<>();
    _properties = new HelixManagerProperties(SystemPropertyKeys.CLUSTER_MANAGER_VERSION);
    _version = _properties.getVersion();

    _keyBuilder = new Builder(clusterName);
    _messagingService = new DefaultMessagingService(this);
    try {
      _callbackMonitors = new HashMap<>();
      for (ChangeType changeType : ChangeType.values()) {
        HelixCallbackMonitor callbackMonitor =
            new HelixCallbackMonitor(instanceType, clusterName, instanceName, changeType);
        _callbackMonitors.put(changeType, callbackMonitor);
    } catch (JMException e) {
      LOG.error("Error in creating callback monitor.", e);

    _stateListener = stateListener;
    _helixManagerProperty = helixManagerProperty;

     * use system property if available
    _flappingTimeWindowMs = HelixUtil.getSystemPropertyAsInt(SystemPropertyKeys.FLAPPING_TIME_WINDOW,

    _maxDisconnectThreshold = HelixUtil

    _sessionTimeout = HelixUtil.getSystemPropertyAsInt(SystemPropertyKeys.ZK_SESSION_TIMEOUT,

    _connectionInitTimeout = HelixUtil

    _waitForConnectedTimeout = HelixUtil

    _reportLatency = HelixUtil

    MonitorLevel configuredMonitorLevel;
    try {
      configuredMonitorLevel = MonitorLevel.valueOf(
    } catch (IllegalArgumentException ex) {
      LOG.warn("Unrecognizable Monitor Level configuration. Use DEFAULT monitor level.", ex);
      configuredMonitorLevel = MonitorLevel.DEFAULT;
    _monitorLevel = configuredMonitorLevel;

     * instance type specific init
    switch (instanceType) {
      _stateMachineEngine = new HelixStateMachineEngine(this);
      _participantHealthInfoCollector =
          new ParticipantHealthReportCollectorImpl(this, _instanceName);
          .add(new ParticipantHealthReportTask(_participantHealthInfoCollector, _reportLatency));
    case CONTROLLER:
      _stateMachineEngine = null;
      _participantHealthInfoCollector = null;
      _controllerTimerTasks.add(new StatusDumpTask(this));

      _stateMachineEngine = new HelixStateMachineEngine(this);
      _participantHealthInfoCollector =
          new ParticipantHealthReportCollectorImpl(this, _instanceName);

          .add(new ParticipantHealthReportTask(_participantHealthInfoCollector, _reportLatency));
      _controllerTimerTasks.add(new StatusDumpTask(this));
    case SPECTATOR:
      _stateMachineEngine = null;
      _participantHealthInfoCollector = null;
      throw new IllegalArgumentException("unrecognized type: " + instanceType);

  public void setEnabledControlPipelineTypes(Set<Pipeline.Type> types) {
    if (!InstanceType.CONTROLLER.equals(_instanceType) && !InstanceType.CONTROLLER_PARTICIPANT
        .equals(_instanceType)) {
      throw new IllegalStateException(
          String.format("Cannot enable control pipeline for instance type %s", _instanceType));
    _enabledPipelineTypes = types;

  @Override public boolean removeListener(PropertyKey key, Object listener) {"Removing listener: " + listener + " on path: " + key.getPath() + " from cluster: "
        + _clusterName + " by instance: " + _instanceName);

    synchronized (this) {
      List<CallbackHandler> toRemove = new ArrayList<>();
      for (CallbackHandler handler : _handlers) {
        // compare property-key path and listener reference
        if (handler.getPath().equals(key.getPath()) && handler.getListener().equals(listener)) {


      // handler.reset() may modify the handlers list, so do it outside the iteration
      for (CallbackHandler handler : toRemove) {

    return true;

  void checkConnected() {

   * Check if HelixManager is connected, if it is not connected,
   * wait for the specified timeout and check again before return.
   * @param timeout
  void checkConnected(long timeout) {
    if (_zkclient == null || _zkclient.isClosed()) {
      throw new HelixException(
          "HelixManager (ZkClient) is not connected. Call HelixManager#connect()");

    boolean isConnected = isConnected();
    if (!isConnected && timeout > 0) {
          "zkClient to " + _zkAddress + " is not connected, wait for " + _waitForConnectedTimeout
              + "ms.");
      isConnected = _zkclient.waitUntilConnected(_waitForConnectedTimeout, TimeUnit.MILLISECONDS);

    if (!isConnected) {
      LOG.error("zkClient is not connected after waiting " + timeout + "ms."
          + ", clusterName: " + _clusterName + ", zkAddress: " + _zkAddress);
      throw new HelixException(
          "HelixManager is not connected within retry timeout for cluster " + _clusterName);

  void addListener(Object listener, PropertyKey propertyKey, ChangeType changeType,
      EventType[] eventType) {

    PropertyType type = propertyKey.getType();

    synchronized (this) {
      for (CallbackHandler handler : _handlers) {
        // compare property-key path and listener reference
        if (handler.getPath().equals(propertyKey.getPath())
            && handler.getListener().equals(listener)) {
"Listener: " + listener + " on path: " + propertyKey.getPath()
              + " already exists. skip add");


      CallbackHandler newHandler =
          new CallbackHandler(this, _zkclient, propertyKey, listener, eventType, changeType,

      _handlers.add(newHandler);"Added listener: " + listener + " for type: " + type + " to path: "
          + newHandler.getPath());

  public void addIdealStateChangeListener(final IdealStateChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).idealStates(), ChangeType.IDEAL_STATE,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addIdealStateChangeListener(final org.apache.helix.IdealStateChangeListener listener)
      throws Exception {
    addListener(listener, new Builder(_clusterName).idealStates(), ChangeType.IDEAL_STATE,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addLiveInstanceChangeListener(LiveInstanceChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).liveInstances(), ChangeType.LIVE_INSTANCE,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addLiveInstanceChangeListener(org.apache.helix.LiveInstanceChangeListener listener)
      throws Exception {
    addListener(listener, new Builder(_clusterName).liveInstances(), ChangeType.LIVE_INSTANCE,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addConfigChangeListener(ConfigChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).instanceConfigs(), ChangeType.INSTANCE_CONFIG,
        new EventType[] { EventType.NodeChildrenChanged

  public void addInstanceConfigChangeListener(InstanceConfigChangeListener listener)
      throws Exception {
    addListener(listener, new Builder(_clusterName).instanceConfigs(), ChangeType.INSTANCE_CONFIG,
        new EventType[] { EventType.NodeChildrenChanged

  public void addInstanceConfigChangeListener(org.apache.helix.InstanceConfigChangeListener listener)
      throws Exception {
    addListener(listener, new Builder(_clusterName).instanceConfigs(), ChangeType.INSTANCE_CONFIG,
        new EventType[] { EventType.NodeChildrenChanged

  public void addResourceConfigChangeListener(ResourceConfigChangeListener listener) throws Exception{
    addListener(listener, new Builder(_clusterName).resourceConfigs(), ChangeType.RESOURCE_CONFIG,
        new EventType[] { EventType.NodeChildrenChanged

  public void addCustomizedStateConfigChangeListener(
      CustomizedStateConfigChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).customizedStateConfig(),
        ChangeType.CUSTOMIZED_STATE_CONFIG, new EventType[] {

  public void addClusterfigChangeListener(ClusterConfigChangeListener listener) throws Exception{
    addListener(listener, new Builder(_clusterName).clusterConfig(), ChangeType.CLUSTER_CONFIG,
        new EventType[] { EventType.NodeDataChanged

  public void addConfigChangeListener(ScopedConfigChangeListener listener, ConfigScopeProperty scope)
      throws Exception {
    Builder keyBuilder = new Builder(_clusterName);

    PropertyKey propertyKey = null;
    switch (scope) {
    case CLUSTER:
      propertyKey = keyBuilder.clusterConfigs();
      propertyKey = keyBuilder.instanceConfigs();
    case RESOURCE:
      propertyKey = keyBuilder.resourceConfigs();

    if (propertyKey != null) {
      addListener(listener, propertyKey, ChangeType.CONFIG, new EventType[] {
    } else {
      LOG.error("Can't add listener to config scope: " + scope);

  public void addConfigChangeListener(org.apache.helix.ScopedConfigChangeListener listener, ConfigScopeProperty scope)
      throws Exception {
    addConfigChangeListener((ScopedConfigChangeListener) listener, scope);

  // TODO: Decide if do we still need this since we are exposing
  // ClusterMessagingService
  public void addMessageListener(MessageListener listener, String instanceName) {
    addListener(listener, new Builder(_clusterName).messages(instanceName), ChangeType.MESSAGE,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addMessageListener(org.apache.helix.MessageListener listener, String instanceName) {
    addListener(listener, new Builder(_clusterName).messages(instanceName), ChangeType.MESSAGE,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addControllerMessageListener(MessageListener listener) {
    addListener(listener, new Builder(_clusterName).controllerMessages(),
        ChangeType.MESSAGES_CONTROLLER, new EventType[] { EventType.NodeChildrenChanged });

  public void addControllerMessageListener(org.apache.helix.MessageListener listener) {
    addListener(listener, new Builder(_clusterName).controllerMessages(),
        ChangeType.MESSAGES_CONTROLLER, new EventType[] { EventType.NodeChildrenChanged });

  public void addCurrentStateChangeListener(CurrentStateChangeListener listener,
      String instanceName, String sessionId) throws Exception {
    addListener(listener, new Builder(_clusterName).currentStates(instanceName, sessionId),
        ChangeType.CURRENT_STATE, new EventType[] { EventType.NodeChildrenChanged

  public void addCurrentStateChangeListener(org.apache.helix.CurrentStateChangeListener listener,
      String instanceName, String sessionId) throws Exception {
    addListener(listener, new Builder(_clusterName).currentStates(instanceName, sessionId),
        ChangeType.CURRENT_STATE, new EventType[] { EventType.NodeChildrenChanged

  public void addCustomizedStateRootChangeListener(CustomizedStateRootChangeListener listener,
      String instanceName) throws Exception {
    addListener(listener, new Builder(_clusterName).customizedStatesRoot(instanceName),
        ChangeType.CUSTOMIZED_STATE_ROOT, new EventType[]{EventType.NodeChildrenChanged});

  public void addCustomizedStateChangeListener(CustomizedStateChangeListener listener,
      String instanceName, String customizedStateType) throws Exception {
        new Builder(_clusterName).customizedStates(instanceName, customizedStateType),
        ChangeType.CUSTOMIZED_STATE, new EventType[]{EventType.NodeChildrenChanged});

  public void addExternalViewChangeListener(ExternalViewChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).externalViews(), ChangeType.EXTERNAL_VIEW,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addCustomizedViewChangeListener(CustomizedViewChangeListener listener, String customizedStateType)
      throws Exception {
    addListener(listener, new Builder(_clusterName).customizedView(customizedStateType),
        ChangeType.CUSTOMIZED_VIEW, new EventType[] {

  public void addCustomizedViewRootChangeListener(CustomizedViewRootChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).customizedViews(),
        ChangeType.CUSTOMIZED_VIEW_ROOT, new EventType[] {

  public void addTargetExternalViewChangeListener(ExternalViewChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).externalViews(), ChangeType.TARGET_EXTERNAL_VIEW,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addExternalViewChangeListener(org.apache.helix.ExternalViewChangeListener listener) throws Exception {
    addListener(listener, new Builder(_clusterName).externalViews(), ChangeType.EXTERNAL_VIEW,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addControllerListener(ControllerChangeListener listener) {
    addListener(listener, new Builder(_clusterName).controller(), ChangeType.CONTROLLER,
        new EventType[] { EventType.NodeChildrenChanged });

  public void addControllerListener(org.apache.helix.ControllerChangeListener listener) {
    addListener(listener, new Builder(_clusterName).controller(), ChangeType.CONTROLLER,
        new EventType[] { EventType.NodeChildrenChanged });

  public HelixDataAccessor getHelixDataAccessor() {
    return _dataAccessor;

  public ConfigAccessor getConfigAccessor() {
    return _configAccessor;

  public String getClusterName() {
    return _clusterName;

   * Returns a string that can be used to connect to metadata store for this HelixManager instance
   * i.e. for ZkHelixManager, this will have format "{zookeeper-address}:{port}"
   * @return a string used to connect to metadata store
  public String getMetadataStoreConnectionString() {
    return _zkAddress;

  public String getInstanceName() {
    return _instanceName;

  BaseDataAccessor<ZNRecord> createBaseDataAccessor() {
    ZkBaseDataAccessor<ZNRecord> baseDataAccessor = new ZkBaseDataAccessor<>(_zkclient);
    return baseDataAccessor;

   * Add Helix built-in state model definitions if not exist
  private void addBuiltInStateModelDefinitions() {
    for (BuiltInStateModelDefinitions def : BuiltInStateModelDefinitions.values()) {
      // creation succeeds only if not exist

  private boolean isMonitorRootPathOnly() {
    switch (_monitorLevel) {
    case ALL:
      return false;
      return true;
      // Otherwise, apply the default policy. Emitting full metrics for controllers only.
      return !_instanceType.equals(InstanceType.CONTROLLER) && !_instanceType

  void createClient() throws Exception {
    final RealmAwareZkClient newClient = createSingleRealmZkClient();

    synchronized (this) {
      if (_zkclient != null) {
      _zkclient = newClient;

      _baseDataAccessor = createBaseDataAccessor();

      _dataAccessor = new ZKHelixDataAccessor(_clusterName, _instanceType, _baseDataAccessor);
      _configAccessor = new ConfigAccessor(_zkclient);

      if (_instanceType == InstanceType.CONTROLLER
          || _instanceType == InstanceType.CONTROLLER_PARTICIPANT) {

    // subscribe to state change
    int retryCount = 0;
    while (retryCount < 3) {
      try {
        final long sessionId =
            _zkclient.waitForEstablishedSession(_connectionInitTimeout, TimeUnit.MILLISECONDS);
         * This listener is subscribed after SyncConnected and firing new session events,
         * which means this listener has not yet handled new session, so we have to handle new
         * session here just for this listener.
      } catch (HelixException e) {
        LOG.error("fail to createClient.", e);
        throw e;
      } catch (Exception e) {
        LOG.error("fail to createClient. retry " + retryCount, e);
        if (retryCount == 3) {
          throw e;

  public void connect() throws Exception {"ClusterManager.connect()");
    if (isConnected()) {
      LOG.warn("Cluster manager: " + _instanceName + " for cluster: " + _clusterName
          + " already connected. skip connect");

    switch (_instanceType) {
    case CONTROLLER:
      if (_controller == null) {
        _controller = new GenericHelixController(_clusterName, _enabledPipelineTypes);

    try {
    } catch (Exception e) {
      LOG.error("fail to connect " + _instanceName, e);
      try {
      } catch (Exception ex) {
        // if zk connection fails to be created, disconnect may throw exception about reporting disconnect to ZK.
      throw e;

  public void disconnect() {
    if (_zkclient == null || _zkclient.isClosed()) {"instanceName: " + _instanceName + " already disconnected");
    }"disconnect " + _instanceName + "(" + _instanceType + ") from " + _clusterName);

    try {
       * stop all timer tasks

       * shutdown thread pool first to avoid reset() being invoked in the middle of state
       * transition

      // TODO reset user defined handlers only
      // TODO Fix the issue that when connection disconnected, reset handlers will be blocked. -- JJ
      // This is because reset logic contains ZK operations.

      if (_leaderElectionHandler != null) {

    } finally {
      GenericHelixController controller = _controller;
      if (controller != null) {
        try {
        } catch (InterruptedException e) {
"Interrupted shutting down GenericHelixController", e);

      ParticipantManager participantManager = _participantManager;
      if (participantManager != null) {

      for (HelixCallbackMonitor callbackMonitor : _callbackMonitors.values()) {

      _helixPropertyStore = null;

      synchronized (this) {
        if (_controller != null) {
          _controller = null;
          _leaderElectionHandler = null;

        if (_participantManager != null) {
          _participantManager = null;

        if (_zkclient != null) {
      _sessionStartTime = null;"Cluster manager: " + _instanceName + " disconnected");

  public String getSessionId() {
    // TODO: session id should be updated after zk client is connected.
    // Otherwise, this session id might be an expired one.
    return _sessionId;

  public boolean isConnected() {
    if (_zkclient == null || _zkclient.isClosed()) {
      return false;
    // Don't check ZkConnection state, which is different from ZkClient's watcher state.
    // ZkConnection state is the internal state of the connection, which can be different from the
    // ZkClient state due to internal thread/retry logic.
    try {
      return _zkclient.waitUntilConnected(0, TimeUnit.MILLISECONDS);
    } catch (ZkInterruptedException ex) {
      return false;

  public long getLastNotificationTime() {
    return 0;

  public void addPreConnectCallback(PreConnectCallback callback) {"Adding preconnect callback: " + callback);

  public boolean isLeader() {
    String warnLogPrefix = String
        .format("Instance %s is not leader of cluster %s due to", _instanceName, _clusterName);
    if (_instanceType != InstanceType.CONTROLLER
        && _instanceType != InstanceType.CONTROLLER_PARTICIPANT) {
          .format("%s instance type %s does not match to CONTROLLER/CONTROLLER_PARTICIPANT",
      return false;

    if (!isConnected()) {
      LOG.warn(String.format("%s HelixManager is not connected", warnLogPrefix));
      return false;

    try {
      LiveInstance leader = _dataAccessor.getProperty(_keyBuilder.controllerLeader());
      if (leader != null) {
        String leaderName = leader.getInstanceName();
        String sessionId = leader.getEphemeralOwner();
        if (leaderName != null && leaderName.equals(_instanceName) && sessionId
            .equals(_sessionId)) {
          return true;
            .format("%s current session %s does not match leader session %s", warnLogPrefix,
                _sessionId, sessionId));
      } else {
        LOG.warn(String.format("%s leader ZNode is null", warnLogPrefix));
    } catch (Exception e) {
      LOG.warn(String.format("%s exception happen when session check", warnLogPrefix), e);
    return false;

  public synchronized ZkHelixPropertyStore<ZNRecord> getHelixPropertyStore() {

    if (_helixPropertyStore == null) {
      String path = PropertyPathBuilder.propertyStore(_clusterName);
      String fallbackPath = String.format("/%s/%s", _clusterName, "HELIX_PROPERTYSTORE");
      _helixPropertyStore =
          new AutoFallbackPropertyStore<>(new ZkBaseDataAccessor<>(_zkclient),
              path, fallbackPath);

    return _helixPropertyStore;

  public synchronized HelixAdmin getClusterManagmentTool() {
    if (_zkclient != null && !_zkclient.isClosed()) {
      return new ZKHelixAdmin(_zkclient);

    LOG.error("Couldn't get ZKClusterManagementTool because zkclient is null");
    return null;

  public ClusterMessagingService getMessagingService() {
    // The caller can register message handler factories on messaging service before the
    // helix manager is connected. Thus we do not do connected check here.
    return _messagingService;

  public InstanceType getInstanceType() {
    return _instanceType;

  public String getVersion() {
    return _version;

  public HelixManagerProperties getProperties() {
    return _properties;

  public StateMachineEngine getStateMachineEngine() {
    return _stateMachineEngine;

  // TODO: rename this and not expose this function as part of interface
  public void startTimerTasks() {
    for (HelixTimerTask task : _timerTasks) {

  public void stopTimerTasks() {
    for (HelixTimerTask task : _timerTasks) {

  public void setLiveInstanceInfoProvider(LiveInstanceInfoProvider liveInstanceInfoProvider) {
    _liveInstanceInfoProvider = liveInstanceInfoProvider;

   * wait until we get a non-zero session-id. note that we might lose zkconnection
   * right after we read session-id. but it's ok to get stale session-id and we will have
   * another handle-new-session callback to correct this.
  void waitUntilConnected() {
    boolean isConnected;
    do {
      isConnected =
          _zkclient.waitUntilConnected(HelixZkClient.DEFAULT_CONNECTION_TIMEOUT, TimeUnit.MILLISECONDS);
      if (!isConnected) {
        LOG.error("fail to connect zkserver: " + _zkAddress + " in "
            + HelixZkClient.DEFAULT_CONNECTION_TIMEOUT + "ms. expiredSessionId: " + _sessionId
            + ", clusterName: " + _clusterName);

      _sessionId = ZKUtil.toHexSessionId(_zkclient.getSessionId());

       * at the time we read session-id, zkconnection might be lost again
       * wait until we get a non-zero session-id
    } while (!isConnected || "0".equals(_sessionId));"Handling new session, session id: " + _sessionId + ", instance: " + _instanceName
        + ", instanceTye: " + _instanceType + ", cluster: " + _clusterName);

  void initHandlers(List<CallbackHandler> handlers) {
    synchronized (this) {
      if (handlers != null) {
        // get a copy of the list and iterate over the copy list
        // in case handler.init() modifies the original handler list
        List<CallbackHandler> tmpHandlers = new ArrayList<>(handlers);
        for (CallbackHandler handler : tmpHandlers) {
"init handler: " + handler.getPath() + ", " + handler.getListener());

  void resetHandlers(boolean isShutdown) {
    synchronized (this) {
      if (_handlers != null) {
        // get a copy of the list and iterate over the copy list
        // in case handler.reset() modifies the original handler list
        List<CallbackHandler> tmpHandlers = new ArrayList<>(_handlers);
        for (CallbackHandler handler : tmpHandlers) {
"reset handler: " + handler.getPath() + ", " + handler.getListener());

   * If zk state has changed into Disconnected for maxDisconnectThreshold times during previous
   * timeWindowLengthMs Ms
   * time window, we think that there are something wrong going on and disconnect the zkHelixManager
   * from zk.
  boolean isFlapping() {
    if (_disconnectTimeHistory.size() == 0) {
      return false;
    long mostRecentTimestamp = _disconnectTimeHistory.get(_disconnectTimeHistory.size() - 1);

    // Remove disconnect history timestamp that are older than flappingTimeWindowMs ago
    while ((_disconnectTimeHistory.get(0) + _flappingTimeWindowMs) < mostRecentTimestamp) {
    return _disconnectTimeHistory.size() > _maxDisconnectThreshold;

  public void handleStateChanged(KeeperState state) {
    switch (state) {
    case SyncConnected:"KeeperState: " + state + ", instance: " + _instanceName + ", type: " + _instanceType);
    case Disconnected:
       * Track the time stamp that the disconnected happens, then check history and see if
       * we should disconnect the helix-manager
      if (isFlapping()) {
        String errorMsg = "instanceName: " + _instanceName + " is flapping. disconnect it. "
            + " maxDisconnectThreshold: " + _maxDisconnectThreshold + " disconnects in "
            + _flappingTimeWindowMs + "ms.";

        // Only disable the instance when it's instance type is PARTICIPANT
        if (_instanceType.equals(InstanceType.PARTICIPANT)) {
          LOG.warn("instanceName: " + _instanceName
              + " is flapping. Since it is a participant, disable it.");
          try {
            getClusterManagmentTool().enableInstance(_clusterName, _instanceName, false);
          } catch (Exception e) {
            LOG.error("Failed to disable participant before disconnecting participant.", e);

        try {
          // TODO Call disconnect in another thread.
          // handleStateChanged is triggered in ZkClient eventThread. The disconnect logic will
          // interrupt this thread. This issue prevents the ZkClient.close() from complete. So the
          // client is left in a strange state.
        } catch (Exception ex) {
          LOG.error("Disconnect HelixManager is not completely done.", ex);

        if (_stateListener != null) {
          try {
            _stateListener.onDisconnected(this, new HelixException(errorMsg));
          } catch (Exception e) {
            LOG.warn("stateListener.onDisconnected callback fails", e);
    // if not flapping, share the continuous logic with Expired case
    case Expired:
      LOG.warn("KeeperState:" + state + ", SessionId: " + _sessionId + ", instance: "
          + _instanceName + ", type: " + _instanceType);
    default:"KeeperState:" + state + ", currentSessionId: " + _sessionId + ", instance: "
          + _instanceName + ", type: " + _instanceType);

   * Called after zookeeper session has expired and a new session has been established. This method
   * may cause session race condition when creating ephemeral nodes. Internally, this method calls
   * {@link #handleNewSession(String)} with a null value as the sessionId parameter, which results
   * in later creating the ephemeral node in the session of the latest zk connection.
   * But please note that the session of the latest zk connection might not be the expected session.
   * This is the session race condition issue.
   * To avoid the race condition issue, please use {@link #handleNewSession(String)}.
   * @deprecated
   * This method is deprecated, because it may cause session race condition when creating ephemeral
   * nodes. It is kept for backward compatibility in case a user class extends this class.
   * Please use {@link #handleNewSession(String)} instead, which takes care of race condition.
   * @throws Exception If any error occurs.
  public void handleNewSession() throws Exception {

   * Called after the zookeeper session has expired and a new session has been established. This
   * methods handles a new session with its session id passed in. Before handling, this method
   * waits until zk client is connected to zk service and gets a non-zero session id(current actual
   * session id). If the passed-in(expected) session id does not match current actual session id,
   * the expected session id is expired and will NOT be handled.
   * @param sessionId the new session's id. The ephemeral nodes are expected to be created in this
   *                  session. If this session id is expired, ephemeral nodes should not be created.
   * @throws Exception if any error occurs during handling new session
  public void handleNewSession(String sessionId) throws Exception {
     * TODO: after removing I0ItecIZkStateListenerImpl, null session should be checked and
     *  discarded.
     * Null session is still a special case here, which is treated as non-session aware operation.
     * This special case could still potentially cause race condition, so null session should NOT
     * be acceptable, once I0ItecIZkStateListenerImpl is removed. Currently this special case
     * is kept for backward compatibility.

    // Wait until we get a non-zero session id. Otherwise, getSessionId() might be null.

     * Filter out stale sessions. If a session id is not null and not the same as current session
     * id, this session is expired. With this filtering, expired sessions are NOT handled,
     * so performance is expected to improve.
    if (sessionId != null && !getSessionId().equals(sessionId)) {
      LOG.warn("Session is expired and not handled. Expected: {}. Actual: {}.", sessionId,

     * When a null session id is passed in, we will take current session's id for following
     * operations. Please note that current session might not be the one we expect to handle,
     * because the one we expect might be already expired when the zk event is waiting in the
     * event queue. Why we use current session here is for backward compatibility with the old
     * method handleNewSession().
    if (sessionId == null) {
      sessionId = getSessionId();
      LOG.debug("Session id: <null> is passed in. Current session id: {} will be used.", sessionId);
    }"Handle new session, instance: {}, type: {}, session id: {}.", _instanceName,
        _instanceType,  sessionId);

     * stop all timer tasks, reset all handlers, make sure cleanup completed for previous session
     * disconnect if fail to cleanup
    if (_leaderElectionHandler != null) {

     * clean up write-through cache

     * from here on, we are dealing with new session
    if (!ZKUtil.isClusterSetup(_clusterName, _zkclient)) {
      throw new HelixException("Cluster structure is not set up for cluster: " + _clusterName);

    _sessionStartTime = System.currentTimeMillis();

    switch (_instanceType) {
    case CONTROLLER:
    case SPECTATOR:


     * init handlers
     * ok to init message handler and data-accessor twice
     * the second init will be skipped (see CallbackHandler)

    if (_stateListener != null) {
      try {
      } catch (Exception e) {
        LOG.warn("stateListener.onConnected callback fails", e);

  void handleNewSessionAsParticipant(final String sessionId) throws Exception {
    if (_participantManager != null) {
    _participantManager =
        new ParticipantManager(this, _zkclient, _sessionTimeout, _liveInstanceInfoProvider,
            _preConnectCallbacks, sessionId, _helixManagerProperty);

  // TODO: pass in session id and make this method session aware to avoid potential session race
  //  condition.
  void handleNewSessionAsController() {
    if (_leaderElectionHandler != null) {
    } else {
      _leaderElectionHandler =
          new CallbackHandler(this, _zkclient, _keyBuilder.controller(),
              new DistributedLeaderElection(this, _controller, _controllerTimerTasks),
              new EventType[] {
                  EventType.NodeChildrenChanged, EventType.NodeDeleted, EventType.NodeCreated
              }, ChangeType.CONTROLLER, _callbackMonitors.get(ChangeType.CONTROLLER));

  public ParticipantHealthReportCollector getHealthReportCollector() {
    return _participantHealthInfoCollector;

  public void handleSessionEstablishmentError(Throwable error) throws Exception {
    LOG.warn("Handling Session Establishment Error. Disconnect Helix Manager.", error);

    if (_stateListener != null) {
      _stateListener.onDisconnected(this, error);

  public Long getSessionStartTime() {
    return _sessionStartTime;

   * Prepares connection config and client config based on the internal parameters given to
   * HelixManager in order to create a ZkClient instance to use. Note that a shared ZkClient
   * instance will be created if connecting as an ADMINISTRATOR to minimize the cost of creating
   * ZkConnections.
  private RealmAwareZkClient createSingleRealmZkClient() {
    final String shardingKey = buildShardingKey();
    PathBasedZkSerializer zkSerializer =
        ChainedPathZkSerializer.builder(new ZNRecordSerializer()).build();

    RealmAwareZkClient.RealmAwareZkConnectionConfig connectionConfig =
        new RealmAwareZkClient.RealmAwareZkConnectionConfig.Builder()

    RealmAwareZkClient.RealmAwareZkClientConfig clientConfig =
        new RealmAwareZkClient.RealmAwareZkClientConfig();


    if (_instanceType == InstanceType.ADMINISTRATOR) {
      return resolveZkClient(SharedZkClientFactory.getInstance(), connectionConfig,

    return resolveZkClient(DedicatedZkClientFactory.getInstance(), connectionConfig,

   * Resolves what type of ZkClient this HelixManager should use based on whether MULTI_ZK_ENABLED
   * System config is set or not. Two types of ZkClients are available:
   * 1) If MULTI_ZK_ENABLED is set to true, we create a dedicated RealmAwareZkClient
   * that provides full ZkClient functionalities and connects to the correct ZK by querying
   * MetadataStoreDirectoryService.
   * 2) Otherwise, we create a dedicated HelixZkClient which plainly connects to
   * the ZK address given.
  private RealmAwareZkClient resolveZkClient(HelixZkClientFactory zkClientFactory,
      RealmAwareZkClient.RealmAwareZkConnectionConfig connectionConfig,
      RealmAwareZkClient.RealmAwareZkClientConfig clientConfig) {
    if (Boolean.getBoolean(SystemPropertyKeys.MULTI_ZK_ENABLED)) {
      try {
        // Create realm-aware ZkClient.
        return zkClientFactory.buildZkClient(connectionConfig, clientConfig);
      } catch (IllegalArgumentException | IOException | InvalidRoutingDataException e) {
        throw new HelixException("Not able to connect on realm-aware mode for sharding key: "
            + connectionConfig.getZkRealmShardingKey(), e);

    // If multi-zk mode is not enabled, create HelixZkClient with the provided zk address.
    HelixZkClient.ZkClientConfig helixZkClientConfig = clientConfig.createHelixZkClientConfig();
    HelixZkClient.ZkConnectionConfig helixZkConnectionConfig =
        new HelixZkClient.ZkConnectionConfig(_zkAddress)

    return zkClientFactory.buildZkClient(helixZkConnectionConfig, helixZkClientConfig);

  private String buildShardingKey() {
    return _clusterName.charAt(0) == '/' ? _clusterName : "/" + _clusterName;