/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs;

import java.io.FileOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Properties;
import java.util.ArrayList;
import java.net.InetSocketAddress;

import junit.framework.Assert;

import org.apache.zookeeper.server.ZooKeeperServer;
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
import org.apache.zookeeper.server.NIOServerCnxn;
import org.apache.zookeeper.server.ServerConfig;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.AvatarConstants;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.namenode.AvatarNode;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.datanode.AvatarDataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.net.StaticMapping;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.DNSToSwitchMapping;

/**
 * This class manages a Avatar/HDFS cluster with all nodes running
 * locally.
 * To synchronize the AvatarNodes, it uses a local ZooKeeper
 * server.
 */
public class MiniAvatarCluster {

  public static final String NAMESERVICE_ID_PREFIX = "nameserviceId";
  public static int currNSId = 0;

  public static class DataNodeProperties {
    public AvatarDataNode datanode;
    public Configuration conf;
    public String[] dnArgs;

    DataNodeProperties(AvatarDataNode node, Configuration conf, String[] args) {
      this.datanode = node;
      this.conf = conf;
      this.dnArgs = args;
    }
  }

  private static enum AvatarState {
    ACTIVE,
    STANDBY,
    DEAD
  }
  
  public static class AvatarInfo {
    public AvatarNode avatar;
    AvatarState state;
    int nnPort;
    int nnDnPort;
    int httpPort;
    int rpcPort;
    String startupOption;
    
    AvatarInfo(AvatarNode avatar, AvatarState state,
               int nnPort, int nnDnPort, int httpPort,
               int rpcPort, String startupOption) {
      this.avatar = avatar;
      this.state = state;
      this.nnPort = nnPort;
      this.nnDnPort = nnDnPort;
      this.httpPort = httpPort;
      this.rpcPort = rpcPort;
      this.startupOption = startupOption;
    }
  }

  private static final Log LOG = LogFactory.getLog(MiniAvatarCluster.class);

  private static final String DEFAULT_TEST_DIR = 
    "build/contrib/highavailability/test/data";
  private static final String TEST_DIR =
    new File(System.getProperty("test.build.data", DEFAULT_TEST_DIR)).
    getAbsolutePath();

  private static final String ZK_DATA_DIR = TEST_DIR + "/zk.data";
  private static final String ZK_CONF_FILE = TEST_DIR + "/zk.conf";

  private static final int zkClientPort = MiniDFSCluster.getFreePort();
  private static String baseAvatarDir;
  private static String dataDir;
  private int numDataNodes;
  private boolean format;
  private String[] racks;
  private String[] hosts;
  private boolean federation;
  private NameNodeInfo[] nameNodes;
  private Configuration conf;
  
  public class NameNodeInfo {
    Configuration conf;
    public ArrayList<AvatarInfo> avatars = null;
    private final String fsimagelocal0Dir;
    private final String fsimagelocal1Dir;
    private final String fsimage0Dir;
    private final String fsimage1Dir;
    private final String fseditslocal0Dir;
    private final String fseditslocal1Dir;
    private final String fsedits0Dir;
    private final String fsedits1Dir;

    private final int nnPort;
    private final int nn0Port;
    private final int nn1Port;
    private final int nnDnPort;
    private final int nnDn0Port;
    private final int nnDn1Port;
    private final int httpPort;
    private final int http0Port;
    private final int http1Port;
    private final int rpcPort;
    private final int rpc0Port;
    private final int rpc1Port;

    private Configuration clientConf;
    private Configuration a0Conf;
    private Configuration a1Conf;
    private final String avatarDir;
    String nameserviceId;
    
    NameNodeInfo(int nnIndex) {
      avatarDir = baseAvatarDir;
      fsimagelocal0Dir = avatarDir + "/fsimagelocal0";
      fsimagelocal1Dir = avatarDir + "/fsimagelocal1";
      fsimage0Dir = avatarDir + "/fsimage0";
      fsimage1Dir = avatarDir + "/fsimage1";
      fseditslocal0Dir = avatarDir + "/fseditslocal0";
      fseditslocal1Dir = avatarDir + "/fseditslocal1";
      fsedits0Dir = avatarDir + "/fsedits0";
      fsedits1Dir = avatarDir + "/fsedits1";

      rpcPort = nnPort = MiniDFSCluster.getFreePort();
      nnDnPort = MiniDFSCluster.getFreePort();
      httpPort = MiniDFSCluster.getFreePort();
      rpc0Port = nn0Port = MiniDFSCluster.getFreePorts(2);
      nnDn0Port = MiniDFSCluster.getFreePort();
      http0Port = MiniDFSCluster.getFreePort();
      rpc1Port = nn1Port = MiniDFSCluster.getFreePorts(2);
      nnDn1Port = MiniDFSCluster.getFreePort();
      http1Port = MiniDFSCluster.getFreePort();
    }
    
    public void setAvatarNodes(ArrayList<AvatarInfo> avatars) {
      this.avatars = avatars;
    }
    
    public void initClientConf(Configuration conf) {
      clientConf = new Configuration(conf);
      clientConf.set("fs.default.name", "hdfs://localhost:" + nnPort);
      clientConf.set("fs.default.name0", "hdfs://localhost:" + nn0Port);
      clientConf.set("fs.default.name1", "hdfs://localhost:" + nn1Port);
      clientConf.set("dfs.namenode.dn-address", "localhost:" + nnDnPort);
      clientConf.set("dfs.namenode.dn-address0", "localhost:" + nnDn0Port);
      clientConf.set("dfs.namenode.dn-address1", "localhost:" + nnDn1Port);
      clientConf.set("fs.hdfs.impl",
          "org.apache.hadoop.hdfs.DistributedAvatarFileSystem");
      clientConf.setBoolean("fs.hdfs.impl.disable.cache", true);
      // Lower the number of retries to close connections quickly.
      clientConf.setInt("ipc.client.connect.max.retries", 3);
    }
    
    public void initGeneralConf(Configuration conf, String nameserviceId) {
      // overwrite relevant settings
      initClientConf(conf);
      this.nameserviceId = nameserviceId;
      // avatar nodes
      if (federation) {
        conf.set("dfs.namenode.rpc-address0", "localhost:" + rpc0Port);
        conf.set("dfs.namenode.rpc-address1", "localhost:" + rpc1Port);
      } else {
        conf.set("fs.default.name", "hdfs://localhost:" + nnPort);
        conf.set("fs.default.name0", "hdfs://localhost:" + nn0Port);
        conf.set("fs.default.name1", "hdfs://localhost:" + nn1Port);
        conf.set("dfs.namenode.dn-address", "localhost:" + nnDnPort);
        conf.set("dfs.http.address", "localhost:" + httpPort);
      }
      // Enable avatar testing framework for unit tests.
      conf.setFloat("dfs.avatarnode.failover.sample.percent", 1.0f);

      conf.set("dfs.namenode.dn-address0", "localhost:" + nnDn0Port);
      conf.set("dfs.namenode.dn-address1", "localhost:" + nnDn1Port);
      conf.set("dfs.http.address0", "localhost:" + http0Port);
      conf.set("dfs.http.address1", "localhost:" + http1Port);
      
      conf.set("dfs.name.dir.shared0", fsimage0Dir);
      conf.set("dfs.name.dir.shared1", fsimage1Dir);
      conf.set("dfs.name.edits.dir.shared0", fsedits0Dir);
      conf.set("dfs.name.edits.dir.shared1", fsedits1Dir);
      conf.setInt("dfs.safemode.extension", 1000);
      // These two ipc parameters help RPC connections to shut down quickly in
      // unit tests.
      conf.setInt("ipc.client.connect.max.retries", 3);
      conf.setInt("ipc.client.connect.timeout", 2000);
      // We need to disable the filesystem cache so that unit tests and
      // MiniAvatarCluster don't end up sharing FileSystem objects.
      if (federation) {
        for (String key: AvatarNode.AVATARSERVICE_SPECIFIC_KEYS) {
          String value = conf.get(key);
          if (value != null) {
            String newKey = DFSUtil.getNameServiceIdKey(key, nameserviceId);
            conf.set(newKey, value);
            conf.set(key, "");
          }
        }
        String rpcKey = DFSUtil.getNameServiceIdKey(
            AvatarNode.DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId);
        conf.set(rpcKey, "localhost:" + rpcPort);
        String dnKey = DFSUtil.getNameServiceIdKey(
            NameNode.DATANODE_PROTOCOL_ADDRESS, nameserviceId);
        conf.set(dnKey, "localhost:" + nnDnPort);
        String httpKey = DFSUtil.getNameServiceIdKey(
            NameNode.DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId);
        conf.set(httpKey, "localhost:" + httpPort);
      }
    }
    
    public void updateAvatarConf(Configuration newConf) {
      conf = new Configuration(newConf);
      if (federation) {
        conf.set(FSConstants.DFS_FEDERATION_NAMESERVICE_ID, nameserviceId);
      }
      
      // server config for avatar nodes
      a0Conf = new Configuration(conf);
      a1Conf = new Configuration(conf);

      a0Conf.set("dfs.name.dir", fsimagelocal0Dir);
      a0Conf.set("dfs.name.edits.dir", fseditslocal0Dir);
      a0Conf.set("fs.checkpoint.dir", avatarDir + "/checkpoint0");

      a1Conf.set("dfs.name.dir", fsimagelocal1Dir);
      a1Conf.set("dfs.name.edits.dir", fseditslocal1Dir);
      a1Conf.set("fs.checkpoint.dir", avatarDir + "/checkpoint1");
    }
    
    public void createAvatarDirs() {
      new File(fsimagelocal0Dir).mkdirs();
      new File(fsimagelocal1Dir).mkdirs();
      new File(fsimage0Dir).mkdirs();
      new File(fsimage1Dir).mkdirs();
      new File(fseditslocal0Dir).mkdirs();
      new File(fseditslocal1Dir).mkdirs();
      new File(fsedits0Dir).mkdirs();
      new File(fsedits1Dir).mkdirs();
    }
    
    public void cleanupAvatarDirs() throws IOException {
      String[] files = new String[] {fsimagelocal0Dir, fsimagelocal1Dir,
          fsimage0Dir, fsimage1Dir, fseditslocal0Dir, fseditslocal1Dir,
          fsedits0Dir, fsedits1Dir
      };
      for (String filename : files) {
        FileUtil.fullyDelete(new File(filename));
      }
    }
  }

  private static ZooKeeperServer zooKeeper;
  private static NIOServerCnxn.Factory cnxnFactory;

  private ArrayList<DataNodeProperties> dataNodes = 
    new ArrayList<DataNodeProperties>();

  public MiniAvatarCluster(Configuration conf,
                           int numDataNodes,
                           boolean format,
                           String[] racks,
                           String[] hosts) 
    throws IOException, ConfigException, InterruptedException {
    this(conf, numDataNodes, format, racks, hosts, 1, false);
  }
  /**
   * Modify the config and start up the servers.  The rpc and info ports for
   * servers are guaranteed to use free ports.
   * <p>
   * NameNode and DataNode directory creation and configuration will be
   * managed by this class.
   *
   * @param conf the base configuration to use in starting the servers.  This
   *          will be modified as necessary.
   * @param numDataNodes Number of DataNodes to start; may be zero
   * @param format if true, format the NameNode and DataNodes before starting up
   * @param racks array of strings indicating the rack that each DataNode is on
   * @param hosts array of strings indicating the hostname of each DataNode
   * @param numNameNodes Number of NameNodes to start; 
   * @param federation if true, we start it with federation configure;
   */
  public MiniAvatarCluster(Configuration conf,
                           int numDataNodes,
                           boolean format,
                           String[] racks,
                           String[] hosts,
                           int numNameNodes,
                           boolean federation) 
    throws IOException, ConfigException, InterruptedException {

    final String testDir = TEST_DIR + "/" + conf.get(MiniDFSCluster.DFS_CLUSTER_ID, "");
    baseAvatarDir = testDir + "/avatar";
    dataDir = testDir + "/data";
    this.conf = conf;
    this.numDataNodes = numDataNodes;
    this.format = format;
    this.racks = racks;
    this.hosts = hosts;
    
    conf.setInt("dfs.secondary.info.port", 0);
    conf.set("fs.ha.zookeeper.prefix", "/hdfs");
    conf.set("fs.ha.zookeeper.quorum", "localhost:" + zkClientPort);
    
    // datanodes
    conf.set("dfs.datanode.address", "localhost:0");
    conf.set("dfs.datanode.http.address", "localhost:0");
    conf.set("dfs.datanode.ipc.address", "localhost:0");
    conf.set("dfs.datanode.dns.interface", "lo");
    conf.set("dfs.namenode.dns.interface", "lo");

    // other settings
    conf.setBoolean("dfs.permissions", false);
    conf.setBoolean("dfs.persist.blocks", true);
    conf.set("fs.hdfs.impl",
             "org.apache.hadoop.hdfs.DistributedAvatarFileSystem");
    conf.setLong("dfs.blockreport.initialDelay", 0);
    conf.setClass("topology.node.switch.mapping.impl", 
                  StaticMapping.class, DNSToSwitchMapping.class);

    this.federation = federation;
    Collection<String> nameserviceIds = DFSUtil.getNameServiceIds(conf);
    if(nameserviceIds.size() > 1)  
      this.federation = true;
    if (!federation && numNameNodes != 1) {
      throw new IOException("Only 1 namenode is allowed in non-federation cluster.");
    }
    nameNodes = new NameNodeInfo[numNameNodes];
    for (int nnIndex = 0; nnIndex < numNameNodes; nnIndex++) {
      nameNodes[nnIndex] = new NameNodeInfo(nnIndex);
      if (format)
        nameNodes[nnIndex].cleanupAvatarDirs();
      nameNodes[nnIndex].createAvatarDirs();
    }
    if (!federation) {
      nameNodes[0].initGeneralConf(conf, null);
    } else {
      if (nameserviceIds.isEmpty()) {
        for (int i = 0; i < nameNodes.length; i++) {
          nameserviceIds.add(NAMESERVICE_ID_PREFIX + getNSId());
        }
      }
      initFederationConf(conf, nameserviceIds);
    }

    if (this.format) {
      File data_dir = new File(dataDir);
      if (data_dir.exists() && !FileUtil.fullyDelete(data_dir)) {
        throw new IOException("Cannot remove data directory: " + data_dir);
      }
    }
  
    // Need to start datanodes before avatarnodes, since the primary starts up
    // in safemode and when the standby starts up, it waits for the primary to
    // exit safemode. So if we start avatarnodes first with non-empty FSImage
    // and FSEdits, the primary avatar would wait for datanode block reports and
    // the standby would wait for the primary to exit safemode and since we
    // wouldn't return from the standby initialization we would never start the
    // datanodes and hence we enter a deadlock.
    registerZooKeeperNodes();
    startDataNodes();
    startAvatarNodes();
    waitAvatarNodesActive();

    waitDataNodesActive();

    waitExitSafeMode();
  }
  
  private void initFederationConf(Configuration conf,
      Collection<String> nameserviceIds) {
    String nameserviceIdList = "";
    int nnIndex = 0;
    for (String nameserviceId : nameserviceIds) {
      // Create comma separated list of nameserviceIds
      if (nameserviceIdList.length() > 0) {
        nameserviceIdList += ",";
      }
      nameserviceIdList += nameserviceId;
      nameNodes[nnIndex].initGeneralConf(conf, nameserviceId);
      nnIndex++;
    }
    conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIdList);
  }

  private static ServerConfig createZooKeeperConf() 
    throws IOException, ConfigException {
    
    // create conf file
    File zkConfDir = new File(TEST_DIR);
    zkConfDir.mkdirs();
    File zkConfFile = new File(ZK_CONF_FILE);
    zkConfFile.delete();
    zkConfFile.createNewFile();

    Properties zkConfProps = new Properties();
    zkConfProps.setProperty("tickTime", "2000");
    zkConfProps.setProperty("dataDir", ZK_DATA_DIR);
    zkConfProps.setProperty("clientPort", new Integer(zkClientPort).toString());
    zkConfProps.setProperty("maxClientCnxns", "30");
    zkConfProps.store(new FileOutputStream(zkConfFile), "");

    // create config object
    ServerConfig zkConf = new ServerConfig();
    zkConf.parse(ZK_CONF_FILE);

    return zkConf;
  }

  private static ServerConfig getZooKeeperConf() throws Exception {
    if (new File(ZK_CONF_FILE).exists()) {
      ServerConfig zkConf = new ServerConfig();
      zkConf.parse(ZK_CONF_FILE);

      return zkConf;
    } else {
      return createZooKeeperConf();
    }
  }

  public static boolean clearZooKeeperData() throws Exception {
    ServerConfig zkConf = getZooKeeperConf();
    File dataLogDir = new File(zkConf.getDataLogDir());
    File dataDir = new File(zkConf.getDataDir());
    return (FileUtil.fullyDelete(dataLogDir) && FileUtil.fullyDelete(dataDir));
  }
    
  public static void createAndStartZooKeeper() 
    throws IOException, ConfigException, InterruptedException {
    ServerConfig zkConf = createZooKeeperConf();

    zooKeeper = new ZooKeeperServer();
    FileTxnSnapLog ftxn = new 
      FileTxnSnapLog(new File(zkConf.getDataLogDir()),
                     new File(zkConf.getDataDir()));
    zooKeeper.setTxnLogFactory(ftxn);
    zooKeeper.setTickTime(zkConf.getTickTime());
    zooKeeper.setMinSessionTimeout(zkConf.getMinSessionTimeout());
    zooKeeper.setMaxSessionTimeout(zkConf.getMaxSessionTimeout());

    cnxnFactory =
      new NIOServerCnxn.Factory(zkConf.getClientPortAddress(),
                                zkConf.getMaxClientCnxns());
    cnxnFactory.startup(zooKeeper);

  }

  private void registerZooKeeperNode(int nnPrimaryPort, int nnDnPrimaryPort,
      int httpPrimaryPort, int rpcPrimaryPort, NameNodeInfo nni) throws IOException {
    AvatarZooKeeperClient zkClient = new AvatarZooKeeperClient(nni.conf, null);
    zkClient.registerPrimary("localhost:" + nni.nnPort, 
    "localhost:" + nnPrimaryPort);
    zkClient.registerPrimary("localhost:" + nni.nnDnPort, 
    "localhost:" + nnDnPrimaryPort);
    zkClient.registerPrimary("localhost:" + nni.httpPort,
    "localhost:" + httpPrimaryPort);
    zkClient.registerPrimary("localhost:" + nni.rpcPort,
    "localhost:" + rpcPrimaryPort);
    try {
      zkClient.shutdown();
    } catch (InterruptedException ie) {
      throw new IOException("zkClient.shutdown() interrupted");
    }
    LOG.info("Closed zk client connection for registerZookeeper");
  }

  void clearZooKeeperNode(int nnIndex) throws IOException {
    NameNodeInfo nni = this.nameNodes[nnIndex];
    AvatarZooKeeperClient zkClient = new AvatarZooKeeperClient(nni.conf, null);
    zkClient.clearPrimary("localhost:" + nni.httpPort);
    zkClient.clearPrimary("localhost:" + nni.nnPort);
    zkClient.clearPrimary("localhost:" + nni.nnDnPort);
    zkClient.clearPrimary("localhost:" + nni.rpcPort);
    try {
      zkClient.shutdown();
    } catch (InterruptedException ie) {
      throw new IOException("zkClient.shutdown() interrupted");
    }
    LOG.info("Closed zk client connection for clearZKNode");
  }

  static Configuration getServerConf(String startupOption,
      NameNodeInfo nni) {
    // namenode should use DFS, not DAFS

    if (startupOption.
               equals(AvatarConstants.StartupOption.NODEZERO.getName())) {
      return new Configuration(nni.a0Conf);
    } else if (startupOption.
               equals(AvatarConstants.StartupOption.NODEONE.getName())) {
      return new Configuration(nni.a1Conf);
    } else {
      throw new IllegalArgumentException("invalid avatar");
    }
  }
  
  private void registerZooKeeperNodes() throws IOException {
    for (NameNodeInfo nni : this.nameNodes) {
      nni.updateAvatarConf(this.conf);
      registerZooKeeperNode(nni.nn0Port, nni.nnDn0Port, nni.http0Port,
          nni.rpc0Port, nni);
    }
  }

  private void startAvatarNodes() throws IOException {
    for (NameNodeInfo nni: this.nameNodes) {
      nni.updateAvatarConf(this.conf);
      startAvatarNode(nni, null);
    }
  }

  private void startAvatarNode(NameNodeInfo nni, StartupOption operation) throws IOException {
    registerZooKeeperNode(nni.nn0Port, nni.nnDn0Port, nni.http0Port,
        nni.rpc0Port, nni);

    if (format) {
      LOG.info("formatting");
      // Start the NameNode
      String[] a0FormatArgs; 
      ArrayList<String> argList = new ArrayList<String>();
      argList.add(AvatarConstants.StartupOption.
          NODEZERO.getName());
      argList.add(AvatarConstants.StartupOption.
          FORMATFORCE.getName());
      if (federation) {
        argList.add(StartupOption.SERVICE.getName());
        argList.add(nni.nameserviceId);
      }
      a0FormatArgs = new String[argList.size()];
      argList.toArray(a0FormatArgs);
      AvatarNode.createAvatarNode(a0FormatArgs, 
                                  getServerConf(AvatarConstants.StartupOption.
                                                NODEZERO.getName(), nni));
    }
    ArrayList<AvatarInfo> avatars = new ArrayList<AvatarInfo>(2);
    {
      LOG.info("starting avatar 0");
      String[] a0Args; 
      ArrayList<String> argList = new ArrayList<String>();
      if (operation != null) {
        argList.add(operation.getName());
      }
      argList.add(AvatarConstants.StartupOption.NODEZERO.getName());
      if (federation) {
        argList.add(StartupOption.SERVICE.getName());
        argList.add(nni.nameserviceId);
      }
      a0Args = new String[argList.size()];
      argList.toArray(a0Args);
      AvatarNode a0 = AvatarNode.
        createAvatarNode(a0Args, 
                         getServerConf(AvatarConstants.
                                       StartupOption.
                                       NODEZERO.
                                       getName(), nni));

      avatars.add(new AvatarInfo(a0,
                                 AvatarState.ACTIVE,
                                 nni.nn0Port, nni.nnDn0Port, nni.http0Port, nni.rpc0Port,
                                 AvatarConstants.StartupOption.NODEZERO.
                                 getName()));
      
      // wait for up to 10 seconds until the ACTIVE is initialized
      for (int i = 0; i < 10; i++) {
        if (a0.isInitialized())
          break;
        LOG.info("Waiting for the ACTIVE to be initialized...");
        try {
          Thread.sleep(1000);
        } catch (InterruptedException e) {
          throw new IOException(
              "Received interruption when initializing ACTIVE node");
        }
      }
      if (!a0.isInitialized()) {
        throw new IOException("The ACTIVE cannot be initialized");
      }
    }

    {
      LOG.info("starting avatar 1");
      String[] a1Args; 
      ArrayList<String> argList = new ArrayList<String>();
      argList.add(AvatarConstants.StartupOption.NODEONE.getName());
      argList.add(AvatarConstants.StartupOption.STANDBY.getName());
      argList.add(AvatarConstants.StartupOption.REGULAR.getName());
      if (federation) {
        argList.add(StartupOption.SERVICE.getName());
        argList.add(nni.nameserviceId);
      }
      a1Args = new String[argList.size()];
      argList.toArray(a1Args);
      avatars.add(new AvatarInfo(AvatarNode.
                                 createAvatarNode(a1Args, 
                                                  getServerConf(AvatarConstants.
                                                                StartupOption.
                                                                NODEONE.
                                                                getName(), nni)),
                                 AvatarState.STANDBY,
                                 nni.nn1Port, nni.nnDn1Port, nni.http1Port, nni.rpc1Port,
                                 AvatarConstants.StartupOption.NODEONE.
                                 getName()));
    }

    for (AvatarInfo avatar: avatars) {
      if (avatar.avatar == null) {
        throw new IOException("Cannot create avatar nodes");
      }
      Assert.assertTrue(
          avatar.avatar.getConf().getBoolean("dfs.persist.blocks", false));
    }
    nni.setAvatarNodes(avatars);
    DFSUtil.setGenericConf(nni.conf, nni.nameserviceId, 
        AvatarNode.AVATARSERVICE_SPECIFIC_KEYS);
    nni.updateAvatarConf(nni.conf);
  }

  public void restartAvatarNodes() throws Exception {
    shutDownAvatarNodes();
    for (NameNodeInfo nni : this.nameNodes) {
      nni.avatars.clear();
    }
    this.format = false;
    Thread.sleep(10000);
    startAvatarNodes();
    waitAvatarNodesActive();

    waitDataNodesActive();

    waitExitSafeMode();
  }

  public void shutDownDataNode(int i) throws IOException, InterruptedException {
    dataNodes.get(i).datanode.shutdown();
  }

  public void shutDownDataNodes() throws IOException, InterruptedException {
    for (int i = 0; i < dataNodes.size(); i++) {
      LOG.info("shutting down data node " + i);
      shutDownDataNode(i);
      LOG.info("data node " + i + " shut down");
    }
  }

  public void shutDownAvatarNodes() throws IOException, InterruptedException {
    for (NameNodeInfo nni : this.nameNodes) {
      for (AvatarInfo avatar: nni.avatars) {
        if (avatar.state == AvatarState.ACTIVE || 
            avatar.state == AvatarState.STANDBY) {
          LOG.info("shutdownAvatar");
          avatar.avatar.shutdown(true);
        }
      }
    }

    try {
      Thread.sleep(1000);
    } catch (InterruptedException ignore) {
      // do nothing
    }
  }

  public static void shutDownZooKeeper() throws IOException, InterruptedException {
    cnxnFactory.shutdown();
    cnxnFactory.join();
    LOG.info("Zookeeper Connection Factory shutdown");
    if (zooKeeper.isRunning()) {
      zooKeeper.shutdown();
    }
    LOG.info("Zookeepr Server shutdown");
  }
  
  /**
   * Shut down the cluster
   */
  public void shutDown() throws IOException, InterruptedException {
    System.out.println("Shutting down the Mini Avatar Cluster");
    // this doesn't work, so just leave the datanodes running,
    // they won't interfere with the next run
    shutDownDataNodes();
    
    shutDownAvatarNodes(); 

  }

  private void startDataNodes() throws IOException {
    if (racks != null && numDataNodes > racks.length ) {
      throw new IllegalArgumentException( "The length of racks [" + 
                                          racks.length +
                                          "] is less than the number " +
                                          "of datanodes [" +
                                          numDataNodes + "].");
    }
    if (hosts != null && numDataNodes > hosts.length ) {
      throw new IllegalArgumentException( "The length of hosts [" + 
                                          hosts.length +
                                          "] is less than the number " +
                                          "of datanodes [" +
                                          numDataNodes + "].");
    }

    //Generate some hostnames if required
    if (racks != null && hosts == null) {
      LOG.info("Generating host names for datanodes");
      hosts = new String[numDataNodes];
      for (int i = 0; i < numDataNodes; i++) {
        hosts[i] = "host" + i + ".foo.com";
      }
    }
    
    
    String[] dnArgs = { HdfsConstants.StartupOption.REGULAR.getName() };
    
    for (int i = 0; i < numDataNodes; i++) {
      Configuration dnConf = new Configuration(conf);

      File dir1 = new File(dataDir, "data"+(2*i+1));
      File dir2 = new File(dataDir, "data"+(2*i+2));
      dir1.mkdirs();
      dir2.mkdirs();
      if (!dir1.isDirectory() || !dir2.isDirectory()) { 
        throw new IOException("Mkdirs failed to create directory for DataNode "
                              + i + ": " + dir1 + " or " + dir2);
      }
      dnConf.set("dfs.data.dir", dir1.getPath() + "," + dir2.getPath()); 

      LOG.info("Starting DataNode " + i + " with dfs.data.dir: " 
                         + dnConf.get("dfs.data.dir"));
      
      if (hosts != null) {
        dnConf.set("slave.host.name", hosts[i]);
        LOG.info("Starting DataNode " + i + " with hostname set to: " 
                           + dnConf.get("slave.host.name"));
      }

      if (racks != null) {
        String name = hosts[i];
        LOG.info("Adding node with hostname : " + name + " to rack "+
                           racks[i]);
        StaticMapping.addNodeToRack(name,
                                    racks[i]);
      }
      Configuration newconf = new Configuration(dnConf); // save config
      if (hosts != null) {
        NetUtils.addStaticResolution(hosts[i], "localhost");
      }
      AvatarDataNode dn = AvatarDataNode.instantiateDataNode(dnArgs, dnConf);
      //since the HDFS does things based on IP:port, we need to add the mapping
      //for IP:port to rackId
      
      String ipAddr = dn.getSelfAddr().getAddress().getHostAddress();
      if (racks != null) {
        int port = dn.getSelfAddr().getPort();
        System.out.println("Adding node with IP:port : " + ipAddr + ":" + port+
                            " to rack " + racks[i]);
        StaticMapping.addNodeToRack(ipAddr + ":" + port,
                                  racks[i]);
      }
      dn.runDatanodeDaemon();
      dataNodes.add(new DataNodeProperties(dn, newconf, dnArgs));

    }

  }

  public void waitAvatarNodesActive() {
    for (int nnIndex = 0; nnIndex < this.nameNodes.length; nnIndex++) {
      waitAvatarNodesActive(nnIndex);
    }
  }

  public void waitAvatarNodesActive(int nnIndex) {
    NameNodeInfo nni = this.nameNodes[nnIndex];
    for (AvatarInfo avatar: nni.avatars) {
      while (avatar.avatar.getNameNodeDNAddress() == null) {
        try {
          LOG.info("waiting for avatar");
          Thread.sleep(200);
        } catch (InterruptedException ignore) {
          // do nothing
        }
      }
    }
  }

  /* wait Datanodes active for all namespaces */
  public void waitDataNodesActive() throws IOException {
    for (int nnIndex = 0; nnIndex < this.nameNodes.length; nnIndex++) {
      waitDataNodesActive(nnIndex);
    }
  }
  
  /* wait Datanodes active for specific namespaces */
  public void waitDataNodesActive(int nnIndex) throws IOException {
    DistributedAvatarFileSystem dafs = null;

    int liveDataNodes = 0;
    // make sure all datanodes are alive
    while(liveDataNodes != numDataNodes) {
      try {
        dafs = getFileSystem(nnIndex);
        LOG.info("waiting for data nodes... ");
        Thread.sleep(200);
        LOG.info("waiting for data nodes : live=" + liveDataNodes + ", total=" + numDataNodes);
        liveDataNodes = dafs.getLiveDataNodeStats(false).length;
      } catch (Exception e) {
        LOG.warn("Exception waiting for datanodes : ", e);
      } finally {
        if (dafs != null) {
          dafs.close();
        }
      }
    }
  }
  
  private void checkSingleNameNode() {
    if (nameNodes.length != 1) {
      throw new IllegalArgumentException("It's not a single namenode cluster, use index instead.");
    }
  }

  public AvatarInfo getPrimaryAvatar(int nnIndex) {
    return getAvatarByState(nnIndex, AvatarState.ACTIVE);
  }
  
  public AvatarInfo getStandbyAvatar(int nnIndex) {
    return getAvatarByState(nnIndex, AvatarState.STANDBY);
  }
  
  private AvatarInfo getDeadAvatar(int nnIndex) {
    return getAvatarByState(nnIndex, AvatarState.DEAD);
  }

  private AvatarInfo getAvatarByState(int nnIndex, AvatarState state) {
    for (AvatarInfo avatar: this.nameNodes[nnIndex].avatars) {
      if (avatar.state == state) {
        return avatar;
      }
    }
    return null;
  }

  /**
   * Return true if primary avatar has left safe mode
   */
  private boolean hasLeftSafeMode(int nnIndex) throws IOException {
    AvatarInfo primary = getPrimaryAvatar(nnIndex);

    return (primary != null && !primary.avatar.isInSafeMode() && 
            primary.avatar.getStats()[0] != 0);
  }

  private void waitExitSafeMode() throws IOException {
    for (int nnIndex=0; nnIndex < this.nameNodes.length; nnIndex++) {
      // make sure all datanodes are alive
      while(!hasLeftSafeMode(nnIndex)) {
        try {
          LOG.info("waiting until avatar0 has left safe mode");
          Thread.sleep(50);
        } catch (InterruptedException ignore) {
          // do nothing
        }
      }
    }
  }

  public DistributedAvatarFileSystem getFileSystem()
      throws IOException {
    checkSingleNameNode();
    return getFileSystem(0);
  }

  /**
   * Get DAFS.
   */
  public DistributedAvatarFileSystem getFileSystem(int nnIndex)
      throws IOException {
    FileSystem fs = FileSystem
        .get(this.nameNodes[nnIndex].clientConf);

    if (!(fs instanceof DistributedAvatarFileSystem)) {
      throw new IOException("fs is not avatar fs");
    }

    return (DistributedAvatarFileSystem) fs;
  }

  /**
   * Kill the primary avatar node.
   * @param updateZK clear zookeeper?
   */
  public void killPrimary() throws IOException {
    checkSingleNameNode();
    killPrimary(0, true);
  }
  
  public void killPrimary(int nnIndex) throws IOException {
    killPrimary(nnIndex, true);
  }
  
  public void killPrimary(boolean clearZK) throws IOException {
    checkSingleNameNode();
    killPrimary(0, clearZK);
  }

  /**
   * Kill the primary avatar node.
   * @param clearZK clear zookeeper?
   */
  public void killPrimary(int nnIndex, boolean clearZK) throws IOException {
    AvatarInfo primary = getPrimaryAvatar(nnIndex);
    if (primary != null) {
      if (clearZK) {
        clearZooKeeperNode(nnIndex);
      }

      primary.avatar.shutdown(true);
      
      primary.avatar = null;
      primary.state = AvatarState.DEAD;

      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignore) {
        // do nothing
      }
      
    } else {
      throw new IOException("can't kill primary avatar, already dead");
    }
  }

  public void killStandby() throws IOException {
    checkSingleNameNode();
    killStandby(0);
  }

  /**
   * Kill the standby avatar node.
   */
  public void killStandby(int nnIndex) throws IOException {
    AvatarInfo standby = getStandbyAvatar(nnIndex);
    if (standby != null) {
      standby.avatar.shutdown(true);
      
      standby.avatar = null;
      standby.state = AvatarState.DEAD;

      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignore) {
        // do nothing
      }
      
    } else {
      LOG.info("can't kill standby avatar, already dead");
    }
  }

  public void failOver() throws IOException {
    failOver(false);
  }

  public void failOver(boolean force) throws IOException {
    checkSingleNameNode();
    failOver(0, force);
  }

  /**
   * Make standby avatar the new primary avatar. Kill the old
   * primary avatar first if necessary.
   */
  public void failOver(int nnIndex) throws IOException {
    failOver(nnIndex, false);
  }

  public void failOver(int nnIndex, boolean force) throws IOException {
    if (getPrimaryAvatar(nnIndex) != null) {
      LOG.info("killing primary avatar before failover");
      killPrimary(nnIndex);
    }

    AvatarInfo standby = getStandbyAvatar(nnIndex);
    if (standby == null) {
      throw new IOException("no standby avatar running");
    }

    standby.avatar.setAvatar(AvatarConstants.Avatar.ACTIVE, force);
    standby.state = AvatarState.ACTIVE;
    registerZooKeeperNode(standby.nnPort, standby.nnDnPort, standby.httpPort,
        standby.rpcPort, this.nameNodes[nnIndex]);
  }
  
  public void restartStandby() throws IOException {
    checkSingleNameNode();
    restartStandby(0);
  }
  /**
   * Restart a dead avatar node as a standby avatar.
   */
  public void restartStandby(int nnIndex) throws IOException {
    AvatarInfo dead = getDeadAvatar(nnIndex);
    if (getPrimaryAvatar(nnIndex) == null || dead == null) {
      throw new IOException("cannot start standby avatar: " +
                            "primary or dead avatar not found");
      
    }
    LOG.info("restarting " + dead.startupOption + " as standby");
    NameNodeInfo nni = this.nameNodes[nnIndex];
    String[] args; 
    ArrayList<String> argList = new ArrayList<String>();
    argList.add(dead.startupOption);
    argList.add(AvatarConstants.StartupOption.STANDBY.getName());
    argList.add(AvatarConstants.StartupOption.REGULAR.getName());
    if (federation) {
      argList.add(StartupOption.SERVICE.getName());
      argList.add(nni.nameserviceId);
    }
    args = new String[argList.size()];
    argList.toArray(args);
    dead.avatar = AvatarNode.createAvatarNode(args, 
        getServerConf(dead.startupOption, nni));
    dead.state = AvatarState.STANDBY;

    if (dead.avatar == null) {
      throw new IOException("cannot start avatar node");
    }
  }
  
  /**
   * return NameNodeInfo 
   */
  public NameNodeInfo getNameNode(int nnIndex) {
    return this.nameNodes[nnIndex];
  }
  
  public ArrayList<DataNodeProperties> getDataNodeProperties() {
    return dataNodes;
  }
  
  /**
   * Gets a list of the started DataNodes.  May be empty.
   */
  public ArrayList<AvatarDataNode> getDataNodes() {
    ArrayList<AvatarDataNode> list = new ArrayList<AvatarDataNode>();
    for (int i = 0; i < dataNodes.size(); i++) {
      AvatarDataNode node = dataNodes.get(i).datanode;
      list.add(node);
    }
    return list;
  }
  
  /*
   * return number of namenodes
   */
  public int getNumNameNodes() {
    return this.nameNodes.length;
  }
  
  /**
   * Add a namenode to cluster and start it. Configuration of datanodes
   * in the cluster is refreshed to register with the new namenode.
   * @return newly started namenode
   */
  public NameNodeInfo addNameNode(Configuration conf)
      throws IOException {
    if(!federation) {
      throw new IOException("cannot add namenode to non-federated cluster");
    }
    int nnIndex = nameNodes.length;
    int numNameNodes = nameNodes.length + 1;
    NameNodeInfo[] newlist = new NameNodeInfo[numNameNodes];
    System.arraycopy(nameNodes, 0, newlist, 0, nameNodes.length);
    nameNodes = newlist;
    nameNodes[nnIndex] = new NameNodeInfo(nnIndex);
    
    NameNodeInfo nni = nameNodes[nnIndex];
    nni.createAvatarDirs();
    String nameserviceId = NAMESERVICE_ID_PREFIX + getNSId();
    String nameserviceIds = conf.get(FSConstants.DFS_FEDERATION_NAMESERVICES);
    nameserviceIds += "," + nameserviceId;
    nni.initGeneralConf(conf, nameserviceId);
    conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIds);
    
    nni.updateAvatarConf(conf);
    startAvatarNode(nni, null);

    // Refresh datanodes with the newly started namenode
    for (DataNodeProperties dn : dataNodes) {
      DataNode datanode = dn.datanode;
      datanode.refreshNamenodes(conf);
    }
    // Wait for new namenode to get registrations from all the datanodes
    waitDataNodesActive(nnIndex);
    return nni;
  }
  
  private void updateAvatarConfWithServiceId(Configuration dstConf, Configuration srcConf,
      String nameserviceId) {
    for (String key: AvatarNode.AVATARSERVICE_SPECIFIC_KEYS) {
      String federationKey = DFSUtil.getNameServiceIdKey(
          key, nameserviceId);
      String value = srcConf.get(federationKey);
      if (value != null) {
        dstConf.set(federationKey, value);
      }
    }
    for (String key: NameNode.NAMESERVICE_SPECIFIC_KEYS) {
      String federationKey = DFSUtil.getNameServiceIdKey(
          key, nameserviceId);
      String value = srcConf.get(federationKey);
      if (value != null) {
        dstConf.set(federationKey, value);
      }
    }
  }
  
  /**
   * Add another cluster to current cluster and start it. Configuration of datanodes
   * in the cluster is refreshed to register with the new namenodes;
   */
  public void addCluster(MiniAvatarCluster cluster, boolean format)
      throws IOException, InterruptedException{
    if(!federation || !cluster.federation) {
      throw new IOException("Cannot handle non-federated cluster");
    }
    if (cluster.dataNodes.size() > this.dataNodes.size()) {
      throw new IOException("Cannot merge: new cluster has more datanodes the old one.");
    }
    this.shutDown();
    cluster.shutDown();
    
    int nnIndex = nameNodes.length;
    int numNameNodes = nameNodes.length + cluster.nameNodes.length;
    NameNodeInfo[] newlist = new NameNodeInfo[numNameNodes];
    System.arraycopy(nameNodes, 0, newlist, 0, nameNodes.length);
    System.arraycopy(cluster.nameNodes, 0, newlist, nameNodes.length, 
        cluster.nameNodes.length);
    nameNodes = newlist;
    String newNameserviceIds = cluster.conf.get(FSConstants.DFS_FEDERATION_NAMESERVICES);
    String nameserviceIds = conf.get(FSConstants.DFS_FEDERATION_NAMESERVICES);
    nameserviceIds += "," + newNameserviceIds;
    this.format = format;
    conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIds);

    int i;
    for (i = 0; i < nameNodes.length; i++) {
      NameNodeInfo nni = nameNodes[i];
      String nameserviceId = nni.nameserviceId;
      nni.initGeneralConf(nni.conf, nni.nameserviceId);
      nni.updateAvatarConf(nni.conf);
      for (int dnIndex = 0; dnIndex < dataNodes.size(); dnIndex++) {
        Configuration dstConf = dataNodes.get(dnIndex).conf;
        if (i >= nnIndex) {
          String dataStr = cluster.dataNodes.get(dnIndex).conf.get("dfs.data.dir");
          dstConf.set("dfs.merge.data.dir." + nameserviceId, dataStr);
        }
        updateAvatarConfWithServiceId(dstConf, nni.conf, nameserviceId);
      }
    }

    for (DataNodeProperties dn : dataNodes) {
      dn.conf.set(FSConstants.DFS_FEDERATION_NAMESERVICES, nameserviceIds);
      dn.datanode = AvatarDataNode.instantiateDataNode(dn.dnArgs, 
          new Configuration(dn.conf));
      dn.datanode.runDatanodeDaemon();
    }

    for (i = 0; i < nameNodes.length; i++) {
      NameNodeInfo nni = nameNodes[i];

      if (i < nnIndex) {
        startAvatarNode(nni, StartupOption.UPGRADE);
      } else {
        startAvatarNode(nni, null);
      }
    }
    waitAvatarNodesActive();
    waitDataNodesActive();
    waitExitSafeMode();
  }

  public synchronized boolean restartDataNodes() throws IOException,
      InterruptedException {
    return restartDataNodes(true);
  }
  
  /*
   * Restart all datanodes
   */
  public synchronized boolean restartDataNodes(boolean waitActive)
      throws IOException, InterruptedException {
    shutDownDataNodes();
    int i = 0;
    for (DataNodeProperties dn : dataNodes) {
      i++;
      LOG.info("Restart Datanode " + i);
      // Use the same port since dn is identified by host:port.
      int port = dn.datanode.getSelfAddr().getPort();
      dn.conf.set("dfs.datanode.address", "localhost:" + port);
      dn.datanode = AvatarDataNode.instantiateDataNode(dn.dnArgs, 
          new Configuration(dn.conf));
      dn.datanode.runDatanodeDaemon();
      if (waitActive) {
        waitDataNodeInitialized(dn.datanode);
      }
    }
    if (waitActive) {
      waitDataNodesActive();
    }
    return true;
  }
  
  /**
   * Wait until the Datanode is initialized, or it throws an IOException
   * @param AvatarDataNode dn;
   * @throws IOException when some ServicePair threads are dead. 
   */
  public synchronized void waitDataNodeInitialized(AvatarDataNode dn) throws IOException {
    if (dn == null) {
      return ;
    }
    boolean initialized = false;
    while (!initialized) {
      initialized = true;
      for (int i = 0; i<nameNodes.length; i++) { 
        InetSocketAddress nameNodeAddr = new InetSocketAddress("localhost",
            getNameNode(i).avatars.get(0).nnDnPort);
        if (!dn.initialized(nameNodeAddr)) {
          initialized = false;
          break;
        }
      }
      try {
        Thread.sleep(100);
      } catch (Exception e) {
      }
    }
  }
  
  public int getNamespaceId(int index) {
    return this.nameNodes[index].avatars.get(0).avatar.getNamespaceID();
  }
  
  static public int getNSId() {
    return MiniAvatarCluster.currNSId++;
  }
}