/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.balancer;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.EOFException;
import java.io.OutputStream;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.Formatter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlocksWithMetaInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.ReplaceBlockHeader;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.Util;
import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicy;
import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyDefault;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.io.retry.RetryProxy;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.security.UnixUserGroupInformation;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/** <p>The balancer is a tool that balances disk space usage on an HDFS cluster
 * when some datanodes become full or when new empty nodes join the cluster.
 * The tool is deployed as an application program that can be run by the
 * cluster administrator on a live HDFS cluster while applications
 * adding and deleting files.
 *
 * <p>SYNOPSIS
 * <pre>
 * To start:
 *      bin/start-balancer.sh [-threshold <threshold>]
 *      Example: bin/ start-balancer.sh
 *                     start the balancer with a default threshold of 10%
 *               bin/ start-balancer.sh -threshold 5
 *                     start the balancer with a threshold of 5%
 * To stop:
 *      bin/ stop-balancer.sh
 * </pre>
 *
 * <p>DESCRIPTION
 * <p>The threshold parameter is a fraction in the range of (0%, 100%) with a
 * default value of 10%. The threshold sets a target for whether the cluster
 * is balanced. A cluster is balanced if for each datanode, the utilization
 * of the node (ratio of used space at the node to total capacity of the node)
 * differs from the utilization of the (ratio of used space in the cluster
 * to total capacity of the cluster) by no more than the threshold value.
 * The smaller the threshold, the more balanced a cluster will become.
 * It takes more time to run the balancer for small threshold values.
 * Also for a very small threshold the cluster may not be able to reach the
 * balanced state when applications write and delete files concurrently.
 *
 * <p>The tool moves blocks from highly utilized datanodes to poorly
 * utilized datanodes iteratively. In each iteration a datanode moves or
 * receives no more than the lesser of 10G bytes or the threshold fraction
 * of its capacity. Each iteration runs no more than 20 minutes.
 * At the end of each iteration, the balancer obtains updated datanodes
 * information from the namenode.
 *
 * <p>A system property that limits the balancer's use of bandwidth is
 * defined in the default configuration file:
 * <pre>
 * <property>
 *   <name>dfs.balance.bandwidthPerSec</name>
 *   <value>1048576</value>
 * <description>  Specifies the maximum bandwidth that each datanode
 * can utilize for the balancing purpose in term of the number of bytes
 * per second. </description>
 * </property>
 * </pre>
 *
 * <p>This property determines the maximum speed at which a block will be
 * moved from one datanode to another. The default value is 1MB/s. The higher
 * the bandwidth, the faster a cluster can reach the balanced state,
 * but with greater competition with application processes. If an
 * administrator changes the value of this property in the configuration
 * file, the change is observed when HDFS is next restarted.
 *
 * <p>MONITERING BALANCER PROGRESS
 * <p>After the balancer is started, an output file name where the balancer
 * progress will be recorded is printed on the screen.  The administrator
 * can monitor the running of the balancer by reading the output file.
 * The output shows the balancer's status iteration by iteration. In each
 * iteration it prints the starting time, the iteration number, the total
 * number of bytes that have been moved in the previous iterations,
 * the total number of bytes that are left to move in order for the cluster
 * to be balanced, and the number of bytes that are being moved in this
 * iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left
 * To Move" is decreasing.
 *
 * <p>Running multiple instances of the balancer in an HDFS cluster is
 * prohibited by the tool.
 *
 * <p>The balancer automatically exits when any of the following five
 * conditions is satisfied:
 * <ol>
 * <li>The cluster is balanced;
 * <li>No block can be moved;
 * <li>No block has been moved for five consecutive iterations;
 * <li>An IOException occurs while communicating with the namenode;
 * <li>Another balancer is running.
 * </ol>
 *
 * <p>Upon exit, a balancer returns an exit code and prints one of the
 * following messages to the output file in corresponding to the above exit
 * reasons:
 * <ol>
 * <li>The cluster is balanced. Exiting
 * <li>No block can be moved. Exiting...
 * <li>No block has been moved for 3 iterations. Exiting...
 * <li>Received an IO exception: failure reason. Exiting...
 * <li>Another balancer is running. Exiting...
 * </ol>
 *
 * <p>The administrator can interrupt the execution of the balancer at any
 * time by running the command "stop-balancer.sh" on the machine where the
 * balancer is running.
 */

public class Balancer implements Tool {
  private static final Log LOG =
      LogFactory.getLog(Balancer.class.getName());
  final private static long MAX_BLOCKS_SIZE_TO_FETCH = 2*1024*1024*1024L; //2GB

  /** The maximum number of concurrent blocks moves for
   * balancing purpose at a datanode
   */
  public final static int MAX_NUM_CONCURRENT_MOVES = 5;
  public static int maxConcurrentMoves = MAX_NUM_CONCURRENT_MOVES;
  private static long maxIterationTime = 20*60*1000L; //20 mins
  private Configuration conf;
  private static double threshold = 10D;
  
  private InetSocketAddress namenodeAddress;
  private NamenodeProtocol namenode;
  private ClientProtocol client;
  private FileSystem fs;
  private OutputStream out = null;

  private final static Random rnd = new Random();
  private int namespaceId;

  // all data node lists
  private Collection<Source> overUtilizedDatanodes
                               = new LinkedList<Source>();
  private Collection<Source> aboveAvgUtilizedDatanodes
                               = new LinkedList<Source>();
  private Collection<BalancerDatanode> belowAvgUtilizedDatanodes
                               = new LinkedList<BalancerDatanode>();
  private Collection<BalancerDatanode> underUtilizedDatanodes
                               = new LinkedList<BalancerDatanode>();

  private Collection<Source> sources
                               = new HashSet<Source>();
  private Collection<BalancerDatanode> targets
                               = new HashSet<BalancerDatanode>();

  private Map<Block, BalancerBlock> globalBlockList
                 = new HashMap<Block, BalancerBlock>();
  private MovedBlocks movedBlocks = new MovedBlocks();
  private Map<String, BalancerDatanode> datanodes
                 = new HashMap<String, BalancerDatanode>();

  private NetworkTopology cluster = new NetworkTopology();

  private double avgRemaining = 0.0D;
  final static private int MOVER_THREAD_POOL_SIZE = 1000;
  private static int moveThreads = MOVER_THREAD_POOL_SIZE;
  private ExecutorService moverExecutor = null;
  final static private int DISPATCHER_THREAD_POOL_SIZE = 200;
  private ExecutorService dispatcherExecutor = null;

  /* This class keeps track of a scheduled block move */
  private class PendingBlockMove {
    private BalancerBlock block;
    private Source source;
    private BalancerDatanode proxySource;
    private BalancerDatanode target;
    private Socket sock;

    /** constructor */
    private PendingBlockMove() {
    }

    /* choose a block & a proxy source for this pendingMove
     * whose source & target have already been chosen.
     *
     * Return true if a block and its proxy are chosen; false otherwise
     */
    private boolean chooseBlockAndProxy() {
      // iterate all source's blocks until find a good one
      for (Iterator<BalancerBlock> blocks=
        source.getBlockIterator(); blocks.hasNext();) {
        if (markMovedIfGoodBlock(blocks.next())) {
          blocks.remove();
          return true;
        }
      }
      return false;
    }

    /* Return true if the given block is good for the tentative move;
     * If it is good, add it to the moved list to marked as "Moved".
     * A block is good if
     * 1. it is a good candidate; see isGoodBlockCandidate
     * 2. can find a proxy source that's not busy for this move
     */
    private boolean markMovedIfGoodBlock(BalancerBlock block) {
      synchronized(block) {
        synchronized(movedBlocks) {
          if (isGoodBlockCandidate(source, target, block)) {
            this.block = block;
            if ( chooseProxySource() ) {
              movedBlocks.add(block);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Decided to move block "+ block.getBlockId()
                    +" with a length of "+StringUtils.byteDesc(block.getNumBytes())
                    + " bytes from " + source.getName()
                    + " to " + target.getName()
                    + " using proxy source " + proxySource.getName() );
              }
              return true;
            }
          }
        }
      }
      return false;
    }

    /* Now we find out source, target, and block, we need to find a proxy
     *
     * @return true if a proxy is found; otherwise false
     */
    private boolean chooseProxySource() {
      // check if there is replica which is on the same rack with the target
      for (BalancerDatanode loc : block.getLocations()) {
        if (cluster.isOnSameRack(loc.getDatanode(), target.getDatanode())) {
          if (loc.addPendingBlock(this)) {
            proxySource = loc;
            return true;
          }
        }
      }
      // find out a non-busy replica
      for (BalancerDatanode loc : block.getLocations()) {
        if (loc.addPendingBlock(this)) {
          proxySource = loc;
          return true;
        }
      }
      return false;
    }

    /* Dispatch the block move task to the proxy source & wait for the response
     */
    private void dispatch() {
      sock = new Socket();
      DataOutputStream out = null;
      DataInputStream in = null;
      try {
        sock.connect(NetUtils.createSocketAddr(
            target.datanode.getName()), HdfsConstants.READ_TIMEOUT);
        sock.setKeepAlive(true);
        out = new DataOutputStream( new BufferedOutputStream(
            sock.getOutputStream(), FSConstants.BUFFER_SIZE));
        sendRequest(out);
        in = new DataInputStream( new BufferedInputStream(
            sock.getInputStream(), FSConstants.BUFFER_SIZE));
        receiveResponse(in);
        bytesMoved.inc(block.getNumBytes());
        LOG.info( "Moving block " + block.getBlock().getBlockId() +
              " from "+ source.getName() + " to " +
              target.getName() + " through " +
              proxySource.getName() +
              " is succeeded." );
      } catch (IOException e) {
        LOG.warn("Error moving block "+block.getBlockId()+
            " from " + source.getName() + " to " +
            target.getName() + " through " +
            proxySource.getName() +
            ": "+e.getMessage());
        if (e instanceof EOFException) {
          LOG.warn("Moving block " + block.getBlockId() +
            " was cancelled because the time exceeded the limit");
        }
      } finally {
        IOUtils.closeStream(out);
        IOUtils.closeStream(in);
        IOUtils.closeSocket(sock);

        proxySource.removePendingBlock(this);
        synchronized(target) {
          target.removePendingBlock(this);
        }

        synchronized (this ) {
          reset();
        }
        synchronized (Balancer.this) {
          Balancer.this.notifyAll();
        }
      }
    }

    /* Send a block replace request to the output stream*/
    private void sendRequest(DataOutputStream out) throws IOException {
      /* Write the header */
      ReplaceBlockHeader replaceBlockHeader = new ReplaceBlockHeader(
          DataTransferProtocol.DATA_TRANSFER_VERSION, namespaceId,
          block.getBlock().getBlockId(), block.getBlock().getGenerationStamp(),
          source.getStorageID(), proxySource.getDatanode());
      replaceBlockHeader.writeVersionAndOpCode(out);
      replaceBlockHeader.write(out);
      out.flush();
    }

    /* Receive a block copy response from the input stream */
    private void receiveResponse(DataInputStream in) throws IOException {
      short status = in.readShort();
      if (status != DataTransferProtocol.OP_STATUS_SUCCESS) {
        throw new IOException("block move is failed");
      }
    }

    /* reset the object */
    private void reset() {
      block = null;
      source = null;
      proxySource = null;
      target = null;
    }

    public void closeSocket() {
      try {
        this.sock.shutdownInput();
      } catch (IOException ex) {
        LOG.error("Error shutting down the socket to cancel block transfer");
      }
    }

    /* start a thread to dispatch the block move */
    private void scheduleBlockMove() {
      moverExecutor.execute(new Runnable() {
        public void run() {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Starting moving "+ block.getBlockId() +
                " from " + proxySource.getName() + " to " + target.getName());
          }
          dispatch();
        }
      });
    }
  }

  /* A class for keeping track of blocks in the Balancer */
  static private class BalancerBlock {
    private Block block; // the block
    private List<BalancerDatanode> locations
            = new ArrayList<BalancerDatanode>(3); // its locations

    /* Constructor */
    private BalancerBlock(Block block) {
      this.block = block;
    }

    /* clean block locations */
    private synchronized void clearLocations() {
      locations.clear();
    }

    /* add a location */
    private synchronized void addLocation(BalancerDatanode datanode) {
      if (!locations.contains(datanode)) {
        locations.add(datanode);
      }
    }

    /* Return if the block is located on <code>datanode</code> */
    private synchronized boolean isLocatedOnDatanode(
        BalancerDatanode datanode) {
      return locations.contains(datanode);
    }

    /* Return its locations */
    private synchronized List<BalancerDatanode> getLocations() {
      return locations;
    }

    /* Return the block */
    private Block getBlock() {
      return block;
    }

    /* Return the block id */
    private long getBlockId() {
      return block.getBlockId();
    }

    /* Return the length of the block */
    private long getNumBytes() {
      return block.getNumBytes();
    }
  }

  /* The class represents a desired move of bytes between two nodes
   * and the target.
   * An object of this class is stored in a source node.
   */
  static private class NodeTask {
    private BalancerDatanode datanode; //target node
    private long size;  //bytes scheduled to move

    /* constructor */
    private NodeTask(BalancerDatanode datanode, long size) {
      this.datanode = datanode;
      this.size = size;
    }

    /* Get the node */
    private BalancerDatanode getDatanode() {
      return datanode;
    }

    /* Get the number of bytes that need to be moved */
    private long getSize() {
      return size;
    }
  }
  
  /* Return the utilization of a datanode */
  static private double getRemaining(DatanodeInfo datanode) {
    return ((double)datanode.getRemaining())/datanode.getCapacity()*100;
  }
  
  /* A class that keeps track of a datanode in Balancer */
  private static class BalancerDatanode implements Writable {
    final private static long MAX_SIZE_TO_MOVE = 10*1024*1024*1024L; //10GB
    final DatanodeInfo datanode;
    final double remaining;
    final long maxSizeToMove;
    protected long scheduledSize = 0L;
    //  blocks being moved but not confirmed yet
    private List<PendingBlockMove> pendingBlocks =
      new ArrayList<PendingBlockMove>(maxConcurrentMoves);

    /* Constructor
     * Depending on avgutil & threshold, calculate maximum bytes to move
     */
    private BalancerDatanode(
        DatanodeInfo node, double avgRemaining, double threshold) {
      datanode = node;
      remaining = Balancer.getRemaining(node);
      long sizeToMove; 
      
      if (remaining + threshold <= avgRemaining 
          || remaining - threshold  >= avgRemaining) {
        sizeToMove = (long)(threshold*datanode.getCapacity()/100);
      } else {
        sizeToMove =
          (long)(Math.abs(avgRemaining-remaining)*datanode.getCapacity()/100);
      }
      if (remaining > avgRemaining) {
        sizeToMove = Math.min(datanode.getRemaining(), sizeToMove);
      }
      this.maxSizeToMove = Math.min(MAX_SIZE_TO_MOVE, sizeToMove);
    }

    /** Get the datanode */
    protected DatanodeInfo getDatanode() {
      return datanode;
    }

    /** Get the name of the datanode */
    protected String getName() {
      return datanode.getName();
    }

    /* Get the storage id of the datanode */
    protected String getStorageID() {
      return datanode.getStorageID();
    }

    /** Decide if still need to move more bytes */
    protected boolean isMoveQuotaFull() {
      return scheduledSize<maxSizeToMove;
    }

    /** Return the total number of bytes that need to be moved */
    protected long availableSizeToMove() {
      return maxSizeToMove-scheduledSize;
    }

    /* increment scheduled size */
    protected void incScheduledSize(long size) {
      scheduledSize += size;
    }

    /* Check if the node can schedule more blocks to move */
    synchronized private boolean isPendingQNotFull() {
      if ( pendingBlocks.size() < maxConcurrentMoves ) {
        return true;
      }
      return false;
    }

    /* Check if all the dispatched moves are done */
    synchronized private boolean isPendingQEmpty() {
      return pendingBlocks.isEmpty();
    }

    synchronized private void killPending() {
      for (PendingBlockMove pendingBlock : pendingBlocks) {
        pendingBlock.closeSocket();
      }
    }

    /* Add a scheduled block move to the node */
    private synchronized boolean addPendingBlock(
        PendingBlockMove pendingBlock) {
      if (isPendingQNotFull()) {
        return pendingBlocks.add(pendingBlock);
      }
      return false;
    }

    /* Remove a scheduled block move from the node */
    private synchronized boolean  removePendingBlock(
        PendingBlockMove pendingBlock) {
      return pendingBlocks.remove(pendingBlock);
    }

    /** The following two methods support the Writable interface */
    /** Deserialize */
    public void readFields(DataInput in) throws IOException {
      datanode.readFields(in);
    }

    /** Serialize */
    public void write(DataOutput out) throws IOException {
      datanode.write(out);
    }
  }

  /** A node that can be the sources of a block move */
  private class Source extends BalancerDatanode {
    /* A thread that initiates a block move
     * and waits for block move to complete */
    private class BlockMoveDispatcher implements Runnable {
      private long startTime;
      BlockMoveDispatcher(long time) {
        this.startTime = time;
      }
      public void run() {
        dispatchBlocks(startTime);
      }
    }

    private ArrayList<NodeTask> nodeTasks = new ArrayList<NodeTask>(2);
    private long blocksToReceive = 0L;
    /* source blocks point to balancerBlocks in the global list because
     * we want to keep one copy of a block in balancer and be aware that
     * the locations are changing over time.
     */
    private List<BalancerBlock> srcBlockList
            = new ArrayList<BalancerBlock>();

    /* constructor */
    private Source(DatanodeInfo node, double avgRemaining, double threshold) {
      super(node, avgRemaining, threshold);
    }

    /** Add a node task */
    private void addNodeTask(NodeTask task) {
      assert (task.datanode != this) :
        "Source and target are the same " + datanode.getName();
      incScheduledSize(task.getSize());
      nodeTasks.add(task);
    }

    /* Return an iterator to this source's blocks */
    private Iterator<BalancerBlock> getBlockIterator() {
      return srcBlockList.iterator();
    }

    /* fetch new blocks of this source from namenode and
     * update this source's block list & the global block list
     * Return the total size of the received blocks in the number of bytes.
     */
    private long getBlockList() throws IOException {
      BlockWithLocations[] newBlocks = namenode.getBlocks(datanode,
        (long)Math.min(MAX_BLOCKS_SIZE_TO_FETCH, blocksToReceive)).getBlocks();
      long bytesReceived = 0;
      for (BlockWithLocations blk : newBlocks) {
        bytesReceived += blk.getBlock().getNumBytes();
        BalancerBlock block;
        synchronized(globalBlockList) {
          block = globalBlockList.get(blk.getBlock());
          if (block==null) {
            block = new BalancerBlock(blk.getBlock());
            globalBlockList.put(blk.getBlock(), block);
          } else {
            block.clearLocations();
          }

          synchronized (block) {
            // update locations
            for ( String location : blk.getDatanodes() ) {
              BalancerDatanode datanode = datanodes.get(location);
              if (datanode != null) { // not an unknown datanode
                block.addLocation(datanode);
              }
            }
          }
          if (!srcBlockList.contains(block) && isGoodBlockCandidate(block)) {
            // filter bad candidates
            srcBlockList.add(block);
          }
        }
      }
      return bytesReceived;
    }

    /* Decide if the given block is a good candidate to move or not */
    private boolean isGoodBlockCandidate(BalancerBlock block) {
      for (NodeTask nodeTask : nodeTasks) {
        if (Balancer.this.isGoodBlockCandidate(this, nodeTask.datanode, block)) {
          return true;
        }
      }
      return false;
    }

    /* Return a block that's good for the source thread to dispatch immediately
     * The block's source, target, and proxy source are determined too.
     * When choosing proxy and target, source & target throttling
     * has been considered. They are chosen only when they have the capacity
     * to support this block move.
     * The block should be dispatched immediately after this method is returned.
     */
    private PendingBlockMove chooseNextBlockToMove() {
      for ( Iterator<NodeTask> tasks=nodeTasks.iterator(); tasks.hasNext(); ) {
        NodeTask task = tasks.next();
        BalancerDatanode target = task.getDatanode();
        PendingBlockMove pendingBlock = new PendingBlockMove();
        if ( target.addPendingBlock(pendingBlock) ) {
          // target is not busy, so do a tentative block allocation
          pendingBlock.source = this;
          pendingBlock.target = target;
          if ( pendingBlock.chooseBlockAndProxy() ) {
            long blockSize = pendingBlock.block.getNumBytes();
            scheduledSize -= blockSize;
            task.size -= blockSize;
            if (task.size == 0) {
              tasks.remove();
            }
            return pendingBlock;
          } else {
            // cancel the tentative move
            target.removePendingBlock(pendingBlock);
          }
        }
      }
      return null;
    }

    /* iterate all source's blocks to remove moved ones */
    private void filterMovedBlocks() {
      for (Iterator<BalancerBlock> blocks=getBlockIterator();
            blocks.hasNext();) {
        if (movedBlocks.contains(blocks.next())) {
          blocks.remove();
        }
      }
    }

    private static final int SOURCE_BLOCK_LIST_MIN_SIZE=5;
    /* Return if should fetch more blocks from namenode */
    private boolean shouldFetchMoreBlocks() {
      return srcBlockList.size()<SOURCE_BLOCK_LIST_MIN_SIZE &&
                 blocksToReceive>0;
    }

    /* This method iteratively does the following:
     * it first selects a block to move,
     * then sends a request to the proxy source to start the block move
     * when the source's block list falls below a threshold, it asks
     * the namenode for more blocks.
     * It terminates when it has dispatch enough block move tasks or
     * it has received enough blocks from the namenode, or
     * the elapsed time of the iteration has exceeded the max time limit.
     */
    private static final long MAX_ITERATION_TIME = 20*60*1000L; //20 mins
    private void dispatchBlocks(long startTime) {
      this.blocksToReceive = 2*scheduledSize;
      boolean isTimeUp = false;
      while(!isTimeUp && scheduledSize>0 &&
          (!srcBlockList.isEmpty() || blocksToReceive>0)) {

        // check if time is up or not
        // Even if not sent everything the iteration is over
        if (Util.now()-startTime > maxIterationTime) {
          isTimeUp = true;
          continue;
        }

        PendingBlockMove pendingBlock = chooseNextBlockToMove();
        if (pendingBlock != null) {
          // move the block
          pendingBlock.scheduleBlockMove();
          continue;
        }

        /* Since we can not schedule any block to move,
         * filter any moved blocks from the source block list and
         * check if we should fetch more blocks from the namenode
         */
        filterMovedBlocks(); // filter already moved blocks

        if (shouldFetchMoreBlocks()) {
          // fetch new blocks
          try {
            blocksToReceive -= getBlockList();
            continue;
          } catch (IOException e) {
            LOG.warn(StringUtils.stringifyException(e));
            return;
          }
        }

        /* Now we can not schedule any block to move and there are
         * no new blocks added to the source block list, so we wait.
         */
        try {
          synchronized(Balancer.this) {
            Balancer.this.wait(1000);  // wait for targets/sources to be idle
          }
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  /*
   * Check that this Balancer is compatible with the Block Placement Policy
   * used by the Namenode.
   *
   * In case it is not compatible, throw IllegalArgumentException
   *
   * */
  private void checkReplicationPolicyCompatibility(Configuration conf) {
    if (!(BlockPlacementPolicy.getInstance(conf, null, null, null, null, null)
        instanceof BlockPlacementPolicyDefault)) {
      throw new IllegalArgumentException("Configuration lacks BlockPlacementPolicyDefault");
    }
  }

  /** Default constructor */
  Balancer() {
  }

  /** Construct a balancer from the given configuration */
  Balancer(Configuration conf) {
    setConf(conf);
    checkReplicationPolicyCompatibility(conf);
  }

  /** Construct a balancer from the given configuration and threshold */
  Balancer(Configuration conf, double threshold) {
    setConf(conf);
    checkReplicationPolicyCompatibility(conf);
    Balancer.threshold = threshold;
  }

  /**
   * Run a balancer
   * @param args
   */
  public static void main(String[] args) {
    try {
      System.exit( ToolRunner.run(null, new Balancer(), args) );
    } catch (Throwable e) {
      LOG.error(StringUtils.stringifyException(e));
      System.exit(-1);
    }

  }

  private static void printUsage(Options opts) {
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp("balancer", opts);
  }

  /* parse argument to get the threshold */
  private double checkThreshold(int value) {
    double threshold = (double) value;
    try {
      if (threshold < 0 || threshold > 100) {
        throw new NumberFormatException();
      }
      LOG.info("Using a threshold of " + threshold);
    } catch (NumberFormatException e) {
      System.err.println("Expect a double parameter in the range of [0, 100]: "
          + value);
      throw e;
    }
    return threshold;
  }

  /* Initialize balancer. It sets the value of the threshold, and
   * builds the communication proxies to
   * namenode as a client and a secondary namenode and retry proxies
   * when connection fails.
   */
  private void init(InetSocketAddress namenodeAddress) throws IOException {
    this.namenodeAddress = namenodeAddress;
    this.namenode = createNamenode(namenodeAddress, conf);
    this.client = DFSClient.createNamenode(namenodeAddress, conf);
    this.fs = FileSystem.get(NameNode.getUri(namenodeAddress), conf);
    this.moverExecutor = Executors.newFixedThreadPool(moveThreads);
    int dispatchThreads = (int)Math.max(1, moveThreads/maxConcurrentMoves);
    this.dispatcherExecutor = Executors.newFixedThreadPool(dispatchThreads);
    /* Check if there is another balancer running.
     * Exit if there is another one running.
     */
    this.out = checkAndMarkRunningBalancer();
    if (out == null) {
      throw new IOException("Another balancer is running");
    }
    // get namespace id
    LocatedBlocksWithMetaInfo locations = client.openAndFetchMetaInfo(BALANCER_ID_PATH.toString(), 0L, 1L);
    this.namespaceId = locations.getNamespaceID();
  }

  /* Build a NamenodeProtocol connection to the namenode and
   * set up the retry policy */
  private static NamenodeProtocol createNamenode(InetSocketAddress nameNodeAddr, Configuration conf)
    throws IOException {
    RetryPolicy timeoutPolicy = RetryPolicies.exponentialBackoffRetry(
        5, 200, TimeUnit.MILLISECONDS);
    Map<Class<? extends Exception>,RetryPolicy> exceptionToPolicyMap =
      new HashMap<Class<? extends Exception>, RetryPolicy>();
    RetryPolicy methodPolicy = RetryPolicies.retryByException(
        timeoutPolicy, exceptionToPolicyMap);
    Map<String,RetryPolicy> methodNameToPolicyMap =
        new HashMap<String, RetryPolicy>();
    methodNameToPolicyMap.put("getBlocks", methodPolicy);

    UserGroupInformation ugi;
    try {
      ugi = UnixUserGroupInformation.login(conf);
    } catch (javax.security.auth.login.LoginException e) {
      throw new IOException(StringUtils.stringifyException(e));
    }

    return (NamenodeProtocol) RetryProxy.create(
        NamenodeProtocol.class,
        RPC.getProxy(NamenodeProtocol.class,
            NamenodeProtocol.versionID,
            nameNodeAddr,
            ugi,
            conf,
            NetUtils.getDefaultSocketFactory(conf)),
        methodNameToPolicyMap);
  }

  /* Shuffle datanode array */
  static private void shuffleArray(DatanodeInfo[] datanodes) {
    for (int i=datanodes.length; i>1; i--) {
      int randomIndex = rnd.nextInt(i);
      DatanodeInfo tmp = datanodes[randomIndex];
      datanodes[randomIndex] = datanodes[i-1];
      datanodes[i-1] = tmp;
    }
  }
  
  /* get all live datanodes of a cluster and their disk usage
   * decide the number of bytes need to be moved
   */
  private long initNodes() throws IOException {
    return initNodes(client.getDatanodeReport(DatanodeReportType.LIVE));
  }

  /* Given a data node set, build a network topology and decide
   * over-utilized datanodes, above average utilized datanodes,
   * below average utilized datanodes, and underutilized datanodes.
   * The input data node set is shuffled before the datanodes
   * are put into the over-utilized datanodes, above average utilized
   * datanodes, below average utilized datanodes, and
   * underutilized datanodes lists. This will add some randomness
   * to the node matching later on.
   *
   * @return the total number of bytes that are
   *                needed to move to make the cluster balanced.
   * @param datanodes a set of datanodes
   */
  private long initNodes(DatanodeInfo[] datanodes) {
    // compute average remaining
    long totalCapacity=0L, totalRemainingSpace=0L;
    for (DatanodeInfo datanode : datanodes) {
      if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) {
        continue; // ignore decommissioning or decommissioned nodes
      }
      totalCapacity += datanode.getCapacity();
      totalRemainingSpace += datanode.getRemaining();
    }
    avgRemaining = ((double)totalRemainingSpace)/totalCapacity*100;

    /*create network topology and all data node lists:
     * overloaded, above-average, below-average, and underloaded
     * we alternates the accessing of the given datanodes array either by
     * an increasing order or a decreasing order.
     */
    long overLoadedBytes = 0L, underLoadedBytes = 0L;
    shuffleArray(datanodes);
    for (DatanodeInfo datanode : datanodes) {
      if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) {
        continue; // ignore decommissioning or decommissioned nodes
      }
      cluster.add(datanode);
      BalancerDatanode datanodeS;
      if (getRemaining(datanode) < avgRemaining) {
        datanodeS = new Source(datanode, avgRemaining, threshold);
        if (isAboveAvgUtilized(datanodeS)) {
          this.aboveAvgUtilizedDatanodes.add((Source)datanodeS);
        } else {
          assert(isOverUtilized(datanodeS)) :
            datanodeS.getName()+ "is not an overUtilized node";
          this.overUtilizedDatanodes.add((Source)datanodeS);
          overLoadedBytes += (long)((avgRemaining - threshold - 
              datanodeS.remaining)*datanodeS.datanode.getCapacity()/100.0);
        }
      } else {
        datanodeS = new BalancerDatanode(datanode, avgRemaining, threshold);
        if ( isBelowAvgUtilized(datanodeS)) {
          this.belowAvgUtilizedDatanodes.add(datanodeS);
        } else {
          assert (isUnderUtilized(datanodeS)) :
            datanodeS.getName()+ "is not an underUtilized node";
          this.underUtilizedDatanodes.add(datanodeS);
          underLoadedBytes += (long)((datanodeS.remaining - avgRemaining-
              threshold)*datanodeS.datanode.getCapacity()/100.0);
        }
      }
      this.datanodes.put(datanode.getStorageID(), datanodeS);
    }

    //logging
    logImbalancedNodes();

    assert (this.datanodes.size() ==
      overUtilizedDatanodes.size()+underUtilizedDatanodes.size()+
      aboveAvgUtilizedDatanodes.size()+belowAvgUtilizedDatanodes.size())
      : "Mismatched number of datanodes";

    // return number of bytes to be moved in order to make the cluster balanced
    return Math.max(overLoadedBytes, underLoadedBytes);
  }

  /* log the over utilized & under utilized nodes */
  private void logImbalancedNodes() {
    StringBuilder msg = new StringBuilder();
    msg.append(overUtilizedDatanodes.size());
    msg.append(" over utilized nodes:");
    for (Source node : overUtilizedDatanodes) {
      msg.append( " " );
      msg.append( node.getName() );
    }
    LOG.info(msg);
    msg = new StringBuilder();
    msg.append(underUtilizedDatanodes.size());
    msg.append(" under utilized nodes: ");
    for (BalancerDatanode node : underUtilizedDatanodes) {
      msg.append( " " );
      msg.append( node.getName() );
    }
    LOG.info(msg);
  }

  /* Decide all <source, target> pairs and
   * the number of bytes to move from a source to a target
   * Maximum bytes to be moved per node is
   * Min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).
   * Return total number of bytes to move in this iteration
   */
  private long chooseNodes() {
    // Match nodes on the same rack first
    chooseNodes(true);
    // Then match nodes on different racks
    chooseNodes(false);

    assert (datanodes.size() ==
      overUtilizedDatanodes.size()+underUtilizedDatanodes.size()+
      aboveAvgUtilizedDatanodes.size()+belowAvgUtilizedDatanodes.size()+
      sources.size()+targets.size())
      : "Mismatched number of datanodes";

    long bytesToMove = 0L;
    for (Source src : sources) {
      bytesToMove += src.scheduledSize;
    }
    return bytesToMove;
  }

  /* if onRack is true, decide all <source, target> pairs
   * where source and target are on the same rack; Otherwise
   * decide all <source, target> pairs where source and target are
   * on different racks
   */
  private void chooseNodes(boolean onRack) {
    /* first step: match each overUtilized datanode (source) to
     * one or more underUtilized datanodes (targets).
     */
    chooseTargets(underUtilizedDatanodes.iterator(), onRack);

    /* match each remaining overutilized datanode (source) to
     * below average utilized datanodes (targets).
     * Note only overutilized datanodes that haven't had that max bytes to move
     * satisfied in step 1 are selected
     */
    chooseTargets(belowAvgUtilizedDatanodes.iterator(), onRack);

    /* match each remaining underutilized datanode to
     * above average utilized datanodes.
     * Note only underutilized datanodes that have not had that max bytes to
     * move satisfied in step 1 are selected.
     */
    chooseSources(aboveAvgUtilizedDatanodes.iterator(), onRack);
  }

  /* choose targets from the target candidate list for each over utilized
   * source datanode. OnRackTarget determines if the chosen target
   * should be on the same rack as the source
   */
  private void chooseTargets(
      Iterator<BalancerDatanode> targetCandidates, boolean onRackTarget ) {
    for (Iterator<Source> srcIterator = overUtilizedDatanodes.iterator();
        srcIterator.hasNext();) {
      Source source = srcIterator.next();
      while (chooseTarget(source, targetCandidates, onRackTarget)) {
      }
      if (!source.isMoveQuotaFull()) {
        srcIterator.remove();
      }
    }
    return;
  }

  /* choose sources from the source candidate list for each under utilized
   * target datanode. onRackSource determines if the chosen source
   * should be on the same rack as the target
   */
  private void chooseSources(
      Iterator<Source> sourceCandidates, boolean onRackSource) {
    for (Iterator<BalancerDatanode> targetIterator =
      underUtilizedDatanodes.iterator(); targetIterator.hasNext();) {
      BalancerDatanode target = targetIterator.next();
      while (chooseSource(target, sourceCandidates, onRackSource)) {
      }
      if (!target.isMoveQuotaFull()) {
        targetIterator.remove();
      }
    }
    return;
  }

  /* For the given source, choose targets from the target candidate list.
   * OnRackTarget determines if the chosen target
   * should be on the same rack as the source
   */
  private boolean chooseTarget(Source source,
      Iterator<BalancerDatanode> targetCandidates, boolean onRackTarget) {
    if (!source.isMoveQuotaFull()) {
      return false;
    }
    boolean foundTarget = false;
    BalancerDatanode target = null;
    while (!foundTarget && targetCandidates.hasNext()) {
      target = targetCandidates.next();
      if (!target.isMoveQuotaFull()) {
        targetCandidates.remove();
        continue;
      }
      if (onRackTarget) {
        // choose from on-rack nodes
        if (cluster.isOnSameRack(source.datanode, target.datanode)) {
          foundTarget = true;
        }
      } else {
        // choose from off-rack nodes
        if (!cluster.isOnSameRack(source.datanode, target.datanode)) {
          foundTarget = true;
        }
      }
    }
    if (foundTarget) {
      assert(target != null):"Choose a null target";
      long size = Math.min(source.availableSizeToMove(),
          target.availableSizeToMove());
      NodeTask nodeTask = new NodeTask(target, size);
      source.addNodeTask(nodeTask);
      target.incScheduledSize(nodeTask.getSize());
      sources.add(source);
      targets.add(target);
      if (!target.isMoveQuotaFull()) {
        targetCandidates.remove();
      }
      LOG.info("Decided to move "+StringUtils.byteDesc(size)+" bytes from "
          +source.datanode.getName() + " to " + target.datanode.getName());
      return true;
    }
    return false;
  }

  /* For the given target, choose sources from the source candidate list.
   * OnRackSource determines if the chosen source
   * should be on the same rack as the target
   */
  private boolean chooseSource(BalancerDatanode target,
      Iterator<Source> sourceCandidates, boolean onRackSource) {
    if (!target.isMoveQuotaFull()) {
      return false;
    }
    boolean foundSource = false;
    Source source = null;
    while (!foundSource && sourceCandidates.hasNext()) {
      source = sourceCandidates.next();
      if (!source.isMoveQuotaFull()) {
        sourceCandidates.remove();
        continue;
      }
      if (onRackSource) {
        // choose from on-rack nodes
        if ( cluster.isOnSameRack(source.getDatanode(), target.getDatanode())) {
          foundSource = true;
        }
      } else {
        // choose from off-rack nodes
        if (!cluster.isOnSameRack(source.datanode, target.datanode)) {
          foundSource = true;
        }
      }
    }
    if (foundSource) {
      assert(source != null):"Choose a null source";
      long size = Math.min(source.availableSizeToMove(),
          target.availableSizeToMove());
      NodeTask nodeTask = new NodeTask(target, size);
      source.addNodeTask(nodeTask);
      target.incScheduledSize(nodeTask.getSize());
      sources.add(source);
      targets.add(target);
      if ( !source.isMoveQuotaFull()) {
        sourceCandidates.remove();
      }
      LOG.info("Decided to move "+StringUtils.byteDesc(size)+" bytes from "
          +source.datanode.getName() + " to " + target.datanode.getName());
      return true;
    }
    return false;
  }

  private static class BytesMoved {
    private long bytesMoved = 0L;;
    private synchronized void inc( long bytes ) {
      bytesMoved += bytes;
    }

    private long get() {
      return bytesMoved;
    }
  };
  private BytesMoved bytesMoved = new BytesMoved();
  private int notChangedIterations = 0;

  /* Start a thread to dispatch block moves for each source.
   * The thread selects blocks to move & sends request to proxy source to
   * initiate block move. The process is flow controlled. Block selection is
   * blocked if there are too many un-confirmed block moves.
   * Return the total number of bytes successfully moved in this iteration.
   */
  private long dispatchBlockMoves() throws InterruptedException {
    long bytesLastMoved = bytesMoved.get();

    Future<?>[] futures = new Future<?>[sources.size()];
    int i=0;
    for (Source source : sources) {
      futures[i++] = dispatcherExecutor.submit(
                        source.new BlockMoveDispatcher(Util.now()));
    }

    // wait for all dispatcher threads to finish
    for (Future<?> future : futures) {
      try {
        future.get();
      } catch (ExecutionException e) {
        LOG.warn("Dispatcher thread failed", e.getCause());
      }
    }

    // wait for all block moving to be done
    waitForMoveCompletion();

    return bytesMoved.get()-bytesLastMoved;
  }

  // The sleeping period before checking if block move is completed again
  static private long blockMoveWaitTime = 30000L;
  // How many blockMoveWait to wait until stopping the move
  private static final int MAX_WAIT_ITERATIONS = 1;

  /** set the sleeping period for block move completion check */
  static void setBlockMoveWaitTime(long time) {
    blockMoveWaitTime = time;
  }

  /* wait for all block move confirmations
   * by checking each target's pendingMove queue
   */
  private void waitForMoveCompletion() {
    boolean shouldWait;
    int waitedIterations = 0;
    do {
      shouldWait = false;
      for (BalancerDatanode target : targets) {
        if (!target.isPendingQEmpty()) {
          shouldWait = true;
        }
      }
      if (shouldWait) {
        try {
          if (waitedIterations > MAX_WAIT_ITERATIONS) {
            for (BalancerDatanode target : targets) {
              target.killPending();
            }
            continue;
          }
          waitedIterations++;
          Thread.sleep(blockMoveWaitTime);
        } catch (InterruptedException ignored) {
        }
      }
    } while (shouldWait);
  }

  /** This window makes sure to keep blocks that have been moved within 1.5 hour.
   * Old window has blocks that are older;
   * Current window has blocks that are more recent;
   * Cleanup method triggers the check if blocks in the old window are
   * more than 1.5 hour old. If yes, purge the old window and then
   * move blocks in current window to old window.
   */
  private static class MovedBlocks {
    private long lastCleanupTime = System.currentTimeMillis();
    private static long winWidth = 5400*1000L; // 1.5 hour
    final private static int CUR_WIN = 0;
    final private static int OLD_WIN = 1;
    final private static int NUM_WINS = 2;
    final private List<HashMap<Block, BalancerBlock>> movedBlocks =
      new ArrayList<HashMap<Block, BalancerBlock>>(NUM_WINS);

    /* initialize the moved blocks collection */
    private MovedBlocks() {
      movedBlocks.add(new HashMap<Block,BalancerBlock>());
      movedBlocks.add(new HashMap<Block,BalancerBlock>());
    }
    
    public void setWinWidth(Configuration conf) {
      winWidth = conf.getLong(
          "dfs.balancer.movedWinWidth", 5400*1000L);
    }
    
    /* add a block thus marking a block to be moved */
    synchronized private void add(BalancerBlock block) {
      movedBlocks.get(CUR_WIN).put(block.getBlock(), block);
    }

    /* check if a block is marked as moved */
    synchronized private boolean contains(BalancerBlock block) {
      return contains(block.getBlock());
    }

    /* check if a block is marked as moved */
    synchronized private boolean contains(Block block) {
      return movedBlocks.get(CUR_WIN).containsKey(block) ||
        movedBlocks.get(OLD_WIN).containsKey(block);
    }

    /* remove old blocks */
    synchronized private void cleanup() {
      long curTime = System.currentTimeMillis();
      // check if old win is older than winWidth
      if (lastCleanupTime + winWidth <= curTime) {
        // purge the old window
        movedBlocks.set(OLD_WIN, movedBlocks.get(CUR_WIN));
        movedBlocks.set(CUR_WIN, new HashMap<Block, BalancerBlock>());
        lastCleanupTime = curTime;
      }
    }
  }

  /* Decide if it is OK to move the given block from source to target
   * A block is a good candidate if
   * 1. the block is not in the process of being moved/has not been moved;
   * 2. the block does not have a replica on the target;
   * 3. doing the move does not reduce the number of racks that the block has
   */
  private boolean isGoodBlockCandidate(Source source,
      BalancerDatanode target, BalancerBlock block) {
    // check if the block is moved or not
    if (movedBlocks.contains(block)) {
        return false;
    }
    if (block.isLocatedOnDatanode(target)) {
      return false;
    }

    boolean goodBlock = false;
    if (cluster.isOnSameRack(source.getDatanode(), target.getDatanode())) {
      // good if source and target are on the same rack
      goodBlock = true;
    } else {
      boolean notOnSameRack = true;
      synchronized (block) {
        for (BalancerDatanode loc : block.locations) {
          if (cluster.isOnSameRack(loc.datanode, target.datanode)) {
            notOnSameRack = false;
            break;
          }
        }
      }
      if (notOnSameRack) {
        // good if target is target is not on the same rack as any replica
        goodBlock = true;
      } else {
        // good if source is on the same rack as on of the replicas
        for (BalancerDatanode loc : block.locations) {
          if (loc != source &&
              cluster.isOnSameRack(loc.datanode, source.datanode)) {
            goodBlock = true;
            break;
          }
        }
      }
    }
    return goodBlock;
  }

  /* reset all fields in a balancer preparing for the next iteration */
  private void resetData() {
    this.cluster = new NetworkTopology();
    this.overUtilizedDatanodes.clear();
    this.aboveAvgUtilizedDatanodes.clear();
    this.belowAvgUtilizedDatanodes.clear();
    this.underUtilizedDatanodes.clear();
    this.datanodes.clear();
    this.sources.clear();
    this.targets.clear();
    this.avgRemaining = 0.0D;
    cleanGlobalBlockList();
    this.movedBlocks.cleanup();
  }

  /* Remove all blocks from the global block list except for the ones in the
   * moved list.
   */
  private void cleanGlobalBlockList() {
    for (Iterator<Block> globalBlockListIterator=globalBlockList.keySet().iterator();
    globalBlockListIterator.hasNext();) {
      Block block = globalBlockListIterator.next();
      if(!movedBlocks.contains(block)) {
        globalBlockListIterator.remove();
      }
    }
  }

  /* Return true if the given datanode is overUtilized */
  private boolean isOverUtilized(BalancerDatanode datanode) {
    return datanode.remaining < (avgRemaining-threshold);
  }

  /* Return true if the given datanode is above average utilized
   * but not overUtilized */
  private boolean isAboveAvgUtilized(BalancerDatanode datanode) {
    return (datanode.remaining >= (avgRemaining-threshold))
        && (datanode.remaining < avgRemaining);
  }

  /* Return true if the given datanode is underUtilized */
  private boolean isUnderUtilized(BalancerDatanode datanode) {
    return datanode.remaining > (avgRemaining+threshold);
  }

  /* Return true if the given datanode is below average utilized
   * but not underUtilized */
  private boolean isBelowAvgUtilized(BalancerDatanode datanode) {
    return (datanode.remaining <= (avgRemaining+threshold))
           && (datanode.remaining < avgRemaining);
  }

  @SuppressWarnings(value = { "static-access" })
  private Options setupOptions() {
    Options cliOpts = new Options();
    cliOpts.addOption(OptionBuilder.hasArg().hasArgs(1).withDescription(
        "percentage of disk capacity. Default is 10")
        .isRequired(false).create("threshold"));
    cliOpts.addOption(OptionBuilder.hasArg().hasArgs(1).isRequired(false)
        .withDescription("The length of an iteration in minutes. " +
                          "Default is " + maxIterationTime/(60 * 1000)).
        create("iter_len"));
    cliOpts.addOption(OptionBuilder.hasArg().hasArgs(1).isRequired(false)
        .withDescription("The number of blocks to move in parallel to " +
        "one node. Default is " + MAX_NUM_CONCURRENT_MOVES).
        create("node_par_moves"));
    cliOpts.addOption(OptionBuilder.hasArg().hasArgs(1).isRequired(false)
        .withDescription("The number of blocks to move in parallel " +
        "in total for the cluster. Default is " + MOVER_THREAD_POOL_SIZE)
        .create("par_moves"));
    return cliOpts;
  }
  
  // Exit status
  final public static int SUCCESS = 1;
  final public static int IN_PROGRESS = 0;
  final public static int ALREADY_RUNNING = -1;
  final public static int NO_MOVE_BLOCK = -2;
  final public static int NO_MOVE_PROGRESS = -3;
  final public static int IO_EXCEPTION = -4;
  final public static int ILLEGAL_ARGS = -5;
  final public static int INTERRUPTED = -6;
  
  public int run(String[] args) throws Exception {
    final long startTime = Util.now();
    try {
      checkReplicationPolicyCompatibility(conf);
      final List<InetSocketAddress> namenodes = DFSUtil.getClientRpcAddresses(conf, null);
      parse(args);
      return Balancer.run(namenodes, conf);
    } catch (IOException e) {
      System.out.println(e + ".  Exiting ...");
      return IO_EXCEPTION;
    } catch (InterruptedException e) {
      System.out.println(e + ".  Exiting ...");
      return INTERRUPTED;
    } catch (Exception e) {
      e.printStackTrace();
      return ILLEGAL_ARGS; 
    } finally {
      System.out.println("Balancing took " + time2Str(Util.now()-startTime));
    }
  }
  
  /** parse command line arguments */
  private void parse(String[] args) {
    Options cliOpts = setupOptions();
    BasicParser parser = new BasicParser();
    CommandLine cl = null;
    try {
      try {
       cl = parser.parse(cliOpts, args);
      } catch (ParseException ex) {
        throw new IllegalArgumentException("args = " + Arrays.toString(args));
      }

      int newThreshold = Integer.parseInt(cl.getOptionValue("threshold", "10"));
      int iterationTime = Integer.parseInt(cl.getOptionValue("iter_len",
                                 String.valueOf(maxIterationTime/(60 * 1000))));
      maxConcurrentMoves = Integer.parseInt(cl.getOptionValue("node_par_moves",
                                     String.valueOf(MAX_NUM_CONCURRENT_MOVES)));
      moveThreads = Integer.parseInt(cl.getOptionValue("par_moves",
                                     String.valueOf(MOVER_THREAD_POOL_SIZE)));
      maxIterationTime = iterationTime * 60 * 1000L;
      
      threshold = checkThreshold(newThreshold);
      System.out.println("Running with threshold of " + threshold
          + " and iteration time of " + maxIterationTime + " milliseconds");
    } catch (RuntimeException e) {
      printUsage(cliOpts);
      throw e;
    }
  }

  
  /**
   * Balance all namenodes.
   * For each iteration,
   * for each namenode,
   * execute a {@link Balancer} to work through all datanodes once.  
   */
  static int run(List<InetSocketAddress> namenodes,
      Configuration conf) throws IOException, InterruptedException {
    final long sleeptime = 2*conf.getLong("dfs.heartbeat.interval", 3);
    LOG.info("namenodes = " + namenodes);
    Formatter formatter = new Formatter(System.out);
    System.out.println("Time Stamp               Iteration#  Bytes Already Moved  Bytes Left To Move  Bytes Being Moved  Iterations Left  Seconds Left");
    
    final List<Balancer> balancers 
        = new ArrayList<Balancer>(namenodes.size());
    try {
      for(InetSocketAddress isa : namenodes) {
        try{
          Balancer b = new Balancer(conf);
          b.init(isa);
          balancers.add(b);
        } catch (IOException e) {
          e.printStackTrace();
          LOG.error("Cannot connect to namenode: " + isa);
        }
      }
    
      boolean done = false;
      for(int iterations = 0; !done && balancers.size() > 0; iterations++) {
        done = true;
        Collections.shuffle(balancers);
        Iterator<Balancer> iter = balancers.iterator();
        while (iter.hasNext()) {
          Balancer b = iter.next();
          b.resetData();
          final int r = b.run(iterations, formatter);
          if (r == IN_PROGRESS) {
            done = false;
          } else if (r != SUCCESS) {
            //Remove this balancer
            b.close();
            LOG.info("Namenode " + b.namenodeAddress + " balancing exits...");
            iter.remove();
            continue;
          }
        }

        if (!done) {
          Thread.sleep(sleeptime);
        }
      }
    } finally {
      for(Balancer b : balancers) {
        b.close();
      }
    }
    return SUCCESS;
  }

  public int run(int iterations, Formatter formatter){
    try {
      /* get all live datanodes of a cluster and their disk usage
       * decide the number of bytes need to be moved
       */
      long bytesLeftToMove = initNodes();
      if (bytesLeftToMove == 0) {
        System.out.println("The cluster is balanced. Exiting...");
        return SUCCESS;
      } else {
        LOG.info( "Need to move "+ StringUtils.byteDesc(bytesLeftToMove)
            +" bytes to make the cluster balanced." );
      }

      /* Decide all the nodes that will participate in the block move and
       * the number of bytes that need to be moved from one node to another
       * in this iteration. Maximum bytes to be moved per node is
       * Min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).
       */
      final long bytesToMove = chooseNodes();
      if (bytesToMove == 0) {
        System.out.println("No block can be moved. Exiting...");
        return NO_MOVE_BLOCK;
      } else {
        LOG.info( "Will move " + StringUtils.byteDesc(bytesToMove) +
            "bytes in this iteration");
      }

      long moved = bytesMoved.get();
      String iterationsLeft = "N/A";
      String timeLeft = "N/A";
      if (iterations != 0 && moved != 0) {
        long bytesPerIteration = moved / iterations;
        long iterLeft = bytesLeftToMove / bytesPerIteration;
        iterationsLeft = String.valueOf(iterLeft );
        long secondsPerIteration = (maxIterationTime + blockMoveWaitTime)/1000;
        long secondsLeft = secondsPerIteration * iterLeft;
        long daysLeft = TimeUnit.SECONDS.toDays(secondsLeft);
        timeLeft = "";
        if (daysLeft > 0) {
          timeLeft = timeLeft + daysLeft + "d ";
        }
        long hoursLeft = TimeUnit.SECONDS.toHours(secondsLeft) -
                         TimeUnit.DAYS.toHours(daysLeft);
        if (hoursLeft > 0) {
          timeLeft = timeLeft + hoursLeft + "h ";
        }
        long minutesLeft = TimeUnit.SECONDS.toMinutes(secondsLeft) -
                           TimeUnit.HOURS.toMinutes(hoursLeft) -
                           TimeUnit.DAYS.toMinutes(daysLeft);
        timeLeft = timeLeft + minutesLeft + "m";

      }

      formatter.format("%-24s %10d  %19s  %18s  %17s  %15s  %12s\n",
          DateFormat.getDateTimeInstance().format(new Date()),
          iterations,
          StringUtils.byteDesc(bytesMoved.get()),
          StringUtils.byteDesc(bytesLeftToMove),
          StringUtils.byteDesc(bytesToMove),
          iterationsLeft,
          timeLeft
          );

      /* For each pair of <source, target>, start a thread that repeatedly
       * decide a block to be moved and its proxy source,
       * then initiates the move until all bytes are moved or no more block
       * available to move.
       * Exit no byte has been moved for 5 consecutive iterations.
       */
      if (dispatchBlockMoves() > 0) {
        notChangedIterations = 0;
      } else {
        notChangedIterations++;
        if (notChangedIterations >= 5) {
          System.out.println(
              "No block has been moved for 5 iterations. Exiting...");
          return NO_MOVE_PROGRESS;
        }
      }

      // clean all lists
      resetData();

      return IN_PROGRESS;
    } catch (IllegalArgumentException ae) {
      ae.printStackTrace();
      return ILLEGAL_ARGS;
    } catch (IOException e) {
      e.printStackTrace();
      System.out.println("Received an IO exception: " + e.getMessage() +
          " . Exiting...");
      return IO_EXCEPTION;
    } catch (InterruptedException e) {
      e.printStackTrace();
      return INTERRUPTED;
    } catch (Exception ex) {
      ex.printStackTrace();
      return ILLEGAL_ARGS;
    } finally {
    }
  }
  
  /** Close the connection. */
  void close() {
    // shutdown thread pools
    dispatcherExecutor.shutdownNow();
    moverExecutor.shutdownNow();
    // close the output file
    IOUtils.closeStream(out);
    if (fs != null) {
      try {
        fs.delete(BALANCER_ID_PATH, true);
      } catch(IOException ignored) {
      }
    }
  }
  
  private Path BALANCER_ID_PATH = new Path("/system/balancer.id");
  /* The idea for making sure that there is no more than one balancer
   * running in an HDFS is to create a file in the HDFS, writes the IP address
   * of the machine on which the balancer is running to the file, but did not
   * close the file until the balancer exits.
   * This prevents the second balancer from running because it can not
   * creates the file while the first one is running.
   *
   * This method checks if there is any running balancer and
   * if no, mark yes if no.
   * Note that this is an atomic operation.
   *
   * Return null if there is a running balancer; otherwise the output stream
   * to the newly created file.
   */
  private OutputStream checkAndMarkRunningBalancer() throws IOException {
    try {
      DataOutputStream out = fs.create(BALANCER_ID_PATH);
      out. writeBytes(InetAddress.getLocalHost().getHostName());
      out.flush();
      return out;
    } catch(RemoteException e) {
      if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
        return null;
      } else {
        throw e;
      }
    }
  }

  /* Given elaspedTime in ms, return a printable string */
  private static String time2Str(long elapsedTime) {
    String unit;
    double time = elapsedTime;
    if (elapsedTime < 1000) {
      unit = "milliseconds";
    } else if (elapsedTime < 60*1000) {
      unit = "seconds";
      time = time/1000;
    } else if (elapsedTime < 3600*1000) {
      unit = "minutes";
      time = time/(60*1000);
    } else {
      unit = "hours";
      time = time/(3600*1000);
    }

    return time+" "+unit;
  }
  
  /** return this balancer's configuration */
  public Configuration getConf() {
    return conf;
  }

  /** set this balancer's configuration */
  public void setConf(Configuration conf) {
    this.conf = conf;
    movedBlocks.setWinWidth(conf);
  }
  
}