 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.hadoop.hdfs.server.namenode;

import org.apache.commons.logging.*;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSLocatedBlocks;
import org.apache.hadoop.hdfs.DFSInputStream;
import org.apache.hadoop.hdfs.protocol.*;
import org.apache.hadoop.hdfs.server.common.Storage.*;
import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
import org.apache.hadoop.hdfs.server.namenode.FSImage.CheckpointStates;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager.*;
import org.apache.hadoop.hdfs.server.namenode.WaitingRoom.*;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.SnapshotProtocol;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.ipc.*;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.hdfs.util.LightWeightLinkedSet;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.concurrent.*;

import org.apache.hadoop.metrics.jvm.JvmMetrics;

 * The SnapshotNode is responsible for taking periodic 
 * snapshots of the HDFS. The current design only allows
 * one SnapshotNode per cluster.
 * The SnapshotNode is a daemon that periodically wakes
 * up (determined by the schedule specified in the configuration),
 * triggers a periodic snapshot and then goes back to sleep.
 * The SnapshotNode uses the Namesystem's jetty server to 
 * retreive files.
public class SnapshotNode implements SnapshotProtocol {

  public static final Log LOG =

  public static final String CURRENT_DIR = "/current";
  public static String SSNAME = "dfs_snapshot_"; // prefix of ss files

  private Configuration conf; // conf

  private String fileServer; // jetty image server namenode listens on
  private FileSystem dfs; // file system

  private String tempDir; // temp dir to download files from namenode
  private String ssDir; // path to store snapshots in

  private Daemon purgeThread; //waiting room purger thread

  private ExecutorService leaseUpdateThreadPool;
  private int maxLeaseUpdateThreads;

  private Server server; // RPC Server
  private InetSocketAddress serverAddress = null; // RPC server address

  private NamenodeProtocol namenode;
  private InetSocketAddress nameNodeAddr;

  public SnapshotNode(Configuration conf) {
    try {
      this.conf = conf;
    } catch (IOException e) {
      LOG.error("Failed to start SnapshotNode");

   * Initialize SnapshotNode
   * @throws IOException
  private void init() throws IOException {
    ssDir = conf.get("fs.snapshot.dir", "/.SNAPSHOT");
    tempDir = conf.get("fs.snapshot.tempdir", "/tmp/snapshot");

    fileServer = getImageServer();
    dfs = FileSystem.get(conf);

    Path ssPath = new Path(ssDir);
    if (!dfs.exists(ssPath)) {

    maxLeaseUpdateThreads = conf.getInt("fs.snapshot.leaseupdatethreads", 100);

    // Waiting room purge thread
    purgeThread = new Daemon((new WaitingRoom(conf)).getPurger());

    // Get namenode rpc connection
    nameNodeAddr = NameNode.getAddress(conf);
    namenode = (NamenodeProtocol) RPC.waitForProxy(NamenodeProtocol.class,
                               NamenodeProtocol.versionID, nameNodeAddr, conf);

    // Snapshot RPC Server
    InetSocketAddress socAddr = SnapshotNode.getAddress(conf);
    int handlerCount = conf.getInt("fs.snapshot.handler.count", 10);
    server = RPC.getServer(this, socAddr.getHostName(), socAddr.getPort(),
                           handlerCount, false, conf);
    // The rpc-server port can be ephemeral... ensure we have the correct info
    serverAddress = server.getListenerAddress();
    LOG.info("SnapshotNode up at: " + serverAddress);

    server.start(); // start rpc server

  private static InetSocketAddress getAddress(String address) {
    return NetUtils.createSocketAddr(address);

  public static InetSocketAddress getAddress(Configuration conf) {
    String nodeport = conf.get("fs.snapshot.server.address");
    if (nodeport == null) {
      nodeport = "localhost:" + 60000; // DEFAULT PORT
    return getAddress(nodeport);

  public long getProtocolVersion(String protocol, long clientVersion) throws IOException {
    if (protocol.equals(SnapshotProtocol.class.getName())) {
      return SnapshotProtocol.versionID;

    throw new IOException("Unknown protocol to snapshot node: " + protocol);

  public ProtocolSignature getProtocolSignature(String protocol, long clientVersion, 
                                         int clientMethodsHash) throws IOException {
    return ProtocolSignature.getProtocolSignature(this, protocol, clientVersion, 

  void prepareDownloadDirs() throws IOException {
    // Check if temp dir exists
    File temp = new File(tempDir);
    if (!temp.exists()) temp.mkdirs();
    if(!temp.isDirectory()) throw new IOException("Temp Dir: " +
                               tempDir + " is not a directory.");

    // Check if current dir in temp exists
    temp = new File(tempDir + CURRENT_DIR);
    if (!temp.exists()) temp.mkdir();
    if(!temp.isDirectory()) throw new IOException("Current in Temp Dir: " +
                           tempDir + CURRENT_DIR + " is not a directory.");

    // Delete all previously downloaded files
    for (File f: temp.listFiles()) {

   * Shutdown snapshot node and attached daemons
  public void shutdown() {
    if (purgeThread != null) {
      WaitingRoomPurger purger = (WaitingRoomPurger) purgeThread.getRunnable();

    if (server != null) server.stop();

   * Shutdown snapshot node and attached daemons
  public void shutdownWaitingRoomPurger() {
    if (purgeThread != null) {
      WaitingRoomPurger purger = (WaitingRoomPurger) purgeThread.getRunnable();


  public String[] listSnapshots() throws IOException {
    Path ssPath = new Path(ssDir);

    if (!dfs.exists(ssPath)) {
      throw new FileNotFoundException("Snapshot dir doesn't exist");

    FileStatus ssStatus = dfs.getFileStatus(ssPath);
      if (!ssStatus.isDir()) {
        throw new IOException("ssDir " + ssDir  +" is not a directory");

    FileStatus[] files = dfs.listStatus(ssPath);
    List<String> ssIds = new ArrayList<String>();

    // Separate snapshot files
    for (FileStatus ss: files) {
      if (ss.isDir()) continue; // skips dirs
      String name = ss.getPath().getName();
      if (!name.startsWith("dfs_snapshot_")) continue;

    String[] rtn = new String[ssIds.size()];
    for (int i = 0; i < ssIds.size(); i++) {
      rtn[i] = ssIds.get(i);

    return rtn;

  public FileStatus getSnapshotFileStatus(String id) throws IOException {
    Path ss = new Path(ssDir + "/" + SSNAME + id);
    return dfs.getFileStatus(ss);

  public boolean deleteSnapshot(String id) throws IOException {
    Path fileToDelete = new Path(ssDir + "/" + SSNAME + id);
    return dfs.delete(fileToDelete, false);

  public LocatedBlocksWithMetaInfo[] getLocatedBlocks(String snapshotId,
      String path)
  throws IOException {
    FSImage fsImage = new FSImage();
    FSNamesystem namesystem = new FSNamesystem(fsImage, conf);
    Path ssPath = new Path(ssDir + "/" + SSNAME + snapshotId);
    FSDataInputStream in = dfs.open(ssPath);
    fsImage.loadFSImage(new File(ssPath.toString()), in);
    INode inode = namesystem.dir.getInode(path);

    if (inode == null) {
      throw new IOException("File/dir at " + path + 
                            " does not exist in snapshot " + snapshotId);

    List<LocatedBlocksWithMetaInfo> blocks = new ArrayList<LocatedBlocksWithMetaInfo>();
    getAllLocatedBlocks(inode, blocks); // fill blocks with LocatedBlocks for all files

    LocatedBlocksWithMetaInfo[] blocksArr = new LocatedBlocksWithMetaInfo[blocks
    for (int i = 0; i < blocksArr.length; ++i) {
      blocksArr[i] = blocks.get(i);

    return blocksArr;

  public void createSnapshot(String snapshotId, boolean updateLeases) throws IOException {
    // Create new SnapshotStore
    SnapshotStorage ssStore = new SnapshotStorage(conf, new File(tempDir));

    // Download image & edit files from namenode

    // Merge image and edit files

    // Update file lengths for leased files (optional)
    if (updateLeases) {

    // Save snapshot
    saveSnapshot(ssStore, snapshotId);

  private void getAllLocatedBlocks(INode inode,
      List<LocatedBlocksWithMetaInfo> blocks)
  throws IOException {
    if (inode.isDirectory()) {
      INodeDirectory dir = (INodeDirectory) inode;
      for (INode child: dir.getChildren()) {
        getAllLocatedBlocks(child, blocks);
    } else {
      INodeFile file = (INodeFile) inode;
      BlockInfo[] fileBlocks = file.getBlocks();
      List<LocatedBlock> lb = new ArrayList<LocatedBlock>();
      for (BlockInfo block: fileBlocks) {
        // DatanodeInfo is unavailable, so set as empty for now
        lb.add(new LocatedBlock(block, new DatanodeInfo[0]));

      LocatedBlocks locatedBlocks =  new LocatedBlocks(
                             file.computeContentSummary().getLength(), // flength
                             lb, // blks
                             false); // isUnderConstruction

      // Update DatanodeInfo from NN

  void saveSnapshot(SnapshotStorage ssStore, String id) throws IOException {
    // Create new snapshot in temp file
    Path tmpPath = new Path("/tmp/" + SSNAME + id);
    FSDataOutputStream out = dfs.create(tmpPath);
    ssStore.saveSnapshot(tmpPath.toString(), out);

    // Rename snapshot
    Path ssPath = new Path(ssDir + "/" + SSNAME + id);
    if (!dfs.rename(tmpPath, ssPath)) {
      throw new IOException("Could not rename temp snapshot file");

  void doMerge(SnapshotStorage ssStore) throws IOException {
    FSNamesystem namesystem = new FSNamesystem(ssStore, conf);

   * Create a snapshot with id equals to 
   * current system time.
  void createSnapshot() throws IOException {
    createSnapshot(Long.toString(System.currentTimeMillis()), true);

  void createSnapshot(String id) throws IOException {
    createSnapshot(id, true);

   * Tries to get the most up to date lengths of files under construction.
  void updateLeasedFiles(SnapshotStorage ssStore) throws IOException {
    FSNamesystem fsNamesys = ssStore.getFSNamesystem();
    List<Block> blocksForNN = new ArrayList<Block>();

    leaseUpdateThreadPool = new ThreadPoolExecutor(1, maxLeaseUpdateThreads, 60, 
                                                new LinkedBlockingQueue<Runnable>());

    // Try to update lengths for leases from DN
    LightWeightLinkedSet<Lease> sortedLeases = fsNamesys.leaseManager.getSortedLeases();
    Iterator<Lease> itr = sortedLeases.iterator();
    while (itr.hasNext()) {
      Lease lease = itr.next();
      for (String path : lease.getPaths()) {
        // Update file lengths using worker threads to increase throughput
                   new LeaseUpdateWorker(conf, path, fsNamesys, blocksForNN));

    try {
      // Wait till update tasks finish successfully (max 20 mins?)
      if (!leaseUpdateThreadPool.awaitTermination(1200, TimeUnit.SECONDS)) {
        throw new IOException("Updating lease files failed");
    } catch (InterruptedException e) {
        throw new IOException("Snapshot creation interrupted while updating leased files");

    // Fetch block lengths for renamed/deleted leases from NN
    long[] blockIds = new long[blocksForNN.size()];

    for (int i = 0; i < blocksForNN.size(); ++i) {
      blockIds[i] = blocksForNN.get(i).getBlockId();

    long[] lengths = namenode.getBlockLengths(blockIds);

    for (int i = 0; i < blocksForNN.size(); ++i) {
      if (lengths[i] == -1) {
        // Couldn't update block length, keep preferred length
        LOG.error("Couldn't update length for block " + blocksForNN.get(i));
      } else {

   * Download fsimage, edits and edits.new files from the name-node.
   * Files will be downloaded in CURRENT_DIR
   * @throws IOException
  void downloadSnapshotFiles(SnapshotStorage ssStore) throws IOException {
    CheckpointSignature start = namenode.getCheckpointSignature();
    CheckpointSignature end = null;
    boolean success;

    do {
      // Clear temp files

      // get fsimage
      String fileId = "getimage=1";
      File[] srcNames = ssStore.getImageFiles();
      assert srcNames.length == 1 : "No snapshot temporary dir.";
      TransferFsImage.getFileClient(fileServer, fileId, srcNames, false);
      LOG.info("Downloaded file " + srcNames[0].getName() + " size " +
               srcNames[0].length() + " bytes.");

      // get edits file
      fileId = "getedit=1";
      srcNames = ssStore.getEditsFiles();
      assert srcNames.length == 1 : "No snapshot temporary dir.";
      TransferFsImage.getFileClient(fileServer, fileId, srcNames, false);
      LOG.info("Downloaded file " + srcNames[0].getName() + " size " +
               srcNames[0].length() + " bytes.");

      // get edits.new file (only if in the middle of ckpt)
      try {
        fileId = "geteditnew=1";
        srcNames = ssStore.getEditsNewFiles();
        assert srcNames.length == 1 : "No snapshot temporary dir.";
        TransferFsImage.getFileClient(fileServer, fileId, srcNames, false);
        LOG.info("Downloaded file " + srcNames[0].getName() + " size " +
               srcNames[0].length() + " bytes.");
      } catch (FileNotFoundException e) {
        // do nothing

      end = namenode.getCheckpointSignature();

      // Are the downloaded files consistent?
      success = end.checkpointTime == start.checkpointTime && 
                end.checkpointState != CheckpointStates.UPLOAD_DONE;

      start = end;
    } while (!success);

   * Returns the jetty image server that the Namenode is listening on.
   * @throws IOException
  private String getImageServer() throws IOException {
    URI fsName = FileSystem.getDefaultUri(conf);

    if (!"hdfs".equals(fsName.getScheme())) {
      throw new IOException("This is not a DFS");

    return NetUtils.getServerAddress(conf, "dfs.info.bindAddress", 
                             "dfs.info.port", "dfs.http.address");

  static class SnapshotStorage extends FSImage {
    Configuration conf;
    File tempDir;
    DataOutputStream out;

    public SnapshotStorage(Configuration conf, File tempDir) throws IOException {
      this.conf = conf;
      this.tempDir = tempDir;

    public boolean isConversionNeeded(StorageDirectory sd) {
      return false;

     * Merge image and edit log (in memory).
     * Files to merge include fsimage, edits, and possibly edits.new
     * @throws IOException
    void doMerge() throws IOException {
      StorageDirectory sdTemp = null;
      Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.IMAGE_AND_EDITS);
      if (it.hasNext()) {
        sdTemp = it.next();
      } else {
        throw new IOException("Could not locate snapshot temp directory.");

      loadFSImage(getImageFile(sdTemp, NameNodeFile.IMAGE));

     * Writes snapshot to the OutputStream.
     * @param out Stream to write snapshot to
    void saveSnapshot(String dest, DataOutputStream out) throws IOException {
      saveFSImage(dest, out);

  private class LeaseUpdateWorker implements Runnable {
    String path;
    Configuration conf;
    List<Block> blocks;
    FSNamesystem fsNamesys;

    public LeaseUpdateWorker(Configuration conf, String path, 
                             FSNamesystem namesystem, List<Block> blocks) {
      this.path = path;
      this.conf = conf;
      this.blocks = blocks;
      this.fsNamesys = namesystem;
    public void run() {
      boolean error = false;
      INodeFile node = null;
      DFSClient client = null;

      try {
        client = new DFSClient(conf);

	LOG.info("Trying to update lease for file at " + path);

        // verify that path exists in namespace
        node = fsNamesys.dir.getFileINode(path);
        if (node == null) {
          error = true;
        if (!node.isUnderConstruction()) {
          error = true;
      catch (IOException e) {
        error = true;

      // Could not find inode in FSNamespace, quit now
      if (error) {
        LOG.error("Couldn't update length for leased file at " + path +
                  " because file not in namespace");

      BlockInfo[] blks = node.getBlocks();

      // If NN has not leased out any block, return
      if (blks.length == 0) return;

      int index = blks.length - 1; // index of last file block

      LOG.info("Block at index " + index + " being written for file at  " +

      // Pessimistically update last block length from DataNode. 
      // File could have been renamed, and a new file created in its place.
      try { 
        DFSInputStream stm = client.open(path);
        DFSLocatedBlocks locBlks = stm.fetchLocatedBlocks();

        if (locBlks.locatedBlockCount() >= blks.length) {
          if (blks[index] != null && locBlks.get(index) != null) {
            if (blks[index].getBlockId() == locBlks.get(index).getBlock().getBlockId()) {

        client.close(); // close dfs client
      catch (IOException e) {

      // If file was renamed/deleted, set block length to preferred size
      // and add it to list of blocks which we should try to update from NN
      LOG.info("Couldn't update block " + blks[index] + " for file " +
               "at " + path + " from DN. Setting length to preferred length " +
               "and queuing block to be checked from NN for updated length.");

      synchronized(blocks) {      