Java Code Examples for org.apache.hadoop.hdfs.MiniDFSCluster#restartDataNode()

The following examples show how to use org.apache.hadoop.hdfs.MiniDFSCluster#restartDataNode() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestPendingCorruptDnMessages.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private static boolean wipeAndRestartDn(MiniDFSCluster cluster, int dnIndex)
    throws IOException {
  // stop the DN, reformat it, then start it again with the same xfer port.
  DataNodeProperties dnProps = cluster.stopDataNode(dnIndex);
  cluster.formatDataNodeDirs();
  return cluster.restartDataNode(dnProps, true);
}
 
Example 2
Source File: TestProcessCorruptBlocks.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * The corrupt block has to be removed when the number of valid replicas
 * matches replication factor for the file. In this test, the above 
 * condition is achieved by increasing the number of good replicas by 
 * replicating on a new Datanode. 
 * The test strategy : 
 *   Bring up Cluster with 3 DataNodes
 *   Create a file  of replication factor 3
 *   Corrupt one replica of a block of the file 
 *   Verify that there are still 2 good replicas and 1 corrupt replica 
 *     (corrupt replica should not be removed since number of good replicas
 *      (2) is less  than replication factor (3)) 
 *   Start a new data node 
 *   Verify that the a new replica is created and corrupt replica is
 *   removed.
 * 
 */
@Test
public void testByAddingAnExtraDataNode() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
  FileSystem fs = cluster.getFileSystem();
  final FSNamesystem namesystem = cluster.getNamesystem();
  DataNodeProperties dnPropsFourth = cluster.stopDataNode(3);

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short) 3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    corruptBlock(cluster, fs, fileName, 0, block);

    DFSTestUtil.waitReplication(fs, fileName, (short) 2);

    assertEquals(2, countReplicas(namesystem, block).liveReplicas());
    assertEquals(1, countReplicas(namesystem, block).corruptReplicas());

    cluster.restartDataNode(dnPropsFourth);

    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    assertEquals(3, countReplicas(namesystem, block).liveReplicas());
    assertEquals(0, countReplicas(namesystem, block).corruptReplicas());
  } finally {
    cluster.shutdown();
  }
}
 
Example 3
Source File: TestProcessCorruptBlocks.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private void corruptBlock(MiniDFSCluster cluster, FileSystem fs, final Path fileName,
    int dnIndex, ExtendedBlock block) throws IOException {
  // corrupt the block on datanode dnIndex
  // the indexes change once the nodes are restarted.
  // But the datadirectory will not change
  assertTrue(cluster.corruptReplica(dnIndex, block));

  DataNodeProperties dnProps = cluster.stopDataNode(0);

  // Each datanode has multiple data dirs, check each
  for (int dirIndex = 0; dirIndex < 2; dirIndex++) {
    final String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getStorageDir(dnIndex, dirIndex);
    File dataDir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
    File scanLogFile = new File(dataDir, "dncp_block_verification.log.curr");
    if (scanLogFile.exists()) {
      // wait for one minute for deletion to succeed;
      for (int i = 0; !scanLogFile.delete(); i++) {
        assertTrue("Could not delete log file in one minute", i < 60);
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  // restart the detained so the corrupt replica will be detected
  cluster.restartDataNode(dnProps);
}
 
Example 4
Source File: TestProcessCorruptBlocks.java    From big-c with Apache License 2.0 5 votes vote down vote up
private void corruptBlock(MiniDFSCluster cluster, FileSystem fs, final Path fileName,
    int dnIndex, ExtendedBlock block) throws IOException {
  // corrupt the block on datanode dnIndex
  // the indexes change once the nodes are restarted.
  // But the datadirectory will not change
  assertTrue(cluster.corruptReplica(dnIndex, block));

  DataNodeProperties dnProps = cluster.stopDataNode(0);

  // Each datanode has multiple data dirs, check each
  for (int dirIndex = 0; dirIndex < 2; dirIndex++) {
    final String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getStorageDir(dnIndex, dirIndex);
    File dataDir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
    File scanLogFile = new File(dataDir, "dncp_block_verification.log.curr");
    if (scanLogFile.exists()) {
      // wait for one minute for deletion to succeed;
      for (int i = 0; !scanLogFile.delete(); i++) {
        assertTrue("Could not delete log file in one minute", i < 60);
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  // restart the detained so the corrupt replica will be detected
  cluster.restartDataNode(dnProps);
}
 
Example 5
Source File: TestPendingCorruptDnMessages.java    From big-c with Apache License 2.0 5 votes vote down vote up
private static boolean wipeAndRestartDn(MiniDFSCluster cluster, int dnIndex)
    throws IOException {
  // stop the DN, reformat it, then start it again with the same xfer port.
  DataNodeProperties dnProps = cluster.stopDataNode(dnIndex);
  cluster.formatDataNodeDirs();
  return cluster.restartDataNode(dnProps, true);
}
 
Example 6
Source File: TestProcessCorruptBlocks.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * The corrupt block has to be removed when the number of valid replicas
 * matches replication factor for the file. In this test, the above 
 * condition is achieved by increasing the number of good replicas by 
 * replicating on a new Datanode. 
 * The test strategy : 
 *   Bring up Cluster with 3 DataNodes
 *   Create a file  of replication factor 3
 *   Corrupt one replica of a block of the file 
 *   Verify that there are still 2 good replicas and 1 corrupt replica 
 *     (corrupt replica should not be removed since number of good replicas
 *      (2) is less  than replication factor (3)) 
 *   Start a new data node 
 *   Verify that the a new replica is created and corrupt replica is
 *   removed.
 * 
 */
@Test
public void testByAddingAnExtraDataNode() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
  FileSystem fs = cluster.getFileSystem();
  final FSNamesystem namesystem = cluster.getNamesystem();
  DataNodeProperties dnPropsFourth = cluster.stopDataNode(3);

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short) 3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    corruptBlock(cluster, fs, fileName, 0, block);

    DFSTestUtil.waitReplication(fs, fileName, (short) 2);

    assertEquals(2, countReplicas(namesystem, block).liveReplicas());
    assertEquals(1, countReplicas(namesystem, block).corruptReplicas());

    cluster.restartDataNode(dnPropsFourth);

    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    assertEquals(3, countReplicas(namesystem, block).liveReplicas());
    assertEquals(0, countReplicas(namesystem, block).corruptReplicas());
  } finally {
    cluster.shutdown();
  }
}
 
Example 7
Source File: TestStandbyIsHot.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Regression test for HDFS-2795:
 *  - Start an HA cluster with a DN.
 *  - Write several blocks to the FS with replication 1.
 *  - Shutdown the DN
 *  - Wait for the NNs to declare the DN dead. All blocks will be under-replicated.
 *  - Restart the DN.
 * In the bug, the standby node would only very slowly notice the blocks returning
 * to the cluster.
 */
@Test(timeout=60000)
public void testDatanodeRestarts() throws Exception {
  Configuration conf = new Configuration();
  conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
  // We read from the standby to watch block locations
  HAUtil.setAllowStandbyReads(conf, true);
  conf.setLong(DFSConfigKeys.DFS_NAMENODE_ACCESSTIME_PRECISION_KEY, 0);
  conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .nnTopology(MiniDFSNNTopology.simpleHATopology())
    .numDataNodes(1)
    .build();
  try {
    NameNode nn0 = cluster.getNameNode(0);
    NameNode nn1 = cluster.getNameNode(1);

    cluster.transitionToActive(0);
    
    // Create 5 blocks.
    DFSTestUtil.createFile(cluster.getFileSystem(0), 
        TEST_FILE_PATH, 5*1024, (short)1, 1L);
    
    HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
    
    // Stop the DN.
    DataNode dn = cluster.getDataNodes().get(0);
    String dnName = dn.getDatanodeId().getXferAddr(); 
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    
    // Make sure both NNs register it as dead.
    BlockManagerTestUtil.noticeDeadDatanode(nn0, dnName);
    BlockManagerTestUtil.noticeDeadDatanode(nn1, dnName);
    
    BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager());
    BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
    assertEquals(5, nn0.getNamesystem().getUnderReplicatedBlocks());
    
    // The SBN will not have any blocks in its neededReplication queue
    // since the SBN doesn't process replication.
    assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks());
    
    LocatedBlocks locs = nn1.getRpcServer().getBlockLocations(
        TEST_FILE, 0, 1);
    assertEquals("Standby should have registered that the block has no replicas",
        0, locs.get(0).getLocations().length);
    
    cluster.restartDataNode(dnProps);
    // Wait for both NNs to re-register the DN.
    cluster.waitActive(0);
    cluster.waitActive(1);
    
    BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager());
    BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
    assertEquals(0, nn0.getNamesystem().getUnderReplicatedBlocks());
    assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks());
    
    locs = nn1.getRpcServer().getBlockLocations(
        TEST_FILE, 0, 1);
    assertEquals("Standby should have registered that the block has replicas again",
        1, locs.get(0).getLocations().length);
  } finally {
    cluster.shutdown();
  }
}
 
Example 8
Source File: TestNodeCount.java    From RDFS with Apache License 2.0 4 votes vote down vote up
public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new Configuration();
  conf.setInt("dfs.replication.interval", 10);
  final short REPLICATION_FACTOR = (short)2;
  final MiniDFSCluster cluster = 
    new MiniDFSCluster(conf, REPLICATION_FACTOR, true, null);
  try {
    final FSNamesystem namesystem = cluster.getNameNode().namesystem;
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    Block block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    DatanodeDescriptor[] datanodes = (DatanodeDescriptor[])
       namesystem.heartbeats.toArray(new DatanodeDescriptor[REPLICATION_FACTOR]);
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive(false);
    
    LOG.info("Bringing down first DN");
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      datanode.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }

    LOG.info("Waiting for block to be replicated");
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    LOG.info("Restarting first datanode");
    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive(false);

    LOG.info("Waiting for excess replicas to be detected");
    
    // check if excessive replica is detected
    waitForExcessReplicasToChange(namesystem, block, 1);

    LOG.info("Finding a non-excess node");

    // find out a non-excess node
    Iterator<DatanodeDescriptor> iter = namesystem.blocksMap.nodeIterator(block);
    DatanodeDescriptor nonExcessDN = null;
    while (iter.hasNext()) {
      DatanodeDescriptor dn = iter.next();
      Collection<Block> blocks = namesystem.excessReplicateMap.get(dn.getStorageID());
      if (blocks == null || !blocks.contains(block) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);

    LOG.info("Stopping non-excess node: " + nonExcessDN);
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      nonExcessDN.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }
    
    LOG.info("Waiting for live replicas to hit repl factor");
    // The block should be replicated
    NumberReplicas num;
    do {
     namesystem.readLock();
     try {
       num = namesystem.countNodes(block);
     } finally {
       namesystem.readUnlock();
     }
    } while (num.liveReplicas() != REPLICATION_FACTOR);
    
    LOG.info("Restarting first DN");
    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive(false);
    // check if excessive replica is detected
    LOG.info("Waiting for excess replicas to be detected");
    waitForExcessReplicasToChange(namesystem, block, 2);
  } finally {
    cluster.shutdown();
  }
}
 
Example 9
Source File: TestUnderReplicatedBlocks.java    From RDFS with Apache License 2.0 4 votes vote down vote up
public void testUnderReplicationWithDecommissionDataNode() throws Exception {
  final Configuration conf = new Configuration();
  final short REPLICATION_FACTOR = (short)1;
  File f = new File(HOST_FILE_PATH);
  if (f.exists()) {
    f.delete();
  }
  conf.set("dfs.hosts.exclude", HOST_FILE_PATH);
  LOG.info("Start the cluster");
  final MiniDFSCluster cluster = 
    new MiniDFSCluster(conf, REPLICATION_FACTOR, true, null);
  try {
    final FSNamesystem namesystem = cluster.getNameNode().namesystem;
    final FileSystem fs = cluster.getFileSystem();
    DatanodeDescriptor[] datanodes = (DatanodeDescriptor[])
          namesystem.heartbeats.toArray(
              new DatanodeDescriptor[REPLICATION_FACTOR]);
    assertEquals(1, datanodes.length);
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile2");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    Block block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // shutdown the datanode
    DataNodeProperties dnprop = shutdownDataNode(cluster, datanodes[0]);
    assertEquals(1, namesystem.getMissingBlocksCount()); // one missing block
    assertEquals(0, namesystem.getNonCorruptUnderReplicatedBlocks());

    // Make the only datanode to be decommissioned
    LOG.info("Decommission the datanode " + dnprop);
    addToExcludeFile(namesystem.getConf(), datanodes);
    namesystem.refreshNodes(namesystem.getConf());      
    
    // bring up the datanode
    cluster.restartDataNode(dnprop);

    // Wait for block report
    LOG.info("wait for its block report to come in");
    NumberReplicas num;
    long startTime = System.currentTimeMillis();
    do {
     namesystem.readLock();
     try {
       num = namesystem.countNodes(block);
     } finally {
       namesystem.readUnlock();
     }
     Thread.sleep(1000);
     LOG.info("live: " + num.liveReplicas() 
         + "Decom: " + num.decommissionedReplicas());
    } while (num.decommissionedReplicas() != 1 &&
        System.currentTimeMillis() - startTime < 30000);
    assertEquals("Decommissioning Replicas doesn't reach 1", 
        1, num.decommissionedReplicas());
    assertEquals(1, namesystem.getNonCorruptUnderReplicatedBlocks());
    assertEquals(0, namesystem.getMissingBlocksCount());
  } finally {
    cluster.shutdown();
  }
}
 
Example 10
Source File: TestDataNodeMultipleRegistrations.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test
public void testDNWithInvalidStorageWithHA() throws Exception {
  MiniDFSNNTopology top = new MiniDFSNNTopology()
    .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
      .addNN(new MiniDFSNNTopology.NNConf("nn0").setClusterId("cluster-1"))
      .addNN(new MiniDFSNNTopology.NNConf("nn1").setClusterId("cluster-1")));

  top.setFederation(true);

  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).nnTopology(top)
      .numDataNodes(0).build();
  try {
    cluster.startDataNodes(conf, 1, true, null, null);
    // let the initialization be complete
    Thread.sleep(10000);
    DataNode dn = cluster.getDataNodes().get(0);
    assertTrue("Datanode should be running", dn.isDatanodeUp());
    assertEquals("BPOfferService should be running", 1,
        dn.getAllBpOs().length);
    DataNodeProperties dnProp = cluster.stopDataNode(0);

    cluster.getNameNode(0).stop();
    cluster.getNameNode(1).stop();
    Configuration nn1 = cluster.getConfiguration(0);
    Configuration nn2 = cluster.getConfiguration(1);
    // setting up invalid cluster
    StartupOption.FORMAT.setClusterId("cluster-2");
    DFSTestUtil.formatNameNode(nn1);
    MiniDFSCluster.copyNameDirs(FSNamesystem.getNamespaceDirs(nn1),
        FSNamesystem.getNamespaceDirs(nn2), nn2);
    cluster.restartNameNode(0, false);
    cluster.restartNameNode(1, false);
    cluster.restartDataNode(dnProp);
    
    // let the initialization be complete
    Thread.sleep(10000);
    dn = cluster.getDataNodes().get(0);
    assertFalse("Datanode should have shutdown as only service failed",
        dn.isDatanodeUp());
  } finally {
    cluster.shutdown();
  }
}
 
Example 11
Source File: TestOverReplicatedBlocks.java    From big-c with Apache License 2.0 4 votes vote down vote up
/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
@Test
public void testProcesOverReplicateBlock() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, 100L);
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(
      DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
      Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
  FileSystem fs = cluster.getFileSystem();

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    assertTrue(cluster.corruptReplica(0, block));
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanCursor = new File(new File(MiniDFSCluster.getFinalizedDir(
        cluster.getInstanceStorageDir(0, 0),
        cluster.getNamesystem().getBlockPoolId()).getParent()).getParent(),
        "scanner.cursor");
    //wait for one minute for deletion to succeed;
    for(int i = 0; !scanCursor.delete(); i++) {
      assertTrue("Could not delete " + scanCursor.getAbsolutePath() +
          " in one minute", i < 60);
      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignored) {}
    }
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    String blockPoolId = cluster.getNamesystem().getBlockPoolId();
    final DatanodeID corruptDataNode = 
      DataNodeTestUtils.getDNRegistrationForBP(
          cluster.getDataNodes().get(2), blockPoolId);
       
    final FSNamesystem namesystem = cluster.getNamesystem();
    final BlockManager bm = namesystem.getBlockManager();
    final HeartbeatManager hm = bm.getDatanodeManager().getHeartbeatManager();
    try {
      namesystem.writeLock();
      synchronized(hm) {
        // set live datanode's remaining space to be 0 
        // so they will be chosen to be deleted when over-replication occurs
        String corruptMachineName = corruptDataNode.getXferAddr();
        for (DatanodeDescriptor datanode : hm.getDatanodes()) {
          if (!corruptMachineName.equals(datanode.getXferAddr())) {
            datanode.getStorageInfos()[0].setUtilizationForTesting(100L, 100L, 0, 100L);
            datanode.updateHeartbeat(
                BlockManagerTestUtil.getStorageReportsForDatanode(datanode),
                0L, 0L, 0, 0, null);
          }
        }

        // decrease the replication factor to 1; 
        NameNodeAdapter.setReplication(namesystem, fileName.toString(), (short)1);

        // corrupt one won't be chosen to be excess one
        // without 4910 the number of live replicas would be 0: block gets lost
        assertEquals(1, bm.countNodes(block.getLocalBlock()).liveReplicas());
      }
    } finally {
      namesystem.writeUnlock();
    }
    
  } finally {
    cluster.shutdown();
  }
}
 
Example 12
Source File: TestBlocksWithNotEnoughRacks.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test
public void testCorruptBlockRereplicatedAcrossRacks() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  int fileLen = 512;
  final Path filePath = new Path("/testFile");
  // Datanodes are spread across two racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block with a replication factor of 2
    final FileSystem fs = cluster.getFileSystem();
    
    DFSTestUtil.createFile(fs, filePath, fileLen, REPLICATION_FACTOR, 1L);
    final String fileContent = DFSTestUtil.readFile(fs, filePath);

    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Corrupt a replica of the block
    int dnToCorrupt = DFSTestUtil.firstDnWithBlock(cluster, b);
    assertTrue(cluster.corruptReplica(dnToCorrupt, b));

    // Restart the datanode so blocks are re-scanned, and the corrupt
    // block is detected.
    cluster.restartDataNode(dnToCorrupt);

    // Wait for the namenode to notice the corrupt replica
    DFSTestUtil.waitCorruptReplicas(fs, ns, filePath, b, 1);

    // The rack policy is still respected
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Ensure all replicas are valid (the corrupt replica may not
    // have been cleaned up yet).
    for (int i = 0; i < racks.length; i++) {
      String blockContent = cluster.readBlockOnDataNode(i, b);
      if (blockContent != null && i != dnToCorrupt) {
        assertEquals("Corrupt replica", fileContent, blockContent);
      }
    }
  } finally {
    cluster.shutdown();
  }
}
 
Example 13
Source File: TestNodeCount.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test
public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new HdfsConfiguration();
  final MiniDFSCluster cluster = 
    new MiniDFSCluster.Builder(conf).numDataNodes(REPLICATION_FACTOR).build();
  try {
    final FSNamesystem namesystem = cluster.getNamesystem();
    final BlockManager bm = namesystem.getBlockManager();
    final HeartbeatManager hm = bm.getDatanodeManager().getHeartbeatManager();
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    final DatanodeDescriptor[] datanodes = hm.getDatanodes();
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();
    
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getXferAddr());
    
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), datanode.getXferAddr());
    
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();
    
    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() == 0) {
      checkTimeout("excess replicas not detected");
    }
    
    // find out a non-excess node
    DatanodeDescriptor nonExcessDN = null;
    for(DatanodeStorageInfo storage : bm.blocksMap.getStorages(block.getLocalBlock())) {
      final DatanodeDescriptor dn = storage.getDatanodeDescriptor();
      Collection<Block> blocks = bm.excessReplicateMap.get(dn.getDatanodeUuid());
      if (blocks == null || !blocks.contains(block.getLocalBlock()) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);
    
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getXferAddr());
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), nonExcessDN.getXferAddr());

    // The block should be replicated
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).liveReplicas() != REPLICATION_FACTOR) {
      checkTimeout("live replica count not correct", 1000);
    }

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();

    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() != 2) {
      checkTimeout("excess replica count not equal to 2");
    }

  } finally {
    cluster.shutdown();
  }
}
 
Example 14
Source File: TestNetworkTopology.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test(timeout=180000)
public void testInvalidNetworkTopologiesNotCachedInHdfs() throws Exception {
  // start a cluster
  Configuration conf = new HdfsConfiguration();
  MiniDFSCluster cluster = null;
  try {
    // bad rack topology
    String racks[] = { "/a/b", "/c" };
    String hosts[] = { "foo1.example.com", "foo2.example.com" };
    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).
        racks(racks).hosts(hosts).build();
    cluster.waitActive();
    
    NamenodeProtocols nn = cluster.getNameNodeRpc();
    Assert.assertNotNull(nn);
    
    // Wait for one DataNode to register.
    // The other DataNode will not be able to register up because of the rack mismatch.
    DatanodeInfo[] info;
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      Assert.assertFalse(info.length == 2);
      if (info.length == 1) {
        break;
      }
      Thread.sleep(1000);
    }
    // Set the network topology of the other node to the match the network
    // topology of the node that came up.
    int validIdx = info[0].getHostName().equals(hosts[0]) ? 0 : 1;
    int invalidIdx = validIdx == 1 ? 0 : 1;
    StaticMapping.addNodeToRack(hosts[invalidIdx], racks[validIdx]);
    LOG.info("datanode " + validIdx + " came up with network location " + 
      info[0].getNetworkLocation());

    // Restart the DN with the invalid topology and wait for it to register.
    cluster.restartDataNode(invalidIdx);
    Thread.sleep(5000);
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      if (info.length == 2) {
        break;
      }
      if (info.length == 0) {
        LOG.info("got no valid DNs");
      } else if (info.length == 1) {
        LOG.info("got one valid DN: " + info[0].getHostName() +
            " (at " + info[0].getNetworkLocation() + ")");
      }
      Thread.sleep(1000);
    }
    Assert.assertEquals(info[0].getNetworkLocation(),
                        info[1].getNetworkLocation());
  } finally {
    if (cluster != null) {
      cluster.shutdown();
    }
  }
}
 
Example 15
Source File: TestNetworkTopology.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test(timeout=180000)
public void testInvalidNetworkTopologiesNotCachedInHdfs() throws Exception {
  // start a cluster
  Configuration conf = new HdfsConfiguration();
  MiniDFSCluster cluster = null;
  try {
    // bad rack topology
    String racks[] = { "/a/b", "/c" };
    String hosts[] = { "foo1.example.com", "foo2.example.com" };
    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).
        racks(racks).hosts(hosts).build();
    cluster.waitActive();
    
    NamenodeProtocols nn = cluster.getNameNodeRpc();
    Assert.assertNotNull(nn);
    
    // Wait for one DataNode to register.
    // The other DataNode will not be able to register up because of the rack mismatch.
    DatanodeInfo[] info;
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      Assert.assertFalse(info.length == 2);
      if (info.length == 1) {
        break;
      }
      Thread.sleep(1000);
    }
    // Set the network topology of the other node to the match the network
    // topology of the node that came up.
    int validIdx = info[0].getHostName().equals(hosts[0]) ? 0 : 1;
    int invalidIdx = validIdx == 1 ? 0 : 1;
    StaticMapping.addNodeToRack(hosts[invalidIdx], racks[validIdx]);
    LOG.info("datanode " + validIdx + " came up with network location " + 
      info[0].getNetworkLocation());

    // Restart the DN with the invalid topology and wait for it to register.
    cluster.restartDataNode(invalidIdx);
    Thread.sleep(5000);
    while (true) {
      info = nn.getDatanodeReport(DatanodeReportType.LIVE);
      if (info.length == 2) {
        break;
      }
      if (info.length == 0) {
        LOG.info("got no valid DNs");
      } else if (info.length == 1) {
        LOG.info("got one valid DN: " + info[0].getHostName() +
            " (at " + info[0].getNetworkLocation() + ")");
      }
      Thread.sleep(1000);
    }
    Assert.assertEquals(info[0].getNetworkLocation(),
                        info[1].getNetworkLocation());
  } finally {
    if (cluster != null) {
      cluster.shutdown();
    }
  }
}
 
Example 16
Source File: TestDataNodeMultipleRegistrations.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testDNWithInvalidStorageWithHA() throws Exception {
  MiniDFSNNTopology top = new MiniDFSNNTopology()
    .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
      .addNN(new MiniDFSNNTopology.NNConf("nn0").setClusterId("cluster-1"))
      .addNN(new MiniDFSNNTopology.NNConf("nn1").setClusterId("cluster-1")));

  top.setFederation(true);

  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).nnTopology(top)
      .numDataNodes(0).build();
  try {
    cluster.startDataNodes(conf, 1, true, null, null);
    // let the initialization be complete
    Thread.sleep(10000);
    DataNode dn = cluster.getDataNodes().get(0);
    assertTrue("Datanode should be running", dn.isDatanodeUp());
    assertEquals("BPOfferService should be running", 1,
        dn.getAllBpOs().length);
    DataNodeProperties dnProp = cluster.stopDataNode(0);

    cluster.getNameNode(0).stop();
    cluster.getNameNode(1).stop();
    Configuration nn1 = cluster.getConfiguration(0);
    Configuration nn2 = cluster.getConfiguration(1);
    // setting up invalid cluster
    StartupOption.FORMAT.setClusterId("cluster-2");
    DFSTestUtil.formatNameNode(nn1);
    MiniDFSCluster.copyNameDirs(FSNamesystem.getNamespaceDirs(nn1),
        FSNamesystem.getNamespaceDirs(nn2), nn2);
    cluster.restartNameNode(0, false);
    cluster.restartNameNode(1, false);
    cluster.restartDataNode(dnProp);
    
    // let the initialization be complete
    Thread.sleep(10000);
    dn = cluster.getDataNodes().get(0);
    assertFalse("Datanode should have shutdown as only service failed",
        dn.isDatanodeUp());
  } finally {
    cluster.shutdown();
  }
}
 
Example 17
Source File: TestOverReplicatedBlocks.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
public void testProcesOverReplicateBlock() throws IOException {
  Configuration conf = new Configuration();
  conf.setLong("dfs.blockreport.intervalMsec", 1000L);
  conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs = cluster.getFileSystem();

  try {
    int namespaceId = cluster.getNameNode().getNamespaceID();
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    Block block = DFSTestUtil.getFirstBlock(fs, fileName);
    TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0, cluster);
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanLog = new File(cluster.getBlockDirectory("data1").getParent(), "dncp_block_verification.log.curr");
    //wait for one minute for deletion to succeed;
    scanLog.delete();
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    final DatanodeID corruptDataNode = 
      cluster.getDataNodes().get(2).getDNRegistrationForNS(namespaceId);
    final FSNamesystem namesystem = cluster.getNameNode().getNamesystem();
    synchronized (namesystem.heartbeats) {
      // set live datanode's remaining space to be 0 
      // so they will be chosen to be deleted when over-replication occurs
      for (DatanodeDescriptor datanode : namesystem.heartbeats) {
        if (!corruptDataNode.equals(datanode)) {
          datanode.updateHeartbeat(100L, 100L, 0L, 100L, 0);
        }
      }
    }
      
    // decrease the replication factor to 1; 
    namesystem.setReplication(fileName.toString(), (short)1);
    waitReplication(namesystem, block, (short)1);
    
    // corrupt one won't be chosen to be excess one
    // without 4910 the number of live replicas would be 0: block gets lost
    assertEquals(1, namesystem.countNodes(block).liveReplicas());

    // Test the case when multiple calls to setReplication still succeeds.
    System.out.println("Starting next test with file foo2.");
    final Path fileName2 = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName2, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName2, (short)3);
    LocatedBlocks lbs = namesystem.getBlockLocations(
               fileName2.toString(), 0, 10);
    Block firstBlock = lbs.get(0).getBlock();
    namesystem.setReplication(fileName2.toString(), (short)2);
    namesystem.setReplication(fileName2.toString(), (short)1);
    
    // wait upto one minute for excess replicas to get deleted. It is not
    // immediate because excess replicas are being handled asyncronously.
    waitReplication(namesystem, firstBlock, (short)1);
    assertEquals(1, namesystem.countNodes(firstBlock).liveReplicas());
  } finally {
    cluster.shutdown();
  }
}
 
Example 18
Source File: TestBlocksWithNotEnoughRacks.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testCorruptBlockRereplicatedAcrossRacks() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  int fileLen = 512;
  final Path filePath = new Path("/testFile");
  // Datanodes are spread across two racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block with a replication factor of 2
    final FileSystem fs = cluster.getFileSystem();
    
    DFSTestUtil.createFile(fs, filePath, fileLen, REPLICATION_FACTOR, 1L);
    final String fileContent = DFSTestUtil.readFile(fs, filePath);

    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Corrupt a replica of the block
    int dnToCorrupt = DFSTestUtil.firstDnWithBlock(cluster, b);
    assertTrue(cluster.corruptReplica(dnToCorrupt, b));

    // Restart the datanode so blocks are re-scanned, and the corrupt
    // block is detected.
    cluster.restartDataNode(dnToCorrupt);

    // Wait for the namenode to notice the corrupt replica
    DFSTestUtil.waitCorruptReplicas(fs, ns, filePath, b, 1);

    // The rack policy is still respected
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Ensure all replicas are valid (the corrupt replica may not
    // have been cleaned up yet).
    for (int i = 0; i < racks.length; i++) {
      String blockContent = cluster.readBlockOnDataNode(i, b);
      if (blockContent != null && i != dnToCorrupt) {
        assertEquals("Corrupt replica", fileContent, blockContent);
      }
    }
  } finally {
    cluster.shutdown();
  }
}
 
Example 19
Source File: TestNodeCount.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new HdfsConfiguration();
  final MiniDFSCluster cluster = 
    new MiniDFSCluster.Builder(conf).numDataNodes(REPLICATION_FACTOR).build();
  try {
    final FSNamesystem namesystem = cluster.getNamesystem();
    final BlockManager bm = namesystem.getBlockManager();
    final HeartbeatManager hm = bm.getDatanodeManager().getHeartbeatManager();
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    final DatanodeDescriptor[] datanodes = hm.getDatanodes();
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();
    
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getXferAddr());
    
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), datanode.getXferAddr());
    
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();
    
    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() == 0) {
      checkTimeout("excess replicas not detected");
    }
    
    // find out a non-excess node
    DatanodeDescriptor nonExcessDN = null;
    for(DatanodeStorageInfo storage : bm.blocksMap.getStorages(block.getLocalBlock())) {
      final DatanodeDescriptor dn = storage.getDatanodeDescriptor();
      Collection<Block> blocks = bm.excessReplicateMap.get(dn.getDatanodeUuid());
      if (blocks == null || !blocks.contains(block.getLocalBlock()) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);
    
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getXferAddr());
    // make sure that NN detects that the datanode is down
    BlockManagerTestUtil.noticeDeadDatanode(
        cluster.getNameNode(), nonExcessDN.getXferAddr());

    // The block should be replicated
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).liveReplicas() != REPLICATION_FACTOR) {
      checkTimeout("live replica count not correct", 1000);
    }

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();

    // check if excessive replica is detected (transient)
    initializeTimeout(TIMEOUT);
    while (countNodes(block.getLocalBlock(), namesystem).excessReplicas() != 2) {
      checkTimeout("excess replica count not equal to 2");
    }

  } finally {
    cluster.shutdown();
  }
}
 
Example 20
Source File: TestOverReplicatedBlocks.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
public void testProcesOverReplicateBlock() throws IOException {
  Configuration conf = new Configuration();
  conf.setLong("dfs.blockreport.intervalMsec", 1000L);
  conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs = cluster.getFileSystem();

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    Block block = DFSTestUtil.getFirstBlock(fs, fileName);
    TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0);
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanLog = new File(System.getProperty("test.build.data"),
        "dfs/data/data1/current/dncp_block_verification.log.curr");
    //wait for one minute for deletion to succeed;
    for(int i=0; !scanLog.delete(); i++) {
      assertTrue("Could not delete log file in one minute", i < 60);
      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignored) {}
    }
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    final DatanodeID corruptDataNode = 
      cluster.getDataNodes().get(2).dnRegistration;
    final FSNamesystem namesystem = FSNamesystem.getFSNamesystem();
    synchronized (namesystem.heartbeats) {
      // set live datanode's remaining space to be 0 
      // so they will be chosen to be deleted when over-replication occurs
      for (DatanodeDescriptor datanode : namesystem.heartbeats) {
        if (!corruptDataNode.equals(datanode)) {
          datanode.updateHeartbeat(100L, 100L, 0L, 0);
        }
      }
      
      // decrease the replication factor to 1; 
      namesystem.setReplication(fileName.toString(), (short)1);

      // corrupt one won't be chosen to be excess one
      // without 4910 the number of live replicas would be 0: block gets lost
      assertEquals(1, namesystem.countNodes(block).liveReplicas());
    }
  } finally {
    cluster.shutdown();
  }
}