org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties Java Examples

The following examples show how to use org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TestAsyncLogRolling.java From hbase with Apache License 2.0

6 votes

@Test
public void testLogRollOnDatanodeDeath() throws IOException, InterruptedException {
  dfsCluster.startDataNodes(TEST_UTIL.getConfiguration(), 3, true, null, null);
  tableName = getName();
  Table table = createTestTable(tableName);
  TEST_UTIL.waitUntilAllRegionsAssigned(table.getName());
  doPut(table, 1);
  server = TEST_UTIL.getRSForFirstRegionInTable(table.getName());
  RegionInfo hri = server.getRegions(table.getName()).get(0).getRegionInfo();
  AsyncFSWAL wal = (AsyncFSWAL) server.getWAL(hri);
  int numRolledLogFiles = AsyncFSWALProvider.getNumRolledLogFiles(wal);
  DatanodeInfo[] dnInfos = wal.getPipeline();
  DataNodeProperties dnProp = TEST_UTIL.getDFSCluster().stopDataNode(dnInfos[0].getName());
  TEST_UTIL.getDFSCluster().restartDataNode(dnProp);
  doPut(table, 2);
  assertEquals(numRolledLogFiles + 1, AsyncFSWALProvider.getNumRolledLogFiles(wal));
}

Example #2

Source File: TestFanOutOneBlockAsyncDFSOutput.java From hbase with Apache License 2.0

6 votes

@Test
public void testConnectToDatanodeFailed()
  throws IOException, ClassNotFoundException, NoSuchMethodException, IllegalAccessException,
  InvocationTargetException, InterruptedException, NoSuchFieldException {
  Field xceiverServerDaemonField = DataNode.class.getDeclaredField("dataXceiverServer");
  xceiverServerDaemonField.setAccessible(true);
  Class<?> xceiverServerClass =
    Class.forName("org.apache.hadoop.hdfs.server.datanode.DataXceiverServer");
  Method numPeersMethod = xceiverServerClass.getDeclaredMethod("getNumPeers");
  numPeersMethod.setAccessible(true);
  // make one datanode broken
  DataNodeProperties dnProp = CLUSTER.stopDataNode(0);
  Path f = new Path("/test");
  EventLoop eventLoop = EVENT_LOOP_GROUP.next();
  try (FanOutOneBlockAsyncDFSOutput output = FanOutOneBlockAsyncDFSOutputHelper.createOutput(FS,
    f, true, false, (short) 3, FS.getDefaultBlockSize(), eventLoop, CHANNEL_CLASS)) {
    // should exclude the dead dn when retry so here we only have 2 DNs in pipeline
    assertEquals(2, output.getPipeline().length);
  } finally {
    CLUSTER.restartDataNode(dnProp);
  }
}

Example #3

Source File: TestUnderReplicatedBlocks.java From RDFS with Apache License 2.0

5 votes

private DataNodeProperties shutdownDataNode(MiniDFSCluster cluster, DatanodeDescriptor datanode) {
  LOG.info("shutdown datanode: " + datanode.getName());
  DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName());
  FSNamesystem namesystem = cluster.getNameNode().namesystem;
  // make sure that NN detects that the datanode is down
  synchronized (namesystem.heartbeats) {
    datanode.setLastUpdate(0); // mark it dead
    namesystem.heartbeatCheck();
  }
  return dnprop;
}

Example #4

Source File: TestDataTransferKeepalive.java From big-c with Apache License 2.0

5 votes

/**
 * Test for the case where the client beings to read a long block, but doesn't
 * read bytes off the stream quickly. The datanode should time out sending the
 * chunks and the transceiver should die, even if it has a long keepalive.
 */
@Test(timeout=300000)
public void testSlowReader() throws Exception {
  // Set a client socket cache expiry time much longer than 
  // the datanode-side expiration time.
  final long CLIENT_EXPIRY_MS = 600000L;
  Configuration clientConf = new Configuration(conf);
  clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
  clientConf.set(DFS_CLIENT_CONTEXT, "testSlowReader");
  DistributedFileSystem fs =
      (DistributedFileSystem)FileSystem.get(cluster.getURI(),
          clientConf);
  // Restart the DN with a shorter write timeout.
  DataNodeProperties props = cluster.stopDataNode(0);
  props.conf.setInt(DFS_DATANODE_SOCKET_WRITE_TIMEOUT_KEY,
      WRITE_TIMEOUT);
  props.conf.setInt(DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_KEY,
      120000);
  assertTrue(cluster.restartDataNode(props, true));
  dn = cluster.getDataNodes().get(0);
  // Wait for heartbeats to avoid a startup race where we
  // try to write the block while the DN is still starting.
  cluster.triggerHeartbeats();
  
  DFSTestUtil.createFile(fs, TEST_FILE, 1024*1024*8L, (short)1, 0L);
  FSDataInputStream stm = fs.open(TEST_FILE);
  stm.read();
  assertXceiverCount(1);

  GenericTestUtils.waitFor(new Supplier<Boolean>() {
    public Boolean get() {
      // DN should time out in sendChunks, and this should force
      // the xceiver to exit.
      return getXceiverCountWithoutServer() == 0;
    }
  }, 500, 50000);

  IOUtils.closeStream(stm);
}

Example #5

Source File: TestRollingUpgrade.java From big-c with Apache License 2.0

5 votes

private static void rollbackRollingUpgrade(Path foo, Path bar,
    Path file, byte[] data,
    MiniDFSCluster cluster) throws IOException {
  final DataNodeProperties dnprop = cluster.stopDataNode(0);
  cluster.restartNameNode("-rollingUpgrade", "rollback");
  cluster.restartDataNode(dnprop, true);

  final DistributedFileSystem dfs = cluster.getFileSystem();
  Assert.assertTrue(dfs.exists(foo));
  Assert.assertFalse(dfs.exists(bar));
  AppendTestUtil.checkFullFile(dfs, file, data.length, data);
}

Example #6

Source File: TestDataTransferKeepalive.java From hadoop with Apache License 2.0

5 votes

/**
 * Test for the case where the client beings to read a long block, but doesn't
 * read bytes off the stream quickly. The datanode should time out sending the
 * chunks and the transceiver should die, even if it has a long keepalive.
 */
@Test(timeout=300000)
public void testSlowReader() throws Exception {
  // Set a client socket cache expiry time much longer than 
  // the datanode-side expiration time.
  final long CLIENT_EXPIRY_MS = 600000L;
  Configuration clientConf = new Configuration(conf);
  clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
  clientConf.set(DFS_CLIENT_CONTEXT, "testSlowReader");
  DistributedFileSystem fs =
      (DistributedFileSystem)FileSystem.get(cluster.getURI(),
          clientConf);
  // Restart the DN with a shorter write timeout.
  DataNodeProperties props = cluster.stopDataNode(0);
  props.conf.setInt(DFS_DATANODE_SOCKET_WRITE_TIMEOUT_KEY,
      WRITE_TIMEOUT);
  props.conf.setInt(DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_KEY,
      120000);
  assertTrue(cluster.restartDataNode(props, true));
  dn = cluster.getDataNodes().get(0);
  // Wait for heartbeats to avoid a startup race where we
  // try to write the block while the DN is still starting.
  cluster.triggerHeartbeats();
  
  DFSTestUtil.createFile(fs, TEST_FILE, 1024*1024*8L, (short)1, 0L);
  FSDataInputStream stm = fs.open(TEST_FILE);
  stm.read();
  assertXceiverCount(1);

  GenericTestUtils.waitFor(new Supplier<Boolean>() {
    public Boolean get() {
      // DN should time out in sendChunks, and this should force
      // the xceiver to exit.
      return getXceiverCountWithoutServer() == 0;
    }
  }, 500, 50000);

  IOUtils.closeStream(stm);
}

Example #7

Source File: TestFileAppend4.java From RDFS with Apache License 2.0

5 votes

private void runDNRestartCorruptType(CorruptionType corrupt) throws Exception {
  cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs1 = cluster.getFileSystem();
  try {
    short rep = 3; // replication
    assertTrue(BLOCK_SIZE%4 == 0);

    file1 = new Path("/dnDeath.dat");

    // write 1/2 block & close
    stm = fs1.create(file1, true, 1024, rep, 4096);
    AppendTestUtil.write(stm, 0, 1024);
    stm.sync();
    loseLeases(fs1);

    DFSOutputStream dfso = (DFSOutputStream)stm.getWrappedStream();
    dfso.abortForTests();

    // close the primary DN
    DataNodeProperties badDN = cluster.stopDataNode(0);

    // Truncate the block on the primary DN
    corruptDataNode(0, corrupt);

    // Start the DN back up
    cluster.restartDataNode(badDN);

    // Recover the lease
    FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
    recoverFile(fs2);

    assertFileSize(fs2, 1024);
    checkFile(fs2, 1024);
  } finally {
    // explicitly do not shut down fs1, since it's been frozen up by
    // killing the DataStreamer and not allowing recovery
    cluster.shutdown();
  }
}

Example #8

Source File: TestFileAppend4.java From RDFS with Apache License 2.0

5 votes

/**
 * Test that the restart of a DN and the subsequent pipeline recovery do not cause
 * a file to become prematurely considered "complete", when it's a fresh file
 * with no .append() called.
 */
public void testNotPrematurelyCompleteWithFailureNotReopened() throws Exception {
  LOG.info("START");
  cluster = new MiniDFSCluster(conf, 3, true, null);
  NameNode nn = cluster.getNameNode();
  FileSystem fs1 = cluster.getFileSystem();
  try {
    short rep = 3; // replication

    file1 = new Path("/delayedReceiveBlock");

    stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, 64*1024*1024);
    LOG.info("======== Writing");
    AppendTestUtil.write(stm, 0, 1024*1024);

    LOG.info("======== Waiting for a block allocation");
    waitForBlockReplication(fs1, "/delayedReceiveBlock", 0, 3000);

    LOG.info("======== Checking not complete");
    assertFalse(NameNodeAdapter.checkFileProgress(nn.namesystem, "/delayedReceiveBlock", true));

    // Stop one of the DNs, don't restart
    MiniDFSCluster.DataNodeProperties dnprops = cluster.stopDataNode(0);

    // Write some more data
    AppendTestUtil.write(stm, 0, 1024*1024);

    // Make sure we don't see the file as complete
    LOG.info("======== Checking progress");
    assertFalse(NameNodeAdapter.checkFileProgress(nn.namesystem, "/delayedReceiveBlock", true));
    LOG.info("======== Closing");
    stm.close();

  } finally {
    LOG.info("======== Cleaning up");
    fs1.close();
    cluster.shutdown();
  }
}

Example #9

Source File: TestDecommissioningStatus.java From hadoop with Apache License 2.0

5 votes

/**
 * Verify the support for decommissioning a datanode that is already dead.
 * Under this scenario the datanode should immediately be marked as
 * DECOMMISSIONED
 */
@Test(timeout=120000)
public void testDecommissionDeadDN() throws Exception {
  Logger log = Logger.getLogger(DecommissionManager.class);
  log.setLevel(Level.DEBUG);
  DatanodeID dnID = cluster.getDataNodes().get(0).getDatanodeId();
  String dnName = dnID.getXferAddr();
  DataNodeProperties stoppedDN = cluster.stopDataNode(0);
  DFSTestUtil.waitForDatanodeState(cluster, dnID.getDatanodeUuid(),
      false, 30000);
  FSNamesystem fsn = cluster.getNamesystem();
  final DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
  DatanodeDescriptor dnDescriptor = dm.getDatanode(dnID);
  decommissionNode(fsn, localFileSys, dnName);
  dm.refreshNodes(conf);
  BlockManagerTestUtil.recheckDecommissionState(dm);
  assertTrue(dnDescriptor.isDecommissioned());

  // Add the node back
  cluster.restartDataNode(stoppedDN, true);
  cluster.waitActive();

  // Call refreshNodes on FSNamesystem with empty exclude file to remove the
  // datanode from decommissioning list and make it available again.
  writeConfigFile(localFileSys, excludeFile, null);
  dm.refreshNodes(conf);
}

Example #10

Source File: TestProcessCorruptBlocks.java From hadoop with Apache License 2.0

5 votes

private void corruptBlock(MiniDFSCluster cluster, FileSystem fs, final Path fileName,
    int dnIndex, ExtendedBlock block) throws IOException {
  // corrupt the block on datanode dnIndex
  // the indexes change once the nodes are restarted.
  // But the datadirectory will not change
  assertTrue(cluster.corruptReplica(dnIndex, block));

  DataNodeProperties dnProps = cluster.stopDataNode(0);

  // Each datanode has multiple data dirs, check each
  for (int dirIndex = 0; dirIndex < 2; dirIndex++) {
    final String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getStorageDir(dnIndex, dirIndex);
    File dataDir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
    File scanLogFile = new File(dataDir, "dncp_block_verification.log.curr");
    if (scanLogFile.exists()) {
      // wait for one minute for deletion to succeed;
      for (int i = 0; !scanLogFile.delete(); i++) {
        assertTrue("Could not delete log file in one minute", i < 60);
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  // restart the detained so the corrupt replica will be detected
  cluster.restartDataNode(dnProps);
}

Example #11

Source File: TestProcessCorruptBlocks.java From hadoop with Apache License 2.0

5 votes

/**
 * The corrupt block has to be removed when the number of valid replicas
 * matches replication factor for the file. In this test, the above 
 * condition is achieved by increasing the number of good replicas by 
 * replicating on a new Datanode. 
 * The test strategy : 
 *   Bring up Cluster with 3 DataNodes
 *   Create a file  of replication factor 3
 *   Corrupt one replica of a block of the file 
 *   Verify that there are still 2 good replicas and 1 corrupt replica 
 *     (corrupt replica should not be removed since number of good replicas
 *      (2) is less  than replication factor (3)) 
 *   Start a new data node 
 *   Verify that the a new replica is created and corrupt replica is
 *   removed.
 * 
 */
@Test
public void testByAddingAnExtraDataNode() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
  FileSystem fs = cluster.getFileSystem();
  final FSNamesystem namesystem = cluster.getNamesystem();
  DataNodeProperties dnPropsFourth = cluster.stopDataNode(3);

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short) 3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    corruptBlock(cluster, fs, fileName, 0, block);

    DFSTestUtil.waitReplication(fs, fileName, (short) 2);

    assertEquals(2, countReplicas(namesystem, block).liveReplicas());
    assertEquals(1, countReplicas(namesystem, block).corruptReplicas());

    cluster.restartDataNode(dnPropsFourth);

    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    assertEquals(3, countReplicas(namesystem, block).liveReplicas());
    assertEquals(0, countReplicas(namesystem, block).corruptReplicas());
  } finally {
    cluster.shutdown();
  }
}

Example #12

Source File: TestPendingCorruptDnMessages.java From hadoop with Apache License 2.0

5 votes

private static boolean wipeAndRestartDn(MiniDFSCluster cluster, int dnIndex)
    throws IOException {
  // stop the DN, reformat it, then start it again with the same xfer port.
  DataNodeProperties dnProps = cluster.stopDataNode(dnIndex);
  cluster.formatDataNodeDirs();
  return cluster.restartDataNode(dnProps, true);
}

Example #13

Source File: TestPendingCorruptDnMessages.java From big-c with Apache License 2.0

5 votes

private static boolean wipeAndRestartDn(MiniDFSCluster cluster, int dnIndex)
    throws IOException {
  // stop the DN, reformat it, then start it again with the same xfer port.
  DataNodeProperties dnProps = cluster.stopDataNode(dnIndex);
  cluster.formatDataNodeDirs();
  return cluster.restartDataNode(dnProps, true);
}

Example #14

Source File: TestProcessCorruptBlocks.java From big-c with Apache License 2.0

5 votes

/**
 * The corrupt block has to be removed when the number of valid replicas
 * matches replication factor for the file. In this test, the above 
 * condition is achieved by increasing the number of good replicas by 
 * replicating on a new Datanode. 
 * The test strategy : 
 *   Bring up Cluster with 3 DataNodes
 *   Create a file  of replication factor 3
 *   Corrupt one replica of a block of the file 
 *   Verify that there are still 2 good replicas and 1 corrupt replica 
 *     (corrupt replica should not be removed since number of good replicas
 *      (2) is less  than replication factor (3)) 
 *   Start a new data node 
 *   Verify that the a new replica is created and corrupt replica is
 *   removed.
 * 
 */
@Test
public void testByAddingAnExtraDataNode() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY, Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
  FileSystem fs = cluster.getFileSystem();
  final FSNamesystem namesystem = cluster.getNamesystem();
  DataNodeProperties dnPropsFourth = cluster.stopDataNode(3);

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short) 3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    corruptBlock(cluster, fs, fileName, 0, block);

    DFSTestUtil.waitReplication(fs, fileName, (short) 2);

    assertEquals(2, countReplicas(namesystem, block).liveReplicas());
    assertEquals(1, countReplicas(namesystem, block).corruptReplicas());

    cluster.restartDataNode(dnPropsFourth);

    DFSTestUtil.waitReplication(fs, fileName, (short) 3);

    assertEquals(3, countReplicas(namesystem, block).liveReplicas());
    assertEquals(0, countReplicas(namesystem, block).corruptReplicas());
  } finally {
    cluster.shutdown();
  }
}

Example #15

Source File: TestRollingUpgrade.java From hadoop with Apache License 2.0

5 votes

private static void rollbackRollingUpgrade(Path foo, Path bar,
    Path file, byte[] data,
    MiniDFSCluster cluster) throws IOException {
  final DataNodeProperties dnprop = cluster.stopDataNode(0);
  cluster.restartNameNode("-rollingUpgrade", "rollback");
  cluster.restartDataNode(dnprop, true);

  final DistributedFileSystem dfs = cluster.getFileSystem();
  Assert.assertTrue(dfs.exists(foo));
  Assert.assertFalse(dfs.exists(bar));
  AppendTestUtil.checkFullFile(dfs, file, data.length, data);
}

Example #16

Source File: TestProcessCorruptBlocks.java From big-c with Apache License 2.0

5 votes

private void corruptBlock(MiniDFSCluster cluster, FileSystem fs, final Path fileName,
    int dnIndex, ExtendedBlock block) throws IOException {
  // corrupt the block on datanode dnIndex
  // the indexes change once the nodes are restarted.
  // But the datadirectory will not change
  assertTrue(cluster.corruptReplica(dnIndex, block));

  DataNodeProperties dnProps = cluster.stopDataNode(0);

  // Each datanode has multiple data dirs, check each
  for (int dirIndex = 0; dirIndex < 2; dirIndex++) {
    final String bpid = cluster.getNamesystem().getBlockPoolId();
    File storageDir = cluster.getStorageDir(dnIndex, dirIndex);
    File dataDir = MiniDFSCluster.getFinalizedDir(storageDir, bpid);
    File scanLogFile = new File(dataDir, "dncp_block_verification.log.curr");
    if (scanLogFile.exists()) {
      // wait for one minute for deletion to succeed;
      for (int i = 0; !scanLogFile.delete(); i++) {
        assertTrue("Could not delete log file in one minute", i < 60);
        try {
          Thread.sleep(1000);
        } catch (InterruptedException ignored) {
        }
      }
    }
  }

  // restart the detained so the corrupt replica will be detected
  cluster.restartDataNode(dnProps);
}

Example #17

Source File: TestDecommissioningStatus.java From big-c with Apache License 2.0

5 votes

/**
 * Verify the support for decommissioning a datanode that is already dead.
 * Under this scenario the datanode should immediately be marked as
 * DECOMMISSIONED
 */
@Test(timeout=120000)
public void testDecommissionDeadDN() throws Exception {
  Logger log = Logger.getLogger(DecommissionManager.class);
  log.setLevel(Level.DEBUG);
  DatanodeID dnID = cluster.getDataNodes().get(0).getDatanodeId();
  String dnName = dnID.getXferAddr();
  DataNodeProperties stoppedDN = cluster.stopDataNode(0);
  DFSTestUtil.waitForDatanodeState(cluster, dnID.getDatanodeUuid(),
      false, 30000);
  FSNamesystem fsn = cluster.getNamesystem();
  final DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
  DatanodeDescriptor dnDescriptor = dm.getDatanode(dnID);
  decommissionNode(fsn, localFileSys, dnName);
  dm.refreshNodes(conf);
  BlockManagerTestUtil.recheckDecommissionState(dm);
  assertTrue(dnDescriptor.isDecommissioned());

  // Add the node back
  cluster.restartDataNode(stoppedDN, true);
  cluster.waitActive();

  // Call refreshNodes on FSNamesystem with empty exclude file to remove the
  // datanode from decommissioning list and make it available again.
  writeConfigFile(localFileSys, excludeFile, null);
  dm.refreshNodes(conf);
}

Example #18

Source File: TestFileAppend4.java From RDFS with Apache License 2.0

4 votes

public void testFullClusterPowerLoss() throws Exception {
  cluster = new MiniDFSCluster(conf, 2, true, null);
  FileSystem fs1 = cluster.getFileSystem();
  try {
    short rep = 2; // replication
    assertTrue(BLOCK_SIZE%4 == 0);

    file1 = new Path("/dnDeath.dat");

    // write 1/2 block & close
    stm = fs1.create(file1, true, 1024, rep, 4096);
    AppendTestUtil.write(stm, 0, 1024);
    stm.sync();
    loseLeases(fs1);

    DFSOutputStream dfso = (DFSOutputStream)stm.getWrappedStream();
    dfso.abortForTests();

    // close the DNs
    DataNodeProperties badDN = cluster.stopDataNode(0);
    DataNodeProperties badDN2 = cluster.stopDataNode(0); // what was 1 is now 0
    assertNotNull(badDN);
    assertNotNull(badDN2);

    // Truncate one of them as if its journal got corrupted
    corruptDataNode(0, CorruptionType.TRUNCATE_BLOCK_HALF);

    // Start the DN back up
    cluster.restartDataNode(badDN);
    cluster.restartDataNode(badDN2);

    // Wait for a heartbeat to make sure we get the initial block
    // report of the replicasBeingWritten
    cluster.waitForDNHeartbeat(0, 10000);
    cluster.waitForDNHeartbeat(1, 10000);

    // Recover the lease
    FileSystem fs2 = AppendTestUtil.createHdfsWithDifferentUsername(fs1.getConf());
    recoverFile(fs2);

    assertFileSize(fs2, 512);
    checkFile(fs2, 512);
  } finally {
    // explicitly do not shut down fs1, since it's been frozen up by
    // killing the DataStreamer and not allowing recovery
    cluster.shutdown();
  }
}

Example #19

Source File: TestPendingInvalidateBlock.java From big-c with Apache License 2.0

4 votes

/**
 * Test whether we can delay the deletion of unknown blocks in DataNode's
 * first several block reports.
 */
@Test
public void testPendingDeleteUnknownBlocks() throws Exception {
  final int fileNum = 5; // 5 files
  final Path[] files = new Path[fileNum];
  final DataNodeProperties[] dnprops = new DataNodeProperties[REPLICATION];
  // create a group of files, each file contains 1 block
  for (int i = 0; i < fileNum; i++) {
    files[i] = new Path("/file" + i);
    DFSTestUtil.createFile(dfs, files[i], BLOCKSIZE, REPLICATION, i);
  }
  // wait until all DataNodes have replicas
  waitForReplication();
  for (int i = REPLICATION - 1; i >= 0; i--) {
    dnprops[i] = cluster.stopDataNode(i);
  }
  Thread.sleep(2000);
  // delete 2 files, we still have 3 files remaining so that we can cover
  // every DN storage
  for (int i = 0; i < 2; i++) {
    dfs.delete(files[i], true);
  }

  // restart NameNode
  cluster.restartNameNode(false);
  InvalidateBlocks invalidateBlocks = (InvalidateBlocks) Whitebox
      .getInternalState(cluster.getNamesystem().getBlockManager(),
          "invalidateBlocks");
  InvalidateBlocks mockIb = Mockito.spy(invalidateBlocks);
  Mockito.doReturn(1L).when(mockIb).getInvalidationDelay();
  Whitebox.setInternalState(cluster.getNamesystem().getBlockManager(),
      "invalidateBlocks", mockIb);

  Assert.assertEquals(0L, cluster.getNamesystem().getPendingDeletionBlocks());
  // restart DataNodes
  for (int i = 0; i < REPLICATION; i++) {
    cluster.restartDataNode(dnprops[i], true);
  }
  cluster.waitActive();

  for (int i = 0; i < REPLICATION; i++) {
    DataNodeTestUtils.triggerBlockReport(cluster.getDataNodes().get(i));
  }
  Thread.sleep(2000);
  // make sure we have received block reports by checking the total block #
  Assert.assertEquals(3, cluster.getNamesystem().getBlocksTotal());
  Assert.assertEquals(4, cluster.getNamesystem().getPendingDeletionBlocks());

  cluster.restartNameNode(true);
  Thread.sleep(6000);
  Assert.assertEquals(3, cluster.getNamesystem().getBlocksTotal());
  Assert.assertEquals(0, cluster.getNamesystem().getPendingDeletionBlocks());
}

Example #20

Source File: TestDataNodeMultipleRegistrations.java From big-c with Apache License 2.0

4 votes

@Test
public void testDNWithInvalidStorageWithHA() throws Exception {
  MiniDFSNNTopology top = new MiniDFSNNTopology()
    .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
      .addNN(new MiniDFSNNTopology.NNConf("nn0").setClusterId("cluster-1"))
      .addNN(new MiniDFSNNTopology.NNConf("nn1").setClusterId("cluster-1")));

  top.setFederation(true);

  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).nnTopology(top)
      .numDataNodes(0).build();
  try {
    cluster.startDataNodes(conf, 1, true, null, null);
    // let the initialization be complete
    Thread.sleep(10000);
    DataNode dn = cluster.getDataNodes().get(0);
    assertTrue("Datanode should be running", dn.isDatanodeUp());
    assertEquals("BPOfferService should be running", 1,
        dn.getAllBpOs().length);
    DataNodeProperties dnProp = cluster.stopDataNode(0);

    cluster.getNameNode(0).stop();
    cluster.getNameNode(1).stop();
    Configuration nn1 = cluster.getConfiguration(0);
    Configuration nn2 = cluster.getConfiguration(1);
    // setting up invalid cluster
    StartupOption.FORMAT.setClusterId("cluster-2");
    DFSTestUtil.formatNameNode(nn1);
    MiniDFSCluster.copyNameDirs(FSNamesystem.getNamespaceDirs(nn1),
        FSNamesystem.getNamespaceDirs(nn2), nn2);
    cluster.restartNameNode(0, false);
    cluster.restartNameNode(1, false);
    cluster.restartDataNode(dnProp);
    
    // let the initialization be complete
    Thread.sleep(10000);
    dn = cluster.getDataNodes().get(0);
    assertFalse("Datanode should have shutdown as only service failed",
        dn.isDatanodeUp());
  } finally {
    cluster.shutdown();
  }
}

Example #21

Source File: TestOverReplicatedBlocks.java From big-c with Apache License 2.0

4 votes

/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
@Test
public void testProcesOverReplicateBlock() throws Exception {
  Configuration conf = new HdfsConfiguration();
  conf.setLong(DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, 100L);
  conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
  conf.set(
      DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
      Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
  FileSystem fs = cluster.getFileSystem();

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, fileName);
    assertTrue(cluster.corruptReplica(0, block));
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanCursor = new File(new File(MiniDFSCluster.getFinalizedDir(
        cluster.getInstanceStorageDir(0, 0),
        cluster.getNamesystem().getBlockPoolId()).getParent()).getParent(),
        "scanner.cursor");
    //wait for one minute for deletion to succeed;
    for(int i = 0; !scanCursor.delete(); i++) {
      assertTrue("Could not delete " + scanCursor.getAbsolutePath() +
          " in one minute", i < 60);
      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignored) {}
    }
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    String blockPoolId = cluster.getNamesystem().getBlockPoolId();
    final DatanodeID corruptDataNode = 
      DataNodeTestUtils.getDNRegistrationForBP(
          cluster.getDataNodes().get(2), blockPoolId);
       
    final FSNamesystem namesystem = cluster.getNamesystem();
    final BlockManager bm = namesystem.getBlockManager();
    final HeartbeatManager hm = bm.getDatanodeManager().getHeartbeatManager();
    try {
      namesystem.writeLock();
      synchronized(hm) {
        // set live datanode's remaining space to be 0 
        // so they will be chosen to be deleted when over-replication occurs
        String corruptMachineName = corruptDataNode.getXferAddr();
        for (DatanodeDescriptor datanode : hm.getDatanodes()) {
          if (!corruptMachineName.equals(datanode.getXferAddr())) {
            datanode.getStorageInfos()[0].setUtilizationForTesting(100L, 100L, 0, 100L);
            datanode.updateHeartbeat(
                BlockManagerTestUtil.getStorageReportsForDatanode(datanode),
                0L, 0L, 0, 0, null);
          }
        }

        // decrease the replication factor to 1; 
        NameNodeAdapter.setReplication(namesystem, fileName.toString(), (short)1);

        // corrupt one won't be chosen to be excess one
        // without 4910 the number of live replicas would be 0: block gets lost
        assertEquals(1, bm.countNodes(block.getLocalBlock()).liveReplicas());
      }
    } finally {
      namesystem.writeUnlock();
    }
    
  } finally {
    cluster.shutdown();
  }
}

Example #22

Source File: TestFileAppend4.java From RDFS with Apache License 2.0

4 votes

/**
 * Test that when a DN starts up with bbws from a file that got
 * removed or finalized when it was down, the block gets deleted.
 */
public void testBBWCleanupOnStartup() throws Throwable {
  LOG.info("START");
  cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs1 = cluster.getFileSystem();
  try {
    int halfBlock = (int) BLOCK_SIZE / 2;
    short rep = 3; // replication
    assertTrue(BLOCK_SIZE % 4 == 0);

    file1 = new Path("/bbwCleanupOnStartup.dat");

    // write 1/2 block & sync
    stm = fs1.create(file1, true, (int) BLOCK_SIZE * 2, rep, BLOCK_SIZE);
    AppendTestUtil.write(stm, 0, halfBlock);
    stm.sync();

    String dataDirs = cluster.getDataNodes().get(0).getConf().get("dfs.data.dir");
    // close one of the datanodes
    MiniDFSCluster.DataNodeProperties dnprops = cluster.stopDataNode(0);

    stm.close();

    List<File> bbwFilesAfterShutdown = getBBWFiles(dataDirs);
    assertEquals(1, bbwFilesAfterShutdown.size());

    assertTrue(cluster.restartDataNode(dnprops));

    List<File> bbwFilesAfterRestart = null;
    // Wait up to 10 heartbeats for the files to get removed - it should
    // really happen after just a couple.
    for (int i = 0; i < 10; i++) {
      LOG.info("Waiting for heartbeat #" + i + " after DN restart");
      cluster.waitForDNHeartbeat(0, 10000);

      // Check if it has been deleted
      bbwFilesAfterRestart = getBBWFiles(dataDirs);
      if (bbwFilesAfterRestart.size() == 0) {
        break;
      }
    }

    assertEquals(0, bbwFilesAfterRestart.size());

  } finally {
    fs1.close();
    cluster.shutdown();
  }
}

Example #23

Source File: TestPendingCorruptDnMessages.java From big-c with Apache License 2.0

4 votes

@Test
public void testChangedStorageId() throws IOException, URISyntaxException,
    InterruptedException {
  HdfsConfiguration conf = new HdfsConfiguration();
  conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
      .numDataNodes(1)
      .nnTopology(MiniDFSNNTopology.simpleHATopology())
      .build();
  
  try {
    cluster.transitionToActive(0);
    
    FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
    OutputStream out = fs.create(filePath);
    out.write("foo bar baz".getBytes());
    out.close();
    
    HATestUtil.waitForStandbyToCatchUp(cluster.getNameNode(0),
        cluster.getNameNode(1));
    
    // Change the gen stamp of the block on datanode to go back in time (gen
    // stamps start at 1000)
    ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
    assertTrue(cluster.changeGenStampOfBlock(0, block, 900));
    
    // Stop the DN so the replica with the changed gen stamp will be reported
    // when this DN starts up.
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    
    // Restart the namenode so that when the DN comes up it will see an initial
    // block report.
    cluster.restartNameNode(1, false);
    assertTrue(cluster.restartDataNode(dnProps, true));
    
    // Wait until the standby NN queues up the corrupt block in the pending DN
    // message queue.
    while (cluster.getNamesystem(1).getBlockManager()
        .getPendingDataNodeMessageCount() < 1) {
      ThreadUtil.sleepAtLeastIgnoreInterrupts(1000);
    }
    
    assertEquals(1, cluster.getNamesystem(1).getBlockManager()
        .getPendingDataNodeMessageCount());
    String oldStorageId = getRegisteredDatanodeUid(cluster, 1);
    
    // Reformat/restart the DN.
    assertTrue(wipeAndRestartDn(cluster, 0));
    
    // Give the DN time to start up and register, which will cause the
    // DatanodeManager to dissociate the old storage ID from the DN xfer addr.
    String newStorageId = "";
    do {
      ThreadUtil.sleepAtLeastIgnoreInterrupts(1000);
      newStorageId = getRegisteredDatanodeUid(cluster, 1);
      System.out.println("====> oldStorageId: " + oldStorageId +
          " newStorageId: " + newStorageId);
    } while (newStorageId.equals(oldStorageId));
    
    assertEquals(0, cluster.getNamesystem(1).getBlockManager()
        .getPendingDataNodeMessageCount());
    
    // Now try to fail over.
    cluster.transitionToStandby(0);
    cluster.transitionToActive(1);
  } finally {
    cluster.shutdown();
  }
}

Example #24

Source File: TestFileAppend4.java From RDFS with Apache License 2.0

4 votes

/**
 * Test that the restart of a DN and the subsequent pipeline recovery do not cause
 * a file to become prematurely considered "complete". (ie that the block
 * synchronization as part of pipeline recovery doesn't add the block to the
 * nodes taking part in recovery)
 */
public void testNotPrematurelyCompleteWithFailure() throws Exception {
  LOG.info("START");
  cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs1 = cluster.getFileSystem();
  try {
    int halfBlock = (int)BLOCK_SIZE/2;
    short rep = 3; // replication
    assertTrue(BLOCK_SIZE%4 == 0);

    file1 = new Path("/delayedReceiveBlock");

    // write 1/2 block & close
    stm = fs1.create(file1, true, (int)BLOCK_SIZE*2, rep, BLOCK_SIZE);
    AppendTestUtil.write(stm, 0, halfBlock);
    stm.close();

    NameNode nn = cluster.getNameNode();
    LOG.info("======== Appending");
    stm = fs1.append(file1);
    LOG.info("======== Writing");
    AppendTestUtil.write(stm, 0, halfBlock/4);

    // restart one of the datanodes and wait for a few of its heartbeats
    // so that it will report the recovered replica
    MiniDFSCluster.DataNodeProperties dnprops = cluster.stopDataNode(0);
    stm.sync();
    assertTrue(cluster.restartDataNode(dnprops));
    for (int i = 0; i < 2; i++) {
      cluster.waitForDNHeartbeat(0, 3000);
    }

    AppendTestUtil.write(stm, 0, halfBlock/4);

    LOG.info("======== Checking progress");
    assertFalse(NameNodeAdapter.checkFileProgress(nn.namesystem, "/delayedReceiveBlock", true));
    LOG.info("======== Closing");
    stm.close();

  } finally {
    LOG.info("======== Cleaning up");
    fs1.close();
    cluster.shutdown();
  }
}

Example #25

Source File: TestUnderReplicatedBlocks.java From RDFS with Apache License 2.0

4 votes

public void testUnderReplicationWithDecommissionDataNode() throws Exception {
  final Configuration conf = new Configuration();
  final short REPLICATION_FACTOR = (short)1;
  File f = new File(HOST_FILE_PATH);
  if (f.exists()) {
    f.delete();
  }
  conf.set("dfs.hosts.exclude", HOST_FILE_PATH);
  LOG.info("Start the cluster");
  final MiniDFSCluster cluster = 
    new MiniDFSCluster(conf, REPLICATION_FACTOR, true, null);
  try {
    final FSNamesystem namesystem = cluster.getNameNode().namesystem;
    final FileSystem fs = cluster.getFileSystem();
    DatanodeDescriptor[] datanodes = (DatanodeDescriptor[])
          namesystem.heartbeats.toArray(
              new DatanodeDescriptor[REPLICATION_FACTOR]);
    assertEquals(1, datanodes.length);
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile2");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    Block block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // shutdown the datanode
    DataNodeProperties dnprop = shutdownDataNode(cluster, datanodes[0]);
    assertEquals(1, namesystem.getMissingBlocksCount()); // one missing block
    assertEquals(0, namesystem.getNonCorruptUnderReplicatedBlocks());

    // Make the only datanode to be decommissioned
    LOG.info("Decommission the datanode " + dnprop);
    addToExcludeFile(namesystem.getConf(), datanodes);
    namesystem.refreshNodes(namesystem.getConf());      
    
    // bring up the datanode
    cluster.restartDataNode(dnprop);

    // Wait for block report
    LOG.info("wait for its block report to come in");
    NumberReplicas num;
    long startTime = System.currentTimeMillis();
    do {
     namesystem.readLock();
     try {
       num = namesystem.countNodes(block);
     } finally {
       namesystem.readUnlock();
     }
     Thread.sleep(1000);
     LOG.info("live: " + num.liveReplicas() 
         + "Decom: " + num.decommissionedReplicas());
    } while (num.decommissionedReplicas() != 1 &&
        System.currentTimeMillis() - startTime < 30000);
    assertEquals("Decommissioning Replicas doesn't reach 1", 
        1, num.decommissionedReplicas());
    assertEquals(1, namesystem.getNonCorruptUnderReplicatedBlocks());
    assertEquals(0, namesystem.getMissingBlocksCount());
  } finally {
    cluster.shutdown();
  }
}

Example #26

Source File: TestNodeCount.java From RDFS with Apache License 2.0

4 votes

public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new Configuration();
  conf.setInt("dfs.replication.interval", 10);
  final short REPLICATION_FACTOR = (short)2;
  final MiniDFSCluster cluster = 
    new MiniDFSCluster(conf, REPLICATION_FACTOR, true, null);
  try {
    final FSNamesystem namesystem = cluster.getNameNode().namesystem;
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    Block block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    DatanodeDescriptor[] datanodes = (DatanodeDescriptor[])
       namesystem.heartbeats.toArray(new DatanodeDescriptor[REPLICATION_FACTOR]);
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive(false);
    
    LOG.info("Bringing down first DN");
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      datanode.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }

    LOG.info("Waiting for block to be replicated");
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    LOG.info("Restarting first datanode");
    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive(false);

    LOG.info("Waiting for excess replicas to be detected");
    
    // check if excessive replica is detected
    waitForExcessReplicasToChange(namesystem, block, 1);

    LOG.info("Finding a non-excess node");

    // find out a non-excess node
    Iterator<DatanodeDescriptor> iter = namesystem.blocksMap.nodeIterator(block);
    DatanodeDescriptor nonExcessDN = null;
    while (iter.hasNext()) {
      DatanodeDescriptor dn = iter.next();
      Collection<Block> blocks = namesystem.excessReplicateMap.get(dn.getStorageID());
      if (blocks == null || !blocks.contains(block) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);

    LOG.info("Stopping non-excess node: " + nonExcessDN);
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      nonExcessDN.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }
    
    LOG.info("Waiting for live replicas to hit repl factor");
    // The block should be replicated
    NumberReplicas num;
    do {
     namesystem.readLock();
     try {
       num = namesystem.countNodes(block);
     } finally {
       namesystem.readUnlock();
     }
    } while (num.liveReplicas() != REPLICATION_FACTOR);
    
    LOG.info("Restarting first DN");
    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive(false);
    // check if excessive replica is detected
    LOG.info("Waiting for excess replicas to be detected");
    waitForExcessReplicasToChange(namesystem, block, 2);
  } finally {
    cluster.shutdown();
  }
}

Example #27

Source File: TestOverReplicatedBlocks.java From RDFS with Apache License 2.0

4 votes

/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
public void testProcesOverReplicateBlock() throws IOException {
  Configuration conf = new Configuration();
  conf.setLong("dfs.blockreport.intervalMsec", 1000L);
  conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs = cluster.getFileSystem();

  try {
    int namespaceId = cluster.getNameNode().getNamespaceID();
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    Block block = DFSTestUtil.getFirstBlock(fs, fileName);
    TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0, cluster);
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanLog = new File(cluster.getBlockDirectory("data1").getParent(), "dncp_block_verification.log.curr");
    //wait for one minute for deletion to succeed;
    scanLog.delete();
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    final DatanodeID corruptDataNode = 
      cluster.getDataNodes().get(2).getDNRegistrationForNS(namespaceId);
    final FSNamesystem namesystem = cluster.getNameNode().getNamesystem();
    synchronized (namesystem.heartbeats) {
      // set live datanode's remaining space to be 0 
      // so they will be chosen to be deleted when over-replication occurs
      for (DatanodeDescriptor datanode : namesystem.heartbeats) {
        if (!corruptDataNode.equals(datanode)) {
          datanode.updateHeartbeat(100L, 100L, 0L, 100L, 0);
        }
      }
    }
      
    // decrease the replication factor to 1; 
    namesystem.setReplication(fileName.toString(), (short)1);
    waitReplication(namesystem, block, (short)1);
    
    // corrupt one won't be chosen to be excess one
    // without 4910 the number of live replicas would be 0: block gets lost
    assertEquals(1, namesystem.countNodes(block).liveReplicas());

    // Test the case when multiple calls to setReplication still succeeds.
    System.out.println("Starting next test with file foo2.");
    final Path fileName2 = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName2, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName2, (short)3);
    LocatedBlocks lbs = namesystem.getBlockLocations(
               fileName2.toString(), 0, 10);
    Block firstBlock = lbs.get(0).getBlock();
    namesystem.setReplication(fileName2.toString(), (short)2);
    namesystem.setReplication(fileName2.toString(), (short)1);
    
    // wait upto one minute for excess replicas to get deleted. It is not
    // immediate because excess replicas are being handled asyncronously.
    waitReplication(namesystem, firstBlock, (short)1);
    assertEquals(1, namesystem.countNodes(firstBlock).liveReplicas());
  } finally {
    cluster.shutdown();
  }
}

Example #28

Source File: TestNodeCount.java From hadoop-gpu with Apache License 2.0

4 votes

public void testNodeCount() throws Exception {
  // start a mini dfs cluster of 2 nodes
  final Configuration conf = new Configuration();
  final short REPLICATION_FACTOR = (short)2;
  final MiniDFSCluster cluster = 
    new MiniDFSCluster(conf, REPLICATION_FACTOR, true, null);
  try {
    final FSNamesystem namesystem = cluster.getNameNode().namesystem;
    final FileSystem fs = cluster.getFileSystem();
    
    // populate the cluster with a one block file
    final Path FILE_PATH = new Path("/testfile");
    DFSTestUtil.createFile(fs, FILE_PATH, 1L, REPLICATION_FACTOR, 1L);
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
    Block block = DFSTestUtil.getFirstBlock(fs, FILE_PATH);

    // keep a copy of all datanode descriptor
    DatanodeDescriptor[] datanodes = (DatanodeDescriptor[])
       namesystem.heartbeats.toArray(new DatanodeDescriptor[REPLICATION_FACTOR]);
    
    // start two new nodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();
    
    // bring down first datanode
    DatanodeDescriptor datanode = datanodes[0];
    DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      datanode.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }
    // the block will be replicated
    DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);

    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();
    
    // check if excessive replica is detected
    NumberReplicas num = null;
    do {
     synchronized (namesystem) {
       num = namesystem.countNodes(block);
     }
    } while (num.excessReplicas() == 0);
    
    // find out a non-excess node
    Iterator<DatanodeDescriptor> iter = namesystem.blocksMap.nodeIterator(block);
    DatanodeDescriptor nonExcessDN = null;
    while (iter.hasNext()) {
      DatanodeDescriptor dn = iter.next();
      Collection<Block> blocks = namesystem.excessReplicateMap.get(dn.getStorageID());
      if (blocks == null || !blocks.contains(block) ) {
        nonExcessDN = dn;
        break;
      }
    }
    assertTrue(nonExcessDN!=null);
    
    // bring down non excessive datanode
    dnprop = cluster.stopDataNode(nonExcessDN.getName());
    // make sure that NN detects that the datanode is down
    synchronized (namesystem.heartbeats) {
      nonExcessDN.setLastUpdate(0); // mark it dead
      namesystem.heartbeatCheck();
    }
    
    // The block should be replicated
    do {
      num = namesystem.countNodes(block);
    } while (num.liveReplicas() != REPLICATION_FACTOR);
    
    // restart the first datanode
    cluster.restartDataNode(dnprop);
    cluster.waitActive();
    
    // check if excessive replica is detected
    do {
     num = namesystem.countNodes(block);
    } while (num.excessReplicas() == 2);
  } finally {
    cluster.shutdown();
  }
}

Example #29

Source File: TestOverReplicatedBlocks.java From hadoop-gpu with Apache License 2.0

4 votes

/** Test processOverReplicatedBlock can handle corrupt replicas fine.
 * It make sure that it won't treat corrupt replicas as valid ones 
 * thus prevents NN deleting valid replicas but keeping
 * corrupt ones.
 */
public void testProcesOverReplicateBlock() throws IOException {
  Configuration conf = new Configuration();
  conf.setLong("dfs.blockreport.intervalMsec", 1000L);
  conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
  MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
  FileSystem fs = cluster.getFileSystem();

  try {
    final Path fileName = new Path("/foo1");
    DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
    DFSTestUtil.waitReplication(fs, fileName, (short)3);
    
    // corrupt the block on datanode 0
    Block block = DFSTestUtil.getFirstBlock(fs, fileName);
    TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0);
    DataNodeProperties dnProps = cluster.stopDataNode(0);
    // remove block scanner log to trigger block scanning
    File scanLog = new File(System.getProperty("test.build.data"),
        "dfs/data/data1/current/dncp_block_verification.log.curr");
    //wait for one minute for deletion to succeed;
    for(int i=0; !scanLog.delete(); i++) {
      assertTrue("Could not delete log file in one minute", i < 60);
      try {
        Thread.sleep(1000);
      } catch (InterruptedException ignored) {}
    }
    
    // restart the datanode so the corrupt replica will be detected
    cluster.restartDataNode(dnProps);
    DFSTestUtil.waitReplication(fs, fileName, (short)2);
    
    final DatanodeID corruptDataNode = 
      cluster.getDataNodes().get(2).dnRegistration;
    final FSNamesystem namesystem = FSNamesystem.getFSNamesystem();
    synchronized (namesystem.heartbeats) {
      // set live datanode's remaining space to be 0 
      // so they will be chosen to be deleted when over-replication occurs
      for (DatanodeDescriptor datanode : namesystem.heartbeats) {
        if (!corruptDataNode.equals(datanode)) {
          datanode.updateHeartbeat(100L, 100L, 0L, 0);
        }
      }
      
      // decrease the replication factor to 1; 
      namesystem.setReplication(fileName.toString(), (short)1);

      // corrupt one won't be chosen to be excess one
      // without 4910 the number of live replicas would be 0: block gets lost
      assertEquals(1, namesystem.countNodes(block).liveReplicas());
    }
  } finally {
    cluster.shutdown();
  }
}

Example #30

Source File: TestPendingInvalidateBlock.java From hadoop with Apache License 2.0

4 votes

/**
 * Test whether we can delay the deletion of unknown blocks in DataNode's
 * first several block reports.
 */
@Test
public void testPendingDeleteUnknownBlocks() throws Exception {
  final int fileNum = 5; // 5 files
  final Path[] files = new Path[fileNum];
  final DataNodeProperties[] dnprops = new DataNodeProperties[REPLICATION];
  // create a group of files, each file contains 1 block
  for (int i = 0; i < fileNum; i++) {
    files[i] = new Path("/file" + i);
    DFSTestUtil.createFile(dfs, files[i], BLOCKSIZE, REPLICATION, i);
  }
  // wait until all DataNodes have replicas
  waitForReplication();
  for (int i = REPLICATION - 1; i >= 0; i--) {
    dnprops[i] = cluster.stopDataNode(i);
  }
  Thread.sleep(2000);
  // delete 2 files, we still have 3 files remaining so that we can cover
  // every DN storage
  for (int i = 0; i < 2; i++) {
    dfs.delete(files[i], true);
  }

  // restart NameNode
  cluster.restartNameNode(false);
  InvalidateBlocks invalidateBlocks = (InvalidateBlocks) Whitebox
      .getInternalState(cluster.getNamesystem().getBlockManager(),
          "invalidateBlocks");
  InvalidateBlocks mockIb = Mockito.spy(invalidateBlocks);
  Mockito.doReturn(1L).when(mockIb).getInvalidationDelay();
  Whitebox.setInternalState(cluster.getNamesystem().getBlockManager(),
      "invalidateBlocks", mockIb);

  Assert.assertEquals(0L, cluster.getNamesystem().getPendingDeletionBlocks());
  // restart DataNodes
  for (int i = 0; i < REPLICATION; i++) {
    cluster.restartDataNode(dnprops[i], true);
  }
  cluster.waitActive();

  for (int i = 0; i < REPLICATION; i++) {
    DataNodeTestUtils.triggerBlockReport(cluster.getDataNodes().get(i));
  }
  Thread.sleep(2000);
  // make sure we have received block reports by checking the total block #
  Assert.assertEquals(3, cluster.getNamesystem().getBlocksTotal());
  Assert.assertEquals(4, cluster.getNamesystem().getPendingDeletionBlocks());

  cluster.restartNameNode(true);
  Thread.sleep(6000);
  Assert.assertEquals(3, cluster.getNamesystem().getBlocksTotal());
  Assert.assertEquals(0, cluster.getNamesystem().getPendingDeletionBlocks());
}