org.apache.hadoop.hdfs.protocol.RecoveryInProgressException Java Examples

The following examples show how to use org.apache.hadoop.hdfs.protocol.RecoveryInProgressException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestReadWhileWriting.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/** Try openning a file for append. */
private static FSDataOutputStream append(FileSystem fs, Path p) throws Exception {
  for(int i = 0; i < 10; i++) {
    try {
      return fs.append(p);
    } catch(RemoteException re) {
      if (re.getClassName().equals(RecoveryInProgressException.class.getName())) {
        AppendTestUtil.LOG.info("Will sleep and retry, i=" + i +", p="+p, re);
        Thread.sleep(1000);
      }
      else
        throw re;
    }
  }
  throw new IOException("Cannot append to " + p);
}
 
Example #2
Source File: TestReadWhileWriting.java    From big-c with Apache License 2.0 6 votes vote down vote up
/** Try openning a file for append. */
private static FSDataOutputStream append(FileSystem fs, Path p) throws Exception {
  for(int i = 0; i < 10; i++) {
    try {
      return fs.append(p);
    } catch(RemoteException re) {
      if (re.getClassName().equals(RecoveryInProgressException.class.getName())) {
        AppendTestUtil.LOG.info("Will sleep and retry, i=" + i +", p="+p, re);
        Thread.sleep(1000);
      }
      else
        throw re;
    }
  }
  throw new IOException("Cannot append to " + p);
}
 
Example #3
Source File: TestBlockRecovery.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * BlockRecoveryFI_05. One DN throws RecoveryInProgressException.
 *
 * @throws IOException
 *           in case of an error
 */
@Test
public void testRecoveryInProgressException()
  throws IOException, InterruptedException {
  if(LOG.isDebugEnabled()) {
    LOG.debug("Running " + GenericTestUtils.getMethodName());
  }
  DataNode spyDN = spy(dn);
  doThrow(new RecoveryInProgressException("Replica recovery is in progress")).
     when(spyDN).initReplicaRecovery(any(RecoveringBlock.class));
  Daemon d = spyDN.recoverBlocks("fake NN", initRecoveringBlocks());
  d.join();
  verify(spyDN, never()).syncBlock(
      any(RecoveringBlock.class), anyListOf(BlockRecord.class));
}
 
Example #4
Source File: TestBlockRecovery.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * BlockRecoveryFI_05. One DN throws RecoveryInProgressException.
 *
 * @throws IOException
 *           in case of an error
 */
@Test
public void testRecoveryInProgressException()
  throws IOException, InterruptedException {
  if(LOG.isDebugEnabled()) {
    LOG.debug("Running " + GenericTestUtils.getMethodName());
  }
  DataNode spyDN = spy(dn);
  doThrow(new RecoveryInProgressException("Replica recovery is in progress")).
     when(spyDN).initReplicaRecovery(any(RecoveringBlock.class));
  Daemon d = spyDN.recoverBlocks("fake NN", initRecoveringBlocks());
  d.join();
  verify(spyDN, never()).syncBlock(
      any(RecoveringBlock.class), anyListOf(BlockRecord.class));
}
 
Example #5
Source File: FsDatasetImpl.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/** static version of {@link #initReplicaRecovery(RecoveringBlock)}. */
static ReplicaRecoveryInfo initReplicaRecovery(String bpid, ReplicaMap map,
    Block block, long recoveryId, long xceiverStopTimeout) throws IOException {
  final ReplicaInfo replica = map.get(bpid, block.getBlockId());
  LOG.info("initReplicaRecovery: " + block + ", recoveryId=" + recoveryId
      + ", replica=" + replica);

  //check replica
  if (replica == null) {
    return null;
  }

  //stop writer if there is any
  if (replica instanceof ReplicaInPipeline) {
    final ReplicaInPipeline rip = (ReplicaInPipeline)replica;
    rip.stopWriter(xceiverStopTimeout);

    //check replica bytes on disk.
    if (rip.getBytesOnDisk() < rip.getVisibleLength()) {
      throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:"
          + " getBytesOnDisk() < getVisibleLength(), rip=" + rip);
    }

    //check the replica's files
    checkReplicaFiles(rip);
  }

  //check generation stamp
  if (replica.getGenerationStamp() < block.getGenerationStamp()) {
    throw new IOException(
        "replica.getGenerationStamp() < block.getGenerationStamp(), block="
        + block + ", replica=" + replica);
  }

  //check recovery id
  if (replica.getGenerationStamp() >= recoveryId) {
    throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:"
        + " replica.getGenerationStamp() >= recoveryId = " + recoveryId
        + ", block=" + block + ", replica=" + replica);
  }

  //check RUR
  final ReplicaUnderRecovery rur;
  if (replica.getState() == ReplicaState.RUR) {
    rur = (ReplicaUnderRecovery)replica;
    if (rur.getRecoveryID() >= recoveryId) {
      throw new RecoveryInProgressException(
          "rur.getRecoveryID() >= recoveryId = " + recoveryId
          + ", block=" + block + ", rur=" + rur);
    }
    final long oldRecoveryID = rur.getRecoveryID();
    rur.setRecoveryID(recoveryId);
    LOG.info("initReplicaRecovery: update recovery id for " + block
        + " from " + oldRecoveryID + " to " + recoveryId);
  }
  else {
    rur = new ReplicaUnderRecovery(replica, recoveryId);
    map.add(bpid, rur);
    LOG.info("initReplicaRecovery: changing replica state for "
        + block + " from " + replica.getState()
        + " to " + rur.getState());
  }
  return rur.createInfo();
}
 
Example #6
Source File: DataNode.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/** Recover a block */
private void recoverBlock(RecoveringBlock rBlock) throws IOException {
  ExtendedBlock block = rBlock.getBlock();
  String blookPoolId = block.getBlockPoolId();
  DatanodeID[] datanodeids = rBlock.getLocations();
  List<BlockRecord> syncList = new ArrayList<BlockRecord>(datanodeids.length);
  int errorCount = 0;

  //check generation stamps
  for(DatanodeID id : datanodeids) {
    try {
      BPOfferService bpos = blockPoolManager.get(blookPoolId);
      DatanodeRegistration bpReg = bpos.bpRegistration;
      InterDatanodeProtocol datanode = bpReg.equals(id)?
          this: DataNode.createInterDataNodeProtocolProxy(id, getConf(),
              dnConf.socketTimeout, dnConf.connectToDnViaHostname);
      ReplicaRecoveryInfo info = callInitReplicaRecovery(datanode, rBlock);
      if (info != null &&
          info.getGenerationStamp() >= block.getGenerationStamp() &&
          info.getNumBytes() > 0) {
        syncList.add(new BlockRecord(id, datanode, info));
      }
    } catch (RecoveryInProgressException ripE) {
      InterDatanodeProtocol.LOG.warn(
          "Recovery for replica " + block + " on data-node " + id
          + " is already in progress. Recovery id = "
          + rBlock.getNewGenerationStamp() + " is aborted.", ripE);
      return;
    } catch (IOException e) {
      ++errorCount;
      InterDatanodeProtocol.LOG.warn(
          "Failed to obtain replica info for block (=" + block 
          + ") from datanode (=" + id + ")", e);
    }
  }

  if (errorCount == datanodeids.length) {
    throw new IOException("All datanodes failed: block=" + block
        + ", datanodeids=" + Arrays.asList(datanodeids));
  }

  syncBlock(rBlock, syncList);
}
 
Example #7
Source File: FsDatasetImpl.java    From big-c with Apache License 2.0 4 votes vote down vote up
/** static version of {@link #initReplicaRecovery(RecoveringBlock)}. */
static ReplicaRecoveryInfo initReplicaRecovery(String bpid, ReplicaMap map,
    Block block, long recoveryId, long xceiverStopTimeout) throws IOException {
  final ReplicaInfo replica = map.get(bpid, block.getBlockId());
  LOG.info("initReplicaRecovery: " + block + ", recoveryId=" + recoveryId
      + ", replica=" + replica);

  //check replica
  if (replica == null) {
    return null;
  }

  //stop writer if there is any
  if (replica instanceof ReplicaInPipeline) {
    final ReplicaInPipeline rip = (ReplicaInPipeline)replica;
    rip.stopWriter(xceiverStopTimeout);

    //check replica bytes on disk.
    if (rip.getBytesOnDisk() < rip.getVisibleLength()) {
      throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:"
          + " getBytesOnDisk() < getVisibleLength(), rip=" + rip);
    }

    //check the replica's files
    checkReplicaFiles(rip);
  }

  //check generation stamp
  if (replica.getGenerationStamp() < block.getGenerationStamp()) {
    throw new IOException(
        "replica.getGenerationStamp() < block.getGenerationStamp(), block="
        + block + ", replica=" + replica);
  }

  //check recovery id
  if (replica.getGenerationStamp() >= recoveryId) {
    throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:"
        + " replica.getGenerationStamp() >= recoveryId = " + recoveryId
        + ", block=" + block + ", replica=" + replica);
  }

  //check RUR
  final ReplicaUnderRecovery rur;
  if (replica.getState() == ReplicaState.RUR) {
    rur = (ReplicaUnderRecovery)replica;
    if (rur.getRecoveryID() >= recoveryId) {
      throw new RecoveryInProgressException(
          "rur.getRecoveryID() >= recoveryId = " + recoveryId
          + ", block=" + block + ", rur=" + rur);
    }
    final long oldRecoveryID = rur.getRecoveryID();
    rur.setRecoveryID(recoveryId);
    LOG.info("initReplicaRecovery: update recovery id for " + block
        + " from " + oldRecoveryID + " to " + recoveryId);
  }
  else {
    rur = new ReplicaUnderRecovery(replica, recoveryId);
    map.add(bpid, rur);
    LOG.info("initReplicaRecovery: changing replica state for "
        + block + " from " + replica.getState()
        + " to " + rur.getState());
  }
  return rur.createInfo();
}
 
Example #8
Source File: DataNode.java    From big-c with Apache License 2.0 4 votes vote down vote up
/** Recover a block */
private void recoverBlock(RecoveringBlock rBlock) throws IOException {
  ExtendedBlock block = rBlock.getBlock();
  String blookPoolId = block.getBlockPoolId();
  DatanodeID[] datanodeids = rBlock.getLocations();
  List<BlockRecord> syncList = new ArrayList<BlockRecord>(datanodeids.length);
  int errorCount = 0;

  //check generation stamps
  for(DatanodeID id : datanodeids) {
    try {
      BPOfferService bpos = blockPoolManager.get(blookPoolId);
      DatanodeRegistration bpReg = bpos.bpRegistration;
      InterDatanodeProtocol datanode = bpReg.equals(id)?
          this: DataNode.createInterDataNodeProtocolProxy(id, getConf(),
              dnConf.socketTimeout, dnConf.connectToDnViaHostname);
      ReplicaRecoveryInfo info = callInitReplicaRecovery(datanode, rBlock);
      if (info != null &&
          info.getGenerationStamp() >= block.getGenerationStamp() &&
          info.getNumBytes() > 0) {
        syncList.add(new BlockRecord(id, datanode, info));
      }
    } catch (RecoveryInProgressException ripE) {
      InterDatanodeProtocol.LOG.warn(
          "Recovery for replica " + block + " on data-node " + id
          + " is already in progress. Recovery id = "
          + rBlock.getNewGenerationStamp() + " is aborted.", ripE);
      return;
    } catch (IOException e) {
      ++errorCount;
      InterDatanodeProtocol.LOG.warn(
          "Failed to obtain replica info for block (=" + block 
          + ") from datanode (=" + id + ")", e);
    }
  }

  if (errorCount == datanodeids.length) {
    throw new IOException("All datanodes failed: block=" + block
        + ", datanodeids=" + Arrays.asList(datanodeids));
  }

  syncBlock(rBlock, syncList);
}
 
Example #9
Source File: HoodieLogFormatWriter.java    From hudi with Apache License 2.0 4 votes vote down vote up
private void handleAppendExceptionOrRecoverLease(Path path, RemoteException e)
    throws IOException, InterruptedException {
  if (e.getMessage().contains(APPEND_UNAVAILABLE_EXCEPTION_MESSAGE)) {
    // This issue happens when all replicas for a file are down and/or being decommissioned.
    // The fs.append() API could append to the last block for a file. If the last block is full, a new block is
    // appended to. In a scenario when a lot of DN's are decommissioned, it can happen that DN's holding all
    // replicas for a block/file are decommissioned together. During this process, all these blocks will start to
    // get replicated to other active DataNodes but this process might take time (can be of the order of few
    // hours). During this time, if a fs.append() API is invoked for a file whose last block is eligible to be
    // appended to, then the NN will throw an exception saying that it couldn't find any active replica with the
    // last block. Find more information here : https://issues.apache.org/jira/browse/HDFS-6325
    LOG.warn("Failed to open an append stream to the log file. Opening a new log file..", e);
    // Rollover the current log file (since cannot get a stream handle) and create new one
    this.logFile = logFile.rollOver(fs, rolloverLogWriteToken);
    createNewFile();
  } else if (e.getClassName().contentEquals(AlreadyBeingCreatedException.class.getName())) {
    LOG.warn("Another task executor writing to the same log file(" + logFile + ". Rolling over");
    // Rollover the current log file (since cannot get a stream handle) and create new one
    this.logFile = logFile.rollOver(fs, rolloverLogWriteToken);
    createNewFile();
  } else if (e.getClassName().contentEquals(RecoveryInProgressException.class.getName())
      && (fs instanceof DistributedFileSystem)) {
    // this happens when either another task executor writing to this file died or
    // data node is going down. Note that we can only try to recover lease for a DistributedFileSystem.
    // ViewFileSystem unfortunately does not support this operation
    LOG.warn("Trying to recover log on path " + path);
    if (FSUtils.recoverDFSFileLease((DistributedFileSystem) fs, path)) {
      LOG.warn("Recovered lease on path " + path);
      // try again
      this.output = fs.append(path, bufferSize);
    } else {
      LOG.warn("Failed to recover lease on path " + path);
      throw new HoodieException(e);
    }
  } else {
    // When fs.append() has failed and an exception is thrown, by closing the output stream
    // we shall force hdfs to release the lease on the log file. When Spark retries this task (with
    // new attemptId, say taskId.1) it will be able to acquire lease on the log file (as output stream was
    // closed properly by taskId.0).
    //
    // If close() call were to fail throwing an exception, our best bet is to rollover to a new log file.
    try {
      close();
      // output stream has been successfully closed and lease on the log file has been released,
      // before throwing an exception for the append failure.
      throw new HoodieIOException("Failed to append to the output stream ", e);
    } catch (Exception ce) {
      LOG.warn("Failed to close the output stream for " + fs.getClass().getName() + " on path " + path
          + ". Rolling over to a new log file.");
      this.logFile = logFile.rollOver(fs, rolloverLogWriteToken);
      createNewFile();
    }
  }
}