Java Code Examples for org.apache.hadoop.yarn.api.records.ContainerExitStatus#ABORTED

The following examples show how to use org.apache.hadoop.yarn.api.records.ContainerExitStatus#ABORTED . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: RMAppAttemptImpl.java From hadoop with Apache License 2.0

5 votes

@Override
public boolean shouldCountTowardsMaxAttemptRetry() {
  try {
    this.readLock.lock();
    int exitStatus = getAMContainerExitStatus();
    return !(exitStatus == ContainerExitStatus.PREEMPTED
        || exitStatus == ContainerExitStatus.ABORTED
        || exitStatus == ContainerExitStatus.DISKS_FAILED
        || exitStatus == ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
  } finally {
    this.readLock.unlock();
  }
}

Example 2

Source File: RMContainerAllocator.java From hadoop with Apache License 2.0

5 votes

@VisibleForTesting
public TaskAttemptEvent createContainerFinishedEvent(ContainerStatus cont,
    TaskAttemptId attemptID) {
  if (cont.getExitStatus() == ContainerExitStatus.ABORTED
      || cont.getExitStatus() == ContainerExitStatus.PREEMPTED) {
    // killed by framework
    return new TaskAttemptEvent(attemptID,
        TaskAttemptEventType.TA_KILL);
  } else {
    return new TaskAttemptEvent(attemptID,
        TaskAttemptEventType.TA_CONTAINER_COMPLETED);
  }
}

Example 3

Source File: RMAppAttemptImpl.java From big-c with Apache License 2.0

5 votes

@Override
public boolean shouldCountTowardsMaxAttemptRetry() {
  try {
    this.readLock.lock();
    int exitStatus = getAMContainerExitStatus();
    return !(exitStatus == ContainerExitStatus.PREEMPTED
        || exitStatus == ContainerExitStatus.ABORTED
        || exitStatus == ContainerExitStatus.DISKS_FAILED
        || exitStatus == ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
  } finally {
    this.readLock.unlock();
  }
}

Example 4

Source File: RMContainerAllocator.java From big-c with Apache License 2.0

5 votes

@VisibleForTesting
public TaskAttemptEvent createContainerFinishedEvent(ContainerStatus cont,
    TaskAttemptId attemptID) {
  if (cont.getExitStatus() == ContainerExitStatus.ABORTED
      || cont.getExitStatus() == ContainerExitStatus.PREEMPTED) {
    // killed by framework
    return new TaskAttemptEvent(attemptID,
        TaskAttemptEventType.TA_KILL);
  } else {
    return new TaskAttemptEvent(attemptID,
        TaskAttemptEventType.TA_CONTAINER_COMPLETED);
  }
}

Example 5

Source File: YarnService.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Check the exit status of a completed container and see if the replacement container
 * should try to be started on the same node. Some exit status indicates a disk or
 * node failure and in such cases the replacement container should try to be started on
 * a different node.
 */
private boolean shouldStickToTheSameNode(int containerExitStatus) {
  switch (containerExitStatus) {
    case ContainerExitStatus.DISKS_FAILED:
      return false;
    case ContainerExitStatus.ABORTED:
      // Mostly likely this exit status is due to node failures because the
      // application itself will not release containers.
      return false;
    default:
      // Stick to the same node for other cases if host affinity is enabled.
      return this.containerHostAffinityEnabled;
  }
}

Example 6

Source File: AbstractApplicationMaster.java From Scribengin with GNU Affero General Public License v3.0

5 votes

public void onContainersCompleted(List<ContainerStatus> statuses) {
  LOG.info("onContainersCompleted");
  for (ContainerStatus status : statuses) {
    assert (status.getState() == ContainerState.COMPLETE);

    int exitStatus = status.getExitStatus();
    if (exitStatus != ContainerExitStatus.SUCCESS) {
      if (exitStatus != ContainerExitStatus.ABORTED) {
        failedContainerCount.incrementAndGet();
      }
      allocatedContainerCount.decrementAndGet();
      requestedContainerCount.decrementAndGet();
      recordFailedCommand(status.getContainerId());
    } else {
      completedContainerCount.incrementAndGet();
    }
  }

  int askAgainCount = totalContainerCount - requestedContainerCount.get();
  requestedContainerCount.addAndGet(askAgainCount);

  if (askAgainCount > 0) {
    // need to reallocate failed containers
    for (int i = 0; i < askAgainCount; i++) {
      ContainerRequest req = setupContainerReqForRM();
      resourceManager.addContainerRequest(req);
    }
  }

  if (completedContainerCount.get() == totalContainerCount) {
    done = true;
  }
}

Example 7

Source File: ApplicationMaster.java From TensorFlowOnYARN with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
  LOG.info("Got response from RM for container ask, completedCnt="
      + completedContainers.size());
  for (ContainerStatus containerStatus : completedContainers) {
    LOG.info(appAttemptId + " got container status for containerID="
        + containerStatus.getContainerId() + ", state="
        + containerStatus.getState() + ", exitStatus="
        + containerStatus.getExitStatus() + ", diagnostics="
        + containerStatus.getDiagnostics());

    // non complete containers should not be here
    assert (containerStatus.getState() == ContainerState.COMPLETE);
    // ignore containers we know nothing about - probably from a previous
    // attempt
    if (!launchedContainers.contains(containerStatus.getContainerId())) {
      LOG.info("Ignoring completed status of "
          + containerStatus.getContainerId()
          + "; unknown container(probably launched by previous attempt)");
      continue;
    }

    // increment counters for completed/failed containers
    int exitStatus = containerStatus.getExitStatus();
    if (0 != exitStatus) {
      // container failed
      if (ContainerExitStatus.ABORTED != exitStatus) {
        // shell script failed
        // counts as completed
        completedContainerNum.incrementAndGet();
        failedContainerNum.incrementAndGet();
      } else {
        // container was killed by framework, possibly preempted
        // we should re-try as the container was lost for some reason
        allocatedContainerNum.decrementAndGet();
        requestedContainerNum.decrementAndGet();
        // we do not need to release the container as it would be done
        // by the RM
      }
    } else {
      // nothing to do
      // container completed successfully
      completedContainerNum.incrementAndGet();
      LOG.info("Container completed successfully." + ", containerId="
          + containerStatus.getContainerId());
    }
  }

  // ask for more containers if any failed
  int askCount = args.totalContainerNum - requestedContainerNum.get();
  requestedContainerNum.addAndGet(askCount);

  if (askCount > 0) {
    for (int i = 0; i < askCount; ++i) {
      ContainerRequest containerAsk = setupContainerAskForRM();
      amRMClient.addContainerRequest(containerAsk);
    }
  }

  if (completedContainerNum.get() == args.totalContainerNum) {
    done = true;
  }
}

Example 8

Source File: ApplicationMaster.java From hadoop with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
  LOG.info("Got response from RM for container ask, completedCnt="
      + completedContainers.size());
  for (ContainerStatus containerStatus : completedContainers) {
    LOG.info(appAttemptID + " got container status for containerID="
        + containerStatus.getContainerId() + ", state="
        + containerStatus.getState() + ", exitStatus="
        + containerStatus.getExitStatus() + ", diagnostics="
        + containerStatus.getDiagnostics());

    // non complete containers should not be here
    assert (containerStatus.getState() == ContainerState.COMPLETE);

    // increment counters for completed/failed containers
    int exitStatus = containerStatus.getExitStatus();
    if (0 != exitStatus) {
      // container failed
      if (ContainerExitStatus.ABORTED != exitStatus) {
        // shell script failed
        // counts as completed
        numCompletedContainers.incrementAndGet();
        numFailedContainers.incrementAndGet();
      } else {
        // container was killed by framework, possibly preempted
        // we should re-try as the container was lost for some reason
        numAllocatedContainers.decrementAndGet();
        numRequestedContainers.decrementAndGet();
        // we do not need to release the container as it would be done
        // by the RM
      }
    } else {
      // nothing to do
      // container completed successfully
      numCompletedContainers.incrementAndGet();
      LOG.info("Container completed successfully." + ", containerId="
          + containerStatus.getContainerId());
    }
    if(timelineClient != null) {
      publishContainerEndEvent(
          timelineClient, containerStatus, domainId, appSubmitterUgi);
    }
  }
  
  // ask for more containers if any failed
  int askCount = numTotalContainers - numRequestedContainers.get();
  numRequestedContainers.addAndGet(askCount);

  if (askCount > 0) {
    for (int i = 0; i < askCount; ++i) {
      ContainerRequest containerAsk = setupContainerAskForRM();
      amRMClient.addContainerRequest(containerAsk);
    }
  }
  
  if (numCompletedContainers.get() == numTotalContainers) {
    done = true;
  }
}

Example 9

Source File: ResourceSchedulerWrapper.java From hadoop with Apache License 2.0

4 votes

private void updateQueueWithNodeUpdate(
        NodeUpdateSchedulerEventWrapper eventWrapper) {
  RMNodeWrapper node = (RMNodeWrapper) eventWrapper.getRMNode();
  List<UpdatedContainerInfo> containerList = node.getContainerUpdates();
  for (UpdatedContainerInfo info : containerList) {
    for (ContainerStatus status : info.getCompletedContainers()) {
      ContainerId containerId = status.getContainerId();
      SchedulerAppReport app = scheduler.getSchedulerAppInfo(
              containerId.getApplicationAttemptId());

      if (app == null) {
        // this happens for the AM container
        // The app have already removed when the NM sends the release
        // information.
        continue;
      }

      String queue =
          appQueueMap.get(containerId.getApplicationAttemptId()
            .getApplicationId());
      int releasedMemory = 0, releasedVCores = 0;
      if (status.getExitStatus() == ContainerExitStatus.SUCCESS) {
        for (RMContainer rmc : app.getLiveContainers()) {
          if (rmc.getContainerId() == containerId) {
            releasedMemory += rmc.getContainer().getResource().getMemory();
            releasedVCores += rmc.getContainer()
                    .getResource().getVirtualCores();
            break;
          }
        }
      } else if (status.getExitStatus() == ContainerExitStatus.ABORTED) {
        if (preemptionContainerMap.containsKey(containerId)) {
          Resource preResource = preemptionContainerMap.get(containerId);
          releasedMemory += preResource.getMemory();
          releasedVCores += preResource.getVirtualCores();
          preemptionContainerMap.remove(containerId);
        }
      }
      // update queue counters
      updateQueueMetrics(queue, releasedMemory, releasedVCores);
    }
  }
}

Example 10

Source File: ApplicationMaster.java From big-c with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
  LOG.info("Got response from RM for container ask, completedCnt="
      + completedContainers.size());
  for (ContainerStatus containerStatus : completedContainers) {
    LOG.info(appAttemptID + " got container status for containerID="
        + containerStatus.getContainerId() + ", state="
        + containerStatus.getState() + ", exitStatus="
        + containerStatus.getExitStatus() + ", diagnostics="
        + containerStatus.getDiagnostics());

    // non complete containers should not be here
    assert (containerStatus.getState() == ContainerState.COMPLETE);

    // increment counters for completed/failed containers
    int exitStatus = containerStatus.getExitStatus();
    if (0 != exitStatus) {
      // container failed
      if (ContainerExitStatus.ABORTED != exitStatus) {
        // shell script failed
        // counts as completed
        numCompletedContainers.incrementAndGet();
        numFailedContainers.incrementAndGet();
      } else {
        // container was killed by framework, possibly preempted
        // we should re-try as the container was lost for some reason
        numAllocatedContainers.decrementAndGet();
        numRequestedContainers.decrementAndGet();
        // we do not need to release the container as it would be done
        // by the RM
      }
    } else {
      // nothing to do
      // container completed successfully
      numCompletedContainers.incrementAndGet();
      LOG.info("Container completed successfully." + ", containerId="
          + containerStatus.getContainerId());
    }
    if(timelineClient != null) {
      publishContainerEndEvent(
          timelineClient, containerStatus, domainId, appSubmitterUgi);
    }
  }
  
  // ask for more containers if any failed
  int askCount = numTotalContainers - numRequestedContainers.get();
  numRequestedContainers.addAndGet(askCount);

  if (askCount > 0) {
    for (int i = 0; i < askCount; ++i) {
      ContainerRequest containerAsk = setupContainerAskForRM();
      amRMClient.addContainerRequest(containerAsk);
    }
  }
  
  if (numCompletedContainers.get() == numTotalContainers) {
    done = true;
  }
}

Example 11

Source File: ResourceSchedulerWrapper.java From big-c with Apache License 2.0

4 votes

private void updateQueueWithNodeUpdate(
        NodeUpdateSchedulerEventWrapper eventWrapper) {
  RMNodeWrapper node = (RMNodeWrapper) eventWrapper.getRMNode();
  List<UpdatedContainerInfo> containerList = node.getContainerUpdates();
  for (UpdatedContainerInfo info : containerList) {
    for (ContainerStatus status : info.getCompletedContainers()) {
      ContainerId containerId = status.getContainerId();
      SchedulerAppReport app = scheduler.getSchedulerAppInfo(
              containerId.getApplicationAttemptId());

      if (app == null) {
        // this happens for the AM container
        // The app have already removed when the NM sends the release
        // information.
        continue;
      }

      String queue =
          appQueueMap.get(containerId.getApplicationAttemptId()
            .getApplicationId());
      int releasedMemory = 0, releasedVCores = 0;
      if (status.getExitStatus() == ContainerExitStatus.SUCCESS) {
        for (RMContainer rmc : app.getLiveContainers()) {
          if (rmc.getContainerId() == containerId) {
            releasedMemory += rmc.getContainer().getResource().getMemory();
            releasedVCores += rmc.getContainer()
                    .getResource().getVirtualCores();
            break;
          }
        }
      } else if (status.getExitStatus() == ContainerExitStatus.ABORTED) {
        if (preemptionContainerMap.containsKey(containerId)) {
          Resource preResource = preemptionContainerMap.get(containerId);
          releasedMemory += preResource.getMemory();
          releasedVCores += preResource.getVirtualCores();
          preemptionContainerMap.remove(containerId);
        }
      }
      // update queue counters
      updateQueueMetrics(queue, releasedMemory, releasedVCores);
    }
  }
}

Example 12

Source File: YarnService.java From incubator-gobblin with Apache License 2.0

4 votes

/**
 * Handle the completion of a container. A new container will be requested to replace the one
 * that just exited. Depending on the exit status and if container host affinity is enabled,
 * the new container may or may not try to be started on the same node.
 *
 * A container completes in either of the following conditions: 1) some error happens in the
 * container and caused the container to exit, 2) the container gets killed due to some reason,
 * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
 * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster.
 * A replacement container is needed in all but the last case.
 */
protected void handleContainerCompletion(ContainerStatus containerStatus) {
  Map.Entry<Container, String> completedContainerEntry = this.containerMap.remove(containerStatus.getContainerId());
  //Get the Helix instance name for the completed container. Because callbacks are processed asynchronously, we might
  //encounter situations where handleContainerCompletion() is called before onContainersAllocated(), resulting in the
  //containerId missing from the containersMap.
  String completedInstanceName = completedContainerEntry == null?  UNKNOWN_HELIX_INSTANCE : completedContainerEntry.getValue();

  LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d",
      containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus()));

  if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
    LOGGER.info(String.format("Received the following diagnostics information for container %s: %s",
        containerStatus.getContainerId(), containerStatus.getDiagnostics()));
  }

  if (containerStatus.getExitStatus() == ContainerExitStatus.ABORTED) {
    if (this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != null) {
      LOGGER.info("Container release requested, so not spawning a replacement for containerId {}", containerStatus.getContainerId());
      if (completedContainerEntry != null) {
        LOGGER.info("Adding instance {} to the pool of unused instances", completedInstanceName);
        this.unusedHelixInstanceNames.add(completedInstanceName);
      }
      return;
    } else {
      LOGGER.info("Container {} aborted due to lost NM", containerStatus.getContainerId());
     // Container release was not requested. Likely, the container was running on a node on which the NM died.
     // In this case, RM assumes that the containers are "lost", even though the container process may still be
      // running on the node. We need to ensure that the Helix instances running on the orphaned containers
      // are fenced off from the Helix cluster to avoid double publishing and state being committed by the
      // instances.
      if (!UNKNOWN_HELIX_INSTANCE.equals(completedInstanceName)) {
        String clusterName = this.helixManager.getClusterName();
        //Disable the orphaned instance.
        if (HelixUtils.isInstanceLive(helixManager, completedInstanceName)) {
          LOGGER.info("Disabling the Helix instance {}", completedInstanceName);
          this.helixManager.getClusterManagmentTool().enableInstance(clusterName, completedInstanceName, false);
        }
      }
    }
  }

  if (this.shutdownInProgress) {
    return;
  }
  if(completedContainerEntry != null) {
    this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0));
    int retryCount = this.helixInstanceRetryCount.get(completedInstanceName).incrementAndGet();

    // Populate event metadata
    Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
    if (this.eventSubmitter.isPresent()) {
      eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
      eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName);
      eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + "");
    }

    if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
      if (this.eventSubmitter.isPresent()) {
        this.eventSubmitter.get()
            .submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build());
      }

      LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName);
      return;
    }

    // Add the Helix instance name of the completed container to the set of unused
    // instance names so they can be reused by a replacement container.
    LOGGER.info("Adding instance {} to the pool of unused instances", completedInstanceName);
    this.unusedHelixInstanceNames.add(completedInstanceName);

    if (this.eventSubmitter.isPresent()) {
      this.eventSubmitter.get()
          .submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build());
    }
  }
  LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s", containerStatus.getContainerId(), completedInstanceName));
  this.eventBus.post(new NewContainerRequest(
      shouldStickToTheSameNode(containerStatus.getExitStatus()) && completedContainerEntry != null ?
          Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent()));
}