org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType Java Examples

The following examples show how to use org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractYarnScheduler.java    From hadoop with Apache License 2.0 6 votes vote down vote up
protected void releaseContainers(List<ContainerId> containers,
    SchedulerApplicationAttempt attempt) {
  for (ContainerId containerId : containers) {
    RMContainer rmContainer = getRMContainer(containerId);
    if (rmContainer == null) {
      if (System.currentTimeMillis() - ResourceManager.getClusterTimeStamp()
          < nmExpireInterval) {
        LOG.info(containerId + " doesn't exist. Add the container"
            + " to the release request cache as it maybe on recovery.");
        synchronized (attempt) {
          attempt.getPendingRelease().add(containerId);
        }
      } else {
        RMAuditLogger.logFailure(attempt.getUser(),
          AuditConstants.RELEASE_CONTAINER,
          "Unauthorized access or invalid container", "Scheduler",
          "Trying to release container not owned by app or with invalid id.",
          attempt.getApplicationId(), containerId);
      }
    }
    completedContainer(rmContainer,
      SchedulerUtils.createAbnormalContainerStatus(containerId,
        SchedulerUtils.RELEASED_CONTAINER), RMContainerEventType.RELEASED);
  }
}
 
Example #2
Source File: AbstractYarnScheduler.java    From big-c with Apache License 2.0 6 votes vote down vote up
public synchronized void adjusctContianerResouce(ContainerId contianerId, Resource resource){
 
 RMContainer container = getRMContainer(contianerId);
 //get original resource
 Resource allocatedResource = container.getAllocatedResource();
 Resource marginalResource=Resources.subtract(allocatedResource, resource);
 
 if(marginalResource.getMemory() > 0 ){		  
  LOG.info("increase memory on continaer"+contianerId.getContainerId()+"by"+marginalResource.getMemory());		  
 }else{		  
  LOG.info("decrease memory on continaer"+contianerId.getContainerId()+"by"+marginalResource.getMemory());
 }
 
 if(marginalResource.getVirtualCores() > 0){
  LOG.info("increase cores on contianer"+contianerId.getContainerId()+"by"+marginalResource.getVirtualCores());
 }else{
  LOG.info("dncrease cores on contianer"+contianerId.getContainerId()+"by"+marginalResource.getVirtualCores());
 }
 
 SchedulerNode schedulerNode = getSchedulerNode(container.getAllocatedNode());
 //now inform the schedulerNode to adjust resurce assumption
 schedulerNode.addAvailableResource(marginalResource);
 //now inform RMContinaer to adjuct its allocatedResource
 container.handle(new RMContainerResourceUpdateEvent(contianerId, RMContainerEventType.RESOURCE_UPDATE, resource));
 	  
}
 
Example #3
Source File: FifoScheduler.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private synchronized void removeNode(RMNode nodeInfo) {
  FiCaSchedulerNode node = getNode(nodeInfo.getNodeID());
  if (node == null) {
    return;
  }
  // Kill running containers
  for(RMContainer container : node.getRunningContainers()) {
    completedContainer(container, 
        SchedulerUtils.createAbnormalContainerStatus(
            container.getContainerId(), 
            SchedulerUtils.LOST_CONTAINER),
            RMContainerEventType.KILL);
  }
  
  //Remove the node
  this.nodes.remove(nodeInfo.getNodeID());
  updateMaximumAllocation(node, false);
  
  // Update cluster metrics
  Resources.subtractFrom(clusterResource, node.getRMNode().getTotalCapability());
}
 
Example #4
Source File: FifoScheduler.java    From big-c with Apache License 2.0 6 votes vote down vote up
private synchronized void removeNode(RMNode nodeInfo) {
  FiCaSchedulerNode node = getNode(nodeInfo.getNodeID());
  if (node == null) {
    return;
  }
  // Kill running containers
  for(RMContainer container : node.getRunningContainers()) {
    completedContainer(container, 
        SchedulerUtils.createAbnormalContainerStatus(
            container.getContainerId(), 
            SchedulerUtils.LOST_CONTAINER),
            RMContainerEventType.KILL);
  }
  
  //Remove the node
  this.nodes.remove(nodeInfo.getNodeID());
  updateMaximumAllocation(node, false);
  
  // Update cluster metrics
  Resources.subtractFrom(clusterResource, node.getRMNode().getTotalCapability());
}
 
Example #5
Source File: FSAppAttempt.java    From hadoop with Apache License 2.0 5 votes vote down vote up
synchronized public void containerCompleted(RMContainer rmContainer,
    ContainerStatus containerStatus, RMContainerEventType event) {
  
  Container container = rmContainer.getContainer();
  ContainerId containerId = container.getId();
  
  // Remove from the list of newly allocated containers if found
  newlyAllocatedContainers.remove(rmContainer);
  
  // Inform the container
  rmContainer.handle(
      new RMContainerFinishedEvent(
          containerId,
          containerStatus, 
          event)
      );
  LOG.info("Completed container: " + rmContainer.getContainerId() + 
      " in state: " + rmContainer.getState() + " event:" + event);
  
  // Remove from the list of containers
  liveContainers.remove(rmContainer.getContainerId());

  RMAuditLogger.logSuccess(getUser(), 
      AuditConstants.RELEASE_CONTAINER, "SchedulerApp", 
      getApplicationId(), containerId);
  
  // Update usage metrics 
  Resource containerResource = rmContainer.getContainer().getResource();
  queue.getMetrics().releaseResources(getUser(), 1, containerResource);
  this.attemptResourceUsage.decUsed(containerResource);

  // remove from preemption map if it is completed
  preemptionMap.remove(rmContainer);

  // Clear resource utilization metrics cache.
  lastMemoryAggregateAllocationUpdateTime = -1;
}
 
Example #6
Source File: CapacityScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Lock(CapacityScheduler.class)
@Override
protected synchronized void completedContainer(RMContainer rmContainer,
    ContainerStatus containerStatus, RMContainerEventType event) {
  if (rmContainer == null) {
    LOG.info("Null container completed...");
    return;
  }
  
  Container container = rmContainer.getContainer();
  
  // Get the application for the finished container
  FiCaSchedulerApp application =
      getCurrentAttemptForContainer(container.getId());
  ApplicationId appId =
      container.getId().getApplicationAttemptId().getApplicationId();
  if (application == null) {
    LOG.info("Container " + container + " of" + " unknown application "
        + appId + " completed or suspended with event " + event);
    return;
  }
  
  // Get the node on which the container was allocated
  FiCaSchedulerNode node = getNode(container.getNodeId());
  
  // Inform the queue
  LeafQueue queue = (LeafQueue)application.getQueue();
  queue.completedContainer(clusterResource, application, node, 
      rmContainer, containerStatus, event, null, true);

  LOG.info("Application attempt " + application.getApplicationAttemptId()
      + " released container " + container.getId() + " on node: " + node
      + " with event: " + event);
}
 
Example #7
Source File: CapacityScheduler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Override
public void killContainer(RMContainer cont) {
  if (LOG.isDebugEnabled()) {
    LOG.debug("KILL_CONTAINER: container" + cont.toString());
  }
  recoverResourceRequestForContainer(cont);
  completedContainer(cont, SchedulerUtils.createPreemptedContainerStatus(
    cont.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER),
    RMContainerEventType.KILL);
}
 
Example #8
Source File: CapacityScheduler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Override
public void dropContainerReservation(RMContainer container) {
  if(LOG.isDebugEnabled()){
    LOG.debug("DROP_RESERVATION:" + container.toString());
  }
  completedContainer(container,
      SchedulerUtils.createAbnormalContainerStatus(
          container.getContainerId(),
          SchedulerUtils.UNRESERVED_CONTAINER),
      RMContainerEventType.KILL);
}
 
Example #9
Source File: CapacityScheduler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Lock(CapacityScheduler.class)
@Override
protected synchronized void completedContainer(RMContainer rmContainer,
    ContainerStatus containerStatus, RMContainerEventType event) {
  if (rmContainer == null) {
    LOG.info("Null container completed...");
    return;
  }
  
  Container container = rmContainer.getContainer();
  
  // Get the application for the finished container
  FiCaSchedulerApp application =
      getCurrentAttemptForContainer(container.getId());
  ApplicationId appId =
      container.getId().getApplicationAttemptId().getApplicationId();
  if (application == null) {
    LOG.info("Container " + container + " of" + " unknown application "
        + appId + " completed with event " + event);
    return;
  }
  
  // Get the node on which the container was allocated
  FiCaSchedulerNode node = getNode(container.getNodeId());
  
  // Inform the queue
  LeafQueue queue = (LeafQueue)application.getQueue();
  queue.completedContainer(clusterResource, application, node, 
      rmContainer, containerStatus, event, null, true);

  LOG.info("Application attempt " + application.getApplicationAttemptId()
      + " released container " + container.getId() + " on node: " + node
      + " with event: " + event);
}
 
Example #10
Source File: CapacityScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Override
public void killContainer(RMContainer cont) {
  if (LOG.isDebugEnabled()) {
    LOG.debug("KILL_CONTAINER: container" + cont.toString());
  }
  recoverResourceRequestForContainer(cont);
  completedContainer(cont, SchedulerUtils.createPreemptedContainerStatus(
  	      cont.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER),
  	      RMContainerEventType.KILL);
}
 
Example #11
Source File: FairScheduler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
protected void warnOrKillContainer(RMContainer container) {
  ApplicationAttemptId appAttemptId = container.getApplicationAttemptId();
  FSAppAttempt app = getSchedulerApp(appAttemptId);
  FSLeafQueue queue = app.getQueue();
  LOG.info("Preempting container (prio=" + container.getContainer().getPriority() +
      "res=" + container.getContainer().getResource() +
      ") from queue " + queue.getName());
  
  Long time = app.getContainerPreemptionTime(container);

  if (time != null) {
    // if we asked for preemption more than maxWaitTimeBeforeKill ms ago,
    // proceed with kill
    if (time + waitTimeBeforeKill < getClock().getTime()) {
      ContainerStatus status =
        SchedulerUtils.createPreemptedContainerStatus(
          container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER);

      recoverResourceRequestForContainer(container);
      // TODO: Not sure if this ever actually adds this to the list of cleanup
      // containers on the RMNode (see SchedulerNode.releaseContainer()).
      completedContainer(container, status, RMContainerEventType.KILL);
      LOG.info("Killing container" + container +
          " (after waiting for premption for " +
          (getClock().getTime() - time) + "ms)");
    }
  } else {
    // track the request in the FSAppAttempt itself
    app.addPreemption(container, getClock().getTime());
  }
}
 
Example #12
Source File: TestLeafQueue.java    From hadoop with Apache License 2.0 5 votes vote down vote up
static LeafQueue stubLeafQueue(LeafQueue queue) {
  
  // Mock some methods for ease in these unit tests
  
  // 1. LeafQueue.createContainer to return dummy containers
  doAnswer(
      new Answer<Container>() {
        @Override
        public Container answer(InvocationOnMock invocation) 
            throws Throwable {
          final FiCaSchedulerApp application = 
              (FiCaSchedulerApp)(invocation.getArguments()[0]);
          final ContainerId containerId =                 
              TestUtils.getMockContainerId(application);

          Container container = TestUtils.getMockContainer(
              containerId,
              ((FiCaSchedulerNode)(invocation.getArguments()[1])).getNodeID(), 
              (Resource)(invocation.getArguments()[2]),
              ((Priority)invocation.getArguments()[3]));
          return container;
        }
      }
    ).
    when(queue).createContainer(
            any(FiCaSchedulerApp.class), 
            any(FiCaSchedulerNode.class), 
            any(Resource.class),
            any(Priority.class)
            );
  
  // 2. Stub out LeafQueue.parent.completedContainer
  CSQueue parent = queue.getParent();
  doNothing().when(parent).completedContainer(
      any(Resource.class), any(FiCaSchedulerApp.class), any(FiCaSchedulerNode.class), 
      any(RMContainer.class), any(ContainerStatus.class), 
      any(RMContainerEventType.class), any(CSQueue.class), anyBoolean());
  
  return queue;
}
 
Example #13
Source File: YarnNodeCapacityManager.java    From incubator-myriad with Apache License 2.0 5 votes vote down vote up
@Override
public void beforeCompletedContainer(RMContainer rmContainer, ContainerStatus containerStatus, RMContainerEventType type) {
  if (type.equals(RMContainerEventType.KILL) || type.equals(RMContainerEventType.RELEASED)) {
    LOGGER.info("{} completed with exit status {}, killing cooresponding mesos task.", rmContainer.getContainerId().toString(), type);
    removeYarnTask(rmContainer);
  }
}
 
Example #14
Source File: FairScheduler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Process a heartbeat update from a node.
 */
private synchronized void nodeUpdate(RMNode nm) {
  long start = getClock().getTime();
  if (LOG.isDebugEnabled()) {
    LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
  }
  eventLog.log("HEARTBEAT", nm.getHostName());
  FSSchedulerNode node = getFSSchedulerNode(nm.getNodeID());
  
  List<UpdatedContainerInfo> containerInfoList = nm.pullContainerUpdates();
  List<ContainerStatus> newlyLaunchedContainers = new ArrayList<ContainerStatus>();
  List<ContainerStatus> completedContainers = new ArrayList<ContainerStatus>();
  for(UpdatedContainerInfo containerInfo : containerInfoList) {
    newlyLaunchedContainers.addAll(containerInfo.getNewlyLaunchedContainers());
    completedContainers.addAll(containerInfo.getCompletedContainers());
  } 
  // Processing the newly launched containers
  for (ContainerStatus launchedContainer : newlyLaunchedContainers) {
    containerLaunchedOnNode(launchedContainer.getContainerId(), node);
  }

  // Process completed containers
  for (ContainerStatus completedContainer : completedContainers) {
    ContainerId containerId = completedContainer.getContainerId();
    LOG.debug("Container FINISHED: " + containerId);
    completedContainer(getRMContainer(containerId),
        completedContainer, RMContainerEventType.FINISHED);
  }

  if (continuousSchedulingEnabled) {
    if (!completedContainers.isEmpty()) {
      attemptScheduling(node);
    }
  } else {
    attemptScheduling(node);
  }

  long duration = getClock().getTime() - start;
  fsOpDurations.addNodeUpdateDuration(duration);
}
 
Example #15
Source File: FairScheduler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private synchronized void removeNode(RMNode rmNode) {
  FSSchedulerNode node = getFSSchedulerNode(rmNode.getNodeID());
  // This can occur when an UNHEALTHY node reconnects
  if (node == null) {
    return;
  }
  Resources.subtractFrom(clusterResource, rmNode.getTotalCapability());
  updateRootQueueMetrics();

  // Remove running containers
  List<RMContainer> runningContainers = node.getRunningContainers();
  for (RMContainer container : runningContainers) {
    completedContainer(container,
        SchedulerUtils.createAbnormalContainerStatus(
            container.getContainerId(),
            SchedulerUtils.LOST_CONTAINER),
        RMContainerEventType.KILL);
  }

  // Remove reservations, if any
  RMContainer reservedContainer = node.getReservedContainer();
  if (reservedContainer != null) {
    completedContainer(reservedContainer,
        SchedulerUtils.createAbnormalContainerStatus(
            reservedContainer.getContainerId(),
            SchedulerUtils.LOST_CONTAINER),
        RMContainerEventType.KILL);
  }

  nodes.remove(rmNode.getNodeID());
  queueMgr.getRootQueue().setSteadyFairShare(clusterResource);
  queueMgr.getRootQueue().recomputeSteadyShares();
  updateMaximumAllocation(node, false);
  LOG.info("Removed node " + rmNode.getNodeAddress() +
      " cluster capacity: " + clusterResource);
}
 
Example #16
Source File: FairScheduler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Clean up a completed container.
 */
@Override
protected synchronized void completedContainer(RMContainer rmContainer,
    ContainerStatus containerStatus, RMContainerEventType event) {
  if (rmContainer == null) {
    LOG.info("Null container completed...");
    return;
  }

  Container container = rmContainer.getContainer();

  // Get the application for the finished container
  FSAppAttempt application =
      getCurrentAttemptForContainer(container.getId());
  ApplicationId appId =
      container.getId().getApplicationAttemptId().getApplicationId();
  if (application == null) {
    LOG.info("Container " + container + " of" +
        " unknown application attempt " + appId +
        " completed with event " + event);
    return;
  }

  // Get the node on which the container was allocated
  FSSchedulerNode node = getFSSchedulerNode(container.getNodeId());

  if (rmContainer.getState() == RMContainerState.RESERVED) {
    application.unreserve(rmContainer.getReservedPriority(), node);
  } else {
    application.containerCompleted(rmContainer, containerStatus, event);
    node.releaseContainer(container);
    updateRootQueueMetrics();
  }

  LOG.info("Application attempt " + application.getApplicationAttemptId()
      + " released container " + container.getId() + " on node: " + node
      + " with event: " + event);
}
 
Example #17
Source File: SchedulerApplicationAttempt.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public synchronized void containerLaunchedOnNode(ContainerId containerId,
    NodeId nodeId) {
  // Inform the container
  RMContainer rmContainer = getRMContainer(containerId);
  if (rmContainer == null) {
    // Some unknown container sneaked into the system. Kill it.
    rmContext.getDispatcher().getEventHandler()
      .handle(new RMNodeCleanContainerEvent(nodeId, containerId));
    return;
  }

  rmContainer.handle(new RMContainerEvent(containerId,
      RMContainerEventType.LAUNCHED));
}
 
Example #18
Source File: TestReservations.java    From hadoop with Apache License 2.0 5 votes vote down vote up
static LeafQueue stubLeafQueue(LeafQueue queue) {

    // Mock some methods for ease in these unit tests

    // 1. LeafQueue.createContainer to return dummy containers
    doAnswer(new Answer<Container>() {
      @Override
      public Container answer(InvocationOnMock invocation) throws Throwable {
        final FiCaSchedulerApp application = (FiCaSchedulerApp) (invocation
            .getArguments()[0]);
        final ContainerId containerId = TestUtils
            .getMockContainerId(application);

        Container container = TestUtils.getMockContainer(containerId,
            ((FiCaSchedulerNode) (invocation.getArguments()[1])).getNodeID(),
            (Resource) (invocation.getArguments()[2]),
            ((Priority) invocation.getArguments()[3]));
        return container;
      }
    }).when(queue).createContainer(any(FiCaSchedulerApp.class),
        any(FiCaSchedulerNode.class), any(Resource.class), any(Priority.class));

    // 2. Stub out LeafQueue.parent.completedContainer
    CSQueue parent = queue.getParent();
    doNothing().when(parent).completedContainer(any(Resource.class),
        any(FiCaSchedulerApp.class), any(FiCaSchedulerNode.class),
        any(RMContainer.class), any(ContainerStatus.class),
        any(RMContainerEventType.class), any(CSQueue.class), anyBoolean());

    return queue;
  }
 
Example #19
Source File: CapacityScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Override
public void dropContainerReservation(RMContainer container) {
  if(LOG.isDebugEnabled()){
    LOG.debug("DROP_RESERVATION:" + container.toString());
  }
  completedContainer(container,
      SchedulerUtils.createAbnormalContainerStatus(
          container.getContainerId(),
          SchedulerUtils.UNRESERVED_CONTAINER),
      RMContainerEventType.KILL);
}
 
Example #20
Source File: CapacityScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Process node labels update on a node.
 * 
 * TODO: Currently capacity scheduler will kill containers on a node when
 * labels on the node changed. It is a simply solution to ensure guaranteed
 * capacity on labels of queues. When YARN-2498 completed, we can let
 * preemption policy to decide if such containers need to be killed or just
 * keep them running.
 */
private synchronized void updateLabelsOnNode(NodeId nodeId,
    Set<String> newLabels) {
  FiCaSchedulerNode node = nodes.get(nodeId);
  if (null == node) {
    return;
  }
  
  // labels is same, we don't need do update
  if (node.getLabels().size() == newLabels.size()
      && node.getLabels().containsAll(newLabels)) {
    return;
  }
  
  // Kill running containers since label is changed
  for (RMContainer rmContainer : node.getRunningContainers()) {
    ContainerId containerId = rmContainer.getContainerId();
    completedContainer(rmContainer, 
        ContainerStatus.newInstance(containerId,
            ContainerState.COMPLETE, 
            String.format(
                "Container=%s killed since labels on the node=%s changed",
                containerId.toString(), nodeId.toString()),
            ContainerExitStatus.KILLED_BY_RESOURCEMANAGER),
        RMContainerEventType.KILL);
  }
  
  // Unreserve container on this node
  RMContainer reservedContainer = node.getReservedContainer();
  if (null != reservedContainer) {
    dropContainerReservation(reservedContainer);
  }
  
  // Update node labels after we've done this
  node.updateLabels(newLabels);
}
 
Example #21
Source File: CapacityScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
private synchronized void nodeUpdate(RMNode nm) {
  if (LOG.isDebugEnabled()) {
    LOG.debug("nodeUpdate: " + nm + " clusterResources: " + clusterResource);
  }

  FiCaSchedulerNode node = getNode(nm.getNodeID());
  
  List<UpdatedContainerInfo> containerInfoList = nm.pullContainerUpdates();
  List<ContainerStatus> newlyLaunchedContainers = new ArrayList<ContainerStatus>();
  List<ContainerStatus> completedContainers = new ArrayList<ContainerStatus>();
  for(UpdatedContainerInfo containerInfo : containerInfoList) {
    newlyLaunchedContainers.addAll(containerInfo.getNewlyLaunchedContainers());
    completedContainers.addAll(containerInfo.getCompletedContainers());
  }
  
  // Processing the newly launched containers
  for (ContainerStatus launchedContainer : newlyLaunchedContainers) {
    containerLaunchedOnNode(launchedContainer, node);
    LOG.info("Container LAUNCHED:"+launchedContainer.getContainerId());
  }

  // Process completed containers
  for (ContainerStatus completedContainer : completedContainers) {
    ContainerId containerId = completedContainer.getContainerId();
    LOG.info("Container FINISHED: " + containerId);
    completedContainer(getRMContainer(containerId), 
        completedContainer, RMContainerEventType.FINISHED);
  }
  
  // Now node data structures are up to date and ready for scheduling.
  if(LOG.isDebugEnabled()) 
  {
    LOG.info("Node being looked for scheduling " + nm
      + " availableResource: " + node.getAvailableResource());
  }
}
 
Example #22
Source File: FifoScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
private synchronized void doneApplicationAttempt(
    ApplicationAttemptId applicationAttemptId,
    RMAppAttemptState rmAppAttemptFinalState, boolean keepContainers)
    throws IOException {
  FiCaSchedulerApp attempt = getApplicationAttempt(applicationAttemptId);
  SchedulerApplication<FiCaSchedulerApp> application =
      applications.get(applicationAttemptId.getApplicationId());
  if (application == null || attempt == null) {
    throw new IOException("Unknown application " + applicationAttemptId + 
    " has completed!");
  }

  // Kill all 'live' containers
  for (RMContainer container : attempt.getLiveContainers()) {
    if (keepContainers
        && container.getState().equals(RMContainerState.RUNNING)) {
      // do not kill the running container in the case of work-preserving AM
      // restart.
      LOG.info("Skip killing " + container.getContainerId());
      continue;
    }
    completedContainer(container,
      SchedulerUtils.createAbnormalContainerStatus(
        container.getContainerId(), SchedulerUtils.COMPLETED_APPLICATION),
      RMContainerEventType.KILL);
  }

  // Clean up pending requests, metrics etc.
  attempt.stop(rmAppAttemptFinalState);
}
 
Example #23
Source File: SchedulerApplicationAttempt.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public synchronized void containerLaunchedOnNode(ContainerId containerId,
    NodeId nodeId) {
  // Inform the container
  RMContainer rmContainer = getRMContainer(containerId);
  if (rmContainer == null) {
    // Some unknown container sneaked into the system. Kill it.
    rmContext.getDispatcher().getEventHandler()
      .handle(new RMNodeCleanContainerEvent(nodeId, containerId));
    return;
  }

  rmContainer.handle(new RMContainerEvent(containerId,
      RMContainerEventType.LAUNCHED));
}
 
Example #24
Source File: SchedulerApplicationAttempt.java    From big-c with Apache License 2.0 5 votes vote down vote up
public synchronized ContainersAndNMTokensAllocation
    pullNewlyAllocatedContainersAndNMTokens() {
  List<Container> returnContainerList =
      new ArrayList<Container>(newlyAllocatedContainers.size());
  List<NMToken> nmTokens = new ArrayList<NMToken>();
  for (Iterator<RMContainer> i = newlyAllocatedContainers.iterator(); i
    .hasNext();) {
    RMContainer rmContainer = i.next();
    Container container = rmContainer.getContainer();
    try {
      // create container token and NMToken altogether.
      container.setContainerToken(rmContext.getContainerTokenSecretManager()
        .createContainerToken(container.getId(), container.getNodeId(),
          getUser(), container.getResource(), container.getPriority(),
          rmContainer.getCreationTime(), this.logAggregationContext));
      NMToken nmToken =
          rmContext.getNMTokenSecretManager().createAndGetNMToken(getUser(),
            getApplicationAttemptId(), container);
      if (nmToken != null) {
        nmTokens.add(nmToken);
      }
    } catch (IllegalArgumentException e) {
      // DNS might be down, skip returning this container.
      LOG.error("Error trying to assign container token and NM token to" +
          " an allocated container " + container.getId(), e);
      continue;
    }
    returnContainerList.add(container);
    i.remove();
    rmContainer.handle(new RMContainerEvent(rmContainer.getContainerId(),
      RMContainerEventType.ACQUIRED));
  }
  return new ContainersAndNMTokensAllocation(returnContainerList, nmTokens);
}
 
Example #25
Source File: FairScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
protected void warnOrKillContainer(RMContainer container) {
  ApplicationAttemptId appAttemptId = container.getApplicationAttemptId();
  FSAppAttempt app = getSchedulerApp(appAttemptId);
  FSLeafQueue queue = app.getQueue();
  LOG.info("Preempting container (prio=" + container.getContainer().getPriority() +
      "res=" + container.getContainer().getResource() +
      ") from queue " + queue.getName());
  
  Long time = app.getContainerPreemptionTime(container);

  if (time != null) {
    // if we asked for preemption more than maxWaitTimeBeforeKill ms ago,
    // proceed with kill
    if (time + waitTimeBeforeKill < getClock().getTime()) {
      ContainerStatus status =
        SchedulerUtils.createPreemptedContainerStatus(
          container.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER);

      recoverResourceRequestForContainer(container);
      // TODO: Not sure if this ever actually adds this to the list of cleanup
      // containers on the RMNode (see SchedulerNode.releaseContainer()).
      completedContainer(container, status, RMContainerEventType.KILL);
      LOG.info("Killing container" + container +
          " (after waiting for premption for " +
          (getClock().getTime() - time) + "ms)");
    }
  } else {
    // track the request in the FSAppAttempt itself
    app.addPreemption(container, getClock().getTime());
  }
}
 
Example #26
Source File: FairScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Clean up a completed container.
 */
@Override
protected synchronized void completedContainer(RMContainer rmContainer,
    ContainerStatus containerStatus, RMContainerEventType event) {
  if (rmContainer == null) {
    LOG.info("Null container completed...");
    return;
  }

  Container container = rmContainer.getContainer();

  // Get the application for the finished container
  FSAppAttempt application =
      getCurrentAttemptForContainer(container.getId());
  ApplicationId appId =
      container.getId().getApplicationAttemptId().getApplicationId();
  if (application == null) {
    LOG.info("Container " + container + " of" +
        " unknown application attempt " + appId +
        " completed with event " + event);
    return;
  }

  // Get the node on which the container was allocated
  FSSchedulerNode node = getFSSchedulerNode(container.getNodeId());

  if (rmContainer.getState() == RMContainerState.RESERVED) {
    application.unreserve(rmContainer.getReservedPriority(), node);
  } else {
    application.containerCompleted(rmContainer, containerStatus, event);
    node.releaseContainer(container,container.getResource());
    updateRootQueueMetrics();
  }

  LOG.info("Application attempt " + application.getApplicationAttemptId()
      + " released container " + container.getId() + " on node: " + node
      + " with event: " + event);
}
 
Example #27
Source File: FairScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
private synchronized void removeNode(RMNode rmNode) {
  FSSchedulerNode node = getFSSchedulerNode(rmNode.getNodeID());
  // This can occur when an UNHEALTHY node reconnects
  if (node == null) {
    return;
  }
  Resources.subtractFrom(clusterResource, rmNode.getTotalCapability());
  updateRootQueueMetrics();

  // Remove running containers
  List<RMContainer> runningContainers = node.getRunningContainers();
  for (RMContainer container : runningContainers) {
    completedContainer(container,
        SchedulerUtils.createAbnormalContainerStatus(
            container.getContainerId(),
            SchedulerUtils.LOST_CONTAINER),
        RMContainerEventType.KILL);
  }

  // Remove reservations, if any
  RMContainer reservedContainer = node.getReservedContainer();
  if (reservedContainer != null) {
    completedContainer(reservedContainer,
        SchedulerUtils.createAbnormalContainerStatus(
            reservedContainer.getContainerId(),
            SchedulerUtils.LOST_CONTAINER),
        RMContainerEventType.KILL);
  }

  nodes.remove(rmNode.getNodeID());
  queueMgr.getRootQueue().setSteadyFairShare(clusterResource);
  queueMgr.getRootQueue().recomputeSteadyShares();
  updateMaximumAllocation(node, false);
  LOG.info("Removed node " + rmNode.getNodeAddress() +
      " cluster capacity: " + clusterResource);
}
 
Example #28
Source File: FairScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Process a heartbeat update from a node.
 */
private synchronized void nodeUpdate(RMNode nm) {
  long start = getClock().getTime();
  if (LOG.isDebugEnabled()) {
    LOG.debug("nodeUpdate: " + nm + " cluster capacity: " + clusterResource);
  }
  eventLog.log("HEARTBEAT", nm.getHostName());
  FSSchedulerNode node = getFSSchedulerNode(nm.getNodeID());
  
  List<UpdatedContainerInfo> containerInfoList = nm.pullContainerUpdates();
  List<ContainerStatus> newlyLaunchedContainers = new ArrayList<ContainerStatus>();
  List<ContainerStatus> completedContainers = new ArrayList<ContainerStatus>();
  for(UpdatedContainerInfo containerInfo : containerInfoList) {
    newlyLaunchedContainers.addAll(containerInfo.getNewlyLaunchedContainers());
    completedContainers.addAll(containerInfo.getCompletedContainers());
  } 
  // Processing the newly launched containers
  for (ContainerStatus launchedContainer : newlyLaunchedContainers) {
    containerLaunchedOnNode(launchedContainer, node);
  }

  // Process completed containers
  for (ContainerStatus completedContainer : completedContainers) {
    ContainerId containerId = completedContainer.getContainerId();
    LOG.debug("Container FINISHED: " + containerId);
    completedContainer(getRMContainer(containerId),
        completedContainer, RMContainerEventType.FINISHED);
  }

  if (continuousSchedulingEnabled) {
    if (!completedContainers.isEmpty()) {
      attemptScheduling(node);
    }
  } else {
    attemptScheduling(node);
  }

  long duration = getClock().getTime() - start;
  fsOpDurations.addNodeUpdateDuration(duration);
}
 
Example #29
Source File: FSAppAttempt.java    From big-c with Apache License 2.0 5 votes vote down vote up
synchronized public void containerCompleted(RMContainer rmContainer,
    ContainerStatus containerStatus, RMContainerEventType event) {
  
  Container container = rmContainer.getContainer();
  ContainerId containerId = container.getId();
  
  // Remove from the list of newly allocated containers if found
  newlyAllocatedContainers.remove(rmContainer);
  
  // Inform the container
  rmContainer.handle(
      new RMContainerFinishedEvent(
          containerId,
          containerStatus, 
          event)
      );
  LOG.info("Completed container: " + rmContainer.getContainerId() + 
      " in state: " + rmContainer.getState() + " event:" + event);
  
  // Remove from the list of containers
  liveContainers.remove(rmContainer.getContainerId());

  RMAuditLogger.logSuccess(getUser(), 
      AuditConstants.RELEASE_CONTAINER, "SchedulerApp", 
      getApplicationId(), containerId);
  
  // Update usage metrics 
  Resource containerResource = rmContainer.getContainer().getResource();
  queue.getMetrics().releaseResources(getUser(), 1, containerResource);
  Resources.subtractFrom(currentConsumption, containerResource);

  // remove from preemption map if it is completed
  preemptionMap.remove(rmContainer);

  // Clear resource utilization metrics cache.
  lastMemoryAggregateAllocationUpdateTime = -1;
}
 
Example #30
Source File: AbstractYarnScheduler.java    From big-c with Apache License 2.0 5 votes vote down vote up
protected void releaseContainers(List<ContainerId> containers,
    SchedulerApplicationAttempt attempt) {
  for (ContainerId containerId : containers) {
    RMContainer rmContainer = getRMContainer(containerId);
    if (rmContainer == null) {
      if (System.currentTimeMillis() - ResourceManager.getClusterTimeStamp()
          < nmExpireInterval) {
        LOG.info(containerId + " doesn't exist. Add the container"
            + " to the release request cache as it maybe on recovery.");
        synchronized (attempt) {
          attempt.getPendingRelease().add(containerId);
        }
      } else {
        RMAuditLogger.logFailure(attempt.getUser(),
          AuditConstants.RELEASE_CONTAINER,
          "Unauthorized access or invalid container", "Scheduler",
          "Trying to release container not owned by app or with invalid id.",
          attempt.getApplicationId(), containerId);
      }
    }
    
    LOG.info("Container FINISHED by AbstractYarnScheduler by releaseContainers: " + containerId);
    completedContainer(rmContainer,
      SchedulerUtils.createAbnormalContainerStatus(containerId,
        SchedulerUtils.RELEASED_CONTAINER), RMContainerEventType.RELEASED);
  }
}