Java Code Examples for org.apache.hadoop.yarn.api.records.NodeState#UNHEALTHY

The following examples show how to use org.apache.hadoop.yarn.api.records.NodeState#UNHEALTHY . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RMNodeImpl.java    From hadoop with Apache License 2.0 6 votes vote down vote up
@Override
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {
  RMNodeStatusEvent statusEvent = (RMNodeStatusEvent) event;

  // Switch the last heartbeatresponse.
  rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();
  NodeHealthStatus remoteNodeHealthStatus = statusEvent.getNodeHealthStatus();
  rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
  rmNode.setLastHealthReportTime(
      remoteNodeHealthStatus.getLastHealthReportTime());
  if (remoteNodeHealthStatus.getIsNodeHealthy()) {
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodeAddedSchedulerEvent(rmNode));
    rmNode.context.getDispatcher().getEventHandler().handle(
            new NodesListManagerEvent(
                NodesListManagerEventType.NODE_USABLE, rmNode));
    // ??? how about updating metrics before notifying to ensure that
    // notifiers get update metadata because they will very likely query it
    // upon notification
    // Update metrics
    rmNode.updateMetricsForRejoinedNode(NodeState.UNHEALTHY);
    return NodeState.RUNNING;
  }

  return NodeState.UNHEALTHY;
}
 
Example 2
Source File: MockNodes.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private static RMNode buildRMNode(int rack, final Resource perNode,
    NodeState state, String httpAddr, int hostnum, String hostName, int port,
    Set<String> labels) {
  final String rackName = "rack"+ rack;
  final int nid = hostnum;
  final String nodeAddr = hostName + ":" + nid;
  if (hostName == null) {
    hostName = "host"+ nid;
  }
  final NodeId nodeID = NodeId.newInstance(hostName, port);

  final String httpAddress = httpAddr;
  String healthReport = (state == NodeState.UNHEALTHY) ? null : "HealthyMe";
  return new MockRMNodeImpl(nodeID, nodeAddr, httpAddress, perNode,
      rackName, healthReport, 0, nid, hostName, state, labels);
}
 
Example 3
Source File: TestResourceTrackerService.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private void checkUnealthyNMCount(MockRM rm, MockNM nm1, boolean health,
    int count) throws Exception {
  
  int waitCount = 0;
  while((rm.getRMContext().getRMNodes().get(nm1.getNodeId())
      .getState() != NodeState.UNHEALTHY) == health
      && waitCount++ < 20) {
    synchronized (this) {
      wait(100);
    }
  }
  Assert.assertFalse((rm.getRMContext().getRMNodes().get(nm1.getNodeId())
      .getState() != NodeState.UNHEALTHY) == health);
  Assert.assertEquals("Unhealthy metrics not incremented", count,
      ClusterMetrics.getMetrics().getUnhealthyNMs());
}
 
Example 4
Source File: RMNodeImpl.java    From big-c with Apache License 2.0 6 votes vote down vote up
@Override
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {
  RMNodeStatusEvent statusEvent = (RMNodeStatusEvent) event;

  // Switch the last heartbeatresponse.
  rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();
  NodeHealthStatus remoteNodeHealthStatus = statusEvent.getNodeHealthStatus();
  rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
  rmNode.setLastHealthReportTime(
      remoteNodeHealthStatus.getLastHealthReportTime());
  if (remoteNodeHealthStatus.getIsNodeHealthy()) {
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodeAddedSchedulerEvent(rmNode));
    rmNode.context.getDispatcher().getEventHandler().handle(
            new NodesListManagerEvent(
                NodesListManagerEventType.NODE_USABLE, rmNode));
    // ??? how about updating metrics before notifying to ensure that
    // notifiers get update metadata because they will very likely query it
    // upon notification
    // Update metrics
    rmNode.updateMetricsForRejoinedNode(NodeState.UNHEALTHY);
    return NodeState.RUNNING;
  }

  return NodeState.UNHEALTHY;
}
 
Example 5
Source File: MockNodes.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static RMNode buildRMNode(int rack, final Resource perNode,
    NodeState state, String httpAddr, int hostnum, String hostName, int port,
    Set<String> labels) {
  final String rackName = "rack"+ rack;
  final int nid = hostnum;
  final String nodeAddr = hostName + ":" + nid;
  if (hostName == null) {
    hostName = "host"+ nid;
  }
  final NodeId nodeID = NodeId.newInstance(hostName, port);

  final String httpAddress = httpAddr;
  String healthReport = (state == NodeState.UNHEALTHY) ? null : "HealthyMe";
  return new MockRMNodeImpl(nodeID, nodeAddr, httpAddress, perNode,
      rackName, healthReport, 0, nid, hostName, state, labels);
}
 
Example 6
Source File: TestResourceTrackerService.java    From big-c with Apache License 2.0 6 votes vote down vote up
private void checkUnealthyNMCount(MockRM rm, MockNM nm1, boolean health,
    int count) throws Exception {
  
  int waitCount = 0;
  while((rm.getRMContext().getRMNodes().get(nm1.getNodeId())
      .getState() != NodeState.UNHEALTHY) == health
      && waitCount++ < 20) {
    synchronized (this) {
      wait(100);
    }
  }
  Assert.assertFalse((rm.getRMContext().getRMNodes().get(nm1.getNodeId())
      .getState() != NodeState.UNHEALTHY) == health);
  Assert.assertEquals("Unhealthy metrics not incremented", count,
      ClusterMetrics.getMetrics().getUnhealthyNMs());
}
 
Example 7
Source File: NodesPage.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Override
protected void render(Block html) {
  html._(MetricsOverviewTable.class);

  ResourceScheduler sched = rm.getResourceScheduler();
  String type = $(NODE_STATE);
  String labelFilter = $(NODE_LABEL, CommonNodeLabelsManager.ANY).trim();
  TBODY<TABLE<Hamlet>> tbody =
      html.table("#nodes").thead().tr()
          .th(".nodelabels", "Node Labels")
          .th(".rack", "Rack")
          .th(".state", "Node State")
          .th(".nodeaddress", "Node Address")
          .th(".nodehttpaddress", "Node HTTP Address")
          .th(".lastHealthUpdate", "Last health-update")
          .th(".healthReport", "Health-report")
          .th(".containers", "Containers")
          .th(".mem", "Mem Used")
          .th(".mem", "Mem Avail")
          .th(".vcores", "VCores Used")
          .th(".vcores", "VCores Avail")
          .th(".gcores", "GCores Used")
          .th(".gcores", "GCores Avail")
          .th(".nodeManagerVersion", "Version")._()._().tbody();
  NodeState stateFilter = null;
  if (type != null && !type.isEmpty()) {
    stateFilter = NodeState.valueOf(StringUtils.toUpperCase(type));
  }
  Collection<RMNode> rmNodes = this.rm.getRMContext().getRMNodes().values();
  boolean isInactive = false;
  if (stateFilter != null) {
    switch (stateFilter) {
    case DECOMMISSIONED:
    case LOST:
    case REBOOTED:
      rmNodes = this.rm.getRMContext().getInactiveRMNodes().values();
      isInactive = true;
      break;
    default:
      LOG.debug("Unexpected state filter for inactive RM node");
    }
  }
  for (RMNode ni : rmNodes) {
    if (stateFilter != null) {
      NodeState state = ni.getState();
      if (!stateFilter.equals(state)) {
        continue;
      }
    } else {
      // No filter. User is asking for all nodes. Make sure you skip the
      // unhealthy nodes.
      if (ni.getState() == NodeState.UNHEALTHY) {
        continue;
      }
    }
    // Besides state, we need to filter label as well.
    if (!labelFilter.equals(RMNodeLabelsManager.ANY)) {
      if (labelFilter.isEmpty()) {
        // Empty label filter means only shows nodes without label
        if (!ni.getNodeLabels().isEmpty()) {
          continue;
        }
      } else if (!ni.getNodeLabels().contains(labelFilter)) {
        // Only nodes have given label can show on web page.
        continue;
      }
    }
    NodeInfo info = new NodeInfo(ni, sched);
    int usedMemory = (int) info.getUsedMemory();
    int availableMemory = (int) info.getAvailableMemory();
    TR<TBODY<TABLE<Hamlet>>> row =
        tbody.tr().td(StringUtils.join(",", info.getNodeLabels()))
            .td(info.getRack()).td(info.getState()).td(info.getNodeId());
    if (isInactive) {
      row.td()._("N/A")._();
    } else {
      String httpAddress = info.getNodeHTTPAddress();
      row.td().a("//" + httpAddress, httpAddress)._();
    }
    row.td().br().$title(String.valueOf(info.getLastHealthUpdate()))._()
        ._(Times.format(info.getLastHealthUpdate()))._()
        .td(info.getHealthReport())
        .td(String.valueOf(info.getNumContainers())).td().br()
        .$title(String.valueOf(usedMemory))._()
        ._(StringUtils.byteDesc(usedMemory * BYTES_IN_MB))._().td().br()
        .$title(String.valueOf(availableMemory))._()
        ._(StringUtils.byteDesc(availableMemory * BYTES_IN_MB))._()
        .td(String.valueOf(info.getUsedVirtualCores()))
        .td(String.valueOf(info.getAvailableVirtualCores()))
        .td(String.valueOf(info.getUsedGpuCores()))
        .td(String.valueOf(info.getAvailableGpuCores()))
        .td(ni.getNodeManagerVersion())._();
  }
  tbody._()._();
}
 
Example 8
Source File: RMNodeImpl.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Override
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {

  RMNodeStatusEvent statusEvent = (RMNodeStatusEvent) event;

  // Switch the last heartbeatresponse.
  rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();

  NodeHealthStatus remoteNodeHealthStatus = 
      statusEvent.getNodeHealthStatus();
  rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
  rmNode.setLastHealthReportTime(
      remoteNodeHealthStatus.getLastHealthReportTime());
  if (!remoteNodeHealthStatus.getIsNodeHealthy()) {
    LOG.info("Node " + rmNode.nodeId + " reported UNHEALTHY with details: "
        + remoteNodeHealthStatus.getHealthReport());
    rmNode.nodeUpdateQueue.clear();
    // Inform the scheduler
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodeRemovedSchedulerEvent(rmNode));
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodesListManagerEvent(
            NodesListManagerEventType.NODE_UNUSABLE, rmNode));
    // Update metrics
    rmNode.updateMetricsForDeactivatedNode(rmNode.getState(),
        NodeState.UNHEALTHY);
    return NodeState.UNHEALTHY;
  }

  rmNode.handleContainerStatus(statusEvent.getContainers());

  if(rmNode.nextHeartBeat) {
    rmNode.nextHeartBeat = false;
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodeUpdateSchedulerEvent(rmNode));
  }

  // Update DTRenewer in secure mode to keep these apps alive. Today this is
  // needed for log-aggregation to finish long after the apps are gone.
  if (UserGroupInformation.isSecurityEnabled()) {
    rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications(
      statusEvent.getKeepAliveAppIds());
  }

  return NodeState.RUNNING;
}
 
Example 9
Source File: NodesPage.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Override
protected void render(Block html) {
  html._(MetricsOverviewTable.class);

  ResourceScheduler sched = rm.getResourceScheduler();
  String type = $(NODE_STATE);
  String labelFilter = $(NODE_LABEL, CommonNodeLabelsManager.ANY).trim();
  TBODY<TABLE<Hamlet>> tbody =
      html.table("#nodes").thead().tr()
          .th(".nodelabels", "Node Labels")
          .th(".rack", "Rack")
          .th(".state", "Node State")
          .th(".nodeaddress", "Node Address")
          .th(".nodehttpaddress", "Node HTTP Address")
          .th(".lastHealthUpdate", "Last health-update")
          .th(".healthReport", "Health-report")
          .th(".containers", "Containers")
          .th(".mem", "Mem Used")
          .th(".mem", "Mem Avail")
          .th(".vcores", "VCores Used")
          .th(".vcores", "VCores Avail")
          .th(".nodeManagerVersion", "Version")._()._().tbody();
  NodeState stateFilter = null;
  if (type != null && !type.isEmpty()) {
    stateFilter = NodeState.valueOf(StringUtils.toUpperCase(type));
  }
  Collection<RMNode> rmNodes = this.rm.getRMContext().getRMNodes().values();
  boolean isInactive = false;
  if (stateFilter != null) {
    switch (stateFilter) {
    case DECOMMISSIONED:
    case LOST:
    case REBOOTED:
      rmNodes = this.rm.getRMContext().getInactiveRMNodes().values();
      isInactive = true;
      break;
    default:
      LOG.debug("Unexpected state filter for inactive RM node");
    }
  }
  for (RMNode ni : rmNodes) {
    if (stateFilter != null) {
      NodeState state = ni.getState();
      if (!stateFilter.equals(state)) {
        continue;
      }
    } else {
      // No filter. User is asking for all nodes. Make sure you skip the
      // unhealthy nodes.
      if (ni.getState() == NodeState.UNHEALTHY) {
        continue;
      }
    }
    // Besides state, we need to filter label as well.
    if (!labelFilter.equals(RMNodeLabelsManager.ANY)) {
      if (labelFilter.isEmpty()) {
        // Empty label filter means only shows nodes without label
        if (!ni.getNodeLabels().isEmpty()) {
          continue;
        }
      } else if (!ni.getNodeLabels().contains(labelFilter)) {
        // Only nodes have given label can show on web page.
        continue;
      }
    }
    NodeInfo info = new NodeInfo(ni, sched);
    int usedMemory = (int) info.getUsedMemory();
    int availableMemory = (int) info.getAvailableMemory();
    TR<TBODY<TABLE<Hamlet>>> row =
        tbody.tr().td(StringUtils.join(",", info.getNodeLabels()))
            .td(info.getRack()).td(info.getState()).td(info.getNodeId());
    if (isInactive) {
      row.td()._("N/A")._();
    } else {
      String httpAddress = info.getNodeHTTPAddress();
      row.td().a("//" + httpAddress, httpAddress)._();
    }
    row.td().br().$title(String.valueOf(info.getLastHealthUpdate()))._()
        ._(Times.format(info.getLastHealthUpdate()))._()
        .td(info.getHealthReport())
        .td(String.valueOf(info.getNumContainers())).td().br()
        .$title(String.valueOf(usedMemory))._()
        ._(StringUtils.byteDesc(usedMemory * BYTES_IN_MB))._().td().br()
        .$title(String.valueOf(availableMemory))._()
        ._(StringUtils.byteDesc(availableMemory * BYTES_IN_MB))._()
        .td(String.valueOf(info.getUsedVirtualCores()))
        .td(String.valueOf(info.getAvailableVirtualCores()))
        .td(ni.getNodeManagerVersion())._();
  }
  tbody._()._();
}
 
Example 10
Source File: RMNodeImpl.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Override
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {

  RMNodeStatusEvent statusEvent = (RMNodeStatusEvent) event;

  // Switch the last heartbeatresponse.
  rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();

  NodeHealthStatus remoteNodeHealthStatus = 
      statusEvent.getNodeHealthStatus();
  rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
  rmNode.setLastHealthReportTime(
      remoteNodeHealthStatus.getLastHealthReportTime());
  if (!remoteNodeHealthStatus.getIsNodeHealthy()) {
    LOG.info("Node " + rmNode.nodeId + " reported UNHEALTHY with details: "
        + remoteNodeHealthStatus.getHealthReport());
    rmNode.nodeUpdateQueue.clear();
    // Inform the scheduler
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodeRemovedSchedulerEvent(rmNode));
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodesListManagerEvent(
            NodesListManagerEventType.NODE_UNUSABLE, rmNode));
    // Update metrics
    rmNode.updateMetricsForDeactivatedNode(rmNode.getState(),
        NodeState.UNHEALTHY);
    return NodeState.UNHEALTHY;
  }

  rmNode.handleContainerStatus(statusEvent.getContainers());

  if(rmNode.nextHeartBeat) {
    rmNode.nextHeartBeat = false;
    rmNode.context.getDispatcher().getEventHandler().handle(
        new NodeUpdateSchedulerEvent(rmNode));
  }

  // Update DTRenewer in secure mode to keep these apps alive. Today this is
  // needed for log-aggregation to finish long after the apps are gone.
  if (UserGroupInformation.isSecurityEnabled()) {
    rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications(
      statusEvent.getKeepAliveAppIds());
  }

  return NodeState.RUNNING;
}