org.apache.kafka.common.metrics.stats.Total Java Examples

The following examples show how to use org.apache.kafka.common.metrics.stats.Total. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ConsumerCollector.java    From ksql-fork-with-deep-learning-function with Apache License 2.0 6 votes vote down vote up
private List<TopicSensors.SensorMetric<ConsumerRecord>> buildSensors(String key) {

    List<TopicSensors.SensorMetric<ConsumerRecord>> sensors = new ArrayList<>();

    // Note: synchronized due to metrics registry not handling concurrent add/check-exists
    // activity in a reliable way
    synchronized (this.metrics) {
      addSensor(key, "consumer-messages-per-sec", new Rate(), sensors, false);
      addSensor(key, "consumer-total-messages", new Total(), sensors, false);
      addSensor(key, "consumer-failed-messages", new Total(), sensors, true);
      addSensor(key, "consumer-total-message-bytes", new Total(), sensors, false,
          (r) -> {
            if (r == null) {
              return 0.0;
            } else {
              return ((double) r.serializedValueSize() + r.serializedKeySize());
            }
          });
      addSensor(key, "failed-messages-per-sec", new Rate(), sensors, true);
    }
    return sensors;
  }
 
Example #2
Source File: TaskJmxReporter.java    From mirus with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private void ensureMetricsCreated(ConnectorTaskId taskId) {
  Map<String, String> tags = getTaskLevelTags(taskId);
  MetricName taskMetric =
      getMetric(
          FAILED_TASK_ATTEMPTS_METRIC_NAME + "-count",
          TASK_CONNECTOR_JMX_GROUP_NAME,
          "count of restart attempts to a failed task",
          taskLevelJmxTags,
          tags);

  if (!metrics.metrics().containsKey(taskMetric)) {
    Sensor sensor = getSensor(taskId.toString());
    sensor.add(taskMetric, new Total());
    logger.info("Added the task {} to the list of JMX metrics", taskId);
    logger.debug("Updated set of JMX metrics is {}", metrics.metrics());
  }
}
 
Example #3
Source File: ConsumeService.java    From kafka-monitor with Apache License 2.0 6 votes vote down vote up
@Override
public synchronized void start() {
  if (_running.compareAndSet(false, true)) {
    _consumeThread.start();
    LOG.info("{}/ConsumeService started.", _name);

    Sensor topicPartitionCount = metrics.sensor("topic-partitions");
    DescribeTopicsResult describeTopicsResult = _adminClient.describeTopics(Collections.singleton(_topic));
    Map<String, KafkaFuture<TopicDescription>> topicResultValues = describeTopicsResult.values();
    KafkaFuture<TopicDescription> topicDescriptionKafkaFuture = topicResultValues.get(_topic);
    TopicDescription topicDescription = null;
    try {
      topicDescription = topicDescriptionKafkaFuture.get();
    } catch (InterruptedException | ExecutionException e) {
      LOG.error("Exception occurred while getting the topicDescriptionKafkaFuture for topic: {}", _topic, e);
    }
    @SuppressWarnings("ConstantConditions")
    double partitionCount = topicDescription.partitions().size();
    topicPartitionCount.add(
        new MetricName("topic-partitions-count", METRIC_GROUP_NAME, "The total number of partitions for the topic.", tags), new Total(partitionCount));
  }
}
 
Example #4
Source File: CommitAvailabilityMetrics.java    From kafka-monitor with Apache License 2.0 6 votes vote down vote up
/**
 * Metrics for Calculating the offset commit availability of a consumer.
 * @param metrics the commit offset metrics
 * @param tags the tags associated, i.e) kmf.services:name=single-cluster-monitor
 */
public CommitAvailabilityMetrics(final Metrics metrics, final Map<String, String> tags) {
  LOG.info("{} called.", this.getClass().getSimpleName());
  _offsetsCommitted = metrics.sensor("offsets-committed");
  _offsetsCommitted.add(new MetricName("offsets-committed-total", METRIC_GROUP_NAME,
      "The total number of offsets per second that are committed.", tags), new Total());

  _failedCommitOffsets = metrics.sensor("failed-commit-offsets");
  _failedCommitOffsets.add(new MetricName("failed-commit-offsets-avg", METRIC_GROUP_NAME,
      "The average number of offsets per second that have failed.", tags), new Rate());
  _failedCommitOffsets.add(new MetricName("failed-commit-offsets-total", METRIC_GROUP_NAME,
      "The total number of offsets per second that have failed.", tags), new Total());

  metrics.addMetric(new MetricName("offsets-committed-avg", METRIC_GROUP_NAME, "The average offset commits availability.", tags),
    (MetricConfig config, long now) -> {
      Object offsetCommitTotal = metrics.metrics().get(metrics.metricName("offsets-committed-total", METRIC_GROUP_NAME, tags)).metricValue();
      Object offsetCommitFailTotal = metrics.metrics().get(metrics.metricName("failed-commit-offsets-total", METRIC_GROUP_NAME, tags)).metricValue();
      if (offsetCommitTotal != null && offsetCommitFailTotal != null) {
        double offsetsCommittedCount = (double) offsetCommitTotal;
        double offsetsCommittedErrorCount = (double) offsetCommitFailTotal;
        return offsetsCommittedCount / (offsetsCommittedCount + offsetsCommittedErrorCount);
      } else {
        return 0;
      }
    });
}
 
Example #5
Source File: ProducerCollector.java    From ksql-fork-with-deep-learning-function with Apache License 2.0 5 votes vote down vote up
private List<TopicSensors.SensorMetric<ProducerRecord>> buildSensors(String key) {
  List<TopicSensors.SensorMetric<ProducerRecord>> sensors = new ArrayList<>();

  // Note: synchronized due to metrics registry not handling concurrent add/check-exists
  // activity in a reliable way
  synchronized (metrics) {
    addSensor(key, "messages-per-sec", new Rate(), sensors, false);
    addSensor(key, "total-messages", new Total(), sensors, false);
    addSensor(key, "failed-messages", new Total(), sensors, true);
    addSensor(key, "failed-messages-per-sec", new Rate(), sensors, true);
  }
  return sensors;
}
 
Example #6
Source File: ProduceService.java    From kafka-monitor with Apache License 2.0 5 votes vote down vote up
public ProduceMetrics(Metrics metrics, final Map<String, String> tags) {
    this.metrics = metrics;
    this.tags = tags;


    _recordsProducedPerPartition = new ConcurrentHashMap<>();
    _produceErrorPerPartition = new ConcurrentHashMap<>();


    recordsProduce = metrics.sensor("records-produced");
    recordsProduce.add(new MetricName("records-produced-total", METRIC_GROUP_NAME, "The total number of records that are produced", tags), new Total());
    errorProduce = metrics.sensor("error-produce");
    errorProduce.add(new MetricName("error-produce-total", METRIC_GROUP_NAME, "", tags), new Total());

    metrics.addMetric(new MetricName("produce-availability-avg", METRIC_GROUP_NAME, "The average produce availability", tags),
            (config, now) -> {
                double availabilitySum = 0.0;
                //可用性等于每个partition的可用性之和除以partition总数
                //partition可用性等于成功发送率除以失败率
                int num = partitionNum.get();

                for (int partition = 0; partition < num; partition++) {
                    double recordsProduced = produceMetrics.metrics.metrics().get(new MetricName("records-produced-rate-partition-" + partition, METRIC_GROUP_NAME, tags)).value();
                    double produceError = produceMetrics.metrics.metrics().get(new MetricName("produce-error-rate-partition-" + partition, METRIC_GROUP_NAME, tags)).value();

                    if (Double.isNaN(produceError) || Double.isInfinite(produceError)) {
                        produceError = 0;
                    }
                    if (recordsProduced + produceError > 0) {
                        availabilitySum += recordsProduced / (recordsProduced + produceError);
                    }
                }
                return availabilitySum / num;
                //return 0;
            });


}
 
Example #7
Source File: ConnectorJmxReporter.java    From mirus with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private void ensureMetricsCreated(String connectorName) {

    Map<String, String> connectorTags = getConnectorLevelTags(connectorName);

    MetricName runningMetric =
        getMetric(
            RUNNING_TASK_METRIC_NAME + "-count",
            CONNECTOR_JMX_GROUP_NAME,
            "count of running tasks per connector",
            connectorLevelJmxTags,
            connectorTags);
    MetricName pausedMetric =
        getMetric(
            PAUSED_TASK_METRIC_NAME + "-count",
            CONNECTOR_JMX_GROUP_NAME,
            "count of paused tasks per connector",
            connectorLevelJmxTags,
            connectorTags);
    MetricName failedMetric =
        getMetric(
            FAILED_TASK_METRIC_NAME + "-count",
            CONNECTOR_JMX_GROUP_NAME,
            "count of failed tasks per connector",
            connectorLevelJmxTags,
            connectorTags);
    MetricName unassignedMetric =
        getMetric(
            UNASSIGNED_TASK_METRIC_NAME + "-count",
            CONNECTOR_JMX_GROUP_NAME,
            "count of unassigned tasks per connector",
            connectorLevelJmxTags,
            connectorTags);
    MetricName destroyedMetric =
        getMetric(
            DESTROYED_TASK_METRIC_NAME + "-count",
            CONNECTOR_JMX_GROUP_NAME,
            "count of destroyed tasks per connector",
            connectorLevelJmxTags,
            connectorTags);

    MetricName totalAttemptsPerConnectorMetric =
        getMetric(
            FAILED_TASK_ATTEMPTS_METRIC_NAME + "-count",
            CONNECTOR_JMX_GROUP_NAME,
            "count of failed task restart attempts per connector",
            connectorLevelJmxTags,
            connectorTags);

    MetricName restartAttemptsPerConnectorMetric =
        getMetric(
            FAILED_CONNECTOR_ATTEMPTS_METRIC_NAME + "-count",
            CONNECTOR_JMX_GROUP_NAME,
            "count of failed connector restart attempts per connector",
            connectorLevelJmxTags,
            connectorTags);

    if (!metrics.metrics().containsKey(runningMetric)) {
      metrics
          .sensor(calculateSensorName(allStates.get("RUNNING"), connectorName))
          .add(runningMetric, new Value());
    }
    if (!metrics.metrics().containsKey(pausedMetric)) {
      metrics
          .sensor(calculateSensorName(allStates.get("PAUSED"), connectorName))
          .add(pausedMetric, new Value());
    }

    if (!metrics.metrics().containsKey(failedMetric)) {
      metrics
          .sensor(calculateSensorName(allStates.get("FAILED"), connectorName))
          .add(failedMetric, new Value());
    }
    if (!metrics.metrics().containsKey(unassignedMetric)) {
      metrics
          .sensor(calculateSensorName(allStates.get("UNASSIGNED"), connectorName))
          .add(unassignedMetric, new Value());
    }
    if (!metrics.metrics().containsKey(destroyedMetric)) {
      metrics
          .sensor(calculateSensorName(allStates.get("DESTROYED"), connectorName))
          .add(destroyedMetric, new Value());
    }
    if (!metrics.metrics().containsKey(totalAttemptsPerConnectorMetric)) {
      metrics
          .sensor(FAILED_TASK_ATTEMPTS_METRIC_NAME + connectorName)
          .add(totalAttemptsPerConnectorMetric, new Total());
    }

    if (!metrics.metrics().containsKey(restartAttemptsPerConnectorMetric)) {
      metrics
          .sensor(FAILED_CONNECTOR_ATTEMPTS_METRIC_NAME + connectorName)
          .add(restartAttemptsPerConnectorMetric, new Total());
    }
  }
 
Example #8
Source File: ConsumeMetrics.java    From kafka-monitor with Apache License 2.0 4 votes vote down vote up
public ConsumeMetrics(final Metrics metrics, Map<String, String> tags, int latencyPercentileMaxMs,
    int latencyPercentileGranularityMs) {

  _bytesConsumed = metrics.sensor("bytes-consumed");
  _bytesConsumed.add(new MetricName("bytes-consumed-rate", METRIC_GROUP_NAME, "The average number of bytes per second that are consumed", tags), new Rate());

  _consumeError = metrics.sensor("consume-error");
  _consumeError.add(new MetricName("consume-error-rate", METRIC_GROUP_NAME, "The average number of errors per second", tags), new Rate());
  _consumeError.add(new MetricName("consume-error-total", METRIC_GROUP_NAME, "The total number of errors", tags), new Total());

  _recordsConsumed = metrics.sensor("records-consumed");
  _recordsConsumed.add(new MetricName("records-consumed-rate", METRIC_GROUP_NAME, "The average number of records per second that are consumed", tags), new Rate());
  _recordsConsumed.add(new MetricName("records-consumed-total", METRIC_GROUP_NAME, "The total number of records that are consumed", tags), new Total());

  _recordsDuplicated = metrics.sensor("records-duplicated");
  _recordsDuplicated.add(new MetricName("records-duplicated-rate", METRIC_GROUP_NAME, "The average number of records per second that are duplicated", tags), new Rate());
  _recordsDuplicated.add(new MetricName("records-duplicated-total", METRIC_GROUP_NAME, "The total number of records that are duplicated", tags), new Total());

  _recordsLost = metrics.sensor("records-lost");
  _recordsLost.add(new MetricName("records-lost-rate", METRIC_GROUP_NAME, "The average number of records per second that are lost", tags), new Rate());
  _recordsLost.add(new MetricName("records-lost-total", METRIC_GROUP_NAME, "The total number of records that are lost", tags), new Total());

  _recordsDelayed = metrics.sensor("records-delayed");
  _recordsDelayed.add(new MetricName("records-delayed-rate", METRIC_GROUP_NAME, "The average number of records per second that are either lost or arrive after maximum allowed latency under SLA", tags), new Rate());
  _recordsDelayed.add(new MetricName("records-delayed-total", METRIC_GROUP_NAME, "The total number of records that are either lost or arrive after maximum allowed latency under SLA", tags), new Total());

  _recordsDelay = metrics.sensor("records-delay");
  _recordsDelay.add(new MetricName("records-delay-ms-avg", METRIC_GROUP_NAME, "The average latency of records from producer to consumer", tags), new Avg());
  _recordsDelay.add(new MetricName("records-delay-ms-max", METRIC_GROUP_NAME, "The maximum latency of records from producer to consumer", tags), new Max());

  // There are 2 extra buckets use for values smaller than 0.0 or larger than max, respectively.
  int bucketNum = latencyPercentileMaxMs / latencyPercentileGranularityMs + 2;
  int sizeInBytes = 4 * bucketNum;
  _recordsDelay.add(new Percentiles(sizeInBytes, latencyPercentileMaxMs, Percentiles.BucketSizing.CONSTANT,
      new Percentile(new MetricName("records-delay-ms-99th", METRIC_GROUP_NAME, "The 99th percentile latency of records from producer to consumer", tags), 99.0),
      new Percentile(new MetricName("records-delay-ms-999th", METRIC_GROUP_NAME, "The 99.9th percentile latency of records from producer to consumer", tags), 99.9),
      new Percentile(new MetricName("records-delay-ms-9999th", METRIC_GROUP_NAME, "The 99.99th percentile latency of records from producer to consumer", tags), 99.99)));

  metrics.addMetric(new MetricName("consume-availability-avg", METRIC_GROUP_NAME, "The average consume availability", tags),
    (config, now) -> {
      double recordsConsumedRate = (double) metrics.metrics().get(metrics.metricName("records-consumed-rate", METRIC_GROUP_NAME, tags)).metricValue();
      double recordsLostRate = (double) metrics.metrics().get(metrics.metricName("records-lost-rate", METRIC_GROUP_NAME, tags)).metricValue();
      double recordsDelayedRate = (double) metrics.metrics().get(metrics.metricName("records-delayed-rate", METRIC_GROUP_NAME, tags)).metricValue();

      if (new Double(recordsLostRate).isNaN())
        recordsLostRate = 0;
      if (new Double(recordsDelayedRate).isNaN())
        recordsDelayedRate = 0;

      return recordsConsumedRate + recordsLostRate > 0
          ? (recordsConsumedRate - recordsDelayedRate) / (recordsConsumedRate + recordsLostRate) : 0;
    });
}
 
Example #9
Source File: ProduceMetrics.java    From kafka-monitor with Apache License 2.0 4 votes vote down vote up
public ProduceMetrics(final Metrics metrics, final Map<String, String> tags, int latencyPercentileGranularityMs,
    int latencyPercentileMaxMs, AtomicInteger partitionNumber, boolean treatZeroThroughputAsUnavailable) {
  _metrics = metrics;
  _tags = tags;

  _recordsProducedPerPartition = new ConcurrentHashMap<>();
  _produceErrorPerPartition = new ConcurrentHashMap<>();
  _produceErrorInLastSendPerPartition = new ConcurrentHashMap<>();

  _recordsProduced = metrics.sensor("records-produced");
  _recordsProduced.add(
      new MetricName("records-produced-rate", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
          "The average number of records per second that are produced", tags), new Rate());
  _recordsProduced.add(
      new MetricName("records-produced-total", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
          "The total number of records that are produced", tags), new Total());

  _produceError = metrics.sensor("produce-error");
  _produceError.add(new MetricName("produce-error-rate", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
      "The average number of errors per second", tags), new Rate());
  _produceError.add(new MetricName("produce-error-total", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
      "The total number of errors", tags), new Total());

  _produceDelay = metrics.sensor("produce-delay");
  _produceDelay.add(new MetricName("produce-delay-ms-avg", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
      "The average delay in ms for produce request", tags), new Avg());
  _produceDelay.add(new MetricName("produce-delay-ms-max", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
      "The maximum delay in ms for produce request", tags), new Max());

  // There are 2 extra buckets use for values smaller than 0.0 or larger than max, respectively.
  int bucketNum = latencyPercentileMaxMs / latencyPercentileGranularityMs + 2;
  int sizeInBytes = 4 * bucketNum;
  _produceDelay.add(new Percentiles(sizeInBytes, latencyPercentileMaxMs, Percentiles.BucketSizing.CONSTANT,
      new Percentile(new MetricName("produce-delay-ms-99th", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
          "The 99th percentile delay in ms for produce request", tags), 99.0), new Percentile(
      new MetricName("produce-delay-ms-999th", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
          "The 99.9th percentile delay in ms for produce request", tags), 99.9), new Percentile(
      new MetricName("produce-delay-ms-9999th", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
          "The 99.99th percentile delay in ms for produce request", tags), 99.99)));

  metrics.addMetric(
      new MetricName("produce-availability-avg", XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE,
          "The average produce availability", tags), (config, now) -> {
      double availabilitySum = 0.0;
      int partitionNum = partitionNumber.get();
      for (int partition = 0; partition < partitionNum; partition++) {
        double recordsProduced = (double) metrics.metrics()
            .get(metrics.metricName("records-produced-rate-partition-" + partition,
                XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, tags))
            .metricValue();
        double produceError = (double) metrics.metrics()
            .get(metrics.metricName("produce-error-rate-partition-" + partition,
                XinfraMonitorConstants.METRIC_GROUP_NAME_PRODUCE_SERVICE, tags))
            .metricValue();
        // If there is no error, error rate sensor may expire and the value may be NaN. Treat NaN as 0 for error rate.
        if (Double.isNaN(produceError) || Double.isInfinite(produceError)) {
          produceError = 0;
        }
        // If there is either succeeded or failed produce to a partition, consider its availability as 0.
        if (recordsProduced + produceError > 0) {
          availabilitySum += recordsProduced / (recordsProduced + produceError);
        } else if (!treatZeroThroughputAsUnavailable) {
          // If user configures treatZeroThroughputAsUnavailable to be false, a partition's availability
          // is 1.0 as long as there is no exception thrown from producer.
          // This allows kafka admin to exactly monitor the availability experienced by Kafka users which
          // will block and retry for a certain amount of time based on its configuration (e.g. retries, retry.backoff.ms).
          // Note that if it takes a long time for messages to be retries and sent, the latency in the ConsumeService
          // will increase and it will reduce ConsumeAvailability if the latency exceeds consume.latency.sla.ms
          // If timeout is set to more than 60 seconds (the current samples window duration),
          // the error sample might be expired before the next error can be produced.
          // In order to detect offline partition with high producer timeout config, the error status during last
          // send is also checked before declaring 1.0 availability for the partition.
          Boolean lastSendError = _produceErrorInLastSendPerPartition.get(partition);
          if (lastSendError == null || !lastSendError) {
            availabilitySum += 1.0;
          }
        }
      }

      // Assign equal weight to per-partition availability when calculating overall availability
      return availabilitySum / partitionNum;
    }
  );
}