java source code of ChronixRDD

/*
 * Copyright (C) 2016 QAware GmbH
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */
package de.qaware.chronix.spark.api.java;

import de.qaware.chronix.spark.api.java.functions.DimensionFilterFunction;
import de.qaware.chronix.storage.solr.timeseries.metric.*;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.math3.stat.regression.SimpleRegression;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.DoubleFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.reflect.ClassTag$;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * ChronixRDD: A time series implementation based on Chronix.
 * <p>
 * The RDD represents a set of time series - the unit of parallelization
 * is one time series. A ChronixRDD can be created by and is bound to
 * a ChronixSparkContext.
 * <p>
 * The RDD contains time series specific actions and transformations. For
 * query purposes please use ChronixSparkContext as only the Context can
 * perform predicate / aggregation pushdown right now.
 *
 * @see ChronixSparkContext
 */
public class ChronixRDD extends JavaRDD<MetricTimeSeries> {

    private static ClassTag MTS_TYPE = ClassTag$.MODULE$.apply(MetricTimeSeries.class);

    public ChronixRDD(JavaRDD<MetricTimeSeries> tsRdd) {
        super(tsRdd.rdd(), MTS_TYPE);
    }

    /**
     * Action: Return iterator on contained time series.
     *
     * @return iterator on time series
     */
    public Iterator<MetricTimeSeries> iterator() {
        return this.collect().iterator();
    }


    /**
     * Action: Compute the mean value.
     *
     * @return mean value over all contained time series.
     */
    public double mean() {
        return getValuesAsRdd().mean();
    }

    /**
     * Action: Compute the approximate mean value.
     *
     * @return approximate mean value over all contained time series.
     */
    public double approxMean() {
        return getValuesAsRdd().meanApprox(1000).getFinalValue().mean();
    }

    /**
     * Action: Compute the min value.
     *
     * @return min value over all contained time series.
     */
    public double min() {
        return getValuesAsRdd().min();
    }

    /**
     * Action: Compute the max value.
     *
     * @return max value over all contained time series.
     */
    public double max() {
        return getValuesAsRdd().max();
    }


    /**
     * Action: Calculates the slope of a linear regression of every time series.
     *
     * Where: value = slope * timestamp
     * .. or:     y = slope * x
     *
     * @return the slopes (simple linear regression) of each an every time series in the RDD
     */
    public JavaDoubleRDD getSlopes() {
        return this.mapToDouble((DoubleFunction<MetricTimeSeries>) mts -> {
                    SimpleRegression regression = new SimpleRegression();
            mts.points().forEach(p -> regression.addData(p.getTimestamp(), p.getValue()));
                    return regression.getSlope();
                }
        );
    }

    /**
     * Transformation: Filters the contained time series according the given predicate.
     *
     * @param filter the predicate
     * @return the time series matching the predicate
     */
    public ChronixRDD filterTimeSeries(Function<MetricTimeSeries, Boolean> filter) {
        JavaRDD<MetricTimeSeries> mts = this.filter(filter);
        return new ChronixRDD(mts);
    }

    /**
     * Transformation: Filters MetricTimeSeries according their dimensional values.
     *
     * @param dimensionFilter dimensions (key) and their value
     * @return time series according the filter criteria
     * @see DimensionFilterFunction
     */
    public ChronixRDD filterByDimensions(Map<MetricDimension, String> dimensionFilter) {
        return this.filterTimeSeries(new DimensionFilterFunction(dimensionFilter));
    }

    /**
     * Transformation: Filters MetricTimeSeries according their dimensional values.
     *
     * @param dimensionFilter dimensions (key) and their value
     * @param ignoreNull      also include time series if dimensional value is null
     * @return time series according the filter criteria
     */
    public ChronixRDD filterByDimensions(Map<MetricDimension, String> dimensionFilter, boolean ignoreNull) {
        return this.filterTimeSeries(new DimensionFilterFunction(dimensionFilter, ignoreNull));
    }

    /**
     * Transformation: Joins the time series according their identity.
     *
     * @return joined time series
     */
    public ChronixRDD joinChunks() {
        JavaPairRDD<MetricTimeSeriesKey, Iterable<MetricTimeSeries>> groupRdd
                = this.groupBy(MetricTimeSeriesKey::new);

        JavaPairRDD<MetricTimeSeriesKey, MetricTimeSeries> joinedRdd
                = groupRdd.mapValues((Function<Iterable<MetricTimeSeries>, MetricTimeSeries>) mtsIt -> {
            MetricTimeSeriesOrdering ordering = new MetricTimeSeriesOrdering();
            List<MetricTimeSeries> orderedChunks = ordering.immutableSortedCopy(mtsIt);
            MetricTimeSeries result = null;
            for (MetricTimeSeries mts : orderedChunks) {
                if (result == null) {
                    result = new MetricTimeSeries
                            .Builder(mts.getMetric())
                            .attributes(mts.attributes()).build();
                }
                result.addAll(mts.getTimestampsAsArray(), mts.getValuesAsArray());
            }
            return result;
        });

        JavaRDD<MetricTimeSeries> resultJavaRdd =
                joinedRdd.map((Tuple2<MetricTimeSeriesKey, MetricTimeSeries> mtTuple) -> mtTuple._2);

        return new ChronixRDD(resultJavaRdd);
    }

    /**
     * Transformation: Get all values as JavaDoubleRDD.
     *
     * @return a RDD with all observation values
     */
    public JavaDoubleRDD getValuesAsRdd() {
        return this.flatMapToDouble(mts -> Arrays.asList(ArrayUtils.toObject(mts.getValuesAsArray())).iterator());
    }

    /**
     * Action: Counts the number of observations.
     *
     * @return the number of overall observations in all time series
     */
    public long countObservations() {
        JavaDoubleRDD sizesRdd = this.mapToDouble(
                (DoubleFunction<MetricTimeSeries>) value -> (double) value.size());
        return sizesRdd.sum().longValue();
    }

    /**
     * Transformation: Transforms the ChronixRDD into a RDD of MetricObservations (pair of timestamp & value + dimensions).
     *
     * @return RDD of MetricObservations
     */
    public JavaRDD<MetricObservation> toObservations() {
        return this.flatMap((FlatMapFunction<MetricTimeSeries, MetricObservation>) ts -> ts.points().map(point -> {
            //null-safe read of dimensional values
            String host = ts.attributes().get(MetricDimension.HOST) == null ? null
                    : ts.attributes().get(MetricDimension.HOST).toString();
            String series = ts.attributes().get(MetricDimension.MEASUREMENT_SERIES) == null ? null
                    : ts.attributes().get(MetricDimension.MEASUREMENT_SERIES).toString();
            String process = ts.attributes().get(MetricDimension.PROCESS) == null ? null
                    : ts.attributes().get(MetricDimension.PROCESS).toString();
            String group = ts.attributes().get(MetricDimension.METRIC_GROUP) == null ? null
                    : ts.attributes().get(MetricDimension.METRIC_GROUP).toString();
            String ag = ts.attributes().get(MetricDimension.AGGREGATION_LEVEL) == null ? null
                    : ts.attributes().get(MetricDimension.AGGREGATION_LEVEL).toString();
            //convert Point/MetricTimeSeries to MetricObservation
            return new MetricObservation(
                    ts.getMetric(),
                    host, series, process, group, ag,
                    point.getTimestamp(),
                    point.getValue()
            );
        }).iterator());
    }

    /**
     * Transformation: Derives a Spark SQL DataFrame from a ChronixRDD.
     * <p>
     * The DataFrame contains the following columns:
     * <ul>
     * <li>for each dimension (@see: de.qaware.chronix.storage.solr.timeseries.metric.MetricDimension) one column</li>
     * <li>one column for the observations' timestamp</li>
     * <li>one column for the measurement value at the observation timestamp</li>
     * </ul>
     *
     * @param sqlContext an open SQLContext
     * @return a DataFrame containing the ChronixRDD data
     */
    public Dataset<Row> toDataFrame(SQLContext sqlContext) {
        return sqlContext.createDataFrame(
                this.toObservations(),
                MetricObservation.class
        );
    }

    /**
     * Transformation: Derives a Spark Dataset of observations from a ChronixRDD.
     * <p>
     * WARNING: The Dataset-API is currently not stable. So this is an experimental feature on Chronix Spark.
     * We've flagged it deprecated to express its experimental nature (which is semantically symmetric to legacy methods).
     * More on the Datasets API here:
     * <a href="https://databricks.com/blog/2016/01/04/introducing-spark-datasets.html">databricks Blog: Introducing Spark Datasets (4/1/2016)</a>.
     *
     * @param sqlContext an open SQLContext
     * @return a Dataset of MetricObservations
     */
    public Dataset<MetricObservation> toObservationsDataset(SQLContext sqlContext) {
        return sqlContext.createDataset(this.toObservations().rdd(), Encoders.bean(MetricObservation.class));
    }
}