* Copyright 2017 The OpenZipkin Authors
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
package zipkin.sparkstreaming;

import com.google.auto.value.AutoValue;
import com.google.auto.value.extension.memoized.Memoized;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import zipkin.Span;
import zipkin.internal.Util;
import zipkin.storage.StorageComponent;

public abstract class SparkStreamingJob implements Closeable {

  public static Builder newBuilder() {
    Map<String, String> conf = new LinkedHashMap<>();
    conf.put("spark.ui.enabled", "false");
    return new AutoValue_SparkStreamingJob.Builder()

  public interface Builder {
     * The spark master used for this job. Defaults to "local[*]"
     * <p>local[*] master lets us run & test the job locally without setting a Spark cluster
    Builder master(String master);

    /** When set, this indicates which jars to distribute to the cluster. */
    Builder jars(List<String> jars);

    /** Overrides the properties used to create a {@link SparkConf}. */
    Builder conf(Map<String, String> conf);

    /** The time interval at which streaming data will be divided into batches. Defaults to 10s. */
    Builder batchDuration(long batchDurationMillis);

    /** Produces a stream of serialized span messages (thrift or json lists) */
    Builder streamFactory(StreamFactory streamFactory);

    /** Conditionally adjusts spans grouped by trace ID. For example, pruning data */
    Builder adjusters(List<Adjuster> adjusters);

    /** Accepts spans grouped by trace ID. For example, writing to a {@link StorageComponent} */
    Builder consumer(Consumer consumer);

    /** Log4J level used for the "zipkin" category. Important when running in a cluster. */
    Builder zipkinLogLevel(String zipkinLogLevel);

    SparkStreamingJob build();

  abstract String master();

  abstract List<String> jars();

  abstract Map<String, String> conf();

  abstract long batchDuration();

  abstract StreamFactory streamFactory();

  abstract List<Adjuster> adjusters();

  abstract Consumer consumer();

  abstract String zipkinLogLevel();

  final AtomicBoolean started = new AtomicBoolean(false);

  JavaStreamingContext jsc() {
    SparkConf conf = new SparkConf(true)
    if (!jars().isEmpty()) conf.setJars(jars().toArray(new String[0]));
    for (Map.Entry<String, String> entry : conf().entrySet()) {
      conf.set(entry.getKey(), entry.getValue());
    return new JavaStreamingContext(conf, new Duration(batchDuration()));

  /** Starts the streaming job. Use {@link #close()} to stop it */
  public SparkStreamingJob start() {
    if (!started.compareAndSet(false, true)) return this;

    Runnable logInitializer = LogInitializer.create(zipkinLogLevel());
    logInitializer.run(); // Ensures local log commands emit
        new AutoValue_ReadSpans(logInitializer),
        new AutoValue_AdjustAndConsumeSpansSharingTraceId(logInitializer, adjusters(), consumer())

    return this;

  /** Use this to block on {@link #close()} */
  public void awaitTermination() {
    if (started.get()) jsc().awaitTermination();

  // NOTE: this is intentionally static to remind us that all state passed in must be serializable
  // Otherwise, tasks cannot be distributed across the cluster.
  static void streamSpansToStorage(
      JavaDStream<byte[]> stream,
      ReadSpans readSpans,
      AdjustAndConsumeSpansSharingTraceId adjustAndConsumeSpansSharingTraceId
  ) {
    JavaDStream<Span> spans = stream.flatMap(readSpans);

    // TODO: plug in some filter to drop spans regardless of trace ID
    // spans = spans.filter(spanFilter);

    JavaPairDStream<String, Iterable<Span>> tracesById = spans
        .mapToPair(s -> new Tuple2<>(Util.toLowerHex(s.traceIdHigh, s.traceId), s))

    tracesById.foreachRDD(rdd -> {

  @Override public void close() throws IOException {
    // not sure how to get spark to close things
    if (consumer() instanceof Closeable) {
      ((Closeable) consumer()).close();

  SparkStreamingJob() {