/*
 * Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.oryx.lambda.batch;

import java.io.IOException;

import com.typesafe.config.Config;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction2;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.oryx.api.batch.BatchLayerUpdate;
import com.cloudera.oryx.api.TopicProducer;
import com.cloudera.oryx.common.settings.ConfigUtils;
import com.cloudera.oryx.lambda.TopicProducerImpl;

/**
 * Framework for executing the batch layer update, and storing data to persistent storage,
 * in the context of a streaming framework.
 *
 * @param <K> type of key read from input topic
 * @param <M> type of message read from input topic
 * @param <U> type of model message written
 */
final class BatchUpdateFunction<K,M,U> implements VoidFunction2<JavaPairRDD<K,M>,Time> {

  private static final Logger log = LoggerFactory.getLogger(BatchUpdateFunction.class);

  private final Class<K> keyClass;
  private final Class<M> messageClass;
  private final Class<? extends Writable> keyWritableClass;
  private final Class<? extends Writable> messageWritableClass;
  private final String dataDirString;
  private final String modelDirString;
  private final BatchLayerUpdate<K,M,U> updateInstance;
  private final String updateBroker;
  private final String updateTopic;
  private final JavaSparkContext sparkContext;

  BatchUpdateFunction(Config config,
                      Class<K> keyClass,
                      Class<M> messageClass,
                      Class<? extends Writable> keyWritableClass,
                      Class<? extends Writable> messageWritableClass,
                      String dataDirString,
                      String modelDirString,
                      BatchLayerUpdate<K,M,U> updateInstance,
                      JavaStreamingContext streamingContext) {
    this.keyClass = keyClass;
    this.messageClass = messageClass;
    this.keyWritableClass = keyWritableClass;
    this.messageWritableClass = messageWritableClass;
    this.dataDirString = dataDirString;
    this.modelDirString = modelDirString;
    this.updateBroker = ConfigUtils.getOptionalString(config, "oryx.update-topic.broker");
    this.updateTopic = ConfigUtils.getOptionalString(config, "oryx.update-topic.message.topic");
    this.updateInstance = updateInstance;
    this.sparkContext = streamingContext.sparkContext();
  }

  @Override
  public void call(JavaPairRDD<K,M> newData, Time timestamp)
      throws IOException, InterruptedException {

    if (newData.isEmpty()) {
      log.info("No data in current generation's RDD; nothing to do");
      return;
    }

    log.info("Beginning update at {}", timestamp);

    Configuration hadoopConf = sparkContext.hadoopConfiguration();
    if (hadoopConf.getResource("core-site.xml") == null) {
      log.warn("Hadoop config like core-site.xml was not found; " +
               "is the Hadoop config directory on the classpath?");
    }

    JavaPairRDD<K,M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
    if (inputPathStatuses == null || inputPathStatuses.length == 0) {

      log.info("No past data at path(s) {}", inputPathPattern);
      pastData = null;

    } else {

      log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
      Configuration updatedConf = new Configuration(hadoopConf);
      updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));

      @SuppressWarnings("unchecked")
      JavaPairRDD<Writable,Writable> pastWritableData = (JavaPairRDD<Writable,Writable>)
          sparkContext.newAPIHadoopRDD(updatedConf,
                                       SequenceFileInputFormat.class,
                                       keyWritableClass,
                                       messageWritableClass);

      pastData = pastWritableData.mapToPair(
          new WritableToValueFunction<>(keyClass,
                                        messageClass,
                                        keyWritableClass,
                                        messageWritableClass));
    }

    if (updateTopic == null || updateBroker == null) {
      log.info("Not producing updates to update topic since none was configured");
      updateInstance.runUpdate(sparkContext,
                               timestamp.milliseconds(),
                               newData,
                               pastData,
                               modelDirString,
                               null);
    } else {
      // This TopicProducer should not be async; sends one big model generally and
      // needs to occur before other updates reliably rather than be buffered
      try (TopicProducer<String,U> producer =
               new TopicProducerImpl<>(updateBroker, updateTopic, false)) {
        updateInstance.runUpdate(sparkContext,
                                 timestamp.milliseconds(),
                                 newData,
                                 pastData,
                                 modelDirString,
                                 producer);
      }
    }
  }

  /**
   * @return paths from {@link FileStatus}es into one comma-separated String
   * @see FileInputFormat#addInputPath(org.apache.hadoop.mapreduce.Job, Path)
   */
  private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) {
    StringBuilder joined = new StringBuilder();
    for (FileStatus status : statuses) {
      if (joined.length() > 0) {
        joined.append(',');
      }
      Path path = fs.makeQualified(status.getPath());
      joined.append(StringUtils.escapeString(path.toString()));
    }
    return joined.toString();
  }

}