/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.compaction.mapreduce;

import com.google.common.collect.ImmutableList;
import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.math3.primes.Primes;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.mortbay.log.Log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.github.rholder.retry.Retryer;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.primitives.Ints;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions;

import org.apache.gobblin.compaction.dataset.Dataset;
import org.apache.gobblin.compaction.dataset.DatasetHelper;
import org.apache.gobblin.compaction.event.CompactionSlaEventHelper;
import org.apache.gobblin.config.ConfigBuilder;
import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.metrics.GobblinMetrics;
import org.apache.gobblin.metrics.event.EventSubmitter;
import org.apache.gobblin.util.ExecutorsUtils;
import org.apache.gobblin.util.FileListUtils;
import org.apache.gobblin.util.HadoopUtils;
import org.apache.gobblin.util.RecordCountProvider;
import org.apache.gobblin.util.WriterUtils;
import org.apache.gobblin.util.executors.ScalingThreadPoolExecutor;
import org.apache.gobblin.util.recordcount.LateFileRecordCountProvider;
import org.apache.gobblin.util.retry.RetryerFactory;

import static org.apache.gobblin.compaction.mapreduce.MRCompactor.COMPACTION_PREFIX;
import static org.apache.gobblin.util.retry.RetryerFactory.*;

/**
 * This class is responsible for configuring and running a single MR job.
 * It should be extended by a subclass that properly configures the mapper and reducer related classes.
 *
 * The properties that control the number of reducers are compaction.target.output.file.size and
 * compaction.max.num.reducers. The number of reducers will be the smaller of
 * [total input size] / [compaction.target.output.file.size] + 1 and [compaction.max.num.reducers].
 *
 * If {@value MRCompactor#COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK} is set to true, does not
 * launch an MR job. Instead, just copies the files present in
 * {@value MRCompactor#COMPACTION_JOB_LATE_DATA_FILES} to a 'late' subdirectory within
 * the output directory.
 *
 * @author Ziyang Liu
 * @deprecated Please use {@link org.apache.gobblin.compaction.mapreduce.MRCompactionTask}
 *  and {@link org.apache.gobblin.compaction.source.CompactionSource} to launch MR instead.
 *  The new way enjoys simpler logic to trigger the compaction flow and more reliable verification criteria,
 *  instead of using timestamp only before.
 */
@SuppressWarnings("deprecation")
public abstract class MRCompactorJobRunner implements Runnable, Comparable<MRCompactorJobRunner> {

  private static final Logger LOG = LoggerFactory.getLogger(MRCompactorJobRunner.class);

  private static final String COMPACTION_JOB_PREFIX = "compaction.job.";

  /**
   * Properties related to the compaction job of a dataset.
   */
  public static final String COMPACTION_JOB_OUTPUT_DIR_PERMISSION = COMPACTION_JOB_PREFIX + "output.dir.permission";
  public static final String COMPACTION_JOB_TARGET_OUTPUT_FILE_SIZE =
      COMPACTION_JOB_PREFIX + "target.output.file.size";
  public static final long DEFAULT_COMPACTION_JOB_TARGET_OUTPUT_FILE_SIZE = 536870912;
  public static final String COMPACTION_JOB_MAX_NUM_REDUCERS = COMPACTION_JOB_PREFIX + "max.num.reducers";
  public static final int DEFAULT_COMPACTION_JOB_MAX_NUM_REDUCERS = 900;
  private static final String COMPACTION_JOB_OVERWRITE_OUTPUT_DIR = COMPACTION_JOB_PREFIX + "overwrite.output.dir";
  private static final boolean DEFAULT_COMPACTION_JOB_OVERWRITE_OUTPUT_DIR = false;
  private static final String COMPACTION_JOB_ABORT_UPON_NEW_DATA = COMPACTION_JOB_PREFIX + "abort.upon.new.data";
  private static final boolean DEFAULT_COMPACTION_JOB_ABORT_UPON_NEW_DATA = false;
  private static final String COMPACTION_COPY_LATE_DATA_THREAD_POOL_SIZE =
      COMPACTION_JOB_PREFIX + "copy.latedata.thread.pool.size";
  private static final int DEFAULT_COMPACTION_COPY_LATE_DATA_THREAD_POOL_SIZE = 5;

  // If true, the MR job will use either 1 reducer or a prime number of reducers.
  public static final String COMPACTION_JOB_USE_PRIME_REDUCERS = COMPACTION_JOB_PREFIX + "use.prime.reducers";
  public static final boolean DEFAULT_COMPACTION_JOB_USE_PRIME_REDUCERS = true;

  public static final String HADOOP_JOB_NAME = "Gobblin MR Compaction";
  private static final long MR_JOB_CHECK_COMPLETE_INTERVAL_MS = 5000;
  private final boolean isRetryEnabled;
  private final String tmpFsUri;

  public enum Policy {

    // The job runner is permitted to publish the data.
    DO_PUBLISH_DATA,

    // The job runner can proceed with the compaction for now but should not publish the data.
    DO_NOT_PUBLISH_DATA,

    // The job runner should abort asap without publishing data.
    ABORT_ASAP
  }

  public enum Status {
    ABORTED,
    COMMITTED,
    RUNNING
  }

  protected final Dataset dataset;
  protected final FileSystem fs;
  protected final FileSystem tmpFs;
  protected final FsPermission perm;
  protected final boolean shouldDeduplicate;
  protected final boolean outputDeduplicated;
  protected final boolean recompactFromDestPaths;
  protected final boolean recompactAllData;
  protected final boolean renameSourceDir;
  protected final boolean usePrimeReducers;
  protected final EventSubmitter eventSubmitter;
  private final RecordCountProvider inputRecordCountProvider;
  private final RecordCountProvider outputRecordCountProvider;
  private final LateFileRecordCountProvider lateInputRecordCountProvider;
  private final LateFileRecordCountProvider lateOutputRecordCountProvider;
  private final DatasetHelper datasetHelper;
  private final int copyLateDataThreadPoolSize;
  private final String outputExtension;

  private volatile Policy policy = Policy.DO_NOT_PUBLISH_DATA;
  private volatile Status status = Status.RUNNING;
  private final Cache<Path, List<Path>> applicablePathCache;

  static final String COMPACTION_RETRY_PREFIX = COMPACTION_JOB_PREFIX + "retry.";
  static final String COMPACTION_RETRY_ENABLED = COMPACTION_RETRY_PREFIX + "enabled";
  static final String COMPACTION_TMP_FS = COMPACTION_PREFIX + "tmp.fs";

  static final Config COMPACTION_RETRY_DEFAULTS;

  static {
    Map<String, Object> configMap =
        ImmutableMap.<String, Object>builder()
            .put(RETRY_TIME_OUT_MS, TimeUnit.MINUTES.toMillis(2L))   //Overall retry for 2 minutes
            .put(RETRY_INTERVAL_MS, TimeUnit.SECONDS.toMillis(5L)) //Try to retry 5 seconds
            .put(RETRY_MULTIPLIER, 2L) // Muliply by 2 every attempt
            .put(RETRY_TYPE, RetryType.EXPONENTIAL.name())
            .build();
    COMPACTION_RETRY_DEFAULTS = ConfigFactory.parseMap(configMap);
  };

  protected final Config retrierConfig;

  protected MRCompactorJobRunner(Dataset dataset, FileSystem fs) {
    this.dataset = dataset;
    this.fs = fs;
    this.perm = HadoopUtils.deserializeFsPermission(this.dataset.jobProps(), COMPACTION_JOB_OUTPUT_DIR_PERMISSION,
        FsPermission.getDefault());
    this.recompactFromDestPaths = this.dataset.jobProps().getPropAsBoolean(
        MRCompactor.COMPACTION_RECOMPACT_FROM_DEST_PATHS, MRCompactor.DEFAULT_COMPACTION_RECOMPACT_FROM_DEST_PATHS);
    this.recompactAllData = this.dataset.jobProps().getPropAsBoolean(
        MRCompactor.COMPACTION_RECOMPACT_ALL_DATA, MRCompactor.DEFAULT_COMPACTION_RECOMPACT_ALL_DATA);
    this.renameSourceDir = this.dataset.jobProps().getPropAsBoolean(
        MRCompactor.COMPACTION_RENAME_SOURCE_DIR_ENABLED, MRCompactor.DEFAULT_COMPACTION_RENAME_SOURCE_DIR_ENABLED);

    Preconditions.checkArgument(this.dataset.jobProps().contains(MRCompactor.COMPACTION_SHOULD_DEDUPLICATE),
        String.format("Missing property %s for dataset %s", MRCompactor.COMPACTION_SHOULD_DEDUPLICATE, this.dataset));
    this.shouldDeduplicate = this.dataset.jobProps().getPropAsBoolean(MRCompactor.COMPACTION_SHOULD_DEDUPLICATE);

    this.outputDeduplicated = this.dataset.jobProps().getPropAsBoolean(MRCompactor.COMPACTION_OUTPUT_DEDUPLICATED,
        MRCompactor.DEFAULT_COMPACTION_OUTPUT_DEDUPLICATED);

    this.usePrimeReducers = this.dataset.jobProps().getPropAsBoolean(COMPACTION_JOB_USE_PRIME_REDUCERS,
        DEFAULT_COMPACTION_JOB_USE_PRIME_REDUCERS);

    this.eventSubmitter = new EventSubmitter.Builder(
        GobblinMetrics.get(this.dataset.jobProps().getProp(ConfigurationKeys.JOB_NAME_KEY)).getMetricContext(),
        MRCompactor.COMPACTION_TRACKING_EVENTS_NAMESPACE).build();

    this.copyLateDataThreadPoolSize = this.dataset.jobProps().getPropAsInt(COMPACTION_COPY_LATE_DATA_THREAD_POOL_SIZE,
        DEFAULT_COMPACTION_COPY_LATE_DATA_THREAD_POOL_SIZE);

    this.tmpFsUri = this.dataset.jobProps().getProp(COMPACTION_TMP_FS,
        null);

    try {
      Log.info("Tmp fs uri:"+this.tmpFsUri);
      if (this.tmpFsUri != null) {
        this.tmpFs = FileSystem.get(new URI(this.tmpFsUri), new Configuration());
      } else {
        this.tmpFs = MRCompactorJobRunner.this.fs;
      }
    } catch (Exception e) {
      throw new RuntimeException("Failed get Filesystem from tmp fs uri", e);
    }

    try {
      this.inputRecordCountProvider = (RecordCountProvider) Class
          .forName(this.dataset.jobProps().getProp(MRCompactor.COMPACTION_INPUT_RECORD_COUNT_PROVIDER,
              MRCompactor.DEFAULT_COMPACTION_INPUT_RECORD_COUNT_PROVIDER))
          .newInstance();
      this.outputRecordCountProvider = (RecordCountProvider) Class
          .forName(this.dataset.jobProps().getProp(MRCompactor.COMPACTION_OUTPUT_RECORD_COUNT_PROVIDER,
              MRCompactor.DEFAULT_COMPACTION_OUTPUT_RECORD_COUNT_PROVIDER))
          .newInstance();
      this.lateInputRecordCountProvider = new LateFileRecordCountProvider(this.inputRecordCountProvider);
      this.lateOutputRecordCountProvider = new LateFileRecordCountProvider(this.outputRecordCountProvider);
      this.isRetryEnabled= this.dataset.jobProps().getPropAsBoolean(COMPACTION_RETRY_ENABLED,
          false);
    } catch (Exception e) {
      throw new RuntimeException("Failed to instantiate RecordCountProvider", e);
    }

    this.applicablePathCache = CacheBuilder.newBuilder().maximumSize(2000).build();
    this.datasetHelper = new DatasetHelper(this.dataset, this.fs, this.getApplicableFileExtensions());

    this.outputExtension = this.dataset.jobProps().getProp(MRCompactor.COMPACTION_FILE_EXTENSION, ".avro");

    if (this.isRetryEnabled) {
      this.retrierConfig = ConfigBuilder.create()
          .loadProps(this.dataset.jobProps().getProperties(), COMPACTION_RETRY_PREFIX)
          .build()
          .withFallback(COMPACTION_RETRY_DEFAULTS);

      LOG.info("Retry enabled for compaction publish :"+ retrierConfig.root().render(ConfigRenderOptions.concise()));
    } else {
      this.retrierConfig = WriterUtils.NO_RETRY_CONFIG;
      LOG.info("Retry disabled for compaction");
    }

  }

  @Override
  public void run() {
    Configuration conf = HadoopUtils.getConfFromState(this.dataset.jobProps());

    // Turn on mapreduce output compression by default
    if (conf.get("mapreduce.output.fileoutputformat.compress") == null && conf.get("mapred.output.compress") == null) {
      conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
    }

    // Disable delegation token cancellation by default
    if (conf.get("mapreduce.job.complete.cancel.delegation.tokens") == null) {
      conf.setBoolean("mapreduce.job.complete.cancel.delegation.tokens", false);
    }

    try {
      DateTime compactionTimestamp = getCompactionTimestamp();
      LOG.info("MR Compaction Job Timestamp " + compactionTimestamp.getMillis());
      if (this.dataset.jobProps().getPropAsBoolean(MRCompactor.COMPACTION_JOB_LATE_DATA_MOVEMENT_TASK, false)) {
        List<Path> newLateFilePaths = Lists.newArrayList();
        for (String filePathString : this.dataset.jobProps()
            .getPropAsList(MRCompactor.COMPACTION_JOB_LATE_DATA_FILES)) {
          if (FilenameUtils.isExtension(filePathString, getApplicableFileExtensions())) {
            newLateFilePaths.add(new Path(filePathString));
          }
        }

        Path lateDataOutputPath = this.outputDeduplicated ? this.dataset.outputLatePath() : this.dataset.outputPath();
        LOG.info(String.format("Copying %d late data files to %s", newLateFilePaths.size(), lateDataOutputPath));
        if (this.outputDeduplicated) {
          if (!this.fs.exists(lateDataOutputPath)) {
            if (!this.fs.mkdirs(lateDataOutputPath)) {
              throw new RuntimeException(
                  String.format("Failed to create late data output directory: %s.", lateDataOutputPath.toString()));
            }
          }
        }
        this.copyDataFiles(lateDataOutputPath, newLateFilePaths);
        if (this.outputDeduplicated) {
          dataset.checkIfNeedToRecompact (datasetHelper);
        }
        this.status = Status.COMMITTED;
      } else {
        if (this.fs.exists(this.dataset.outputPath()) && !canOverwriteOutputDir()) {
          LOG.warn(String.format("Output paths %s exists. Will not compact %s.", this.dataset.outputPath(),
              this.dataset.inputPaths()));
          this.status = Status.COMMITTED;
          return;
        }
        addJars(conf);
        Job job = Job.getInstance(conf);
        this.configureJob(job);
        this.submitAndWait(job);
        if (shouldPublishData(compactionTimestamp)) {
          // remove all invalid empty files due to speculative task execution
          List<Path> goodPaths = CompactionJobConfigurator.getGoodFiles(job, this.dataset.outputTmpPath(), this.tmpFs,
              ImmutableList.of("avro"));

          if (!this.recompactAllData && this.recompactFromDestPaths) {
            // append new files without deleting output directory
            addGoodFilesToOutputPath(goodPaths);
            // clean up late data from outputLateDirectory, which has been set to inputPath
            deleteFilesByPaths(this.dataset.inputPaths());
          } else {
            moveTmpPathToOutputPath();
            if (this.recompactFromDestPaths) {
              deleteFilesByPaths(this.dataset.additionalInputPaths());
            }
          }
          submitSlaEvent(job);
          LOG.info("Successfully published data for input folder " + this.dataset.inputPaths());
          this.status = Status.COMMITTED;
        } else {
          LOG.info("Data not published for input folder " + this.dataset.inputPaths() + " due to incompleteness");
          this.status = Status.ABORTED;
          return;
        }
      }
      if (renameSourceDir) {
        MRCompactor.renameSourceDirAsCompactionComplete (this.fs, this.dataset);
      } else {
        this.markOutputDirAsCompleted(compactionTimestamp);
      }
      this.submitRecordsCountsEvent();
    } catch (Throwable t) {
      throw Throwables.propagate(t);
    }
  }

  /**
   * For regular compactions, compaction timestamp is the time the compaction job starts.
   *
   * If this is a recompaction from output paths, the compaction timestamp will remain the same as previously
   * persisted compaction time. This is because such a recompaction doesn't consume input data, so next time,
   * whether a file in the input folder is considered late file should still be based on the previous compaction
   * timestamp.
   */
  private DateTime getCompactionTimestamp() throws IOException {
    DateTimeZone timeZone = DateTimeZone.forID(
        this.dataset.jobProps().getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));

    if (!this.recompactFromDestPaths) {
      return new DateTime(timeZone);
    }

    Set<Path> inputPaths = getInputPaths();
    long maxTimestamp = Long.MIN_VALUE;
    for (FileStatus status : FileListUtils.listFilesRecursively(this.fs, inputPaths)) {
      maxTimestamp = Math.max(maxTimestamp, status.getModificationTime());
    }
    return maxTimestamp == Long.MIN_VALUE ? new DateTime(timeZone) : new DateTime(maxTimestamp, timeZone);
  }

  private void copyDataFiles(final Path outputDirectory, List<Path> inputFilePaths) throws IOException {
    ExecutorService executor = ScalingThreadPoolExecutor.newScalingThreadPool(0, this.copyLateDataThreadPoolSize, 100,
        ExecutorsUtils.newThreadFactory(Optional.of(LOG), Optional.of(this.dataset.getName() + "-copy-data")));

    List<Future<?>> futures = Lists.newArrayList();
    for (final Path filePath : inputFilePaths) {
      Future<Void> future = executor.submit(new Callable<Void>() {
        @Override
        public Void call() throws Exception {
          Path convertedFilePath = MRCompactorJobRunner.this.outputRecordCountProvider.convertPath(
              LateFileRecordCountProvider.restoreFilePath(filePath),
              MRCompactorJobRunner.this.outputExtension,
              MRCompactorJobRunner.this.inputRecordCountProvider);
          String targetFileName = convertedFilePath.getName();
          Path outPath = MRCompactorJobRunner.this.lateOutputRecordCountProvider.constructLateFilePath(targetFileName,
              MRCompactorJobRunner.this.fs, outputDirectory);
          HadoopUtils.copyPath (MRCompactorJobRunner.this.fs, filePath, MRCompactorJobRunner.this.fs, outPath, true,
              MRCompactorJobRunner.this.fs.getConf());
          LOG.debug(String.format("Copied %s to %s.", filePath, outPath));
          return null;
        }
      });
      futures.add(future);
    }
    try {
      for (Future<?> future : futures) {
        future.get();
      }
    } catch (ExecutionException | InterruptedException e) {
      throw new IOException("Failed to copy file.", e);
    } finally {
      ExecutorsUtils.shutdownExecutorService(executor, Optional.of(LOG));
    }
  }

  private boolean canOverwriteOutputDir() {
    return this.dataset.jobProps().getPropAsBoolean(COMPACTION_JOB_OVERWRITE_OUTPUT_DIR,
        DEFAULT_COMPACTION_JOB_OVERWRITE_OUTPUT_DIR) || this.recompactFromDestPaths;
  }

  private void addJars(Configuration conf) throws IOException {
    if (!this.dataset.jobProps().contains(MRCompactor.COMPACTION_JARS)) {
      return;
    }
    Path jarFileDir = new Path(this.dataset.jobProps().getProp(MRCompactor.COMPACTION_JARS));
    for (FileStatus status : this.fs.listStatus(jarFileDir)) {
      DistributedCache.addFileToClassPath(status.getPath(), conf, this.fs);
    }
  }

  protected void configureJob(Job job) throws IOException {
    job.setJobName(HADOOP_JOB_NAME + " (" + this.dataset.getDatasetName() + ")");
    configureInputAndOutputPaths(job);
    configureMapper(job);
    configureReducer(job);
    if (!this.shouldDeduplicate) {
      job.setNumReduceTasks(0);
    }
  }

  private void configureInputAndOutputPaths(Job job) throws IOException {
    for (Path inputPath : getInputPaths()) {
      FileInputFormat.addInputPath(job, inputPath);
    }

    //MR output path must not exist when MR job starts, so delete if exists.
    this.tmpFs.delete(this.dataset.outputTmpPath(), true);
    FileOutputFormat.setOutputPath(job, this.dataset.outputTmpPath());
  }

  private Set<Path> getInputPaths() {
    return ImmutableSet.<Path> builder().addAll(this.dataset.inputPaths()).addAll(this.dataset.additionalInputPaths())
        .build();
  }

  public Dataset getDataset() {
    return this.dataset;
  }

  protected void configureMapper(Job job) {
    setInputFormatClass(job);
    setMapperClass(job);
    setMapOutputKeyClass(job);
    setMapOutputValueClass(job);
  }

  protected void configureReducer(Job job) throws IOException {
    setOutputFormatClass(job);
    setReducerClass(job);
    setOutputKeyClass(job);
    setOutputValueClass(job);
    setNumberOfReducers(job);
  }

  protected abstract void setInputFormatClass(Job job);

  protected abstract void setMapperClass(Job job);

  protected abstract void setMapOutputKeyClass(Job job);

  protected abstract void setMapOutputValueClass(Job job);

  protected abstract void setOutputFormatClass(Job job);

  protected abstract void setReducerClass(Job job);

  protected abstract void setOutputKeyClass(Job job);

  protected abstract void setOutputValueClass(Job job);

  protected abstract Collection<String> getApplicableFileExtensions();

  protected void setNumberOfReducers(Job job) throws IOException {
    long inputSize = getInputSize();
    long targetFileSize = getTargetFileSize();
    int numReducers = Math.min(Ints.checkedCast(inputSize / targetFileSize) + 1, getMaxNumReducers());
    if (this.usePrimeReducers && numReducers != 1) {
      numReducers = Primes.nextPrime(numReducers);
    }
    job.setNumReduceTasks(numReducers);
  }

  private long getInputSize() throws IOException {
    long inputSize = 0;
    for (Path inputPath : this.getInputPaths()) {
      inputSize += this.fs.getContentSummary(inputPath).getLength();
    }
    return inputSize;
  }

  private long getTargetFileSize() {
    return this.dataset.jobProps().getPropAsLong(COMPACTION_JOB_TARGET_OUTPUT_FILE_SIZE,
        DEFAULT_COMPACTION_JOB_TARGET_OUTPUT_FILE_SIZE);
  }

  private int getMaxNumReducers() {
    return this.dataset.jobProps().getPropAsInt(COMPACTION_JOB_MAX_NUM_REDUCERS,
        DEFAULT_COMPACTION_JOB_MAX_NUM_REDUCERS);
  }

  private void submitAndWait(Job job) throws ClassNotFoundException, IOException, InterruptedException {
    job.submit();
    MRCompactor.addRunningHadoopJob(this.dataset, job);
    LOG.info(String.format("MR job submitted for dataset %s, input %s, url: %s", this.dataset, getInputPaths(),
        job.getTrackingURL()));
    while (!job.isComplete()) {
      if (this.policy == Policy.ABORT_ASAP) {
        LOG.info(String.format(
            "MR job for dataset %s, input %s killed due to input data incompleteness." + " Will try again later",
            this.dataset, getInputPaths()));
        job.killJob();
        return;
      }
      Thread.sleep(MR_JOB_CHECK_COMPLETE_INTERVAL_MS);
    }
    if (!job.isSuccessful()) {
      throw new RuntimeException(String.format("MR job failed for topic %s, input %s, url: %s", this.dataset,
          getInputPaths(), job.getTrackingURL()));
    }
  }

  /**
   * Data should be published if: (1) this.policy == {@link Policy#DO_PUBLISH_DATA}; (2) either
   * compaction.abort.upon.new.data=false, or no new data is found in the input folder since jobStartTime.
   */
  private boolean shouldPublishData(DateTime jobStartTime) throws IOException {
    if (this.policy != Policy.DO_PUBLISH_DATA) {
      return false;
    }
    if (!this.dataset.jobProps().getPropAsBoolean(COMPACTION_JOB_ABORT_UPON_NEW_DATA,
        DEFAULT_COMPACTION_JOB_ABORT_UPON_NEW_DATA)) {
      return true;
    }
    for (Path inputPath : getInputPaths()) {
      if (findNewDataSinceCompactionStarted(inputPath, jobStartTime)) {
        return false;
      }
    }
    return true;
  }

  private boolean findNewDataSinceCompactionStarted(Path inputPath, DateTime jobStartTime) throws IOException {
    for (FileStatus fstat : FileListUtils.listFilesRecursively(this.fs, inputPath)) {
      DateTime fileModificationTime = new DateTime(fstat.getModificationTime());
      if (fileModificationTime.isAfter(jobStartTime)) {
        LOG.info(String.format("Found new file %s in input folder %s after compaction started. Will abort compaction.",
            fstat.getPath(), inputPath));
        return true;
      }
    }
    return false;
  }

  private void markOutputDirAsCompleted(DateTime jobStartTime) throws IOException {
    Path completionFilePath = new Path(this.dataset.outputPath(), MRCompactor.COMPACTION_COMPLETE_FILE_NAME);
    try (FSDataOutputStream completionFileStream = this.fs.create(completionFilePath)) {
      completionFileStream.writeLong(jobStartTime.getMillis());
    }
  }

  private void moveTmpPathToOutputPath() throws IOException {
    Retryer<Void> retryer = RetryerFactory.newInstance(this.retrierConfig);

    LOG.info(String.format("Moving %s to %s", this.dataset.outputTmpPath(), this.dataset.outputPath()));

    this.fs.delete(this.dataset.outputPath(), true);

    if (this.isRetryEnabled) {
      try {
        retryer.call(() -> {
          if (fs.exists(this.dataset.outputPath())) {
            throw new IOException("Path " + this.dataset.outputPath() + " exists however it should not. Will wait more.");
          }
          return null;
        });
      } catch (Exception e) {
        throw new IOException(e);
      }
    }

    WriterUtils.mkdirsWithRecursivePermissionWithRetry(MRCompactorJobRunner.this.fs, this.dataset.outputPath().getParent(), this.perm, this.retrierConfig);

    Log.info("Moving from fs: ("+MRCompactorJobRunner.this.tmpFs.getUri()+") path: "+ this.dataset.outputTmpPath() + " to "+ "fs: ("+ FileSystem.get(this.dataset.outputPath().getParent().toUri(), this.fs.getConf()).getUri()+") output path: " + this.dataset.outputPath());
    HadoopUtils.movePath (MRCompactorJobRunner.this.tmpFs, this.dataset.outputTmpPath(), FileSystem.get(this.dataset.outputPath().getParent().toUri(), this.fs.getConf()), this.dataset.outputPath(), false, this.fs.getConf()) ;
  }

  private void addGoodFilesToOutputPath (List<Path> goodPaths) throws IOException {
    for (Path path: goodPaths) {
      String fileName = path.getName();
      LOG.info(String.format("Adding %s to %s", path.toString(), this.dataset.outputPath()));
      Path outPath = MRCompactorJobRunner.this.lateOutputRecordCountProvider.constructLateFilePath(fileName,
          MRCompactorJobRunner.this.fs, this.dataset.outputPath());

      HadoopUtils.movePath(MRCompactorJobRunner.this.tmpFs, path,
            FileSystem.get(this.dataset.outputPath().getParent().toUri(), this.fs.getConf()), outPath, false, this.fs.getConf());
    }
  }


  private void deleteFilesByPaths(Set<Path> paths) throws IOException {
    for (Path path : paths) {
      HadoopUtils.deletePathAndEmptyAncestors(this.fs, path, true);
    }
  }

  /**
   * Tell the {@link MRCompactorJobRunner} that it can go ahead and publish the data.
   */
  public void proceed() {
    this.policy = Policy.DO_PUBLISH_DATA;
  }

  public void abort() {
    this.policy = Policy.ABORT_ASAP;
  }

  /**
   * The status of the MRCompactorJobRunner.
   * @return RUNNING, COMMITTED or ABORTED.
   */
  public Status status() {
    return this.status;
  }

  @Override
  public int compareTo(MRCompactorJobRunner o) {
    return Double.compare(o.dataset.priority(), this.dataset.priority());
  }

  /**
   * Get the list of file {@link Path}s in the given dataDir, which satisfy the extension requirements
   *  of {@link #getApplicableFileExtensions()}.
   */
  private List<Path> getApplicableFilePaths(final Path dataDir, final FileSystem fs) throws IOException {
    try {
      return applicablePathCache.get(dataDir, new Callable<List<Path>>() {

        @Override
        public List<Path> call() throws Exception {
          if (!MRCompactorJobRunner.this.fs.exists(dataDir)) {
            return Lists.newArrayList();
          }
          List<Path> paths = Lists.newArrayList();
          for (FileStatus fileStatus : FileListUtils.listFilesRecursively(fs, dataDir,
              new PathFilter() {
            @Override
            public boolean accept(Path path) {
              for (String validExtention : getApplicableFileExtensions()) {
                if (path.getName().endsWith(validExtention)) {
                  return true;
                }
              }
              return false;
            }
          })) {
            paths.add(fileStatus.getPath());
          }
          return paths;
        }
      });
    } catch (ExecutionException e) {
      throw new IOException(e);
    }
  }

  /**
   * Submit an event when compaction MR job completes
   */
  private void submitSlaEvent(Job job) {
    try {
      CompactionSlaEventHelper
          .getEventSubmitterBuilder(this.dataset, Optional.of(job), this.fs)
          .eventSubmitter(this.eventSubmitter)
          .eventName(CompactionSlaEventHelper.COMPACTION_COMPLETED_EVENT_NAME)
          .additionalMetadata(
              CompactionSlaEventHelper.LATE_RECORD_COUNT,
              Long.toString(this.lateOutputRecordCountProvider.getRecordCount(this.getApplicableFilePaths(this.dataset
                  .outputLatePath(), this.fs))))
          .additionalMetadata(
              CompactionSlaEventHelper.REGULAR_RECORD_COUNT,
              Long.toString(this.outputRecordCountProvider.getRecordCount(this.getApplicableFilePaths(this.dataset
                  .outputPath(), this.fs))))
          .additionalMetadata(CompactionSlaEventHelper.RECOMPATED_METADATA_NAME,
              Boolean.toString(this.dataset.needToRecompact())).build().submit();
    } catch (Throwable e) {
      LOG.warn("Failed to submit compaction completed event:" + e, e);
    }
  }

  /**
   * Submit an event reporting late record counts and non-late record counts.
   */
  private void submitRecordsCountsEvent() {
    long lateOutputRecordCount = this.datasetHelper.getLateOutputRecordCount();
    long outputRecordCount = this.datasetHelper.getOutputRecordCount();

    try {
      CompactionSlaEventHelper
          .getEventSubmitterBuilder(this.dataset, Optional.<Job> absent(), this.fs)
          .eventSubmitter(this.eventSubmitter)
          .eventName(CompactionSlaEventHelper.COMPACTION_RECORD_COUNT_EVENT)
          .additionalMetadata(CompactionSlaEventHelper.DATASET_OUTPUT_PATH, this.dataset.outputPath().toString())
          .additionalMetadata(
              CompactionSlaEventHelper.LATE_RECORD_COUNT,
              Long.toString(lateOutputRecordCount))
          .additionalMetadata(
              CompactionSlaEventHelper.REGULAR_RECORD_COUNT,
              Long.toString(outputRecordCount))
          .additionalMetadata(CompactionSlaEventHelper.NEED_RECOMPACT, Boolean.toString(this.dataset.needToRecompact()))
          .build().submit();
    } catch (Throwable e) {
      LOG.warn("Failed to submit late event count:" + e, e);
    }
  }
}