package org.hammerlab.hadoop.splits

import java.io.IOException
import java.util

import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath }
import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat }
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input

import scala.collection.JavaConverters._

/**
 * [[SequenceFileInputFormat]] that guarantees loading the same splits it was written with.
 */
class UnsplittableSequenceFileInputFormat[K, V]
  extends SequenceFileInputFormat[K, V] {

  override def isSplitable(fs: FileSystem, filename: HPath): Boolean = false

  /**
   * Ensure that partitions are read back in in the same order they were written; should be unnecessary as of Hadoop 2.8
   * / 3.x. See https://issues.apache.org/jira/browse/HADOOP-10798
   */
  override def listStatus(job: JobConf): Array[FileStatus] =
    super
      .listStatus(job)
      .sortBy {
        _.getPath.getName match {
          case PartFileBasename(idx) ⇒
            idx
          case basename ⇒
            throw new IllegalArgumentException(s"Bad partition file: $basename")
        }
      }
}

class UnsplittableNewSequenceFileInputFormat[K, V]
  extends input.SequenceFileInputFormat[K, V] {

  override def isSplitable(context: JobContext, filename: HPath): Boolean = false

  /**
   * Ensure that partitions are read back in in the same order they were written; should be unnecessary as of Hadoop 2.8
   * / 3.x. See https://issues.apache.org/jira/browse/HADOOP-10798
   */

  override def listStatus(job: JobContext): util.List[FileStatus] =
    super
      .listStatus(job)
      .asScala
      .sortBy {
        _.getPath.getName match {
          case PartFileBasename(idx) ⇒
            idx
          case basename ⇒
            throw new IllegalArgumentException(s"Bad partition file: $basename")
        }
      }
      .asJava
}