org.apache.hadoop.fs.LocatedFileStatus Scala Examples

The following examples show how to use org.apache.hadoop.fs.LocatedFileStatus. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: HdfsOps.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels

import com.sksamuel.exts.Logging
import io.eels.util.{HdfsIterator, PathIterator}
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}

object HdfsOps extends Logging {

  def makePathVisible(path: Path)(implicit fs: FileSystem): Unit = {
    if (path.getName.startsWith(".")) {
      logger.info(s"Making $path visible by stripping leading .")
      val dest = new Path(path.getParent, path.getName.drop(1))
      fs.rename(path, dest)
    }
  }

  def findFiles(path: Path, recursive: Boolean, fs: FileSystem): Iterator[LocatedFileStatus] = {
    HdfsIterator.remote(fs.listFiles(path, recursive))
  }

  def mkdirsp(path: Path, fs: FileSystem): Boolean = PathIterator(path).forall(fs.mkdirs)
} 
Example 2
Source File: HivePartitionScanner.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.typesafe.config.{Config, ConfigFactory}
import io.eels.component.hive.partition.PartitionMetaData
import io.eels.schema.PartitionConstraint
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus}

// scans partitions for files, returning the files and the meta data object for each partition
class HivePartitionScanner(implicit fs: FileSystem) extends Logging {

  private val config: Config = ConfigFactory.load()
  private val missingPartitionAction: String = config.getString("eel.hive.source.missingPartitionAction")

  def scan(partitions: Seq[PartitionMetaData],
           constraints: Seq[PartitionConstraint] = Nil): Map[PartitionMetaData, Seq[LocatedFileStatus]] = {
    logger.debug(s"Scanning ${partitions.size} partitions for applicable files ${partitions.map(_.location).mkString(", ").take(100)}")

    // first we filter out any partitions not matching the constraints
    val filteredPartitions = partitions.filter { meta =>
      constraints.forall(_.eval(meta.partition))
    }
    logger.debug(s"Filtered partitions: ${filteredPartitions.map(_.location).mkString(", ")})")

    // next, we check that the directories that the partitions point to actually exist
    // this will avoid a situation where a location exists in the metastore but not on disk
    val exantPartitions = filteredPartitions.filter { partition =>
      if (fs.exists(partition.location)) {
        true
      } else {
        if (missingPartitionAction == "error") {
          throw new IllegalStateException(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these exceptions set eel.hive.source.missingPartitionAction=warn or eel.hive.source.missingPartitionAction=none")
        } else if (missingPartitionAction == "warn") {
          logger.warn(s"Partition [${partition.name}] was specified in the hive metastore at [${partition.location}] but did not exist on disk. To disable these warnings set eel.hive.source.missingPartitionAction=none")
          false
        } else {
          false
        }
      }
    }

    // next we grab all the data files from each of these partitions
    exantPartitions.map { meta =>
      meta -> HiveFileScanner(meta.location, false)
    }.toMap
  }
} 
Example 3
Source File: HiveFileScanner.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.hive

import com.sksamuel.exts.Logging
import com.typesafe.config.ConfigFactory
import io.eels.util.HdfsIterator
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}

// given a hadoop path, will look for files inside that path that match the
// configured settings for hidden files
// does not return directories
object HiveFileScanner extends Logging {

  private val config = ConfigFactory.load()
  private val ignoreHiddenFiles = config.getBoolean("eel.hive.source.ignoreHiddenFiles")
  private val hiddenFilePattern = config.getString("eel.hive.source.hiddenFilePattern")

  // returns true if the given file should be considered based on the config settings
  private def skip(file: LocatedFileStatus): Boolean = {
    file.getLen == 0L || ignoreHiddenFiles && file.getPath.getName.matches(hiddenFilePattern)
  }

  def apply(path: Path, recursive: Boolean)(implicit fs: FileSystem): Seq[LocatedFileStatus] = {
    logger.debug(s"Scanning $path, filtering=$ignoreHiddenFiles, pattern=$hiddenFilePattern")
    val files: List[LocatedFileStatus] = if (fs.exists(path)) {
      val files = fs.listFiles(path, recursive)
      HdfsIterator.remote(files)
          .filter(_.isFile)
          .filterNot(skip)
          .toList
    } else {
      Nil
    }
    logger.debug(s"Scanner found ${files.size} files")
    files
  }
} 
Example 4
Source File: HiveTableFilesFn.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.hive

import com.sksamuel.exts.Logging
import io.eels.component.hive.partition.PartitionMetaData
import io.eels.schema.{Partition, PartitionConstraint}
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}
import org.apache.hadoop.hive.metastore.IMetaStoreClient


object HiveTableFilesFn extends Logging {

  def apply(dbName: String,
            tableName: String,
            tableLocation: Path,
            partitionConstraints: Seq[PartitionConstraint])
           (implicit fs: FileSystem, client: IMetaStoreClient): Map[Partition, Seq[LocatedFileStatus]] = {

    val ops = new HiveOps(client)

    // when we have no partitions, this will scan just the table folder directly for files
    def rootScan(): Map[Partition, Seq[LocatedFileStatus]] = {
      Map(Partition.empty -> HiveFileScanner(tableLocation, false))
    }

    def partitionsScan(partitions: Seq[PartitionMetaData]): Map[Partition, Seq[LocatedFileStatus]] = {
      new HivePartitionScanner().scan(partitions, partitionConstraints)
        .map { case (key, value) => key.partition -> value }
    }

    // the table may or may not have partitions.
    //
    // 1. If we do have partitions then we need to scan the path of each partition
    // (and each partition may be located anywhere outside of the table root)
    //
    // 2. If we do not have partitions then we can simply scan the table root.

    // we go to the metastore as we need the locations of the partitions not the values
    val partitions = ops.partitionsMetaData(dbName, tableName)
    if (partitions.isEmpty && partitionConstraints.nonEmpty) {
      sys.error("Constraints were used on a table that was not partitioned")
    } else if (partitions.isEmpty) {
      logger.debug(s"No partitions for $tableName; performing root table scan")
      rootScan
    } else partitionsScan(partitions)
  }
} 
Example 5
Source File: HiveFilePublisher.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.hive

import com.sksamuel.exts.io.Using
import io.eels.datastream.{Subscription, Publisher, Subscriber}
import io.eels.schema.{Partition, StructType}
import io.eels.{Predicate, _}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus}


class HiveFilePublisher(dialect: HiveDialect,
                        file: LocatedFileStatus,
                        metastoreSchema: StructType,
                        projectionSchema: StructType,
                        predicate: Option[Predicate],
                        partition: Partition)
                       (implicit fs: FileSystem, conf: Configuration) extends Publisher[Seq[Row]] with Using {
  require(projectionSchema.fieldNames.forall { it => it == it.toLowerCase() }, s"Use only lower case field names with hive")

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {

    val partitionMap: Map[String, Any] = partition.entries.map { it => (it.key, it.value) }.toMap

    // the schema we send to the dialect must have any partition fields removed, because those
    // fields won't exist in the data files. This is because partitions are not always written
    // and instead inferred from the partition itself.
    val projectionFields = projectionSchema.fields.filterNot(field => partition.containsKey(field.name))
    val projectionWithoutPartitions = StructType(projectionFields)

    // since we removed the partition fields from the target schema, we must repopulate them after the read
    // we also need to throw away the dummy field if we had an empty schema
    val publisher = dialect.input(file.getPath, metastoreSchema, projectionWithoutPartitions, predicate)
    publisher.subscribe(new Subscriber[Seq[Row]] {
      override def subscribed(s: Subscription): Unit = subscriber.subscribed(s)
      override def next(chunk: Seq[Row]): Unit = {
        val aligned = chunk.map { row =>
          if (projectionFields.isEmpty) {
            val values = projectionSchema.fieldNames().map(partitionMap.apply)
            Row(projectionSchema, values.toVector)
          } else {
            RowUtils.rowAlign(row, projectionSchema, partitionMap)
          }
        }
        subscriber.next(aligned)
      }
      override def completed(): Unit = subscriber.completed()
      override def error(t: Throwable): Unit = subscriber.error(t)
    })
  }
} 
Example 6
Source File: ShapeInputFormat.scala    From magellan   with Apache License 2.0 5 votes vote down vote up
package magellan.mapreduce

import com.google.common.base.Stopwatch
import magellan.io.{ShapeKey, ShapeWritable}
import org.apache.commons.logging.LogFactory
import org.apache.hadoop.fs.{LocatedFileStatus, Path}
import org.apache.hadoop.mapreduce.lib.input._
import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext}

import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer

private[magellan] class ShapeInputFormat
  extends FileInputFormat[ShapeKey, ShapeWritable] {

  private val log = LogFactory.getLog(classOf[ShapeInputFormat])

  override def createRecordReader(inputSplit: InputSplit,
    taskAttemptContext: TaskAttemptContext) = {
    new ShapefileReader
  }

  override def isSplitable(context: JobContext, filename: Path): Boolean = true

  override def getSplits(job: JobContext): java.util.List[InputSplit] = {
    val splitInfos = SplitInfos.SPLIT_INFO_MAP.get()
    computeSplits(job, splitInfos)
  }

  private def computeSplits(
       job: JobContext,
       splitInfos: scala.collection.Map[String, Array[Long]]) = {

    val sw = new Stopwatch().start
    val splits = ListBuffer[InputSplit]()
    val files = listStatus(job)
    for (file <- files) {
      val path = file.getPath
      val length = file.getLen
      val blkLocations = if (file.isInstanceOf[LocatedFileStatus]) {
        file.asInstanceOf[LocatedFileStatus].getBlockLocations
      } else {
        val fs = path.getFileSystem(job.getConfiguration)
        fs.getFileBlockLocations(file, 0, length)
      }
      val key = path.getName.split("\\.shp$")(0)
      if (splitInfos == null || !splitInfos.containsKey(key)) {
        val blkIndex = getBlockIndex(blkLocations, 0)
        splits.+= (makeSplit(path, 0, length, blkLocations(blkIndex).getHosts,
          blkLocations(blkIndex).getCachedHosts))
      } else {
        val s = splitInfos(key).toSeq
        val start = s
        val end = s.drop(1) ++ Seq(length)
        start.zip(end).foreach { case (startOffset: Long, endOffset: Long) =>
          val blkIndex = getBlockIndex(blkLocations, startOffset)
          splits.+=(makeSplit(path, startOffset, endOffset - startOffset, blkLocations(blkIndex).getHosts,
            blkLocations(blkIndex).getCachedHosts))
        }
      }
    }
    sw.stop
    if (log.isDebugEnabled) {
      log.debug("Total # of splits generated by getSplits: " + splits.size + ", TimeTaken: " + sw.elapsedMillis)
    }
    splits
  }
}

object SplitInfos {

  // TODO: Can we get rid of this hack to pass split calculation to the Shapefile Reader?
  val SPLIT_INFO_MAP = new ThreadLocal[scala.collection.Map[String, Array[Long]]]

} 
Example 7
Source File: SerializableFileStatus.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.util

import java.util.Objects

import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path}


case class SerializableFileStatus(
    path: String,
    length: Long,
    isDir: Boolean,
    modificationTime: Long) {

  // Important note! This is very expensive to compute, but we don't want to cache it
  // as a `val` because Paths internally contain URIs and therefore consume lots of memory.
  def getPath: Path = new Path(path)
  def getLen: Long = length
  def getModificationTime: Long = modificationTime
  def isDirectory: Boolean = isDir

  def toFileStatus: FileStatus = {
    new LocatedFileStatus(
      new FileStatus(length, isDir, 0, 0, modificationTime, new Path(path)),
      null)
  }

  override def equals(obj: Any): Boolean = obj match {
    case other: SerializableFileStatus =>
      // We only compare the paths to stay consistent with FileStatus.equals.
      Objects.equals(path, other.path)
    case _ => false
  }

  override def hashCode(): Int = {
    // We only use the path to stay consistent with FileStatus.hashCode.
    Objects.hashCode(path)
  }
}

object SerializableFileStatus {
  def fromStatus(status: FileStatus): SerializableFileStatus = {
    SerializableFileStatus(
      Option(status.getPath).map(_.toString).orNull,
      status.getLen,
      status.isDirectory,
      status.getModificationTime)
  }

  val EMPTY: SerializableFileStatus = fromStatus(new FileStatus())
} 
Example 8
Source File: SerializableFileStatus.scala    From parquet-index   with Apache License 2.0 5 votes vote down vote up
package com.github.lightcopy.util

import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path}


object SerializableFileStatus {
  def fromFileStatus(status: FileStatus): SerializableFileStatus = {
    val blockLocations = status match {
      case f: LocatedFileStatus =>
        f.getBlockLocations.map { loc =>
          SerializableBlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
        }
      case _ =>
        Array.empty[SerializableBlockLocation]
    }

    SerializableFileStatus(
      status.getPath.toString,
      status.getLen,
      status.isDirectory,
      status.getReplication,
      status.getBlockSize,
      status.getModificationTime,
      status.getAccessTime,
      blockLocations)
  }

  def toFileStatus(status: SerializableFileStatus): FileStatus = {
    val blockLocations = status.blockLocations.map { loc =>
      new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
    }

    new LocatedFileStatus(
      new FileStatus(
        status.length,
        status.isDir,
        status.blockReplication,
        status.blockSize,
        status.modificationTime,
        new Path(status.path)),
      blockLocations)
  }
}