org.apache.hadoop.fs.FileStatus Scala Examples

The following examples show how to use org.apache.hadoop.fs.FileStatus. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala    From drizzle-spark   with Apache License 2.0 12 votes vote down vote up
package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
} 
Example 2
Source File: SerializableFileStatus.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.util

import java.util.Objects

import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path}


case class SerializableFileStatus(
    path: String,
    length: Long,
    isDir: Boolean,
    modificationTime: Long) {

  // Important note! This is very expensive to compute, but we don't want to cache it
  // as a `val` because Paths internally contain URIs and therefore consume lots of memory.
  def getPath: Path = new Path(path)
  def getLen: Long = length
  def getModificationTime: Long = modificationTime
  def isDirectory: Boolean = isDir

  def toFileStatus: FileStatus = {
    new LocatedFileStatus(
      new FileStatus(length, isDir, 0, 0, modificationTime, new Path(path)),
      null)
  }

  override def equals(obj: Any): Boolean = obj match {
    case other: SerializableFileStatus =>
      // We only compare the paths to stay consistent with FileStatus.equals.
      Objects.equals(path, other.path)
    case _ => false
  }

  override def hashCode(): Int = {
    // We only use the path to stay consistent with FileStatus.hashCode.
    Objects.hashCode(path)
  }
}

object SerializableFileStatus {
  def fromStatus(status: FileStatus): SerializableFileStatus = {
    SerializableFileStatus(
      Option(status.getPath).map(_.toString).orNull,
      status.getLen,
      status.isDirectory,
      status.getModificationTime)
  }

  val EMPTY: SerializableFileStatus = fromStatus(new FileStatus())
} 
Example 3
Source File: BgenSchemaInferrer.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.bgen

import com.google.common.io.LittleEndianDataInputStream
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType

import io.projectglow.common.{BgenOptions, CommonOptions, VariantSchemas, WithUtils}
import io.projectglow.sql.util.SerializableConfiguration


object BgenSchemaInferrer {
  def inferSchema(
      spark: SparkSession,
      files: Seq[FileStatus],
      options: Map[String, String]): StructType = {
    val shouldIncludeSampleIds = options.get(CommonOptions.INCLUDE_SAMPLE_IDS).forall(_.toBoolean)
    if (!shouldIncludeSampleIds) {
      return VariantSchemas.bgenDefaultSchema(hasSampleIds = false)
    }

    val sampleIdsFromSampleFile =
      BgenFileFormat.getSampleIds(options, spark.sparkContext.hadoopConfiguration)
    if (sampleIdsFromSampleFile.isDefined) {
      return VariantSchemas.bgenDefaultSchema(hasSampleIds = true)
    }

    val serializableConf = new SerializableConfiguration(spark.sparkContext.hadoopConfiguration)
    val ignoreExtension = options.get(BgenOptions.IGNORE_EXTENSION_KEY).exists(_.toBoolean)
    val bgenPaths =
      files.filter { fs =>
        fs.getLen > 0 && (fs
          .getPath
          .toString
          .endsWith(BgenFileFormat.BGEN_SUFFIX) || ignoreExtension)
      }.map(_.getPath.toString)
    val hasSampleIds = spark
      .sparkContext
      .parallelize(bgenPaths)
      .map { path =>
        val hPath = new Path(path)
        val hadoopFs = hPath.getFileSystem(serializableConf.value)
        WithUtils.withCloseable(hadoopFs.open(hPath)) { stream =>
          val littleEndianDataInputStream = new LittleEndianDataInputStream(stream)
          new BgenHeaderReader(littleEndianDataInputStream)
            .readHeader(None)
            .sampleIds
            .exists(_.nonEmpty)
        }
      }
      .collect()
      .exists(identity)

    VariantSchemas.bgenDefaultSchema(hasSampleIds)
  }
} 
Example 4
Source File: XyzRelation.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus.xyz

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import org.apache.spark.sql.sources.HadoopFsRelation
import org.apache.spark.sql.Row
import org.apache.spark.rdd.RDD
import org.apache.hadoop.fs.FileStatus

object XyzRelation {

  val xyzrgbSchema = StructType(Array(
    StructField("x", FloatType, false),
    StructField("y", FloatType, false),
    StructField("z", FloatType, false),
    StructField("r", ByteType, false),
    StructField("g", ByteType, false),
    StructField("b", ByteType, false)
  ))

  val xyzSchema = StructType(Array(
    StructField("x", FloatType, false),
    StructField("y", FloatType, false),
    StructField("z", FloatType, false)
  ))

}

class XyzRelation(
  override val paths: Array[String],
  maybeDataSchema: Option[StructType],
  override val userDefinedPartitionColumns: Option[StructType],
  parameters: Map[String, String]
)(@transient val sqlContext: SQLContext)
    extends HadoopFsRelation {

  override lazy val dataSchema = maybeDataSchema.getOrElse(XyzRelation.xyzrgbSchema)

  override def prepareJobForWrite(job: org.apache.hadoop.mapreduce.Job): org.apache.spark.sql.sources.OutputWriterFactory = ???

  override def buildScan(inputs: Array[FileStatus]): RDD[Row] = {
    val lines = sqlContext.sparkContext.textFile(inputs.map(_.getPath).mkString("", ",", ""))
    val dataTypes = dataSchema.fields.map(_.dataType)
    lines map (line => Row.fromSeq((line.split("\t") zip dataTypes).map {
      case (x, StringType) => x
      case (x, ByteType) => x.toByte
      case (x, ShortType) => x.toShort
      case (x, IntegerType) => x.toInt
      case (x, LongType) => x.toLong
      case (x, FloatType) => x.toFloat
      case (x, DoubleType) => x.toDouble
      case _ => null
    }.padTo(dataTypes.size, null)))
  }

} 
Example 5
Source File: S3ALineCountWritebackSuite.scala    From cloud-integration   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.spark.cloud.s3

import scala.concurrent.duration._
import scala.language.postfixOps

import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource
import org.apache.hadoop.fs.{FileStatus, Path}


class S3ALineCountWritebackSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup {

  init()

  def init(): Unit = {
    // propagate S3 credentials
    if (enabled) {
      initFS()
    }
  }

  override def enabled: Boolean = super.enabled && hasCSVTestFile

  override def cleanFSInTeardownEnabled: Boolean = true

  after {
    cleanFilesystemInTeardown()
  }

  ctest("LineCountWriteback",
    "Execute the LineCount example with the results written back to the test filesystem.") {
    val sourceFile = getTestCSVPath()
    val sourceFS = sourceFile.getFileSystem(getConf)
    val sourceInfo = sourceFS.getFileStatus(sourceFile)
    val sparkConf = newSparkConf()
    sparkConf.setAppName("LineCount")
    val destDir = testPath(filesystem, "LineCountWriteback")
    assert(0 === S3ALineCount.action(sparkConf,
      Array(sourceFile.toString, destDir.toString)))


    val status = filesystem.getFileStatus(destDir)
    assert(status.isDirectory, s"Not a directory: $status")

    // only a small fraction of the source data is needed
    val expectedLen = sourceInfo.getLen / 1024

    def validateChildSize(qualifier: String, files: Seq[FileStatus]) = {
      val (filenames, size) = enumFileSize(destDir, files)
      logInfo(s"total size of $qualifier = $size bytes from ${files.length} files: $filenames")
      assert(size >= expectedLen, s"$qualifier size $size in files $filenames" +
          s" smaller than exoected length $expectedLen")
    }

    val stdInterval = interval(100 milliseconds)
    val appId = eventually(timeout(20 seconds), stdInterval) {
      validateChildSize("descendants",
        listFiles(filesystem, destDir, true)
            .filter(f => f.getPath.getName != "_SUCCESS"))

      validateChildSize("children",
        filesystem.listStatus(destDir,
          pathFilter(p => p.getName != "_SUCCESS")).toSeq)
    }
  }

  private def enumFileSize(destDir: Path, files: Seq[FileStatus]): (String, Long)  = {
    assert(files.nonEmpty, s"No files in destination directory $destDir")
    var size = 0L
    val filenames = new StringBuffer()
    files.foreach { f =>
      size += f.getLen
      filenames.append(" ").append(f.getPath)
    }
    (filenames.toString, size)
  }

} 
Example 6
Source File: SerializableFileStatus.scala    From parquet-index   with Apache License 2.0 5 votes vote down vote up
package com.github.lightcopy.util

import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path}


object SerializableFileStatus {
  def fromFileStatus(status: FileStatus): SerializableFileStatus = {
    val blockLocations = status match {
      case f: LocatedFileStatus =>
        f.getBlockLocations.map { loc =>
          SerializableBlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
        }
      case _ =>
        Array.empty[SerializableBlockLocation]
    }

    SerializableFileStatus(
      status.getPath.toString,
      status.getLen,
      status.isDirectory,
      status.getReplication,
      status.getBlockSize,
      status.getModificationTime,
      status.getAccessTime,
      blockLocations)
  }

  def toFileStatus(status: SerializableFileStatus): FileStatus = {
    val blockLocations = status.blockLocations.map { loc =>
      new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
    }

    new LocatedFileStatus(
      new FileStatus(
        status.length,
        status.isDir,
        status.blockReplication,
        status.blockSize,
        status.modificationTime,
        new Path(status.path)),
      blockLocations)
  }
} 
Example 7
Source File: SageMakerProtobufFileFormat.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.protobuf

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.types.StructType

class SageMakerProtobufFileFormat extends FileFormat with DataSourceRegister {

  override def inferSchema(sparkSession: SparkSession,
                           options: Map[String, String],
                           files: Seq[FileStatus]):
  Option[StructType] = {
    Option.empty
  }

  override def shortName(): String = "sagemaker"

  override def toString: String = "SageMaker"

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    new OutputWriterFactory {
      override def newInstance(
                                path: String,
                                dataSchema: StructType,
                                context: TaskAttemptContext): OutputWriter = {
        new SageMakerProtobufWriter(path, context, dataSchema, options)
      }

      override def getFileExtension(context: TaskAttemptContext): String = {
        ".pbr"
      }
    }
  }
} 
Example 8
Source File: SparkHadoopUtilSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy

import java.security.PrivilegedExceptionAction

import scala.util.Random

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.fs.permission.{FsAction, FsPermission}
import org.apache.hadoop.security.UserGroupInformation
import org.scalatest.Matchers

import org.apache.spark.SparkFunSuite

class SparkHadoopUtilSuite extends SparkFunSuite with Matchers {
  test("check file permission") {
    import FsAction._
    val testUser = s"user-${Random.nextInt(100)}"
    val testGroups = Array(s"group-${Random.nextInt(100)}")
    val testUgi = UserGroupInformation.createUserForTesting(testUser, testGroups)

    testUgi.doAs(new PrivilegedExceptionAction[Void] {
      override def run(): Void = {
        val sparkHadoopUtil = new SparkHadoopUtil

        // If file is owned by user and user has access permission
        var status = fileStatus(testUser, testGroups.head, READ_WRITE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by user but user has no access permission
        status = fileStatus(testUser, testGroups.head, NONE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        val otherUser = s"test-${Random.nextInt(100)}"
        val otherGroup = s"test-${Random.nextInt(100)}"

        // If file is owned by user's group and user's group has access permission
        status = fileStatus(otherUser, testGroups.head, NONE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by user's group but user's group has no access permission
        status = fileStatus(otherUser, testGroups.head, READ_WRITE, NONE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        // If file is owned by other user and this user has access permission
        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, READ_WRITE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(true)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(true)

        // If file is owned by other user but this user has no access permission
        status = fileStatus(otherUser, otherGroup, READ_WRITE, READ_WRITE, NONE)
        sparkHadoopUtil.checkAccessPermission(status, READ) should be(false)
        sparkHadoopUtil.checkAccessPermission(status, WRITE) should be(false)

        null
      }
    })
  }

  private def fileStatus(
      owner: String,
      group: String,
      userAction: FsAction,
      groupAction: FsAction,
      otherAction: FsAction): FileStatus = {
    new FileStatus(0L,
      false,
      0,
      0L,
      0L,
      0L,
      new FsPermission(userAction, groupAction, otherAction),
      owner,
      group,
      null)
  }
} 
Example 9
Source File: FileStreamSinkLog.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.net.URI

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: Int,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = 1
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
} 
Example 10
Source File: MetadataLogFileIndex.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import scala.collection.mutable

import org.apache.hadoop.fs.{FileStatus, Path}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.types.StructType



class MetadataLogFileIndex(
    sparkSession: SparkSession,
    path: Path,
    userPartitionSchema: Option[StructType])
  extends PartitioningAwareFileIndex(sparkSession, Map.empty, userPartitionSchema) {

  private val metadataDirectory = new Path(path, FileStreamSink.metadataDir)
  logInfo(s"Reading streaming file log from $metadataDirectory")
  private val metadataLog =
    new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString)
  private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory)
  private var cachedPartitionSpec: PartitionSpec = _

  override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = {
    new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f)
  }

  override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = {
    allFilesFromLog.toArray.groupBy(_.getPath.getParent)
  }

  override def rootPaths: Seq[Path] = path :: Nil

  override def refresh(): Unit = { }

  override def partitionSpec(): PartitionSpec = {
    if (cachedPartitionSpec == null) {
      cachedPartitionSpec = inferPartitioning()
    }
    cachedPartitionSpec
  }
} 
Example 11
Source File: KyuubiDistributedCacheManagerSuite.scala    From kyuubi   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType, LocalResourceVisibility}
import org.apache.hadoop.yarn.util.ConverterUtils
import org.apache.spark.{KyuubiSparkUtil, SparkFunSuite}
import org.mockito.Mockito.when
import org.scalatest.mock.MockitoSugar

import yaooqinn.kyuubi.utils.ReflectUtils

class KyuubiDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {

  class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
    override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]):
    LocalResourceVisibility = {
      LocalResourceVisibility.PRIVATE
    }
  }

  test("add resource") {
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val status = new FileStatus()
    when(fs.getFileStatus(destPath)).thenReturn(status)
    val fileLink = "link"
    ReflectUtils.setFieldValue(
      KyuubiDistributedCacheManager, "cacheManager", new MockClientDistributedCacheManager)
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.FILE, fileLink, statCache)
    val res = localResources(fileLink)
    assert(res.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res.getResource) === destPath)
    assert(res.getSize === 0)
    assert(res.getTimestamp === 0)
    assert(res.getType === LocalResourceType.FILE)
    val status2 = new FileStatus(
      10, false, 1, 1024, 10,
      10, null, KyuubiSparkUtil.getCurrentUserName, null, new Path("/tmp/testing2"))
    val destPath2 = new Path("file:///foo.bar.com:8080/tmp/testing2")
    when(fs.getFileStatus(destPath2)).thenReturn(status2)
    val fileLink2 = "link2"
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath2, localResources, LocalResourceType.FILE, fileLink2, statCache)
    val res2 = localResources(fileLink2)
    assert(res2.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res2.getResource) === destPath2)
    assert(res2.getSize === 10)
    assert(res2.getTimestamp === 10)
    assert(res2.getType === LocalResourceType.FILE)
  }

  test("add resource when link null") {
    val distMgr = new MockClientDistributedCacheManager()
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
    intercept[Exception] {
      KyuubiDistributedCacheManager.addResource(
        fs, conf, destPath, localResources, LocalResourceType.FILE, null, statCache)
    }
    assert(localResources.get("link") === None)
    assert(localResources.size === 0)
  }

  test("test addResource archive") {
    val distMgr = new MockClientDistributedCacheManager()
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner",
      null, new Path("/tmp/testing"))
    when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)

    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", statCache)
    val resource = localResources("link")
    assert(resource.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(resource.getResource) === destPath)
    assert(resource.getTimestamp === 10)
    assert(resource.getSize === 10)
    assert(resource.getType === LocalResourceType.ARCHIVE)

  }

} 
Example 12
Source File: KyuubiDistributedCacheManager.scala    From kyuubi   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType}


  def addResource(
      fs: FileSystem,
      conf: Configuration,
      destPath: Path,
      localResources: HashMap[String, LocalResource],
      resourceType: LocalResourceType,
      link: String,
      statCache: Map[URI, FileStatus]): Unit = {
    cacheManager.addResource(fs, conf, destPath,
      localResources, resourceType, link, statCache, appMasterOnly = true)
  }
} 
Example 13
Source File: FileStreamSourceSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.File
import java.net.URI

import scala.util.Random

import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.StructType

class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {

  import FileStreamSource._

  test("SeenFilesMap") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 5)
    assert(map.size == 1)
    map.purge()
    assert(map.size == 1)

    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
    map.add("b", 15)
    assert(map.size == 2)
    map.purge()
    assert(map.size == 2)

    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
    map.add("c", 16)
    assert(map.size == 3)
    map.purge()
    assert(map.size == 2)

    // Override existing entry shouldn't change the size
    map.add("c", 25)
    assert(map.size == 2)

    // Not a new file because we have seen c before
    assert(!map.isNewFile("c", 20))

    // Not a new file because timestamp is too old
    assert(!map.isNewFile("d", 5))

    // Finally a new file: never seen and not too old
    assert(map.isNewFile("e", 20))
  }

  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 20)
    assert(map.size == 1)

    // Timestamp 5 should still considered a new file because purge time should be 0
    assert(map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))

    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
    map.purge()
    assert(!map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))
  }

  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
    withTempDir { temp =>
      spark.conf.set(
        s"fs.$scheme.impl",
        classOf[ExistsThrowsExceptionFileSystem].getName)
      // add the metadata entries as a pre-req
      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
      val metadataLog =
        new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))

      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
        dir.getAbsolutePath, Map.empty)
      // this method should throw an exception if `fs.exists` is called during resolveRelation
      newSource.getBatch(None, FileStreamSourceOffset(1))
    }
  }
}


  override def listStatus(file: Path): Array[FileStatus] = {
    val emptyFile = new FileStatus()
    emptyFile.setPath(file)
    Array(emptyFile)
  }
}

object ExistsThrowsExceptionFileSystem {
  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
} 
Example 14
Source File: FileStreamSinkLog.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.{read, write}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: String,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = "v1"
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
} 
Example 15
Source File: BinaryFileReader.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark

import com.microsoft.ml.spark.core.env.StreamUtilities
import com.microsoft.ml.spark.core.schema.BinaryFileSchema
import com.microsoft.ml.spark.core.utils.AsyncUtils
import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.spark.binary.BinaryFileFormat
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.binary.ConfUtils
import org.apache.spark.sql.types.BinaryType

import scala.concurrent.{ExecutionContext, Future}
import scala.concurrent.duration.Duration

object BinaryFileReader {

  private def recursePath(fileSystem: FileSystem,
                          path: Path,
                          pathFilter: FileStatus => Boolean,
                          visitedSymlinks: Set[Path]): Array[Path] ={
    val filteredPaths = fileSystem.listStatus(path).filter(pathFilter)
    val filteredDirs = filteredPaths.filter(fs => fs.isDirectory & !visitedSymlinks(fs.getPath))
    val symlinksFound = visitedSymlinks ++ filteredDirs.filter(_.isSymlink).map(_.getPath)
    filteredPaths.map(_.getPath) ++ filteredDirs.map(_.getPath)
      .flatMap(p => recursePath(fileSystem, p, pathFilter, symlinksFound))
  }

  def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean): Array[Path] ={
    recursePath(fileSystem, path, pathFilter, Set())
  }

  
  def readFromPaths(df: DataFrame,
                    pathCol: String,
                    bytesCol: String,
                    concurrency: Int,
                    timeout: Int
                   ): DataFrame = {
    val outputSchema = df.schema.add(bytesCol, BinaryType, nullable = true)
    val encoder = RowEncoder(outputSchema)
    val hconf = ConfUtils.getHConf(df)

    df.mapPartitions { rows =>
      val futures = rows.map {row: Row =>
        Future {
            val path = new Path(row.getAs[String](pathCol))
            val fs = path.getFileSystem(hconf.value)
            val bytes = StreamUtilities.using(fs.open(path)) {is => IOUtils.toByteArray(is)}.get
            val ret = Row.merge(Seq(row, Row(bytes)): _*)
            ret
          }(ExecutionContext.global)
      }
      AsyncUtils.bufferedAwait(
        futures,concurrency, Duration.fromNanos(timeout*(20^6).toLong))(ExecutionContext.global)
    }(encoder)
  }

} 
Example 16
Source File: HadoopFileSystemLogStore.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.storage

import java.io.{BufferedReader, FileNotFoundException, InputStreamReader}
import java.nio.charset.StandardCharsets.UTF_8
import java.nio.file.FileAlreadyExistsException
import java.util.UUID

import scala.collection.JavaConverters._

import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession


  protected def writeWithRename(
      path: Path, actions: Iterator[String], overwrite: Boolean = false): Unit = {
    val fs = path.getFileSystem(getHadoopConfiguration)

    if (!fs.exists(path.getParent)) {
      throw new FileNotFoundException(s"No such file or directory: ${path.getParent}")
    }
    if (overwrite) {
      val stream = fs.create(path, true)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
      } finally {
        stream.close()
      }
    } else {
      if (fs.exists(path)) {
        throw new FileAlreadyExistsException(path.toString)
      }
      val tempPath = createTempPath(path)
      var streamClosed = false // This flag is to avoid double close
      var renameDone = false // This flag is to save the delete operation in most of cases.
      val stream = fs.create(tempPath)
      try {
        actions.map(_ + "\n").map(_.getBytes(UTF_8)).foreach(stream.write)
        stream.close()
        streamClosed = true
        try {
          if (fs.rename(tempPath, path)) {
            renameDone = true
          } else {
            if (fs.exists(path)) {
              throw new FileAlreadyExistsException(path.toString)
            } else {
              throw new IllegalStateException(s"Cannot rename $tempPath to $path")
            }
          }
        } catch {
          case _: org.apache.hadoop.fs.FileAlreadyExistsException =>
            throw new FileAlreadyExistsException(path.toString)
        }
      } finally {
        if (!streamClosed) {
          stream.close()
        }
        if (!renameDone) {
          fs.delete(tempPath, false)
        }
      }
    }
  }

  protected def createTempPath(path: Path): Path = {
    new Path(path.getParent, s".${path.getName}.${UUID.randomUUID}.tmp")
  }

  override def invalidateCache(): Unit = {}
} 
Example 17
Source File: FileNames.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.util

import org.apache.spark.sql.delta.DeltaErrors
import org.apache.hadoop.fs.{FileStatus, Path}


  def getFileVersion(path: Path): Long = {
    if (isCheckpointFile(path)) {
      checkpointVersion(path)
    } else if (isDeltaFile(path)) {
      deltaVersion(path)
    } else if (isChecksumFile(path)) {
      checksumVersion(path)
    } else {
      // scalastyle:off throwerror
      throw new AssertionError(
        s"Unexpected file type found in transaction log: $path")
      // scalastyle:on throwerror
    }
  }
} 
Example 18
Source File: HdfsUtils.scala    From etl-light   with MIT License 5 votes vote down vote up
package yamrcraft.etlite.utils

import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}

import scala.collection.mutable

object HdfsUtils {

  def renameFiles(fromBase: Path, toBase: Path, fs: FileSystem) = {
    if (fs.exists(fromBase)) {
      val filesToMove = listFiles(fromBase, fs)
      println("files to move:")
      filesToMove foreach (p => println("+++" + p.toString))
      filesToMove foreach { file =>
        val relPath = relativize(fromBase, file)
        val toPath = new Path(toBase, relPath)
        fs.mkdirs(toPath.getParent)
        fs.rename(file, toPath)
        println(" file renamed to: " + toPath.toString)
      }
    }
  }

  def relativize(base: Path, files: List[Path]) = {
    files map (file => new Path(base.toUri.relativize(file.toUri).getPath))
  }

  def relativize(base: Path, file: Path): Path = {
    new Path(base.toUri.relativize(file.toUri).getPath)
  }

  def listFiles(path: Path, fs: FileSystem): List[Path] = {
    val statusList = mutable.MutableList[FileStatus]()
    traverse(path, statusList, fs)
    statusList.map(status => new Path(status.getPath.toUri.getPath)).toList
  }

  private def traverse(path: Path, list: mutable.MutableList[FileStatus], fs: FileSystem): Unit = {
    fs.listStatus(path) foreach { status =>
      if (!status.isDirectory) {
        list += status
      } else {
        traverse(status.getPath, list, fs)
      }
    }
  }

} 
Example 19
Source File: FileStreamSourceSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.File
import java.net.URI

import scala.util.Random

import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.StructType

class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {

  import FileStreamSource._

  test("SeenFilesMap") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 5)
    assert(map.size == 1)
    map.purge()
    assert(map.size == 1)

    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
    map.add("b", 15)
    assert(map.size == 2)
    map.purge()
    assert(map.size == 2)

    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
    map.add("c", 16)
    assert(map.size == 3)
    map.purge()
    assert(map.size == 2)

    // Override existing entry shouldn't change the size
    map.add("c", 25)
    assert(map.size == 2)

    // Not a new file because we have seen c before
    assert(!map.isNewFile("c", 20))

    // Not a new file because timestamp is too old
    assert(!map.isNewFile("d", 5))

    // Finally a new file: never seen and not too old
    assert(map.isNewFile("e", 20))
  }

  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 20)
    assert(map.size == 1)

    // Timestamp 5 should still considered a new file because purge time should be 0
    assert(map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))

    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
    map.purge()
    assert(!map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))
  }

  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
    withTempDir { temp =>
      spark.conf.set(
        s"fs.$scheme.impl",
        classOf[ExistsThrowsExceptionFileSystem].getName)
      // add the metadata entries as a pre-req
      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
      val metadataLog =
        new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))

      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
        dir.getAbsolutePath, Map.empty)
      // this method should throw an exception if `fs.exists` is called during resolveRelation
      newSource.getBatch(None, FileStreamSourceOffset(1))
    }
  }
}


  override def listStatus(file: Path): Array[FileStatus] = {
    val emptyFile = new FileStatus()
    emptyFile.setPath(file)
    Array(emptyFile)
  }
}

object ExistsThrowsExceptionFileSystem {
  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
} 
Example 20
Source File: FileStreamSinkLog.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.{read, write}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: String,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = "v1"
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
} 
Example 21
Source File: ArrowFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package com.intel.oap.spark.sql.execution.datasources.arrow

import scala.collection.JavaConverters._

import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions}
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._
import org.apache.arrow.dataset.scanner.ScanOptions
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable {

  val batchSize = 4096

  def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = {
    ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava))
  }

  override def inferSchema(
                            sparkSession: SparkSession,
                            options: Map[String, String],
                            files: Seq[FileStatus]): Option[StructType] = {
    convert(files, options)
  }

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    throw new UnsupportedOperationException("Write is not supported for Arrow source")
  }

  override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true

  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
      dataSchema: StructType,
      partitionSchema: StructType,
      requiredSchema: StructType,
      filters: Seq[Filter],
      options: Map[String, String],
      hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
    (file: PartitionedFile) => {

      val sqlConf = sparkSession.sessionState.conf;
      val enableFilterPushDown = sqlConf.arrowFilterPushDown
      val factory = ArrowUtils.makeArrowDiscovery(
        file.filePath, new ArrowOptions(
          new CaseInsensitiveStringMap(
            options.asJava).asScala.toMap))

      // todo predicate validation / pushdown
      val dataset = factory.finish();

      val filter = if (enableFilterPushDown) {
        ArrowFilters.translateFilters(filters)
      } else {
        org.apache.arrow.dataset.filter.Filter.EMPTY
      }

      val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray,
        filter, batchSize)
      val scanner = dataset.newScan(scanOptions)
      val itrList = scanner
        .scan()
        .iterator()
        .asScala
        .map(task => task.scan())
        .toList

      val itr = itrList
        .toIterator
        .flatMap(itr => itr.asScala)
        .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema))
      new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]]
    }
  }

  override def shortName(): String = "arrow"
}

object ArrowFileFormat {
  class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] {
    override def hasNext: Boolean = delegate.hasNext

    override def next(): T = delegate.next()
  }
} 
Example 22
Source File: ArrowTable.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package com.intel.oap.spark.sql.execution.datasources.v2.arrow

import org.apache.hadoop.fs.FileStatus

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.connector.read.ScanBuilder
import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder}
import org.apache.spark.sql.execution.datasources.FileFormat
import org.apache.spark.sql.execution.datasources.v2.FileTable
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap

case class ArrowTable(
    name: String,
    sparkSession: SparkSession,
    options: CaseInsensitiveStringMap,
    paths: Seq[String],
    userSpecifiedSchema: Option[StructType],
    fallbackFileFormat: Class[_ <: FileFormat])
    extends FileTable(sparkSession, options, paths, userSpecifiedSchema) {

  override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
    ArrowUtils.readSchema(files, options)
  }

  override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {
    ArrowScanBuilder(sparkSession, fileIndex, schema, dataSchema, options)
  }

  override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = {
    throw new UnsupportedOperationException // fixme implement later
  }

  override def formatName: String = "ARROW"
} 
Example 23
Source File: OapIndexFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
import org.apache.parquet.hadoop.util.ContextUtil

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory}
import org.apache.spark.sql.execution.datasources.oap.OapFileFormat
import org.apache.spark.sql.types.StructType

private[index] class OapIndexFileFormat
  extends FileFormat
  with Logging
  with Serializable {

  override def inferSchema(
      sparkSession: SparkSession,
      options: Map[String, String],
      files: Seq[FileStatus]): Option[StructType] = None

  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory = {

    val configuration = ContextUtil.getConfiguration(job)

    configuration.set(OapIndexFileFormat.ROW_SCHEMA, dataSchema.json)
    configuration.set(OapIndexFileFormat.INDEX_TYPE, options("indexType"))
    configuration.set(OapIndexFileFormat.INDEX_NAME, options("indexName"))
    configuration.set(OapIndexFileFormat.INDEX_TIME, options("indexTime"))
    configuration.set(OapIndexFileFormat.IS_APPEND, options("isAppend"))

    new OutputWriterFactory {
      override def getFileExtension(context: TaskAttemptContext): String =
        OapFileFormat.OAP_INDEX_EXTENSION

      override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext) =
        new OapIndexOutputWriter(path, context)
    }
  }
}

private[index] object OapIndexFileFormat {
  val ROW_SCHEMA: String = "org.apache.spark.sql.oap.row.attributes"
  val INDEX_TYPE: String = "org.apache.spark.sql.oap.index.type"
  val INDEX_NAME: String = "org.apache.spark.sql.oap.index.name"
  val INDEX_TIME: String = "org.apache.spark.sql.oap.index.time"
  val IS_APPEND: String = "org.apache.spark.sql.oap.index.append"
}

case class IndexBuildResult(dataFile: String, rowCount: Long, fingerprint: String, parent: String) 
Example 24
Source File: Checksum.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package org.schedoscope.dsl.transformations

import java.security.MessageDigest

import org.apache.hadoop.fs.{FileStatus, Path}
import org.schedoscope.Schedoscope
import org.schedoscope.scheduler.driver.FilesystemDriver._

import scala.Array.canBuildFrom
import scala.collection.mutable.HashMap

object Checksum {
  private def md5 = MessageDigest.getInstance("MD5")

  private def listFiles(path: String): Array[FileStatus] = {
    val files = fileSystem(path, Schedoscope.settings.hadoopConf).globStatus(new Path(path))
    if (files != null)
      files
    else Array()
  }

  private def fileChecksum(path: String) =
    if (path == null)
      "null-checksum"
    else if (path.endsWith(".jar"))
      path
    else try {
      val cs = fileSystem(path, Schedoscope.settings.hadoopConf).getFileChecksum(new Path(path))
      if (cs == null)
        path
      else
        cs.toString()
    } catch {
      case _: Throwable => path
    }

  def fileChecksums(paths: List[String], recursive: Boolean): List[String] =
    paths.flatMap(path => {
      if (fileSystem(path, Schedoscope.settings.hadoopConf).isFile(new Path(path)))
        List(fileChecksum(path))
      else if (recursive)
        fileChecksums(listFiles(path + "/*").map(f => f.getPath.toString()).toList, recursive)
      else
        List()
    }).sorted

  val resourceHashCache = new HashMap[List[String], List[String]]()

  def resourceHashes(resources: List[String]): List[String] = synchronized {
    resourceHashCache.getOrElseUpdate(resources, fileChecksums(resources, true))
  }

  val defaultDigest = "0"

  def digest(stringsToDigest: String*): String =
    if (stringsToDigest.isEmpty)
      defaultDigest
    else
      md5.digest(stringsToDigest.sorted.mkString.toCharArray().map(_.toByte)).map("%02X" format _).mkString

  object SchemaChecksum {
    val checksumProperty = "schema.checksum"
  }

  object TransformationChecksum {
    val checksumProperty = "transformation.checksum"
    val timestampProperty = "transformation.timestamp"
  }

} 
Example 25
Source File: UnsplittableSequenceFileInputFormat.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.hadoop.splits

import java.io.IOException
import java.util

import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath }
import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat }
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input

import scala.collection.JavaConverters._



  override def listStatus(job: JobContext): util.List[FileStatus] =
    super
      .listStatus(job)
      .asScala
      .sortBy {
        _.getPath.getName match {
          case PartFileBasename(idx) ⇒
            idx
          case basename ⇒
            throw new IllegalArgumentException(s"Bad partition file: $basename")
        }
      }
      .asJava
} 
Example 26
Source File: FileStreamSinkLog.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.net.URI

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: Int,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val defaultCompactInterval =
    sparkSession.sessionState.conf.fileSinkLogCompactInterval

  require(defaultCompactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
      "to a positive value.")

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = 1
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
} 
Example 27
Source File: MetadataLogFileIndex.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import scala.collection.mutable

import org.apache.hadoop.fs.{FileStatus, Path}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.types.StructType



class MetadataLogFileIndex(
    sparkSession: SparkSession,
    path: Path,
    userSpecifiedSchema: Option[StructType])
  extends PartitioningAwareFileIndex(sparkSession, Map.empty, userSpecifiedSchema) {

  private val metadataDirectory = new Path(path, FileStreamSink.metadataDir)
  logInfo(s"Reading streaming file log from $metadataDirectory")
  private val metadataLog =
    new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString)
  private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory)
  private var cachedPartitionSpec: PartitionSpec = _

  override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = {
    new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f)
  }

  override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = {
    allFilesFromLog.groupBy(_.getPath.getParent)
  }

  override def rootPaths: Seq[Path] = path :: Nil

  override def refresh(): Unit = { }

  override def partitionSpec(): PartitionSpec = {
    if (cachedPartitionSpec == null) {
      cachedPartitionSpec = inferPartitioning()
    }
    cachedPartitionSpec
  }
} 
Example 28
Source File: FileStreamSourceSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import java.io.File
import java.net.URI

import scala.util.Random

import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.StructType

class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {

  import FileStreamSource._

  test("SeenFilesMap") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 5)
    assert(map.size == 1)
    map.purge()
    assert(map.size == 1)

    // Add a new entry and purge should be no-op, since the gap is exactly 10 ms.
    map.add("b", 15)
    assert(map.size == 2)
    map.purge()
    assert(map.size == 2)

    // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now.
    map.add("c", 16)
    assert(map.size == 3)
    map.purge()
    assert(map.size == 2)

    // Override existing entry shouldn't change the size
    map.add("c", 25)
    assert(map.size == 2)

    // Not a new file because we have seen c before
    assert(!map.isNewFile("c", 20))

    // Not a new file because timestamp is too old
    assert(!map.isNewFile("d", 5))

    // Finally a new file: never seen and not too old
    assert(map.isNewFile("e", 20))
  }

  test("SeenFilesMap should only consider a file old if it is earlier than last purge time") {
    val map = new SeenFilesMap(maxAgeMs = 10)

    map.add("a", 20)
    assert(map.size == 1)

    // Timestamp 5 should still considered a new file because purge time should be 0
    assert(map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))

    // Once purge, purge time should be 10 and then b would be a old file if it is less than 10.
    map.purge()
    assert(!map.isNewFile("b", 9))
    assert(map.isNewFile("b", 10))
  }

  testWithUninterruptibleThread("do not recheck that files exist during getBatch") {
    withTempDir { temp =>
      spark.conf.set(
        s"fs.$scheme.impl",
        classOf[ExistsThrowsExceptionFileSystem].getName)
      // add the metadata entries as a pre-req
      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
      val metadataLog =
        new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath)
      assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0))))

      val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
        dir.getAbsolutePath, Map.empty)
      // this method should throw an exception if `fs.exists` is called during resolveRelation
      newSource.getBatch(None, LongOffset(1))
    }
  }
}


  override def listStatus(file: Path): Array[FileStatus] = {
    val emptyFile = new FileStatus()
    emptyFile.setPath(file)
    Array(emptyFile)
  }
}

object ExistsThrowsExceptionFileSystem {
  val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs"
} 
Example 29
Source File: FileStreamSinkLog.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming

import org.apache.hadoop.fs.{FileStatus, Path}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.{read, write}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf


class FileStreamSinkLog(
    metadataLogVersion: String,
    sparkSession: SparkSession,
    path: String)
  extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) {

  private implicit val formats = Serialization.formats(NoTypeHints)

  protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay

  protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion

  protected override val compactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval
  require(compactInterval > 0,
    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $compactInterval) " +
      "to a positive value.")

  protected override def serializeData(data: SinkFileStatus): String = {
    write(data)
  }

  protected override def deserializeData(encodedString: String): SinkFileStatus = {
    read[SinkFileStatus](encodedString)
  }

  override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
    val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
    if (deletedFiles.isEmpty) {
      logs
    } else {
      logs.filter(f => !deletedFiles.contains(f.path))
    }
  }
}

object FileStreamSinkLog {
  val VERSION = "v1"
  val DELETE_ACTION = "delete"
  val ADD_ACTION = "add"
}