org.apache.hadoop.fs.PathFilter Scala Examples

The following examples show how to use org.apache.hadoop.fs.PathFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: ParquetCompatibilityTest.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.{Path, PathFilter}
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.schema.MessageType

import org.apache.spark.sql.QueryTest


private[sql] abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest {
  protected def readParquetSchema(path: String): MessageType = {
    readParquetSchema(path, { path => !path.getName.startsWith("_") })
  }
  //读Parquet模式
  protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = {
    val fsPath = new Path(path)
    val fs = fsPath.getFileSystem(configuration)
    val parquetFiles = fs.listStatus(fsPath, new PathFilter {
      override def accept(path: Path): Boolean = pathFilter(path)
    }).toSeq

    val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true)
    footers.head.getParquetMetadata.getFileMetaData.getSchema
  }

  protected def logParquetSchema(path: String): Unit = {
    logInfo(
      //由parquet-avro写的Parquet文件的模式
      s"""Schema of the Parquet file written by parquet-avro:
         |${readParquetSchema(path)}
       """.stripMargin)
  }
}
//复合Parquet的兼容性测试
object ParquetCompatibilityTest {
  def makeNullable[T <: AnyRef](i: Int)(f: => T): T = {
    if (i % 3 == 0) null.asInstanceOf[T] else f
  }
}

Example 2

Source File: HadoopUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.image

import scala.language.existentials
import scala.util.Random

import org.apache.commons.io.FilenameUtils
import org.apache.hadoop.conf.{Configuration, Configured}
import org.apache.hadoop.fs.{Path, PathFilter}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat

import org.apache.spark.sql.SparkSession

private object RecursiveFlag {
  
  def withPathFilter[T](
      sampleRatio: Double,
      spark: SparkSession,
      seed: Long)(f: => T): T = {
    val sampleImages = sampleRatio < 1
    if (sampleImages) {
      val flagName = FileInputFormat.PATHFILTER_CLASS
      val hadoopConf = spark.sparkContext.hadoopConfiguration
      val old = Option(hadoopConf.getClass(flagName, null))
      hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio)
      hadoopConf.setLong(SamplePathFilter.seedParam, seed)
      hadoopConf.setClass(flagName, classOf[SamplePathFilter], classOf[PathFilter])
      try f finally {
        hadoopConf.unset(SamplePathFilter.ratioParam)
        hadoopConf.unset(SamplePathFilter.seedParam)
        old match {
          case Some(v) => hadoopConf.setClass(flagName, v, classOf[PathFilter])
          case None => hadoopConf.unset(flagName)
        }
      }
    } else {
      f
    }
  }
}

Example 3

Source File: ParquetCompatibilityTest.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, PathFilter}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter}
import org.apache.parquet.io.api.RecordConsumer
import org.apache.parquet.schema.{MessageType, MessageTypeParser}

import org.apache.spark.sql.QueryTest


  def writeDirect(
      path: String,
      schema: String,
      metadata: Map[String, String],
      recordWriters: (RecordConsumer => Unit)*): Unit = {
    val messageType = MessageTypeParser.parseMessageType(schema)
    val writeSupport = new DirectWriteSupport(messageType, metadata)
    val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport)
    try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close()
  }
}

Example 4

Source File: HadoopUtils.scala From spark-images with Apache License 2.0

5 votes

package org.apache.spark.image

import java.nio.file.Paths

import org.apache.commons.io.FilenameUtils

import scala.sys.process._
import org.apache.hadoop.conf.{Configuration, Configured}
import org.apache.hadoop.fs.{Path, PathFilter}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.spark.sql.SparkSession
import scala.language.existentials
import scala.util.Random

object RecursiveFlag {

  
  def setPathFilter(value: Option[Class[_]], sampleRatio: Option[Double] = None, spark: SparkSession)
  : Option[Class[_]] = {
    val flagName = FileInputFormat.PATHFILTER_CLASS
    val hadoopConf = spark.sparkContext.hadoopConfiguration
    val old = Option(hadoopConf.getClass(flagName, null))
    if (sampleRatio.isDefined) {
      hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio.get)
    } else {
      hadoopConf.unset(SamplePathFilter.ratioParam)
      None
    }

    value match {
      case Some(v) => hadoopConf.setClass(flagName, v, classOf[PathFilter])
      case None => hadoopConf.unset(flagName)
    }
    old
  }
}