org.apache.hadoop.mapreduce.Job Scala Examples

The following examples show how to use org.apache.hadoop.mapreduce.Job. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala    From drizzle-spark   with Apache License 2.0 12 votes vote down vote up
package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
} 
Example 2
Source File: HBaseUtils.scala    From bigdata-examples   with Apache License 2.0 5 votes vote down vote up
package com.timeyang.common.util

import com.timeyang.common.config.BaseConf
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.hadoop.mapreduce.Job


  def createHbaseOutputJob(tableName: String): Job = {
    val conf = HBaseUtils.newConf()
    conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
    val job = Job.getInstance(conf)
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setOutputValueClass(classOf[Put])
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
    job
  }

} 
Example 3
Source File: CommitFailureTestSource.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.TaskContext
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.types.StructType

class CommitFailureTestSource extends SimpleTextSource {
  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    new OutputWriterFactory {
      override def newInstance(
          stagingDir: String,
          fileNamePrefix: String,
          dataSchema: StructType,
          context: TaskAttemptContext): OutputWriter = {
        new SimpleTextOutputWriter(stagingDir, fileNamePrefix, context) {
          var failed = false
          TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) =>
            failed = true
            SimpleTextRelation.callbackCalled = true
          }

          override val path: String = new Path(stagingDir, fileNamePrefix).toString

          override def write(row: Row): Unit = {
            if (SimpleTextRelation.failWriter) {
              sys.error("Intentional task writer failure for testing purpose.")

            }
            super.write(row)
          }

          override def close(): Unit = {
            super.close()
            sys.error("Intentional task commitment failure for testing purpose.")
          }
        }
      }
    }

  override def shortName(): String = "commit-failure-test"
} 
Example 4
Source File: GcsConnectorUtil.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.parquet

import com.spotify.scio.ScioContext
import com.spotify.scio.util.ScioUtil
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat

private[parquet] object GcsConnectorUtil {
  def setCredentials(job: Job): Unit =
    // These are needed since `FileInputFormat.setInputPaths` validates paths locally and
    // requires the user's GCP credentials.
    sys.env.get("GOOGLE_APPLICATION_CREDENTIALS") match {
      case Some(json) =>
        job.getConfiguration
          .set("fs.gs.auth.service.account.json.keyfile", json)
      case None =>
        // Client id/secret of Google-managed project associated with the Cloud SDK
        job.getConfiguration
          .setBoolean("fs.gs.auth.service.account.enable", false)
        job.getConfiguration.set("fs.gs.auth.client.id", "32555940559.apps.googleusercontent.com")
        job.getConfiguration
          .set("fs.gs.auth.client.secret", "ZmssLNjJy2998hD4CTg2ejr2")
    }

  def unsetCredentials(job: Job): Unit = {
    job.getConfiguration.unset("fs.gs.auth.service.account.json.keyfile")
    job.getConfiguration.unset("fs.gs.auth.service.account.enable")
    job.getConfiguration.unset("fs.gs.auth.client.id")
    job.getConfiguration.unset("fs.gs.auth.client.secret")
  }

  def setInputPaths(sc: ScioContext, job: Job, path: String): Unit = {
    // This is needed since `FileInputFormat.setInputPaths` validates paths locally and requires
    // the user's GCP credentials.
    GcsConnectorUtil.setCredentials(job)

    FileInputFormat.setInputPaths(job, path)

    // It will interfere with credentials in Dataflow workers
    if (!ScioUtil.isLocalRunner(sc.options.getRunner)) {
      GcsConnectorUtil.unsetCredentials(job)
    }
  }
} 
Example 5
Source File: LasRelation.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus.las

import fr.ign.spark.iqmulus.{ BinarySectionRelation, BinarySection }
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.OutputWriterFactory
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.types._
import scala.util.{ Try, Success, Failure }

class LasRelation(
  override val paths: Array[String],
  override val maybeDataSchema: Option[StructType],
  override val userDefinedPartitionColumns: Option[StructType],
  parameters: Map[String, String]
)(@transient val sqlContext: SQLContext)
    extends BinarySectionRelation(parameters) {

  def format = parameters.get("lasformat").map(_.toByte)
  def minor = parameters.get("minor").map(_.toByte).getOrElse(Version.minorDefault)
  def major = parameters.get("major").map(_.toByte).getOrElse(Version.majorDefault)
  def version = parameters.get("version").map(Version.fromString)
    .getOrElse(Version(major, minor))

  lazy val headers: Array[LasHeader] = paths flatMap { location =>
    Try {
      val path = new Path(location)
      val fs = FileSystem.get(path.toUri, sqlContext.sparkContext.hadoopConfiguration)
      val dis = fs.open(path)
      try LasHeader.read(location, dis)
      finally {
        dis.close
        fs.close
      }
    } match {
      case Success(h) => Some(h)
      case Failure(e) => logWarning(s"Skipping $location : ${e.getMessage}"); None
    }
  }

  override def sections: Array[BinarySection] = headers.map(_.toBinarySection(paths))

  override def prepareJobForWrite(job: Job): OutputWriterFactory = {
    new LasOutputWriterFactory(format, version)
  }
} 
Example 6
Source File: TimelyImplicits.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.timeseries.timely

import io.gzet.utils.spark.accumulo.AccumuloConfig
import org.apache.accumulo.core.client.ClientConfiguration
import org.apache.accumulo.core.client.mapreduce.{AbstractInputFormat, InputFormatBase}
import org.apache.accumulo.core.client.security.tokens.PasswordToken
import org.apache.accumulo.core.data.Range
import org.apache.accumulo.core.security.Authorizations
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.JavaConversions._

object TimelyImplicits {

  implicit class AccumuloReader(sc: SparkContext) {

    def timely(accumuloConfig: AccumuloConfig, rowPrefix: Option[String] = None): RDD[Metric] = {

      val conf = sc.hadoopConfiguration
      val job = Job.getInstance(conf)
      val clientConfig: ClientConfiguration = new ClientConfiguration()
        .withInstance(accumuloConfig.accumuloInstance)
        .withZkHosts(accumuloConfig.zookeeperHosts)

      val authorizations = new Authorizations(List("INTERNAL", "CONFIDENTIAL", "SECRET").map(_.getBytes()))

      AbstractInputFormat.setConnectorInfo(job, accumuloConfig.accumuloUser, new PasswordToken(accumuloConfig.accumuloPassword))
      AbstractInputFormat.setZooKeeperInstance(job, clientConfig)
      AbstractInputFormat.setScanAuthorizations(job, authorizations)
      InputFormatBase.setInputTableName(job, "timely.metrics")

      if(rowPrefix.isDefined) {
        val ranges = List(Range.prefix(rowPrefix.get))
        InputFormatBase.setRanges(job, ranges)
      }

      val rdd = sc.newAPIHadoopRDD(job.getConfiguration,
        classOf[AccumuloTimelyInputFormat],
        classOf[NullWritable],
        classOf[TimelyWritable]
      ) values

      rdd map {
        timely =>
          val Array(tagK, tagV) = timely.getMetricType.split("=", 2)
          Metric(
            timely.getMetric,
            timely.getTime,
            timely.getMetricValue,
            Map(tagK -> tagV)
          )
      }
    }
  }

} 
Example 7
Source File: AccumuloLoader.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community.accumulo

import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat
import org.apache.accumulo.core.client.security.tokens.PasswordToken
import org.apache.accumulo.core.client.{BatchWriterConfig, ClientConfiguration}
import org.apache.accumulo.core.data.Mutation
import org.apache.accumulo.core.security.ColumnVisibility
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

class AccumuloLoader(config: AccumuloConfig) extends Serializable {

  def persist(sc: SparkContext, accumuloTable: String, rdd: RDD[(String, String)], blacklist: Set[String] = Set()) = {

    val conf = sc.hadoopConfiguration
    val job = Job.getInstance(conf)
    val clientConfig: ClientConfiguration = new ClientConfiguration()
      .withInstance(config.accumuloInstance)
      .withZkHosts(config.zookeeperHosts)

    AccumuloOutputFormat.setConnectorInfo(job, config.accumuloUser, new PasswordToken(config.accumuloPassword))
    AccumuloOutputFormat.setBatchWriterOptions(job, new BatchWriterConfig)
    AccumuloOutputFormat.setZooKeeperInstance(job, clientConfig)
    AccumuloOutputFormat.setCreateTables(job, true)

    val bList = sc.broadcast(blacklist)

    val mutationRdd = rdd.map({ case (from, to) =>
      val visibility = {
        if(bList.value.contains(from) || bList.value.contains(to)){
          new ColumnVisibility(AccumuloAuthorization.BLACKLIST)
        } else {
          new ColumnVisibility("")
        }
      }
      val mutation = new Mutation(from)
      mutation.put("associated", to, visibility, "1")
      (new Text(accumuloTable), mutation)
    })

    mutationRdd.saveAsNewAPIHadoopFile(
      "",
      classOf[Text],
      classOf[Mutation],
      classOf[AccumuloOutputFormat],
      job.getConfiguration
    )

  }


}

object AccumuloAuthorization {
  final val BLACKLIST = "BLACKLIST"
} 
Example 8
Source File: AccumuloReader.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community.accumulo

import org.apache.accumulo.core.client.{IteratorSetting, ClientConfiguration}
import org.apache.accumulo.core.client.mapreduce.{AccumuloInputFormat, AbstractInputFormat, InputFormatBase}
import org.apache.accumulo.core.client.security.tokens.PasswordToken
import org.apache.accumulo.core.security.Authorizations
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.language.postfixOps

class AccumuloReader(config: AccumuloConfig) extends Serializable {

  def read(sc: SparkContext, accumuloTable: String, authorization: Option[String] = None): RDD[EdgeWritable] = {

    val conf = sc.hadoopConfiguration
    val job = Job.getInstance(conf)
    val clientConfig: ClientConfiguration = new ClientConfiguration()
      .withInstance(config.accumuloInstance)
      .withZkHosts(config.zookeeperHosts)

    AbstractInputFormat.setConnectorInfo(job, config.accumuloUser, new PasswordToken(config.accumuloPassword))
    AbstractInputFormat.setZooKeeperInstance(job, clientConfig)
    if(authorization.isDefined)
      AbstractInputFormat.setScanAuthorizations(job, new Authorizations(authorization.get))

    val is = new IteratorSetting(
      1,
      "summingCombiner",
      "org.apache.accumulo.core.iterators.user.SummingCombiner"
    )

    is.addOption("all", "")
    is.addOption("columns", "associated")
    is.addOption("lossy", "TRUE")
    is.addOption("type", "STRING")

    InputFormatBase.addIterator(job, is)
    InputFormatBase.setInputTableName(job, accumuloTable)

    sc.newAPIHadoopRDD(job.getConfiguration,
      classOf[AccumuloGraphxInputFormat],
      classOf[NullWritable],
      classOf[EdgeWritable]
    ) values

  }
} 
Example 9
Source File: SageMakerProtobufFileFormat.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.protobuf

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.types.StructType

class SageMakerProtobufFileFormat extends FileFormat with DataSourceRegister {

  override def inferSchema(sparkSession: SparkSession,
                           options: Map[String, String],
                           files: Seq[FileStatus]):
  Option[StructType] = {
    Option.empty
  }

  override def shortName(): String = "sagemaker"

  override def toString: String = "SageMaker"

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    new OutputWriterFactory {
      override def newInstance(
                                path: String,
                                dataSchema: StructType,
                                context: TaskAttemptContext): OutputWriter = {
        new SageMakerProtobufWriter(path, context, dataSchema, options)
      }

      override def getFileExtension(context: TaskAttemptContext): String = {
        ".pbr"
      }
    }
  }
} 
Example 10
Source File: CommitFailureTestSource.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.types.StructType

class CommitFailureTestSource extends SimpleTextSource {
  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    new OutputWriterFactory {
      override def newInstance(
          path: String,
          dataSchema: StructType,
          context: TaskAttemptContext): OutputWriter = {
        new SimpleTextOutputWriter(path, dataSchema, context) {
          var failed = false
          TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) =>
            failed = true
            SimpleTextRelation.callbackCalled = true
          }

          override def write(row: InternalRow): Unit = {
            if (SimpleTextRelation.failWriter) {
              sys.error("Intentional task writer failure for testing purpose.")

            }
            super.write(row)
          }

          override def close(): Unit = {
            super.close()
            sys.error("Intentional task commitment failure for testing purpose.")
          }
        }
      }

      override def getFileExtension(context: TaskAttemptContext): String = ""
    }

  override def shortName(): String = "commit-failure-test"
} 
Example 11
Source File: CassandraTest.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples
import java.nio.ByteBuffer
import java.util.SortedMap
import scala.collection.JavaConversions._
import org.apache.cassandra.db.IColumn
import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
import org.apache.cassandra.hadoop.ConfigHelper
import org.apache.cassandra.hadoop.ColumnFamilyInputFormat
import org.apache.cassandra.thrift._
import org.apache.cassandra.utils.ByteBufferUtil
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkConf, SparkContext}

object CassandraTest {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("casDemo")
    // Get a SparkContext
    val sc = new SparkContext(sparkConf)

    // Build the job configuration with ConfigHelper provided by Cassandra
    //使用Cassandra提供的ConfigHelper构建作业配置
    val job = new Job()
    job.setInputFormatClass(classOf[ColumnFamilyInputFormat])

    val host: String = args(1)
    val port: String = args(2)

    ConfigHelper.setInputInitialAddress(job.getConfiguration(), host)
    ConfigHelper.setInputRpcPort(job.getConfiguration(), port)
    ConfigHelper.setOutputInitialAddress(job.getConfiguration(), host)
    ConfigHelper.setOutputRpcPort(job.getConfiguration(), port)
    ConfigHelper.setInputColumnFamily(job.getConfiguration(), "casDemo", "Words")
    ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "casDemo", "WordCount")

   

    sc.stop()
  }
}
// scalastyle:on println 
Example 12
Source File: CommitFailureTestSource.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.TaskContext
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.types.StructType

class CommitFailureTestSource extends SimpleTextSource {
  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    new OutputWriterFactory {
      override def newInstance(
          path: String,
          dataSchema: StructType,
          context: TaskAttemptContext): OutputWriter = {
        new SimpleTextOutputWriter(path, context) {
          var failed = false
          TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) =>
            failed = true
            SimpleTextRelation.callbackCalled = true
          }

          override def write(row: Row): Unit = {
            if (SimpleTextRelation.failWriter) {
              sys.error("Intentional task writer failure for testing purpose.")

            }
            super.write(row)
          }

          override def close(): Unit = {
            super.close()
            sys.error("Intentional task commitment failure for testing purpose.")
          }
        }
      }

      override def getFileExtension(context: TaskAttemptContext): String = ""
    }

  override def shortName(): String = "commit-failure-test"
} 
Example 13
Source File: DistCpWrapper.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.util

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.tools.{DistCp, DistCpOptions}

import scala.collection.JavaConversions._


class DistCpWrapper(conf: Configuration, sources: Seq[Path], target: Path) {

  private val baseOptions = new DistCpOptions(sources, target)

  def run(mapsNum: Int = 10, atomic: Boolean = false, overwrite: Boolean = false): Job = {
    val options = new DistCpOptions(baseOptions)
    options.setAppend(false)
    options.setBlocking(true)
    options.setSyncFolder(false)
    options.setDeleteMissing(false)

    options.setMaxMaps(mapsNum)
    options.setOverwrite(overwrite)
    options.setAtomicCommit(atomic)

    new DistCp(conf, options).execute()
  }
}

object DistCpWrapper {

  def apply(conf: Configuration, sources: Seq[Path], target: Path): DistCpWrapper = {
    new DistCpWrapper(conf, sources, target)
  }
} 
Example 14
Source File: FlinkExample.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.examples

import org.apache.flink.api.java.ExecutionEnvironment
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.Job

import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.carbondata.hadoop.CarbonProjection
import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}

// Write carbondata file by spark and read it by flink
// scalastyle:off println
object FlinkExample {

  def main(args: Array[String]): Unit = {
    // write carbondata file by spark
    val cc = ExampleUtils.createCarbonSession("FlinkExample")
    val path = ExampleUtils.writeSampleCarbonFile(cc, "carbon1")

    // read two columns by flink
    val projection = new CarbonProjection
    projection.addColumn("c1")  // column c1
    projection.addColumn("c3")  // column c3
    val conf = new Configuration()
    CarbonInputFormat.setColumnProjection(conf, projection)

    val env = ExecutionEnvironment.getExecutionEnvironment
    val ds = env.readHadoopFile(
      new CarbonTableInputFormat[Array[Object]],
      classOf[Void],
      classOf[Array[Object]],
      path,
      new Job(conf)
    )

    // print result
    val result = ds.collect()
    for (i <- 0 until result.size()) {
      println(result.get(i).f1.mkString(","))
    }

    // delete carbondata file
    ExampleUtils.cleanSampleCarbonFile(cc, "carbon1")
  }
}
// scalastyle:on println 
Example 15
Source File: CommitFailureTestSource.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.TaskContext
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.types.StructType

class CommitFailureTestSource extends SimpleTextSource {
  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    new OutputWriterFactory {
      override def newInstance(
          path: String,
          dataSchema: StructType,
          context: TaskAttemptContext): OutputWriter = {
        new SimpleTextOutputWriter(path, context) {
          var failed = false
          TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) =>
            failed = true
            SimpleTextRelation.callbackCalled = true
          }

          override def write(row: Row): Unit = {
            if (SimpleTextRelation.failWriter) {
              sys.error("Intentional task writer failure for testing purpose.")

            }
            super.write(row)
          }

          override def close(): Unit = {
            super.close()
            sys.error("Intentional task commitment failure for testing purpose.")
          }
        }
      }

      override def getFileExtension(context: TaskAttemptContext): String = ""
    }

  override def shortName(): String = "commit-failure-test"
} 
Example 16
Source File: ArrowFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package com.intel.oap.spark.sql.execution.datasources.arrow

import scala.collection.JavaConverters._

import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions}
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._
import org.apache.arrow.dataset.scanner.ScanOptions
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable {

  val batchSize = 4096

  def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = {
    ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava))
  }

  override def inferSchema(
                            sparkSession: SparkSession,
                            options: Map[String, String],
                            files: Seq[FileStatus]): Option[StructType] = {
    convert(files, options)
  }

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    throw new UnsupportedOperationException("Write is not supported for Arrow source")
  }

  override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true

  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
      dataSchema: StructType,
      partitionSchema: StructType,
      requiredSchema: StructType,
      filters: Seq[Filter],
      options: Map[String, String],
      hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
    (file: PartitionedFile) => {

      val sqlConf = sparkSession.sessionState.conf;
      val enableFilterPushDown = sqlConf.arrowFilterPushDown
      val factory = ArrowUtils.makeArrowDiscovery(
        file.filePath, new ArrowOptions(
          new CaseInsensitiveStringMap(
            options.asJava).asScala.toMap))

      // todo predicate validation / pushdown
      val dataset = factory.finish();

      val filter = if (enableFilterPushDown) {
        ArrowFilters.translateFilters(filters)
      } else {
        org.apache.arrow.dataset.filter.Filter.EMPTY
      }

      val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray,
        filter, batchSize)
      val scanner = dataset.newScan(scanOptions)
      val itrList = scanner
        .scan()
        .iterator()
        .asScala
        .map(task => task.scan())
        .toList

      val itr = itrList
        .toIterator
        .flatMap(itr => itr.asScala)
        .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema))
      new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]]
    }
  }

  override def shortName(): String = "arrow"
}

object ArrowFileFormat {
  class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] {
    override def hasNext: Boolean = delegate.hasNext

    override def next(): T = delegate.next()
  }
} 
Example 17
Source File: OapIndexCommitProtocolSuite.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.mapreduce.TaskAttemptID
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.mapreduce.task.JobContextImpl
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.sql.test.oap.SharedOapContext
import org.apache.spark.util.Utils

class OapIndexCommitProtocolSuite extends SharedOapContext {
  test("newTaskTempFile") {
    val attempt = "attempt_200707121733_0001_m_000000_0"
    val taskID = TaskAttemptID.forName(attempt)
    val jobID = taskID.getJobID.toString
    val outDir = Utils.createTempDir().getAbsolutePath
    val job = Job.getInstance()
    FileOutputFormat.setOutputPath(job, new Path(outDir))
    val conf = job.getConfiguration()
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt)
    val jobContext = new JobContextImpl(conf, taskID.getJobID())
    val taskContext = new TaskAttemptContextImpl(conf, taskID)
    val commitProtocol = new OapIndexCommitProtocol(jobID, outDir)
    // test task temp path
    val pendingDirName = "_temporary_" + jobID
    commitProtocol.setupJob(jobContext)
    commitProtocol.setupTask(taskContext)
    val tempFile = new Path(commitProtocol.newTaskTempFile(taskContext, None, "test"))
    val expectedJobAttemptPath = new Path(new Path(outDir, pendingDirName), "0")
    val expectedTaskWorkPath = new Path(new Path(expectedJobAttemptPath, pendingDirName), attempt)
    assert(tempFile.getParent == expectedTaskWorkPath)
  }
} 
Example 18
Source File: ReadOnlyParquetFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.OutputWriterFactory
import org.apache.spark.sql.types.StructType


class ReadOnlyParquetFileFormat extends ParquetFileFormat {

  override def isSplitable(
      sparkSession: SparkSession,
      options: Map[String, String],
      path: Path): Boolean = false

  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    throw new UnsupportedOperationException("ReadOnlyParquetFileFormat not support write operation")
} 
Example 19
Source File: OapIndexFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
import org.apache.parquet.hadoop.util.ContextUtil

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory}
import org.apache.spark.sql.execution.datasources.oap.OapFileFormat
import org.apache.spark.sql.types.StructType

private[index] class OapIndexFileFormat
  extends FileFormat
  with Logging
  with Serializable {

  override def inferSchema(
      sparkSession: SparkSession,
      options: Map[String, String],
      files: Seq[FileStatus]): Option[StructType] = None

  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory = {

    val configuration = ContextUtil.getConfiguration(job)

    configuration.set(OapIndexFileFormat.ROW_SCHEMA, dataSchema.json)
    configuration.set(OapIndexFileFormat.INDEX_TYPE, options("indexType"))
    configuration.set(OapIndexFileFormat.INDEX_NAME, options("indexName"))
    configuration.set(OapIndexFileFormat.INDEX_TIME, options("indexTime"))
    configuration.set(OapIndexFileFormat.IS_APPEND, options("isAppend"))

    new OutputWriterFactory {
      override def getFileExtension(context: TaskAttemptContext): String =
        OapFileFormat.OAP_INDEX_EXTENSION

      override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext) =
        new OapIndexOutputWriter(path, context)
    }
  }
}

private[index] object OapIndexFileFormat {
  val ROW_SCHEMA: String = "org.apache.spark.sql.oap.row.attributes"
  val INDEX_TYPE: String = "org.apache.spark.sql.oap.index.type"
  val INDEX_NAME: String = "org.apache.spark.sql.oap.index.name"
  val INDEX_TIME: String = "org.apache.spark.sql.oap.index.time"
  val IS_APPEND: String = "org.apache.spark.sql.oap.index.append"
}

case class IndexBuildResult(dataFile: String, rowCount: Long, fingerprint: String, parent: String) 
Example 20
Source File: ReadOnlyNativeOrcFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.orc

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.OutputWriterFactory
import org.apache.spark.sql.types.StructType


class ReadOnlyNativeOrcFileFormat extends OrcFileFormat {
  override def isSplitable(
      sparkSession: SparkSession,
      options: Map[String, String],
      path: Path): Boolean = false

  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    throw new UnsupportedOperationException(
      "ReadOnlyNativeOrcFileFormat doesn't support write operation")
} 
Example 21
Source File: ReadOnlyOrcFileFormat.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.orc

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.OutputWriterFactory
import org.apache.spark.sql.types.StructType


class ReadOnlyOrcFileFormat extends OrcFileFormat {
  override def isSplitable(
      sparkSession: SparkSession,
      options: Map[String, String],
      path: Path): Boolean = false

  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    throw new UnsupportedOperationException("ReadOnlyOrcFileFormat not support write operation")
} 
Example 22
Source File: NodesWithGeohash.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package schedoscope.example.osm.processed

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, LazyOutputFormat, TextOutputFormat}
import org.schedoscope.dsl.View
import org.schedoscope.dsl.storageformats.TextFile
import org.schedoscope.dsl.transformations.MapreduceTransformation
import schedoscope.example.osm.mapreduce.GeohashMapper

case class NodesWithGeohash() extends View {

  val id = fieldOf[Long]("The node ID")
  val version = fieldOf[Int]("OSM version - ignored")
  val userId = fieldOf[Int]("OSM user ID - ignored")
  val tstamp = fieldOf[String]("Timestamp of node creation")
  val longitude = fieldOf[Double]("Longitude of the node")
  val latitude = fieldOf[Double]("Latitude of the node")
  val geohash = fieldOf[String]("A geoencoded area string")

  val stageNodes = dependsOn { () =>
    schedoscope.example.osm.stage.Nodes()
      .affects(n => Seq(
        n.id -> id,
        n.version -> version,
        n.userId -> userId,
        n.tstamp -> tstamp,
        n.longitude -> longitude,
        n.longitude -> geohash,
        n.latitude -> latitude,
        n.latitude -> geohash
      ))
  }

  transformVia(() =>
    MapreduceTransformation(
      this,
      (conf: Map[String, Any]) => {
        val job = Job.getInstance
        LazyOutputFormat.setOutputFormatClass(job, classOf[TextOutputFormat[Text, NullWritable]])
        job.setJobName(this.urlPath)
        job.setJarByClass(classOf[GeohashMapper])
        job.setMapperClass(classOf[GeohashMapper])
        job.setNumReduceTasks(0)
        FileInputFormat.setInputPaths(job, conf.get("input_path").get.toString)
        FileOutputFormat.setOutputPath(job, new Path(conf.get("output_path").get.toString))
        val cfg = job.getConfiguration();
        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
          cfg.set("mapreduce.job.credentials.binary",
            System.getenv("HADOOP_TOKEN_FILE_LOCATION"))
        }
        job
      }).configureWith(
      Map(
        "input_path" -> stageNodes().fullPath,
        "output_path" -> fullPath)))

  comment("nodes, extended with geohash")

  storedAs(TextFile(fieldTerminator = "\\t", lineTerminator = "\\n"))
} 
Example 23
Source File: MapreduceDriverTest.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package org.schedoscope.scheduler.driver

import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.scalatest.{FlatSpec, Matchers}
import org.schedoscope.dsl.View
import org.schedoscope.dsl.transformations.{FailingMapper, MapreduceTransformation}
import org.schedoscope.test.resources.LocalTestResources
import org.schedoscope.test.resources.TestDriverRunCompletionHandlerCallCounter._

class MapreduceDriverTest extends FlatSpec with Matchers with TestFolder {
  lazy val driver = new LocalTestResources().driverFor[MapreduceTransformation]("mapreduce")

  def invalidJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => Job.getInstance

  def failingJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => {
    writeData()
    val job = Job.getInstance
    job.setMapperClass(classOf[FailingMapper])
    FileInputFormat.setInputPaths(job, new Path(inputPath("")))
    FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString)))
    job
  }

  def identityJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => {
    writeData()
    val job = Job.getInstance
    FileInputFormat.setInputPaths(job, new Path(inputPath("")))
    FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString)))
    job
  }

  case class DummyView() extends View

  def writeData() {
    Files.write(Paths.get(s"${inputPath("")}/file.txt"), "some data".getBytes(StandardCharsets.UTF_8))
  }

  "MapreduceDriver" should "have transformation name Mapreduce" in {
    driver.transformationName shouldBe "mapreduce"
  }

  it should "execute Mapreduce transformations synchronously" in {
    val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob))

    driverRunState shouldBe a[DriverRunSucceeded[_]]
  }

  it should "execute another Mapreduce transformations synchronously" in {
    val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob))

    driverRunState shouldBe a[DriverRunSucceeded[_]]
  }

  it should "execute Mapreduce transformations asynchronously" in {
    val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob))

    var runWasAsynchronous = false

    while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]])
      runWasAsynchronous = true

    runWasAsynchronous shouldBe true
    driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunSucceeded[_]]
  }

  it should "execute Mapreduce transformations and return errors when running asynchronously" in {
    val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), failingJob))

    var runWasAsynchronous = false

    while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]])
      runWasAsynchronous = true

    // runWasAsynchronous shouldBe true FIXME: isn't asynchronous, why?
    driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunFailed[_]]
  }

  it should "call its DriverRunCompletitionHandlers' driverRunCompleted upon request" in {
    val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob))

    while (driver.getDriverRunState(runHandle).isInstanceOf[DriverRunOngoing[_]]) {}

    driver.driverRunCompleted(runHandle)

    driverRunCompletedCalled(runHandle, driver.getDriverRunState(runHandle)) shouldBe true
  }

  it should "call its DriverRunCompletitionHandlers' driverRunStarted upon request" in {
    val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob))

    driver.driverRunStarted(runHandle)

    driverRunStartedCalled(runHandle) shouldBe true
  }
} 
Example 24
Source File: DistCpTransformation.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package org.schedoscope.dsl.transformations

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.tools.{DistCp, DistCpOptions}
import org.schedoscope.dsl.View
import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver}

import scala.collection.JavaConverters._

object DistCpTransformation {

  def copyToView(sourceView: View, targetView: View): DistCpTransformation = {
    val target = targetView.fullPath.split("/").dropRight(1).mkString("/")
    DistCpTransformation(targetView, List(sourceView.fullPath), target)
  }

  def copyToDirToView(sourcePath: String, targetView: View): DistCpTransformation = {
    val target = targetView.fullPath.split("/").drop(1).mkString("/")
    DistCpTransformation(targetView, List(sourcePath), target)
  }

  def copyToFileToView(sourceFile: String, targetView: View): DistCpTransformation = {
    DistCpTransformation(targetView, List(sourceFile), targetView.fullPath)
  }

}

case class DistCpTransformation(v: View,
                                var sources: List[String],
                                var target: String,
                                deleteViewPath: Boolean = false,
                                config: Configuration = new Configuration())
  extends MapreduceBaseTransformation {

  var directoriesToDelete = if (deleteViewPath) List(v.fullPath) else List()

  override def stringsToChecksum: List[String] = target :: sources

  override def fileResourcesToChecksum = List()


  override val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) =>
    DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState

  lazy val job: Job = {
    val distCp = new DistCp(config, distCpOptions)
    val createJob = distCp.getClass.getDeclaredMethod("createJob")
    createJob.setAccessible(true)
    val job = createJob.invoke(distCp).asInstanceOf[Job]
    val prepareFileListing = distCp.getClass.getDeclaredMethod("prepareFileListing", job.getClass)
    prepareFileListing.setAccessible(true)
    prepareFileListing.invoke(distCp, job)
    job
  }

  def distCpOptions: DistCpOptions = if (configuration.nonEmpty) {
    DistCpConfiguration
      .fromConfig(configuration.toMap)
      .toDistCpOptions(sources.map(new Path(_)), new Path(target))
  } else {
    val s = sources.map(new Path(_)).asJava
    new DistCpOptions(s, new Path(target))
  }
} 
Example 25
Source File: MapreduceTransformation.scala    From schedoscope   with Apache License 2.0 5 votes vote down vote up
package org.schedoscope.dsl.transformations

import java.net.URI

import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.mapreduce.{Job, MRJobConfig}
import org.schedoscope.Schedoscope
import org.schedoscope.dsl.View
import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver}
import org.schedoscope.scheduler.service.ViewTransformationStatus


case class MapreduceTransformation(v: View,
                                   createJob: (Map[String, Any]) => Job,
                                   cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState,
                                   dirsToDelete: List[String] = List(),
                                   deleteViewPath: Boolean = true) extends MapreduceBaseTransformation {

  lazy val job = createJob(configuration.toMap)

  var directoriesToDelete = dirsToDelete ++ (if (deleteViewPath) List(v.fullPath) else List())

  description = StringUtils.abbreviate(v.urlPath, 100)
}

trait MapreduceBaseTransformation extends Transformation {

  def name = "mapreduce"

  val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation]

  val v: View

  val job: Job

  var directoriesToDelete: List[String]

  override def fileResourcesToChecksum = {
    val jarName = try {
      job.getConfiguration().get(MRJobConfig.JAR).split("/").last
    } catch {
      case _: Throwable => null
    }

    Schedoscope.settings
      .getDriverSettings("mapreduce")
      .libJarsHdfs
      .filter(lj => jarName == null || lj.contains(jarName))
  }

  override def viewTransformationStatus = ViewTransformationStatus(
    name,
    Some(Map(
      "input" -> job.getConfiguration().get(FileInputFormat.INPUT_DIR),
      "output" -> job.getConfiguration().get(FileOutputFormat.OUTDIR))))

  def configure() {
    // if job jar hasn't been registered, add all mapreduce libjars
    // to distributed cache
    if (job.getConfiguration().get(MRJobConfig.JAR) == null) {
      fileResourcesToChecksum.foreach(r => {
        try {
          job.addCacheFile(new URI(r))
        } catch {
          case _: Throwable => Unit
        }
      })
    }
    configuration.foreach { case (k, v) => if (v == null) job.getConfiguration.unset(k) else job.getConfiguration.set(k, v.toString) }
  }
} 
Example 26
Source File: CommitFailureTestSource.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}

import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
import org.apache.spark.sql.types.StructType

class CommitFailureTestSource extends SimpleTextSource {
  
  override def prepareWrite(
      sparkSession: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory =
    new OutputWriterFactory {
      override def newInstance(
          path: String,
          dataSchema: StructType,
          context: TaskAttemptContext): OutputWriter = {
        new SimpleTextOutputWriter(path, dataSchema, context) {
          var failed = false
          TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) =>
            failed = true
            SimpleTextRelation.callbackCalled = true
          }

          override def write(row: InternalRow): Unit = {
            if (SimpleTextRelation.failWriter) {
              sys.error("Intentional task writer failure for testing purpose.")

            }
            super.write(row)
          }

          override def close(): Unit = {
            super.close()
            sys.error("Intentional task commitment failure for testing purpose.")
          }
        }
      }

      override def getFileExtension(context: TaskAttemptContext): String = ""
    }

  override def shortName(): String = "commit-failure-test"
} 
Example 27
Source File: ConfOnlyTAC.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapreduce.{ Counter, TaskAttemptID, Job, TaskAttemptContext }

// This exists just because of a quirk of the record reader api.
case class ConfOnlyTAC(_conf: Configuration) extends Job with TaskAttemptContext {
  // JobContextImpl and JobContext
  override def getConfiguration: Configuration = _conf

  // TaskAttemptContext
  override def getTaskAttemptID: TaskAttemptID = sys.error("not implemented")
  override def setStatus(msg: String): Unit = sys.error("not implemented")
  override def getStatus = sys.error("not implemented")
  override def getProgress: Float = sys.error("not implemented")
  override def getCounter(counterName: Enum[_]): Counter = sys.error("not implemented")
  override def getCounter(groupName: String, counterName: String): Counter = sys.error("not implemented")

  // Progressable
  override def progress(): Unit = sys.error("not implemented")
} 
Example 28
Source File: InputFormatConf.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.hadoop

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.hadoop.io.{ LongWritable, Text, Writable }
import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader }
import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat }

import scala.collection.immutable

trait InputFormatConf[K, V] extends Serializable {
  type IF <: InputFormat[K, V]
  type Split <: InputSplit with Writable

  type KExtract <: Extract[K]
  type VExtract <: Extract[V]

  def kExtract: KExtract
  def vExtract: VExtract

  def makeInputFormat(): IF

  // I'm unsure if we should WriSer them for them
  def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]]

  // TODO do we want to require typing of the RecordReader as well?
  final def createRecordReader(hadoopConf: Configuration, split: Split,
    inputFormat: IF = makeInputFormat()): RecordReader[K, V] = {
    val tac = ConfOnlyTAC(hadoopConf)
    val recordReader = inputFormat.createRecordReader(split, tac)
    recordReader.initialize(split, tac)
    recordReader
  }
}

case class TextInputFormatConf(file: String, partitions: Int)
  extends InputFormatConf[LongWritable, Text] {
  type IF = TextInputFormat
  type Split = FileSplit

  // TODO now that we figured out what's up, see if we can't eliminate the need for this...
  val internalK = Extract.unit[LongWritable]
  val internalV = Extract.text

  type KExtract = internalK.type
  type VExtract = internalV.type

  override val kExtract: KExtract = internalK
  override val vExtract: VExtract = internalV

  def makeInputFormat() = new TextInputFormat()
  def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = {
    val job = Job.getInstance(hadoopConf)
    FileInputFormat.setInputPaths(job, file)
    val path = new Path(file)
    val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen
    val size_per = math.round(len / partitions.toDouble)

    ((0 until partitions - 1).map { p =>
      new FileSplit(path, size_per * p, size_per, null)
    } :+ {
      val fin = size_per * (partitions - 1)
      new FileSplit(path, fin, len - fin, null)
    }).map(WriSer(_))
  }
}

// TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf
object CSVInputFormatConf {
  def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract
  } = new InputFormatConf[LongWritable, V] {
    type IF = ifc.IF
    type Split = ifc.Split
    type KExtract = ifc.KExtract
    type VExtract = ifc.VExtract

    override val kExtract: KExtract = ifc.kExtract
    override val vExtract: VExtract = ifc.vExtract

    override def makeInputFormat() = ifc.makeInputFormat()
    override def makeSplits(hadoopConf: Configuration) = {
      val splits = ifc.makeSplits(hadoopConf)
      splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) {
        case WriSer(head) =>
          val rr = createRecordReader(hadoopConf, head)
          require(rr.nextKeyValue, "csv has no header, first line was empty")
          val afterHeader = rr.getCurrentKey.get
          require(rr.nextKeyValue, "first split is empty")
          WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +:
            splits.tail
      }
    }
  }
} 
Example 29
Source File: CarbonCountStar.scala    From carbondata   with Apache License 2.0 4 votes vote down vote up
package org.apache.spark.sql

import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.optimizer.CarbonFilters
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String

import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier
import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.StageInputCollector
import org.apache.carbondata.core.util.{CarbonProperties, ThreadLocalSessionInfo}
import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}
import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
import org.apache.carbondata.spark.load.DataLoadProcessBuilderOnSpark

case class CarbonCountStar(
    attributesRaw: Seq[Attribute],
    carbonTable: CarbonTable,
    sparkSession: SparkSession,
    outUnsafeRows: Boolean = true) extends LeafExecNode {

  override def doExecute(): RDD[InternalRow] = {
    ThreadLocalSessionInfo
      .setConfigurationToCurrentThread(sparkSession.sessionState.newHadoopConf())
    val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier
    val (job, tableInputFormat) = createCarbonInputFormat(absoluteTableIdentifier)
    CarbonInputFormat.setQuerySegment(job.getConfiguration, carbonTable)

    // get row count
    var rowCount = CarbonUpdateUtil.getRowCount(
      tableInputFormat.getBlockRowCount(
        job,
        carbonTable,
        CarbonFilters.getPartitions(
          Seq.empty,
          sparkSession,
          TableIdentifier(
            carbonTable.getTableName,
            Some(carbonTable.getDatabaseName))).map(_.asJava).orNull, false),
      carbonTable)

    if (CarbonProperties.isQueryStageInputEnabled) {
      // check for number of row for stage input
      val splits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration)
      if (!splits.isEmpty) {
        val df = DataLoadProcessBuilderOnSpark.createInputDataFrame(
          sparkSession, carbonTable, splits.asScala)
        rowCount += df.count()
      }
    }

    val valueRaw =
      attributesRaw.head.dataType match {
        case StringType => Seq(UTF8String.fromString(Long.box(rowCount).toString)).toArray
          .asInstanceOf[Array[Any]]
        case _ => Seq(Long.box(rowCount)).toArray.asInstanceOf[Array[Any]]
      }
    val value = new GenericInternalRow(valueRaw)
    val unsafeProjection = UnsafeProjection.create(output.map(_.dataType).toArray)
    val row = if (outUnsafeRows) unsafeProjection(value) else value
    sparkContext.parallelize(Seq(row))
  }

  override def output: Seq[Attribute] = {
    attributesRaw
  }

  private def createCarbonInputFormat(absoluteTableIdentifier: AbsoluteTableIdentifier
  ): (Job, CarbonTableInputFormat[Array[Object]]) = {
    val carbonInputFormat = new CarbonTableInputFormat[Array[Object]]()
    val jobConf: JobConf = new JobConf(FileFactory.getConfiguration)
    SparkHadoopUtil.get.addCredentials(jobConf)
    CarbonInputFormat.setTableInfo(jobConf, carbonTable.getTableInfo)
    val job = new Job(jobConf)
    FileInputFormat.addInputPath(job, new Path(absoluteTableIdentifier.getTablePath))
    CarbonInputFormat
      .setTransactionalTable(job.getConfiguration,
        carbonTable.getTableInfo.isTransactionalTable)
    CarbonInputFormatUtil.setIndexJobIfConfigured(job.getConfiguration)
    (job, carbonInputFormat)
  }
}