org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl Scala Examples

The following examples show how to use org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: HadoopFileLinesReader.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new LineRecordReader()
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Example 2

Source File: HadoopFileLinesReader.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile,
    lineSeparator: Option[Array[Byte]],
    conf: Configuration) extends Iterator[Text] with Closeable {

  def this(file: PartitionedFile, conf: Configuration) = this(file, None, conf)

  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)

    val reader = lineSeparator match {
      case Some(sep) => new LineRecordReader(sep)
      // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`.
      case _ => new LineRecordReader()
    }

    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Example 3

Source File: HadoopFileWholeTextReader.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.input.WholeTextFileRecordReader


class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration)
  extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new CombineFileSplit(
      Array(new Path(new URI(file.filePath))),
      Array(file.start),
      Array(file.length),
      // TODO: Implement Locality
      Array.empty[String])
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0)
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Example 4

Source File: HadoopFileExcelReader.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import java.io.Closeable
import java.net.URI

import org.apache.spark.sql.execution.datasources._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.ArrayWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{ FileSplit, LineRecordReader }
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.sql.execution.datasources.RecordReaderIterator

import org.zuinnote.hadoop.office.format.mapreduce.ExcelFileInputFormat
import org.zuinnote.hadoop.office.format.mapreduce.ExcelRecordReader

import org.apache.commons.logging.LogFactory
import org.apache.commons.logging.Log


class HadoopFileExcelReader(
  file: PartitionedFile, conf: Configuration) extends Iterator[ArrayWritable] with Closeable {
  val LOG = LogFactory.getLog(classOf[HadoopFileExcelReader])
  private var reader: RecordReader[Text, ArrayWritable] = null
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)), file.start, file.length, Array.empty) // todo: implement locality (replace Array.empty with the locations)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val inputFormat = new ExcelFileInputFormat()
    reader = inputFormat.createRecordReader(fileSplit, hadoopAttemptContext)
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }
  
  def getReader: RecordReader[Text, ArrayWritable] = reader

  override def hasNext: Boolean = iterator.hasNext

  override def next(): ArrayWritable = iterator.next()

  override def close(): Unit = {
    if (reader != null) {
      reader.close()
    }
  }
}

Example 5

Source File: VectorizedFilePartitionReaderHandler.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.v2

import com.intel.sparkColumnarPlugin.datasource.VectorizedParquetArrowReader

import java.net.URI
import java.time.ZoneId

import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, PartitionReader}
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile}
import org.apache.spark.sql.execution.datasources.v2.PartitionedFileReader
import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory
import org.apache.spark.sql.execution.datasources.v2.FilePartitionReader
import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}

object VectorizedFilePartitionReaderHandler {
  def get(
    inputPartition: InputPartition,
    parquetReaderFactory: ParquetPartitionReaderFactory) 
  : FilePartitionReader[ColumnarBatch] = {
    val iter: Iterator[PartitionedFileReader[ColumnarBatch]] 
      = inputPartition.asInstanceOf[FilePartition].files.toIterator.map { file =>
      val filePath = new Path(new URI(file.filePath))
      val split =
      new org.apache.parquet.hadoop.ParquetInputSplit(
        filePath,
        file.start,
        file.start + file.length,
        file.length,
        Array.empty,
        null)
      //val timestampConversion: Boolean = sqlConf.isParquetINT96TimestampConversion
      
      val capacity = 4096
      //partitionReaderFactory.createColumnarReader(inputPartition)
      val dataSchema = parquetReaderFactory.dataSchema
      val readDataSchema = parquetReaderFactory.readDataSchema
      
      val conf = parquetReaderFactory.broadcastedConf.value.value
      val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
      val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)

      val vectorizedReader = new VectorizedParquetArrowReader(split.getPath().toString(), null, false, capacity, dataSchema, readDataSchema)
      vectorizedReader.initialize(split, hadoopAttemptContext)
      val partitionReader = new PartitionReader[ColumnarBatch] {
        override def next(): Boolean = vectorizedReader.nextKeyValue()
        override def get(): ColumnarBatch =
          vectorizedReader.getCurrentValue.asInstanceOf[ColumnarBatch]
        override def close(): Unit = vectorizedReader.close()
      }

      PartitionedFileReader(file, partitionReader)
    }
    new FilePartitionReader[ColumnarBatch](iter)
  }
}

Example 6

Source File: OapIndexCommitProtocolSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.mapreduce.TaskAttemptID
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.mapreduce.task.JobContextImpl
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.sql.test.oap.SharedOapContext
import org.apache.spark.util.Utils

class OapIndexCommitProtocolSuite extends SharedOapContext {
  test("newTaskTempFile") {
    val attempt = "attempt_200707121733_0001_m_000000_0"
    val taskID = TaskAttemptID.forName(attempt)
    val jobID = taskID.getJobID.toString
    val outDir = Utils.createTempDir().getAbsolutePath
    val job = Job.getInstance()
    FileOutputFormat.setOutputPath(job, new Path(outDir))
    val conf = job.getConfiguration()
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt)
    val jobContext = new JobContextImpl(conf, taskID.getJobID())
    val taskContext = new TaskAttemptContextImpl(conf, taskID)
    val commitProtocol = new OapIndexCommitProtocol(jobID, outDir)
    // test task temp path
    val pendingDirName = "_temporary_" + jobID
    commitProtocol.setupJob(jobContext)
    commitProtocol.setupTask(taskContext)
    val tempFile = new Path(commitProtocol.newTaskTempFile(taskContext, None, "test"))
    val expectedJobAttemptPath = new Path(new Path(outDir, pendingDirName), "0")
    val expectedTaskWorkPath = new Path(new Path(expectedJobAttemptPath, pendingDirName), attempt)
    assert(tempFile.getParent == expectedTaskWorkPath)
  }
}

Example 7

Source File: HadoopFileLinesReader.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new LineRecordReader()
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Example 8

Source File: DistributedCountRDD.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.indexserver

import java.util.concurrent.Executors

import scala.collection.JavaConverters._
import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future}
import scala.concurrent.duration.Duration

import org.apache.hadoop.mapred.TaskAttemptID
import org.apache.hadoop.mapreduce.{InputSplit, TaskType}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
import org.apache.spark.{Partition, SparkEnv, TaskContext}
import org.apache.spark.sql.SparkSession

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.cache.CacheProvider
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager}
import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper
import org.apache.carbondata.core.util.{CarbonProperties, CarbonThreadFactory}
import org.apache.carbondata.spark.rdd.CarbonRDD


class DistributedCountRDD(@transient ss: SparkSession, indexInputFormat: IndexInputFormat)
  extends CarbonRDD[(String, String)](ss, Nil) {

  @transient private val LOGGER = LogServiceFactory.getLogService(classOf[DistributedPruneRDD]
    .getName)

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    if (split.asInstanceOf[IndexRDDPartition].getLocations != null) {
      split.asInstanceOf[IndexRDDPartition].getLocations.toSeq
    } else {
      Seq()
    }
  }

  override def internalCompute(split: Partition,
      context: TaskContext): Iterator[(String, String)] = {
    val attemptId = new TaskAttemptID(DistributedRDDUtils.generateTrackerId,
      id, TaskType.MAP, split.index, 0)
    val attemptContext = new TaskAttemptContextImpl(FileFactory.getConfiguration, attemptId)
    val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit
    val numOfThreads = CarbonProperties.getInstance().getNumOfThreadsForExecutorPruning
    val service = Executors
      .newFixedThreadPool(numOfThreads, new CarbonThreadFactory("IndexPruningPool", true))
    implicit val ec: ExecutionContextExecutor = ExecutionContext
      .fromExecutor(service)
    if (indexInputFormat.ifAsyncCall()) {
      // to clear cache of invalid segments during pre-priming in index server
      IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable,
        indexInputFormat.getInvalidSegments)
    }
    val futures = if (inputSplits.length <= numOfThreads) {
      inputSplits.map {
        split => generateFuture(Seq(split))
      }
    } else {
      DistributedRDDUtils.groupSplits(inputSplits, numOfThreads).map {
        splits => generateFuture(splits)
      }
    }
    // scalastyle:off awaitresult
    val results = Await.result(Future.sequence(futures), Duration.Inf).flatten
    // scalastyle:on awaitresult
    val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${
      SparkEnv.get.blockManager.blockManagerId.executorId
    }"
    val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) {
      CacheProvider.getInstance().getCarbonCache.getCurrentSize
    } else {
      0L
    }
    Iterator((executorIP + "_" + cacheSize.toString, results.map(_._2.toLong).sum.toString))
  }

  override protected def internalGetPartitions: Array[Partition] = {
    new DistributedPruneRDD(ss, indexInputFormat).partitions
  }

  private def generateFuture(split: Seq[InputSplit])
    (implicit executionContext: ExecutionContext) = {
    Future {
      val segments = split.map { inputSplit =>
        val distributable = inputSplit.asInstanceOf[IndexInputSplitWrapper]
        distributable.getDistributable.getSegment
          .setReadCommittedScope(indexInputFormat.getReadCommittedScope)
        distributable.getDistributable.getSegment
      }
      val defaultIndex = IndexStoreManager.getInstance
        .getIndex(indexInputFormat.getCarbonTable, split.head
          .asInstanceOf[IndexInputSplitWrapper].getDistributable.getIndexSchema)
      defaultIndex.getBlockRowCount(defaultIndex, segments.toList.asJava, indexInputFormat
        .getPartitions).asScala
    }
  }

}

Example 9

Source File: HadoopFileLinesReader.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new LineRecordReader()
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Example 10

Source File: HadoopFileLinesReader.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl


class HadoopFileLinesReader(
    file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(file.filePath)),
      file.start,
      file.length,
      // TODO: Implement Locality
      Array.empty)
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new LineRecordReader()
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Example 11

Source File: HadoopFileWholeTextReader.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

import org.apache.spark.input.WholeTextFileRecordReader


class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration)
  extends Iterator[Text] with Closeable {
  private val iterator = {
    val fileSplit = new CombineFileSplit(
      Array(new Path(new URI(file.filePath))),
      Array(file.start),
      Array(file.length),
      // TODO: Implement Locality
      Array.empty[String])
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
    val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0)
    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = iterator.hasNext

  override def next(): Text = iterator.next()

  override def close(): Unit = iterator.close()
}

Example 12

Source File: HadoopLineIterator.scala From glow with Apache License 2.0

5 votes

package io.projectglow.sql.util

import java.io.Closeable
import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
import org.apache.spark.sql.execution.datasources.RecordReaderIterator

import io.projectglow.common.GlowLogging


class HadoopLineIterator(
    path: String,
    start: Long,
    length: Long,
    lineSeparator: Option[Array[Byte]],
    conf: Configuration)
    extends Iterator[Text]
    with Closeable
    with GlowLogging {

  private val iterator = {
    val fileSplit = new FileSplit(
      new Path(new URI(path)),
      start,
      length,
      // TODO: Implement Locality
      Array.empty
    )
    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)

    val reader = lineSeparator match {
      case Some(sep) => new LineRecordReader(sep)
      // If the line separator is `None`, it covers `\r`, `\r\n` and `\n`.
      case _ => new LineRecordReader()
    }

    reader.initialize(fileSplit, hadoopAttemptContext)
    new RecordReaderIterator(reader)
  }

  override def hasNext: Boolean = {
    iterator.hasNext
  }

  override def next(): Text = {
    iterator.next()
  }

  override def close(): Unit = {
    iterator.close()
  }
}