org.apache.spark.rdd.NewHadoopRDD Scala Examples

The following examples show how to use org.apache.spark.rdd.NewHadoopRDD. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: JavaNewHadoopRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 2
Source File: JavaNewHadoopRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 3
Source File: JavaNewHadoopRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 4
Source File: ProtoParquetRDD.scala    From sparksql-protobuf   with Apache License 2.0 5 votes vote down vote up
package com.github.saurfang.parquet.proto.spark

import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat
import com.google.protobuf.AbstractMessage
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
import org.apache.parquet.proto.ProtoReadSupport
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

class ProtoParquetRDD[T <: AbstractMessage : ClassTag](
                                                        sc: SparkContext,
                                                        input: String,
                                                        protoClass: Class[T],
                                                        @transient conf: Configuration
                                                        ) extends RDD[T](sc, Nil) {

  def this(sc: SparkContext, input: String, protoClass: Class[T]) = {
    this(sc, input, protoClass, sc.hadoopConfiguration)
  }

  lazy private[this] val rdd = {
    val jconf = new JobConf(conf)
    FileInputFormat.setInputPaths(jconf, input)
    ProtoReadSupport.setProtobufClass(jconf, protoClass.getName)

    new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf)
  }

  @DeveloperApi
  override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2)

  override protected def getPartitions: Array[Partition] = rdd.getPartitions
} 
Example 5
Source File: JavaNewHadoopRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 6
Source File: JavaNewHadoopRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 7
Source File: HBaseSimpleRDD.scala    From spark-hbase-connector   with Apache License 2.0 5 votes vote down vote up
package it.nerdammer.spark.hbase

import it.nerdammer.spark.hbase.conversion.FieldReader
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, TaskContext}

import scala.reflect.ClassTag

class HBaseSimpleRDD[R: ClassTag](hadoopHBase: NewHadoopRDD[ImmutableBytesWritable, Result], builder: HBaseReaderBuilder[R], saltingLength: Int = 0)
                       (implicit mapper: FieldReader[R], saltingProvider: SaltingProviderFactory[String]) extends RDD[R](hadoopHBase) {

  override def getPartitions: Array[Partition] = firstParent[(ImmutableBytesWritable, Result)].partitions

  override def compute(split: Partition, context: TaskContext) = {
    // val cleanConversion = sc.clean ---> next version
    firstParent[(ImmutableBytesWritable, Result)].iterator(split, context)
      .map(e => conversion(e._1, e._2))
  }

  def conversion(key: ImmutableBytesWritable, row: Result) = {

    val columnNames = HBaseUtils.chosenColumns(builder.columns, mapper.columns)

    val columnNamesFC = HBaseUtils.columnsWithFamily(builder.columnFamily, columnNames)

    val columns = columnNamesFC
      .map(t => (Bytes.toBytes(t._1), Bytes.toBytes(t._2)))
      .map(t => if(row.containsColumn(t._1, t._2)) Some(CellUtil.cloneValue(row.getColumnLatestCell(t._1, t._2)).array) else None)
      .toList

    mapper.map(Some(key.get.drop(saltingLength)) :: columns)
  }
} 
Example 8
Source File: JavaNewHadoopRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConversions._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)),
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 9
Source File: JavaNewHadoopRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 10
Source File: JavaNewHadoopRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.java

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.java.JavaSparkContext._
import org.apache.spark.api.java.function.{Function2 => JFunction2}
import org.apache.spark.rdd.NewHadoopRDD

@DeveloperApi
class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V])
    (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V])
  extends JavaPairRDD[K, V](rdd) {

  
  @DeveloperApi
  def mapPartitionsWithInputSplit[R](
      f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]],
      preservesPartitioning: Boolean = false): JavaRDD[R] = {
    new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala,
      preservesPartitioning)(fakeClassTag))(fakeClassTag)
  }
} 
Example 11
Source File: featureCounts.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
import htsjdk.samtools.ValidationStringency
import org.apache.hadoop.io.LongWritable
import org.apache.spark.SparkContext
import org.apache.spark.rdd.NewHadoopRDD
import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalTreeJoinStrategyOptim
import org.biodatageeks.sequila.rangejoins.common.metrics.MetricsCollector
import org.seqdoop.hadoop_bam.{BAMInputFormat, FileVirtualSplit, SAMRecordWritable}
import org.seqdoop.hadoop_bam.util.SAMHeaderReader

val metricsTable = "granges.metrics"

sc.hadoopConfiguration.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, ValidationStringency.SILENT.toString)
case class PosRecord(contigName:String,start:Int,end:Int)
spark.experimental.extraStrategies = new IntervalTreeJoinStrategyOptim(spark) :: Nil

val alignments = sc.newAPIHadoopFile[LongWritable, SAMRecordWritable, BAMInputFormat]("/data/granges/NA12878.ga2.exome.maq.recal.bam").map(_._2.get).map(r=>PosRecord(r.getContig,r.getStart,r.getEnd))

val reads=alignments.toDF
reads.createOrReplaceTempView("reads")

val targets = spark.read.parquet("/data/granges/tgp_exome_hg18.adam")
targets.createOrReplaceTempView("targets")

val query="""    SELECT targets.contigName,targets.start,targets.end,count(*) FROM reads JOIN targets
            |         ON (targets.contigName=reads.contigName
            |         AND
            |         CAST(reads.end AS INTEGER)>=CAST(targets.start AS INTEGER)
            |         AND
            |         CAST(reads.start AS INTEGER)<=CAST(targets.end AS INTEGER)
            |         )
            |         GROUP BY targets.contigName,targets.start,targets.end"""


spark.sqlContext.setConf("spark.biodatageeks.rangejoin.maxBroadcastSize", (100 *1024*1024).toString)
val mc = new  MetricsCollector(spark,metricsTable)
mc.initMetricsTable
mc.runAndCollectMetrics(
  "q_featurecounts_bam_wes",
  "spark_granges_it_bc_all",
  Array("reads","targets"),
  query,
  true
)





val reads = spark.read.parquet("/data/granges/NA12878.ga2.exome.maq.recal.adam")
reads.createOrReplaceTempView("reads")

val targets = spark.read.parquet("/data/granges/tgp_exome_hg18.adam")
targets.createOrReplaceTempView("targets")
val mc = new  MetricsCollector(spark,metricsTable)
mc.initMetricsTable
mc.runAndCollectMetrics(
  "q_featurecounts_adam_wes",
  "spark_granges_it_bc_all",
  Array("reads","targets"),
  query,
  true
) 
Example 12
Source File: NewHBaseRDD.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.hadoop.conf.Configuration
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.mapreduce.InputFormat
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}

@InterfaceAudience.Public
class NewHBaseRDD[K,V](@transient val sc : SparkContext,
                       @transient val inputFormatClass: Class[_ <: InputFormat[K, V]],
                       @transient val keyClass: Class[K],
                       @transient val valueClass: Class[V],
                       @transient private val __conf: Configuration,
                       val hBaseContext: HBaseContext) extends NewHadoopRDD(sc, inputFormatClass, keyClass, valueClass, __conf) {

  override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
    hBaseContext.applyCreds()
    super.compute(theSplit, context)
  }
}