org.apache.spark.sql.catalyst.expressions.GenericRow Scala Examples

The following examples show how to use org.apache.spark.sql.catalyst.expressions.GenericRow. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: ConfigurableDataGeneratorMain.scala    From Spark.TableStatsExample   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.examples.tablestats

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType}
import org.apache.spark.{SparkContext, SparkConf}

import scala.collection.mutable
import scala.util.Random



object ConfigurableDataGeneratorMain {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>")
      return
    }

    val outputPath = args(0)
    val numberOfColumns = args(1).toInt
    val numberOfRecords = args(2).toInt
    val numberOfPartitions = args(3).toInt
    val runLocal = (args.length == 5 && args(4).equals("L"))

    var sc: SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "test", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain")
      sc = new SparkContext(sparkConfig)
    }

    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    //Part A
    val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions)

    //Part B
    val megaDataRDD = rowRDD.flatMap( r => {
      val random = new Random()

      val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator
      dataRange.map[Row]( x => {
        val values = new mutable.ArrayBuffer[Any]
        for (i <- 0 until numberOfColumns) {
          if (i % 2 == 0) {
            values.+=(random.nextInt(100).toLong)
          } else {
            values.+=(random.nextInt(100).toString)
          }
        }
        new GenericRow(values.toArray)
      })
    })

    //Part C
    val schema =
      StructType(
        (0 until numberOfColumns).map( i => {
          if (i % 2 == 0) {
            StructField("longColumn_" + i, LongType, true) }
          else {
            StructField("stringColumn_" + i, StringType, true)
          }
        })
      )
    val df = sqlContext.createDataFrame(megaDataRDD, schema)
    df.saveAsParquetFile(outputPath)

    //Part D
    sc.stop()
  }
} 
Example 2
Source File: HailKryoRegistrator.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.kryo

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.serializers.JavaSerializer
import is.hail.annotations.{Region, UnsafeIndexedSeq, UnsafeRow}
import is.hail.utils.{Interval, SerializableHadoopConfiguration}
import is.hail.variant.Locus
import org.apache.spark.serializer.KryoRegistrator
import org.apache.spark.sql.catalyst.expressions.GenericRow

class HailKryoRegistrator extends KryoRegistrator {
  override def registerClasses(kryo: Kryo) {
    kryo.register(classOf[SerializableHadoopConfiguration], new JavaSerializer())
    kryo.register(classOf[UnsafeRow])
    kryo.register(classOf[GenericRow])
    kryo.register(classOf[Locus])
    kryo.register(classOf[Interval])
    kryo.register(classOf[UnsafeIndexedSeq])
    kryo.register(classOf[Region])
  }
} 
Example 3
Source File: RowTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types._
import org.scalatest.{Matchers, FunSpec}

class RowTest extends FunSpec with Matchers {

  val schema = StructType(
    StructField("col1", StringType) ::
    StructField("col2", StringType) ::
    StructField("col3", IntegerType) :: Nil)
  val values = Array("value1", "value2", 1)

  val sampleRow: Row = new GenericRowWithSchema(values, schema)
  val noSchemaRow: Row = new GenericRow(values)

  describe("Row (without schema)") {
    it("throws an exception when accessing by fieldName") {
      intercept[UnsupportedOperationException] {
        noSchemaRow.fieldIndex("col1")
      }
      intercept[UnsupportedOperationException] {
        noSchemaRow.getAs("col1")
      }
    }
  }

  describe("Row (with schema)") {
    it("fieldIndex(name) returns field index") {
      sampleRow.fieldIndex("col1") shouldBe 0
      sampleRow.fieldIndex("col3") shouldBe 2
    }

    it("getAs[T] retrieves a value by fieldname") {
      sampleRow.getAs[String]("col1") shouldBe "value1"
      sampleRow.getAs[Int]("col3") shouldBe 1
    }

    it("Accessing non existent field throws an exception") {
      intercept[IllegalArgumentException] {
        sampleRow.getAs[String]("non_existent")
      }
    }

    it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") {
      val expected = Map(
        "col1" -> "value1",
        "col2" -> "value2"
      )
      sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
    }
  }
} 
Example 4
Source File: RowTest.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types._
import org.scalatest.{Matchers, FunSpec}

class RowTest extends FunSpec with Matchers {

  val schema = StructType(
    StructField("col1", StringType) ::
    StructField("col2", StringType) ::
    StructField("col3", IntegerType) :: Nil)//列表结尾为Nil
  val values = Array("value1", "value2", 1)

  val sampleRow: Row = new GenericRowWithSchema(values, schema)
  val noSchemaRow: Row = new GenericRow(values)
  //行(无模式)
  describe("Row (without schema)") {
    //通过fieldName访问时会抛出异常
    it("throws an exception when accessing by fieldName") {
      intercept[UnsupportedOperationException] {
        noSchemaRow.fieldIndex("col1")
      }
      intercept[UnsupportedOperationException] {
        noSchemaRow.getAs("col1")
      }
    }
  }
  //行(带模式)
  describe("Row (with schema)") {
    //fieldIndex(name)返回字段索引
    it("fieldIndex(name) returns field index") {
      sampleRow.fieldIndex("col1") shouldBe 0
      sampleRow.fieldIndex("col3") shouldBe 2
    }
    //getAs [T]通过字段名检索值
    it("getAs[T] retrieves a value by fieldname") {
      sampleRow.getAs[String]("col1") shouldBe "value1"
      sampleRow.getAs[Int]("col3") shouldBe 1
    }
    //访问不存在的字段会引发异常
    it("Accessing non existent field throws an exception") {
      intercept[IllegalArgumentException] {
        sampleRow.getAs[String]("non_existent")
      }
    }
    //getValuesMap()检索多个字段的值作为Map(field -> value)
    it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") {
      val expected = Map(
        "col1" -> "value1",
        "col2" -> "value2"
      )
      sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
    }
  }
  //行等于
  describe("row equals") {
    val externalRow = Row(1, 2)
    val externalRow2 = Row(1, 2)
    val internalRow = InternalRow(1, 2)
    val internalRow2 = InternalRow(1, 2)
    //外部行的等式检查
    it("equality check for external rows") {
      externalRow shouldEqual externalRow2
    }
    //相等检查内部行
    it("equality check for internal rows") {
      internalRow shouldEqual internalRow2
    }
  }
} 
Example 5
Source File: RowTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types._
import org.scalatest.{Matchers, FunSpec}

class RowTest extends FunSpec with Matchers {

  val schema = StructType(
    StructField("col1", StringType) ::
    StructField("col2", StringType) ::
    StructField("col3", IntegerType) :: Nil)
  val values = Array("value1", "value2", 1)
  val valuesWithoutCol3 = Array[Any](null, "value2", null)

  val sampleRow: Row = new GenericRowWithSchema(values, schema)
  val sampleRowWithoutCol3: Row = new GenericRowWithSchema(valuesWithoutCol3, schema)
  val noSchemaRow: Row = new GenericRow(values)

  describe("Row (without schema)") {
    it("throws an exception when accessing by fieldName") {
      intercept[UnsupportedOperationException] {
        noSchemaRow.fieldIndex("col1")
      }
      intercept[UnsupportedOperationException] {
        noSchemaRow.getAs("col1")
      }
    }
  }

  describe("Row (with schema)") {
    it("fieldIndex(name) returns field index") {
      sampleRow.fieldIndex("col1") shouldBe 0
      sampleRow.fieldIndex("col3") shouldBe 2
    }

    it("getAs[T] retrieves a value by fieldname") {
      sampleRow.getAs[String]("col1") shouldBe "value1"
      sampleRow.getAs[Int]("col3") shouldBe 1
    }

    it("Accessing non existent field throws an exception") {
      intercept[IllegalArgumentException] {
        sampleRow.getAs[String]("non_existent")
      }
    }

    it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") {
      val expected = Map(
        "col1" -> "value1",
        "col2" -> "value2"
      )
      sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
    }

    it("getValuesMap() retrieves null value on non AnyVal Type") {
      val expected = Map(
        "col1" -> null,
        "col2" -> "value2"
      )
      sampleRowWithoutCol3.getValuesMap[String](List("col1", "col2")) shouldBe expected
    }

    it("getAs() on type extending AnyVal throws an exception when accessing field that is null") {
      intercept[NullPointerException] {
        sampleRowWithoutCol3.getInt(sampleRowWithoutCol3.fieldIndex("col3"))
      }
    }

    it("getAs() on type extending AnyVal does not throw exception when value is null"){
      sampleRowWithoutCol3.getAs[String](sampleRowWithoutCol3.fieldIndex("col1")) shouldBe null
    }
  }

  describe("row equals") {
    val externalRow = Row(1, 2)
    val externalRow2 = Row(1, 2)
    val internalRow = InternalRow(1, 2)
    val internalRow2 = InternalRow(1, 2)

    it("equality check for external rows") {
      externalRow shouldEqual externalRow2
    }

    it("equality check for internal rows") {
      internalRow shouldEqual internalRow2
    }
  }
} 
Example 6
Source File: SparkAvroDecoder.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

} 
Example 7
Source File: SPKSQLUtils.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types.{StructType, UDTRegistration}

object SPKSQLUtils {
  def append(row: Row, fields: StructType, values: Any*): Row = {
    row match {
      case r: GenericRowWithSchema =>
        val newValues = new Array[Any](r.length + values.length)
        val rLength: Int = r.length
        (0 until rLength).foreach(idx => newValues(idx) = r(idx))
        values.zipWithIndex.foreach { case (value, idx) =>
          newValues(idx + rLength) = value
        }

        val newSchema = if (r.schema != null) {
          val schemaTemp = StructType(r.schema)
          fields.foreach(field => schemaTemp.add(field))
          schemaTemp
        } else {
          null.asInstanceOf[StructType]
        }
        new GenericRowWithSchema(newValues, newSchema)
      case r: GenericRow =>
        val newValues = new Array[Any](r.length + values.length)
        val rLength: Int = r.length
        (0 until rLength).foreach(idx => newValues(idx) = r(idx))
        values.zipWithIndex.foreach { case (value, idx) =>
          newValues(idx + rLength) = value
        }

        new GenericRow(newValues)
      case _ =>
        throw new Exception("Row Error!")
    }
  }

  def registerUDT(): Unit = synchronized{
    UDTRegistration.register("org.apache.spark.linalg.Vector", "org.apache.spark.linalg.VectorUDT")
    UDTRegistration.register("org.apache.spark.linalg.DenseVector", "org.apache.spark.linalg.VectorUDT")
    UDTRegistration.register("org.apache.spark.linalg.SparseVector", "org.apache.spark.linalg.VectorUDT")
    UDTRegistration.register("org.apache.spark.linalg.Matrix", "org.apache.spark.linalg.MatrixUDT")
    UDTRegistration.register("org.apache.spark.linalg.DenseMatrix", "org.apache.spark.linalg.MatrixUDT")
    UDTRegistration.register("org.apache.spark.linalg.SparseMatrix", "org.apache.spark.linalg.MatrixUDT")
  }
} 
Example 8
Source File: LocalWriteSuite.scala    From ecosystem   with Apache License 2.0 5 votes vote down vote up
package org.tensorflow.spark.datasources.tfrecords

import java.nio.file.Files
import java.nio.file.Paths

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types._

import org.apache.commons.io.FileUtils

class LocalWriteSuite extends SharedSparkSessionSuite {

  val testRows: Array[Row] = Array(
    new GenericRow(Array[Any](11, 1, 23L, 10.0F, 14.0, List(1.0, 3.0), "r1")),
    new GenericRow(Array[Any](21, 2, 24L, 12.0F, 15.0, List(2.0, 3.0), "r2")),
    new GenericRow(Array[Any](31, 3, 25L, 14.0F, 16.0, List(3.0, 3.0), "r3")))
  val schema = StructType(List(StructField("id", IntegerType),
    StructField("IntegerTypeLabel", IntegerType),
    StructField("LongTypeLabel", LongType),
    StructField("FloatTypeLabel", FloatType),
    StructField("DoubleTypeLabel", DoubleType),
    StructField("VectorLabel", ArrayType(DoubleType, true)),
    StructField("name", StringType)))


  "Propagate" should {
    "write data locally" in {
      // Create a dataframe with 2 partitions
      val rdd = spark.sparkContext.parallelize(testRows, numSlices = 2)
      val df = spark.createDataFrame(rdd, schema)

      // Write the partitions onto the local hard drive. Since it is going to be the
      // local file system, the partitions will be written in the same directory of the
      // same machine.
      // In a distributed setting though, two different machines would each hold a single
      // partition.
      val localPath = Files.createTempDirectory("spark-connector-propagate").toAbsolutePath.toString
      val savePath = localPath + "/testResult"
      df.write.format("tfrecords")
        .option("recordType", "Example")
        .option("writeLocality", "local")
        .save(savePath)

      // Read again this directory, this time using the Hadoop file readers, it should
      // return the same data.
      // This only works in this test and does not hold in general, because the partitions
      // will be written on the workers. Everything runs locally for tests.
      val df2 = spark.read.format("tfrecords").option("recordType", "Example")
        .load(savePath).sort("id").select("id", "IntegerTypeLabel", "LongTypeLabel",
        "FloatTypeLabel", "DoubleTypeLabel", "VectorLabel", "name") // Correct column order.

      assert(df2.collect().toSeq === testRows.toSeq)
    }
  }
}