org.apache.spark.sql.types.Metadata Scala Examples

The following examples show how to use org.apache.spark.sql.types.Metadata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: FieldPoly2Type.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import org.apache.spark.sql.types.{DataType, Metadata}

abstract class FieldPoly2Type[T](name: String,
                                 dataType: DataType,
                                 nullValueAllowed: Boolean,
                                 xOrig: Double,
                                 yOrig: Double,
                                 xyScale: Double,
                                 metadata: Metadata)
  extends FieldBytes(name, dataType, nullValueAllowed, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val blob = getByteBuffer(byteBuffer)

    val geomType = blob.getVarUInt
    val numPoints = blob.getVarUInt.toInt
    if (numPoints == 0)
      createPolyType(0, 0, 0, 0, Array.empty[Int], Array.empty[Double])
    else {
      val numParts = blob.getVarUInt.toInt

      val xmin = blob.getVarUInt / xyScale + xOrig
      val ymin = blob.getVarUInt / xyScale + yOrig
      val xmax = blob.getVarUInt / xyScale + xmin
      val ymax = blob.getVarUInt / xyScale + ymin

      var dx = 0L
      var dy = 0L

      val xyNum = new Array[Int](numParts)
      val xyArr = new Array[Double](numPoints * 2)

      if (numParts > 1) {
        var i = 0
        var sum = 0
        1 to numParts foreach (partIndex => {
          if (partIndex == numParts) {
            xyNum(i) = numPoints - sum
          } else {
            val numXY = blob.getVarUInt.toInt
            xyNum(i) = numXY
            sum += numXY
            i += 1
          }
        })
        i = 0
        xyNum.foreach(numXY => {
          0 until numXY foreach (n => {
            dx += blob.getVarInt
            dy += blob.getVarInt
            val x = dx / xyScale + xOrig
            val y = dy / xyScale + yOrig
            xyArr(i) = x
            i += 1
            xyArr(i) = y
            i += 1
          })
        })
      }
      else {
        xyNum(0) = numPoints
        var i = 0
        0 until numPoints foreach (n => {
          dx += blob.getVarInt
          dy += blob.getVarInt
          val x = dx / xyScale + xOrig
          val y = dy / xyScale + yOrig
          xyArr(i) = x
          i += 1
          xyArr(i) = y
          i += 1
        })
      }
      createPolyType(xmin, ymin, xmax, ymax, xyNum, xyArr)
    }
  }

  def createPolyType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): T
} 
Example 2
Source File: NameAssigner.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCols
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Dataset, functions}
import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType}


class NameAssigner(override val uid: String) extends Transformer with HasInputCols{

  def setInputCols(column: String*) : this.type = set(inputCols, column.toArray)

  def this() = this(Identifiable.randomUID("NameAssigner"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    $(inputCols)

    $(inputCols).foldLeft(dataset.toDF)((data, column) => {
      val metadata: Metadata = dataset.schema(column).metadata
      val attributes = AttributeGroup.fromStructField(
        StructField(column, new VectorUDT, nullable = false, metadata = metadata))

      val map = attributes.attributes
        .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap)
        .getOrElse(Map())

      val func = functions.udf[String, Number](x => if(x == null) {
        null
      } else {
        val i = x.intValue()
        map.getOrElse(i, i.toString)
      })

      data.withColumn(column, func(data(column)).as(column, metadata))
    }).toDF
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(schema.map(f => if ($(inputCols).contains(f.name)) {
      StructField(f.name, StringType, f.nullable, f.metadata)
    } else {
      f
    }))
} 
Example 3
Source File: MetadataSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Metadata, MetadataBuilder}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()

  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }

  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 4
Source File: MetadataSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{MetadataBuilder, Metadata}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()

  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }

  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 5
Source File: FieldDateTime.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer
import java.sql.Timestamp

import org.apache.spark.sql.types.{Metadata, TimestampType}


class FieldDateTime(name: String, nullValueAllowed: Boolean, metadata:Metadata)
  extends Field(name, TimestampType, nullValueAllowed, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val numDays = byteBuffer.getDouble
    // convert days since 12/30/1899 to 1/1/1970
    val unixDays = numDays - 25569
    val millis = (unixDays * 1000 * 60 * 60 * 24).ceil.toLong
    new Timestamp(millis)
  }
} 
Example 6
Source File: FieldPointMType.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import com.esri.udt.{PointMType, PointMUDT}
import org.apache.spark.sql.types.Metadata


object FieldPointMType extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            mOrig: Double,
            xyScale: Double,
            mScale: Double,
            metadata: Metadata
           ) = {
    new FieldPointMType(name, nullValueAllowed, xOrig, yOrig, mOrig, xyScale, mScale, metadata)
  }
}

class FieldPointMType(name: String,
                      nullValueAllowed: Boolean,
                      xOrig: Double,
                      yOrig: Double,
                      mOrig: Double,
                      xyScale: Double,
                      mScale: Double,
                      metadata: Metadata)
  extends FieldBytes(name, new PointMUDT(), nullValueAllowed, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val blob = getByteBuffer(byteBuffer)

    val geomType = blob.getVarUInt()

    val vx = blob.getVarUInt
    val vy = blob.getVarUInt
    val vm = blob.getVarUInt

    val x = (vx - 1.0) / xyScale + xOrig
    val y = (vy - 1.0) / xyScale + yOrig
    val m = (vm - 1.0) / mScale + mOrig

    new PointMType(x, y, m)
  }
} 
Example 7
Source File: FieldPoly.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import com.esri.core.geometry.MultiPath
import org.apache.spark.sql.types.{DataType, Metadata}

@deprecated("not used", "0.4")
abstract class FieldPoly(name: String,
                         dataType: DataType,
                         nullValueAllowed: Boolean,
                         xOrig: Double,
                         yOrig: Double,
                         xyScale: Double,
                         metadata: Metadata)
  extends FieldBytes(name, dataType, nullValueAllowed, metadata) {

  protected var dx = 0L
  protected var dy = 0L

  def addPath(byteBuffer: ByteBuffer, numCoordinates: Int, path: MultiPath) = {
    0 until numCoordinates foreach (n => {
      dx += byteBuffer.getVarInt
      dy += byteBuffer.getVarInt
      val x = dx / xyScale + xOrig
      val y = dy / xyScale + yOrig
      n match {
        case 0 => path.startPath(x, y)
        case _ => path.lineTo(x, y)
      }
    })
    path
  }
} 
Example 8
Source File: FieldBytes.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import org.apache.spark.sql.types.{DataType, Metadata}


abstract class FieldBytes(name: String,
                          dataType: DataType,
                          nullValueAllowed: Boolean,
                          metadata: Metadata = Metadata.empty
                         )
  extends Field(name, dataType, nullValueAllowed, metadata) {

  protected var m_bytes = new Array[Byte](1024)

  def getByteBuffer(byteBuffer: ByteBuffer) = {
    val numBytes = fillVarBytes(byteBuffer)
    ByteBuffer.wrap(m_bytes, 0, numBytes)
  }

  def fillVarBytes(byteBuffer: ByteBuffer) = {
    val numBytes = byteBuffer.getVarUInt.toInt
    if (numBytes > m_bytes.length) {
      m_bytes = new Array[Byte](numBytes)
    }
    0 until numBytes foreach {
      m_bytes(_) = byteBuffer.get
    }
    numBytes
  }
} 
Example 9
Source File: FieldPolylineMType.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import com.esri.udt.{PolylineMType, PolylineMUDT}
import org.apache.spark.sql.types.Metadata


object FieldPolylineMType extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            mOrig: Double,
            xyScale: Double,
            mScale: Double,
            metadata: Metadata) = {
    new FieldPolylineMType(name, nullValueAllowed, xOrig, yOrig, mOrig, xyScale, mScale, metadata)
  }
}

class FieldPolylineMType(name: String,
                         nullValueAllowed: Boolean,
                         xOrig: Double,
                         yOrig: Double,
                         mOrig: Double,
                         xyScale: Double,
                         mScale: Double,
                         metadata: Metadata)
  extends FieldPoly3Type[PolylineMType](name, new PolylineMUDT(), nullValueAllowed, xOrig, yOrig, mOrig, xyScale, mScale, metadata) {

  override def createPolyMType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]) = {
    PolylineMType(xmin, ymin, xmax, ymax, xyNum, xyArr)
  }
} 
Example 10
Source File: FieldPolygon.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import com.esri.core.geometry.Polygon
import com.esri.udt.PolygonUDT
import org.apache.spark.sql.types.{DataType, Metadata}

@deprecated("not used", "0.4")
object FieldPolygon {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            xyScale: Double,
            metadata: Metadata) = {
    new FieldPolygonEsri(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata)
  }
}

@deprecated("not used", "0.4")
abstract class FieldPolygon(name: String,
                            dataType: DataType,
                            nullValueAllowed: Boolean,
                            xOrig: Double,
                            yOrig: Double,
                            xyScale: Double,
                            metadata: Metadata
                           )
  extends FieldPoly(name, dataType, nullValueAllowed, xOrig, yOrig, xyScale, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val polygon = new Polygon()

    val blob = getByteBuffer(byteBuffer)

    val geomType = blob.getVarUInt

    val numPoints = blob.getVarUInt.toInt
    val numParts = blob.getVarUInt.toInt

    val xmin = blob.getVarUInt / xyScale + xOrig
    val ymin = blob.getVarUInt / xyScale + yOrig
    val xmax = blob.getVarUInt / xyScale + xmin
    val ymax = blob.getVarUInt / xyScale + ymin

    dx = 0L
    dy = 0L

    if (numParts > 1) {
      var sum = 0
      val numCoordSeq = 1 to numParts map (part => {
        val numCoord = if (part == numParts) {
          numPoints - sum
        } else {
          blob.getVarUInt.toInt
        }
        sum += numCoord
        numCoord
      })
      // TODO - fix shells and holes based on https://github.com/rouault/dump_gdbtable/wiki/FGDB-Spec
      numCoordSeq.foreach(numCoord => addPath(blob, numCoord, polygon))
    }
    else {
      addPath(blob, numPoints, polygon)
    }
    polygon
  }
}

@deprecated("not used", "0.4")
class FieldPolygonEsri(name: String,
                       nullValueAllowed: Boolean,
                       xOrig: Double,
                       yOrig: Double,
                       xyScale: Double,
                       metadata: Metadata)
  extends FieldPolygon(name, new PolygonUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata) 
Example 11
Source File: MetadataSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{MetadataBuilder, Metadata}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()
  //元数据构建器和getter
  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }
  //元数据的JSON转换
  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 12
Source File: FieldPolyline.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import com.esri.core.geometry.Polyline
import com.esri.udt.PolylineUDT
import org.apache.spark.sql.types.Metadata

@deprecated("not used", "0.4")
object FieldPolyline extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            xyScale: Double,
            metadata: Metadata) = {
    new FieldPolyline(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata)
  }
}

@deprecated("not used", "0.4")
class FieldPolyline(name: String,
                    nullValueAllowed: Boolean,
                    xOrig: Double,
                    yOrig: Double,
                    xyScale: Double,
                    metadata: Metadata
                   )
  extends FieldPoly(name, new PolylineUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val polyline = new Polyline()

    val blob = getByteBuffer(byteBuffer)
    val geomType = blob.getVarUInt

    val numPoints = blob.getVarUInt.toInt
    val numParts = blob.getVarUInt.toInt

    val xmin = blob.getVarUInt / xyScale + xOrig
    val ymin = blob.getVarUInt / xyScale + yOrig
    val xmax = blob.getVarUInt / xyScale + xmin
    val ymax = blob.getVarUInt / xyScale + ymin

    dx = 0L
    dy = 0L

    if (numParts > 1) {
      var sum = 0
      val numCoordSeq = 1 to numParts map (part => {
        val numCoord = if (part == numParts) {
          numPoints - sum
        } else {
          blob.getVarUInt.toInt
        }
        sum += numCoord
        numCoord
      })
      numCoordSeq.foreach(numCoord => addPath(blob, numCoord, polyline))
    }
    else {
      addPath(blob, numPoints, polyline)
    }
    polyline
  }
} 
Example 13
Source File: FieldPointZMType.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import com.esri.udt.{PointZMType, PointZMUDT}
import org.apache.spark.sql.types.Metadata


object FieldPointZMType extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            zOrig: Double,
            mOrig: Double,
            xyScale: Double,
            zScale: Double,
            mScale: Double,
            metadata: Metadata
           ) = {
    new FieldPointZMType(name, nullValueAllowed, xOrig, yOrig, zOrig, mOrig, xyScale, zScale, mScale, metadata)
  }
}

class FieldPointZMType(name: String,
                       nullValueAllowed: Boolean,
                       xOrig: Double,
                       yOrig: Double,
                       zOrig: Double,
                       mOrig: Double,
                       xyScale: Double,
                       zScale: Double,
                       mScale: Double,
                       metadata: Metadata)
  extends FieldBytes(name, new PointZMUDT(), nullValueAllowed, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val blob = getByteBuffer(byteBuffer)

    val geomType = blob.getVarUInt

    val vx = blob.getVarUInt()
    val vy = blob.getVarUInt()
    val x = (vx - 1.0) / xyScale + xOrig
    val y = (vy - 1.0) / xyScale + yOrig

    geomType match {
      // Point
      case 1 => new PointZMType(x, y)
      // PointZ
      case 9 =>
        val vz = blob.getVarUInt
        val z = (vz - 1.0) / zScale + zOrig
        new PointZMType(x, y, z)
      // PointM
      case 21 =>
        val vm = blob.getVarUInt
        val m = (vm - 1.0) / mScale + mOrig
        new PointZMType(x, y, 0.0, m)
      // PointZM
      case _ =>
        val vz = blob.getVarUInt
        val vm = blob.getVarUInt
        val z = (vz - 1.0) / zScale + zOrig
        val m = (vm - 1.0) / mScale + mOrig
        new PointZMType(x, y, z, m)
    }
  }
} 
Example 14
Source File: FieldPolylineType.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import com.esri.udt.{PolylineType, PolylineUDT}
import org.apache.spark.sql.types.Metadata


object FieldPolylineType extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            xyScale: Double,
            metadata: Metadata) = {
    new FieldPolylineType(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata)
  }
}

class FieldPolylineType(name: String,
                        nullValueAllowed: Boolean,
                        xOrig: Double,
                        yOrig: Double,
                        xyScale: Double,
                        metadata: Metadata)
  extends FieldPoly2Type[PolylineType](name, new PolylineUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata) {

  override def createPolyType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): PolylineType = {
    PolylineType(xmin, ymin, xmax, ymax, xyNum, xyArr)
  }
} 
Example 15
Source File: FieldPointZType.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import com.esri.udt.{PointZType, PointZUDT}
import org.apache.spark.sql.types.Metadata


object FieldPointZType extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            zOrig: Double,
            xyScale: Double,
            zScale: Double,
            metadata: Metadata
           ) = {
    new FieldPointZType(name, nullValueAllowed, xOrig, yOrig, zOrig, xyScale, zScale, metadata)
  }
}

class FieldPointZType(name: String,
                      nullValueAllowed: Boolean,
                      xOrig: Double,
                      yOrig: Double,
                      zOrig: Double,
                      xyScale: Double,
                      zScale: Double,
                      metadata: Metadata)
  extends FieldBytes(name, new PointZUDT(), nullValueAllowed, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val blob = getByteBuffer(byteBuffer)

    val geomType = blob.getVarUInt

    val vx = blob.getVarUInt
    val vy = blob.getVarUInt
    val vz = blob.getVarUInt

    val x = (vx - 1.0) / xyScale + xOrig
    val y = (vy - 1.0) / xyScale + yOrig
    val z = (vz - 1.0) / zScale + zOrig

    new PointZType(x, y, z)
  }
} 
Example 16
Source File: FieldPoly3Type.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import org.apache.spark.sql.types.{DataType, Metadata}

abstract class FieldPoly3Type[T](name: String,
                                 dataType: DataType,
                                 nullValueAllowed: Boolean,
                                 xOrig: Double,
                                 yOrig: Double,
                                 nOrig: Double,
                                 xyScale: Double,
                                 nScale: Double,
                                 metadata: Metadata)
  extends FieldBytes(name, dataType, nullValueAllowed, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val blob = getByteBuffer(byteBuffer)

    val geomType = blob.getVarUInt
    val numPoints = blob.getVarUInt.toInt
    // TODO - Handle zero num points in other geom type.
    if (numPoints == 0) {
      createPolyMType(0, 0, 0, 0, Array.empty[Int], Array.empty[Double])
    }
    else {
      val numParts = blob.getVarUInt.toInt

      val xmin = blob.getVarUInt / xyScale + xOrig
      val ymin = blob.getVarUInt / xyScale + yOrig
      val xmax = blob.getVarUInt / xyScale + xmin
      val ymax = blob.getVarUInt / xyScale + ymin

      var dx = 0L
      var dy = 0L

      val xyNum = new Array[Int](numParts)
      val xyArr = new Array[Double](numPoints * 3)

      var i = 0
      if (numParts > 1) {
        var sum = 0
        1 to numParts foreach (partIndex => {
          if (partIndex == numParts) {
            xyNum(i) = numPoints - sum
          } else {
            val numXY = blob.getVarUInt.toInt
            xyNum(i) = numXY
            sum += numXY
            i += 1
          }
        })
        i = 0
        xyNum.foreach(numXY => {
          0 until numXY foreach (_ => {
            dx += blob.getVarInt
            dy += blob.getVarInt
            val x = dx / xyScale + xOrig
            val y = dy / xyScale + yOrig
            xyArr(i) = x
            i += 1
            xyArr(i) = y
            i += 2
          })
        })
      }
      else {
        xyNum(0) = numPoints
        0 until numPoints foreach (_ => {
          dx += blob.getVarInt
          dy += blob.getVarInt
          xyArr(i) = dx / xyScale + xOrig
          i += 1
          xyArr(i) = dy / xyScale + yOrig
          i += 2
        })
      }
      i = 2
      var dn = 0L
      0 until numPoints foreach (_ => {
        dn += blob.getVarInt
        xyArr(i) = dn / nScale + nOrig
        i += 3
      })
      createPolyMType(xmin, ymin, xmax, ymax, xyNum, xyArr)
    }
  }

  def createPolyMType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): T
} 
Example 17
Source File: FieldPolygonType.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import com.esri.udt.{PolygonType, PolygonUDT}
import org.apache.spark.sql.types.Metadata


object FieldPolygonType extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            xyScale: Double,
            metadata: Metadata) = {
    new FieldPolygonType(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata)
  }
}

class FieldPolygonType(name: String,
                       nullValueAllowed: Boolean,
                       xOrig: Double,
                       yOrig: Double,
                       xyScale: Double,
                       metadata: Metadata)
  extends FieldPoly2Type[PolygonType](name, new PolygonUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata) {

  override def createPolyType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): PolygonType = {
    PolygonType(xmin, ymin, xmax, ymax, xyNum, xyArr)
  }
} 
Example 18
Source File: FieldPointType.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import java.nio.ByteBuffer

import com.esri.udt.{PointType, PointUDT}
import org.apache.spark.sql.types.Metadata

object FieldPointType extends Serializable {
  def apply(name: String,
            nullValueAllowed: Boolean,
            xOrig: Double,
            yOrig: Double,
            xyScale: Double,
            metadata: Metadata) = {
    new FieldPointType(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata)
  }
}

class FieldPointType(name: String,
                     nullValueAllowed: Boolean,
                     xOrig: Double,
                     yOrig: Double,
                     xyScale: Double,
                     metadata: Metadata)
  extends FieldBytes(name, new PointUDT(), nullValueAllowed, metadata) {

  override def readValue(byteBuffer: ByteBuffer, oid: Int) = {
    val blob = getByteBuffer(byteBuffer)

    blob.getVarUInt() // geomType

    val vx = blob.getVarUInt()
    val vy = blob.getVarUInt()
    val x = (vx - 1.0) / xyScale + xOrig
    val y = (vy - 1.0) / xyScale + yOrig

    new PointType(x, y)
  }
} 
Example 19
Source File: DatasetUtil.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata}
import org.apache.spark.sql.{Column, DataFrame, Dataset}


object DatasetUtil {
  def withColumns[T](ds: Dataset[T],
                     colNames: Seq[String],
                     cols: Seq[Column],
                     metadata: Seq[Metadata]): DataFrame = {
    require(colNames.size == cols.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of columns: ${cols.size}")
    require(colNames.size == metadata.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of metadata elements: ${metadata.size}")

    val sparkSession = ds.sparkSession
    val queryExecution = ds.queryExecution
    val resolver = sparkSession.sessionState.analyzer.resolver
    val output = queryExecution.analyzed.output

    checkColumnNameDuplication(colNames,
      "in given column names",
      sparkSession.sessionState.conf.caseSensitiveAnalysis)

    val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) =>
      colName -> col.as(colName, metadata)
    }.toMap

    val replacedAndExistingColumns = output.map { field =>
      columnMap.find { case (colName, _) =>
        resolver(field.name, colName)
      } match {
        case Some((colName: String, col: Column)) => col.as(colName)
        case _ => new Column(field)
      }
    }

    val newColumns = columnMap.filter { case (colName, col) =>
      !output.exists(f => resolver(f.name, colName))
    }.map { case (colName, col) => col.as(colName) }

    ds.select(replacedAndExistingColumns ++ newColumns: _*)
  }

  def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = {
    withColumns(ds, Seq(colName), Seq(col), Seq(metadata))
  }

  private def checkColumnNameDuplication(columnNames: Seq[String], colType: String,
                                         caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }

  /**
    * Cast a column in a Dataset to Vector type.
    *
    * The supported data types of the input column are
    * - Vector
    * - float/double type Array.
    *
    * Note: The returned column does not have Metadata.
    *
    * @param dataset input DataFrame
    * @param colName column name.
    * @return Vector column
    */
  def columnToVector(dataset: Dataset[_], colName: String): Column = {
    val columnDataType = dataset.schema(colName).dataType
    columnDataType match {
      case _: VectorUDT => col(colName)
      case fdt: ArrayType =>
        val transferUDF = fdt.elementType match {
          case _: FloatType => udf(f = (vector: Seq[Float]) => {
            val inputArray = Array.fill[Double](vector.size)(0.0)
            vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble)
            Vectors.dense(inputArray)
          })
          case _: DoubleType => udf((vector: Seq[Double]) => {
            Vectors.dense(vector.toArray)
          })
          case other =>
            throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector")
        }
        transferUDF(col(colName))
      case other =>
        throw new IllegalArgumentException(s"$other column cannot be cast to Vector")
    }
  }

} 
Example 20
Source File: GBTClassifierSmokeTest.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.spark.wrappers.estimators

import org.apache.spark.sql.types.{DoubleType, Metadata, StructType}

import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperables.spark.wrappers.params.common.ClassificationImpurity
import io.deepsense.deeplang.params.ParamPair
import io.deepsense.deeplang.params.selections.NameSingleColumnSelection
import io.deepsense.deeplang.utils.DataFrameUtils

class GBTClassifierSmokeTest
  extends AbstractEstimatorModelWrapperSmokeTest {

  override def className: String = "GBTClassifier"

  override val estimator = new GBTClassifier()

  private val labelColumnName = "myRating"

  import estimator.vanillaGBTClassifier._

  override val estimatorParams: Seq[ParamPair[_]] = Seq(
    featuresColumn -> NameSingleColumnSelection("myFeatures"),
    impurity -> ClassificationImpurity.Entropy(),
    labelColumn -> NameSingleColumnSelection(labelColumnName),
    lossType -> GBTClassifier.Logistic(),
    maxBins -> 2.0,
    maxDepth -> 6.0,
    maxIterations -> 10.0,
    minInfoGain -> 0.0,
    minInstancesPerNode -> 1,
    predictionColumn -> "prediction",
    seed -> 100.0,
    stepSize -> 0.11,
    subsamplingRate -> 0.999
  )

  override def assertTransformedDF(dataFrame: DataFrame): Unit = {
    val possibleValues = DataFrameUtils.collectValues(dataFrame, labelColumnName)
    val actualValues = DataFrameUtils.collectValues(dataFrame, "prediction")

    actualValues.diff(possibleValues) shouldBe empty
  }

  override def assertTransformedSchema(schema: StructType): Unit = {
    val predictionColumn = schema.fields.last
    predictionColumn.name shouldBe "prediction"
    predictionColumn.dataType shouldBe DoubleType
    predictionColumn.metadata shouldBe Metadata.empty
  }
} 
Example 21
Source File: MetricUtils.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.metrics

import com.microsoft.ml.spark.core.schema.{SchemaConstants, SparkSchema}
import com.microsoft.ml.spark.core.schema.SchemaConstants.MMLTag
import org.apache.spark.sql.types.injections.MetadataUtilities
import org.apache.spark.sql.types.{Metadata, StructField, StructType}


object MetricUtils {

  def isClassificationMetric(metric: String): Boolean = {
    if (MetricConstants.RegressionMetrics.contains(metric)) false
    else if (MetricConstants.ClassificationMetrics.contains(metric)) true
    else throw new Exception("Invalid metric specified")
  }

  def getSchemaInfo(schema: StructType, labelCol: Option[String],
                    evaluationMetric: String): (String, String, String) = {
    val schemaInfo = tryGetSchemaInfo(schema)
    if (schemaInfo.isDefined) {
      schemaInfo.get
    } else {
      if (labelCol.isEmpty) {
        throw new Exception("Please score the model prior to evaluating")
      } else if (evaluationMetric == MetricConstants.AllSparkMetrics) {
        throw new Exception("Please specify whether you are using evaluation for " +
          MetricConstants.ClassificationMetricsName + " or " + MetricConstants.RegressionMetricsName +
          " instead of " + MetricConstants.AllSparkMetrics)
      }
      ("custom model", labelCol.get,
        if (isClassificationMetric(evaluationMetric))
          SchemaConstants.ClassificationKind
        else SchemaConstants.RegressionKind)
    }
  }

  private def tryGetSchemaInfo(schema: StructType): Option[(String, String, String)] = {
    // TODO: evaluate all models; for now, get first model name found
    val firstModelName = schema.collectFirst {
      case StructField(_, _, _, m) if getFirstModelName(m) != null && getFirstModelName(m).isDefined =>
          getFirstModelName(m).get
    }
    if (firstModelName.isEmpty) None
    else {
      val modelName = firstModelName.get
      val labelColumnName = SparkSchema.getLabelColumnName(schema, modelName)
      val scoreValueKind = SparkSchema.getScoreValueKind(schema, modelName, labelColumnName)
      Option((modelName, labelColumnName, scoreValueKind))
    }
  }

  private def getFirstModelName(colMetadata: Metadata): Option[String] = {
    if (!colMetadata.contains(MMLTag)) null
    else {
      val mlTagMetadata = colMetadata.getMetadata(MMLTag)
      val metadataKeys = MetadataUtilities.getMetadataKeys(mlTagMetadata)
      metadataKeys.find(key => key.startsWith(SchemaConstants.ScoreModelPrefix))
    }
  }

} 
Example 22
Source File: SparkWrapper.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark

import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog}
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.types.{DataType, Metadata}

object SparkWrapper {
  def getVersion: String = {
    "SparkWrapper-2.3"
  }

  def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = {
    SubqueryAlias(identifier, child)
  }

  def newAlias(child: Expression, name: String): Alias = {
    Alias(child, name)()
  }

  def newAttributeReference(
      name: String,
      dataType: DataType,
      nullable: Boolean,
      metadata: Metadata): AttributeReference = {
    AttributeReference(name, dataType, nullable, metadata)()
  }

  def callSessionCatalogCreateTable(
      obj: SessionCatalog,
      tableDefinition: CatalogTable,
      ignoreIfExists: Boolean): Unit = {
    obj.createTable(tableDefinition, ignoreIfExists)
  }
} 
Example 23
Source File: SparkWrapper.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark

import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog}
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.types.{DataType, Metadata}

object SparkWrapper {
  def getVersion: String = {
    "SparkWrapper-2.4"
  }

  def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = {
    SubqueryAlias(identifier, child)
  }

  def newAlias(child: Expression, name: String): Alias = {
    Alias(child, name)()
  }

  def newAttributeReference(
      name: String,
      dataType: DataType,
      nullable: Boolean,
      metadata: Metadata): AttributeReference = {
    AttributeReference(name, dataType, nullable, metadata)()
  }

  def callSessionCatalogCreateTable(
      obj: SessionCatalog,
      tableDefinition: CatalogTable,
      ignoreIfExists: Boolean): Unit = {
    obj.createTable(tableDefinition, ignoreIfExists)
  }
} 
Example 24
Source File: MetadataSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Metadata, MetadataBuilder}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()

  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }

  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 25
Source File: TableColumnsParser.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.parser

import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
import org.apache.spark.sql.catalyst.util.DataTypeParser
import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField}

trait TableColumnsParser
  extends AbstractSparkSQLParser
  with DataTypeParser
  with AnnotationParser {

  protected def commentIndicator: Keyword

  protected lazy val columnName = acceptMatch("column name", {
    case lexical.Identifier(chars) => chars
    case lexical.Keyword(chars) if !sqlReservedWords.contains(chars.toUpperCase) => chars
  })

  
  protected lazy val tableColumns: Parser[Seq[StructField]] =
    "(" ~> repsep(annotatedCol, ",") <~ ")"

  protected lazy val annotatedCol: Parser[StructField] =
    columnName ~ metadata ~ dataType ^^ {
      case name ~ md ~ typ =>
        StructField(name, typ, nullable = true, metadata = toTableMetadata(md))
    } |
    columnName ~ dataType ~ (commentIndicator ~> stringLit).?  ^^ { case name ~ typ ~ cm =>
      val meta = cm match {
        case Some(comment) =>
          new MetadataBuilder().putString(commentIndicator.str.toLowerCase, comment).build()
        case None => Metadata.empty
      }

      StructField(name, typ, nullable = true, meta)
    }
} 
Example 26
Source File: FieldExtractor.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.tablefunctions

import org.apache.spark.sql.types.{DataType, Metadata}


  def apply(index: Int,
            tableName: String,
            originalTableName: String,
            name: String,
            originalName: String,
            dataType: DataType,
            metadata: Metadata,
            isNullable: Boolean,
            checkStar: Boolean): FieldExtractor =
    new FieldExtractor(
      index,
      tableName,
      originalTableName,
      name,
      originalName,
      DataTypeExtractor(dataType),
      AnnotationsExtractor(metadata, checkStar),
      isNullable)
} 
Example 27
Source File: AnnotationsExtractor.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.tablefunctions

import org.apache.spark.sql.types.{Metadata, MetadataAccessor}


  lazy val annotations: Map[String, String] =
    metadata
      .filter {
        case (k, v) if checkStar => k != "*"
        case _ => true
      }
      .mapValues(_.toString)
}

object AnnotationsExtractor {
  def apply(metadata: Metadata, checkStar: Boolean): AnnotationsExtractor =
    AnnotationsExtractor(MetadataAccessor.metadataToMap(metadata), checkStar)
} 
Example 28
Source File: MetadataSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Metadata, MetadataBuilder}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()

  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }

  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 29
Source File: ExpressionHelper.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.mv.plans.modular

import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression}
import org.apache.spark.sql.types.{DataType, Metadata}

object ExpressionHelper {

  def createReference(
     name: String,
     dataType: DataType,
     nullable: Boolean,
     metadata: Metadata,
     exprId: ExprId,
     qualifier: Option[String],
     attrRef : NamedExpression = null): AttributeReference = {
    AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier)
  }

  def createAlias(
       child: Expression,
       name: String,
       exprId: ExprId = NamedExpression.newExprId,
       qualifier: Option[String] = None,
       explicitMetadata: Option[Metadata] = None,
       namedExpr : Option[NamedExpression] = None ) : Alias = {
    Alias(child, name)(exprId, qualifier, explicitMetadata)
  }

  def getTheLastQualifier(reference: AttributeReference): String = {
    reference.qualifier.head
  }

} 
Example 30
Source File: ExpressionHelper.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.mv.plans.modular

import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression}
import org.apache.spark.sql.types.{DataType, Metadata}

object ExpressionHelper {

  def createReference(
     name: String,
     dataType: DataType,
     nullable: Boolean,
     metadata: Metadata,
     exprId: ExprId,
     qualifier: Option[String],
     attrRef : NamedExpression = null): AttributeReference = {
    val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty
    AttributeReference(name, dataType, nullable, metadata)(exprId, qf)
  }

  def createAlias(
      child: Expression,
      name: String,
      exprId: ExprId,
      qualifier: Option[String]) : Alias = {
    val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty
    Alias(child, name)(exprId, qf, None)
  }

  def getTheLastQualifier(reference: AttributeReference): String = {
    reference.qualifier.reverse.head
  }

} 
Example 31
Source File: MetadataSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Metadata, MetadataBuilder}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()

  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }

  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 32
Source File: MetadataSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Metadata, MetadataBuilder}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()

  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }

  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 33
Source File: MetadataSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import org.json4s.jackson.JsonMethods.parse

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{MetadataBuilder, Metadata}

class MetadataSuite extends SparkFunSuite {

  val baseMetadata = new MetadataBuilder()
    .putString("purpose", "ml")
    .putBoolean("isBase", true)
    .build()

  val summary = new MetadataBuilder()
    .putLong("numFeatures", 10L)
    .build()

  val age = new MetadataBuilder()
    .putString("name", "age")
    .putLong("index", 1L)
    .putBoolean("categorical", false)
    .putDouble("average", 45.0)
    .build()

  val gender = new MetadataBuilder()
    .putString("name", "gender")
    .putLong("index", 5)
    .putBoolean("categorical", true)
    .putStringArray("categories", Array("male", "female"))
    .build()

  val metadata = new MetadataBuilder()
    .withMetadata(baseMetadata)
    .putBoolean("isBase", false) // overwrite an existing key
    .putMetadata("summary", summary)
    .putLongArray("long[]", Array(0L, 1L))
    .putDoubleArray("double[]", Array(3.0, 4.0))
    .putBooleanArray("boolean[]", Array(true, false))
    .putMetadataArray("features", Array(age, gender))
    .build()

  test("metadata builder and getters") {
    assert(age.contains("summary") === false)
    assert(age.contains("index") === true)
    assert(age.getLong("index") === 1L)
    assert(age.contains("average") === true)
    assert(age.getDouble("average") === 45.0)
    assert(age.contains("categorical") === true)
    assert(age.getBoolean("categorical") === false)
    assert(age.contains("name") === true)
    assert(age.getString("name") === "age")
    assert(metadata.contains("purpose") === true)
    assert(metadata.getString("purpose") === "ml")
    assert(metadata.contains("isBase") === true)
    assert(metadata.getBoolean("isBase") === false)
    assert(metadata.contains("summary") === true)
    assert(metadata.getMetadata("summary") === summary)
    assert(metadata.contains("long[]") === true)
    assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L))
    assert(metadata.contains("double[]") === true)
    assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0))
    assert(metadata.contains("boolean[]") === true)
    assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false))
    assert(gender.contains("categories") === true)
    assert(gender.getStringArray("categories").toSeq === Seq("male", "female"))
    assert(metadata.contains("features") === true)
    assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender))
  }

  test("metadata json conversion") {
    val json = metadata.json
    withClue("toJson must produce a valid JSON string") {
      parse(json)
    }
    val parsed = Metadata.fromJson(json)
    assert(parsed === metadata)
    assert(parsed.## === metadata.##)
  }
} 
Example 34
Source File: GBTClassifierSmokeTest.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.spark.wrappers.estimators

import org.apache.spark.sql.types.{DoubleType, Metadata, StructType}

import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.doperables.spark.wrappers.params.common.ClassificationImpurity
import ai.deepsense.deeplang.params.ParamPair
import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection
import ai.deepsense.deeplang.utils.DataFrameUtils

class GBTClassifierSmokeTest
  extends AbstractEstimatorModelWrapperSmokeTest {

  override def className: String = "GBTClassifier"

  override val estimator = new GBTClassifier()

  private val labelColumnName = "myRating"

  import estimator.vanillaGBTClassifier._

  override val estimatorParams: Seq[ParamPair[_]] = Seq(
    featuresColumn -> NameSingleColumnSelection("myFeatures"),
    impurity -> ClassificationImpurity.Entropy(),
    labelColumn -> NameSingleColumnSelection(labelColumnName),
    lossType -> GBTClassifier.Logistic(),
    maxBins -> 2.0,
    maxDepth -> 6.0,
    maxIterations -> 10.0,
    minInfoGain -> 0.0,
    minInstancesPerNode -> 1,
    predictionColumn -> "prediction",
    seed -> 100.0,
    stepSize -> 0.11,
    subsamplingRate -> 0.999
  )

  override def assertTransformedDF(dataFrame: DataFrame): Unit = {
    val possibleValues = DataFrameUtils.collectValues(dataFrame, labelColumnName)
    val actualValues = DataFrameUtils.collectValues(dataFrame, "prediction")

    actualValues.diff(possibleValues) shouldBe empty
  }

  override def assertTransformedSchema(schema: StructType): Unit = {
    val predictionColumn = schema.fields.last
    predictionColumn.name shouldBe "prediction"
    predictionColumn.dataType shouldBe DoubleType
    predictionColumn.metadata shouldBe Metadata.empty
  }
} 
Example 35
Source File: MinVarianceFilterMetadata.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.preparators

import com.salesforce.op.stages.impl.MetadataLike
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.sql.types.{Metadata, MetadataBuilder}

import scala.util.{Failure, Success, Try}



  def fromMetadata(meta: Metadata): MinVarianceSummary = {
    val wrapped = meta.wrapped
    Try {
      MinVarianceSummary(
        dropped = wrapped.getArray[String](MinVarianceNames.Dropped).toSeq,
        featuresStatistics = statisticsFromMetadata(wrapped.get[Metadata](MinVarianceNames.FeaturesStatistics)),
        names = wrapped.getArray[String](MinVarianceNames.Names).toSeq
      )
    } match {
      case Success(summary) => summary
      case Failure(_) => throw new IllegalArgumentException(s"failed to parse MinVarianceSummary from $meta")
    }
  }
} 
Example 36
Source File: VectorsCombinerTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.TransientFeature
import com.salesforce.op.features.types.{Text, _}
import com.salesforce.op.stages.base.sequence.SequenceModel
import com.salesforce.op.test.{OpEstimatorSpec, PassengerSparkFixtureTest, TestFeatureBuilder}
import com.salesforce.op.utils.spark.OpVectorMetadata
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.ml.attribute.MetadataHelper
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.Metadata
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class VectorsCombinerTest
  extends OpEstimatorSpec[OPVector, SequenceModel[OPVector, OPVector], VectorsCombiner]
    with PassengerSparkFixtureTest {

  override def specName: String = classOf[VectorsCombiner].getSimpleName

  val (inputData, f1, f2) = TestFeatureBuilder(Seq(
    Vectors.sparse(4, Array(0, 3), Array(1.0, 1.0)).toOPVector ->
      Vectors.sparse(4, Array(0, 3), Array(2.0, 3.0)).toOPVector,
    Vectors.dense(Array(2.0, 3.0, 4.0)).toOPVector ->
      Vectors.dense(Array(12.0, 13.0, 14.0)).toOPVector,
    // Purposely added some very large sparse vectors to verify the efficiency
    Vectors.sparse(100000000, Array(1), Array(777.0)).toOPVector ->
      Vectors.sparse(500000000, Array(0), Array(888.0)).toOPVector
  ))

  val estimator = new VectorsCombiner().setInput(f1, f2)

  val expectedResult = Seq(
    Vectors.sparse(8, Array(0, 3, 4, 7), Array(1.0, 1.0, 2.0, 3.0)).toOPVector,
    Vectors.dense(Array(2.0, 3.0, 4.0, 12.0, 13.0, 14.0)).toOPVector,
    Vectors.sparse(600000000, Array(1, 100000000), Array(777.0, 888.0)).toOPVector
  )

  it should "combine metadata correctly" in {
    val vector = Seq(height, description, stringMap).transmogrify()
    val inputs = vector.parents
    val outputData = new OpWorkflow().setReader(dataReader)
      .setResultFeatures(vector, inputs(0), inputs(1), inputs(2))
      .train().score()
    val inputMetadata = OpVectorMetadata.flatten(vector.name,
      inputs.map(i => OpVectorMetadata(outputData.schema(i.name))))
    OpVectorMetadata(outputData.schema(vector.name)).columns should contain theSameElementsAs inputMetadata.columns
  }

  it should "create metadata correctly" in {
    val descVect = description.map[Text] { t =>
      Text(t.value match {
        case Some(text) => "this is dumb " + text
        case None => "some STUFF to tokenize"
      })
    }.tokenize().tf(numTerms = 5)
    val vector = Seq(height, stringMap, descVect).transmogrify()
    val Seq(inputs1, inputs2, inputs3) = vector.parents

    val outputData = new OpWorkflow().setReader(dataReader)
      .setResultFeatures(vector, inputs1, inputs2, inputs3)
      .train().score()
    outputData.schema(inputs1.name).metadata.wrapped
      .get[Metadata](MetadataHelper.attributeKeys.ML_ATTR)
      .getLong(MetadataHelper.attributeKeys.NUM_ATTRIBUTES) shouldBe 5

    val inputMetadata = OpVectorMetadata.flatten(vector.name,
      Array(TransientFeature(inputs1).toVectorMetaData(5, Option(inputs1.name)),
        OpVectorMetadata(outputData.schema(inputs2.name)), OpVectorMetadata(outputData.schema(inputs3.name))))
    OpVectorMetadata(outputData.schema(vector.name)).columns should contain theSameElementsAs inputMetadata.columns
  }
} 
Example 37
Source File: MinVarianceFilterMetadataTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.preparators

import com.salesforce.op.stages.impl.preparators.MinVarianceSummary.statisticsFromMetadata
import com.salesforce.op.test.TestSparkContext
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.sql.types.Metadata
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class MinVarianceFilterMetadataTest extends FlatSpec with TestSparkContext {

  val summary = MinVarianceSummary(
    dropped = Seq("f1"),
    featuresStatistics = SummaryStatistics(3, 0.01, Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3),
      Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3)),
    names = Seq("f1", "f2", "f3")
  )

  Spec[MinVarianceSummary] should "convert to and from metadata correctly" in {
    val meta = summary.toMetadata()
    meta.isInstanceOf[Metadata] shouldBe true

    val retrieved = MinVarianceSummary.fromMetadata(meta)
    retrieved.isInstanceOf[MinVarianceSummary]

    retrieved.dropped should contain theSameElementsAs summary.dropped
    retrieved.featuresStatistics.count shouldBe summary.featuresStatistics.count
    retrieved.featuresStatistics.max should contain theSameElementsAs summary.featuresStatistics.max
    retrieved.featuresStatistics.min should contain theSameElementsAs summary.featuresStatistics.min
    retrieved.featuresStatistics.mean should contain theSameElementsAs summary.featuresStatistics.mean
    retrieved.featuresStatistics.variance should contain theSameElementsAs summary.featuresStatistics.variance
    retrieved.names should contain theSameElementsAs summary.names
  }

  it should "convert to and from JSON and give the same values" in {
    val meta = summary.toMetadata()
    val json = meta.wrapped.prettyJson
    val recovered = Metadata.fromJson(json).wrapped
    val dropped = recovered.getArray[String](MinVarianceNames.Dropped).toSeq
    val stats = statisticsFromMetadata(recovered.get[Metadata](MinVarianceNames.FeaturesStatistics))
    val names = recovered.getArray[String](MinVarianceNames.Names).toSeq

    dropped should contain theSameElementsAs summary.dropped
    stats.count shouldBe summary.featuresStatistics.count
    stats.max should contain theSameElementsAs summary.featuresStatistics.max
    stats.min should contain theSameElementsAs summary.featuresStatistics.min
    stats.mean should contain theSameElementsAs summary.featuresStatistics.mean
    stats.variance should contain theSameElementsAs summary.featuresStatistics.variance
    names should contain theSameElementsAs summary.names
  }

} 
Example 38
Source File: OpPipelineStageReaderWriterTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages

import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.stages.OpPipelineStageReaderWriter._
import com.salesforce.op.test.PassengerSparkFixtureTest
import com.salesforce.op.utils.reflection.ReflectionUtils
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.{Model, Transformer}
import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder}
import org.json4s.JsonAST.JValue
import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render}
import org.json4s.{JArray, JObject}
import org.scalatest.FlatSpec
import org.slf4j.LoggerFactory


// TODO: consider adding a read/write test for a spark wrapped stage as well
private[stages] abstract class OpPipelineStageReaderWriterTest
  extends FlatSpec with PassengerSparkFixtureTest {

  val meta = new MetadataBuilder().putString("foo", "bar").build()
  val expectedFeaturesLength = 1
  def stage: OpPipelineStageBase with Transformer
  val expected: Array[Real]
  val hasOutputName = true

  private val log = LoggerFactory.getLogger(this.getClass)
  private lazy val savePath = tempDir + "/" + this.getClass.getSimpleName + "-" + System.currentTimeMillis()
  private lazy val writer = new OpPipelineStageWriter(stage)
  private lazy val stageJsonString: String = writer.writeToJsonString(savePath)
  private lazy val stageJson: JValue = parse(stageJsonString)
  private lazy val isModel = stage.isInstanceOf[Model[_]]
  private val FN = FieldNames

  Spec(this.getClass) should "write stage uid" in {
    log.info(pretty(stageJson))
    (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid
  }
  it should "write class name" in {
    (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName
  }
  it should "write params map" in {
    val params = extractParams(stageJson).extract[Map[String, Any]]
    if (hasOutputName) {
      params should have size 4
      params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema", "outputFeatureName")
    } else {
      params should have size 3
      params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema")
    }
  }
  it should "write outputMetadata" in {
    val params = extractParams(stageJson)
    val metadataStr = compact(render(extractParams(stageJson) \ "outputMetadata"))
    val metadata = Metadata.fromJson(metadataStr)
    metadata shouldBe stage.getMetadata()
  }
  it should "write inputSchema" in {
    val schemaStr = compact(render(extractParams(stageJson) \ "inputSchema"))
    val schema = DataType.fromJson(schemaStr)
    schema shouldBe stage.getInputSchema()
  }
  it should "write input features" in {
    val jArray = (extractParams(stageJson) \ "inputFeatures").extract[JArray]
    jArray.values should have length expectedFeaturesLength
    val obj = jArray(0).extract[JObject]
    obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures")
  }
  it should "write model ctor args" in {
    if (stage.isInstanceOf[Model[_]]) {
      val ctorArgs = (stageJson \ FN.CtorArgs.entryName).extract[JObject]
      val (_, args) = ReflectionUtils.bestCtorWithArgs(stage)
      ctorArgs.values.keys shouldBe args.map(_._1).toSet
    }
  }
  it should "load stage correctly" in {
    val reader = new OpPipelineStageReader(stage)
    val stageLoaded = reader.loadFromJsonString(stageJsonString, path = savePath)
    stageLoaded shouldBe a[OpPipelineStageBase]
    stageLoaded shouldBe a[Transformer]
    stageLoaded.getOutput() shouldBe a[FeatureLike[_]]
    val _ = stage.asInstanceOf[Transformer].transform(passengersDataSet)
    val transformed = stageLoaded.asInstanceOf[Transformer].transform(passengersDataSet)
    transformed.collect(stageLoaded.getOutput().asInstanceOf[FeatureLike[Real]]) shouldBe expected
    stageLoaded.uid shouldBe stage.uid
    stageLoaded.operationName shouldBe stage.operationName
    stageLoaded.getInputFeatures() shouldBe stage.getInputFeatures()
    stageLoaded.getInputSchema() shouldBe stage.getInputSchema()
  }

  private def extractParams(stageJson: JValue): JValue = {
    val defaultParamsMap = stageJson \ FN.DefaultParamMap.entryName
    val paramsMap = stageJson \ FN.ParamMap.entryName
    defaultParamsMap.merge(paramsMap)
  }

} 
Example 39
Source File: MetadataIteratorSpec.scala    From jgit-spark-connector   with Apache License 2.0 5 votes vote down vote up
package tech.sourced.engine.iterator

import java.nio.file.Paths
import java.util.{Properties, UUID}

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{Metadata, StringType, StructType}
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
import tech.sourced.engine.{BaseSparkSpec, Schema}

class JDBCQueryIteratorSpec
  extends FlatSpec with Matchers with BeforeAndAfterAll with BaseSparkSpec {
  private val tmpPath = Paths.get(
    System.getProperty("java.io.tmpdir"),
    UUID.randomUUID.toString
  )

  private val dbPath = tmpPath.resolve("test.db")

  override def beforeAll(): Unit = {
    super.beforeAll()
    tmpPath.toFile.mkdir()
    val rdd = ss.sparkContext.parallelize(Seq(
      Row("id1"),
      Row("id2"),
      Row("id3")
    ))

    val properties = new Properties()
    properties.put("driver", "org.sqlite.JDBC")
    val df = ss.createDataFrame(rdd, StructType(Seq(Schema.repositories.head)))
    df.write.jdbc(s"jdbc:sqlite:${dbPath.toString}", "repositories", properties)
  }

  override def afterAll(): Unit = {
    super.afterAll()
    FileUtils.deleteQuietly(tmpPath.toFile)
  }

  "JDBCQueryIterator" should "return all rows for the query" in {
    val iter = new JDBCQueryIterator(
      Seq(attr("id")),
      dbPath.toString,
      "SELECT id FROM repositories ORDER BY id"
    )

    // calling hasNext more than one time does not cause rows to be lost
    iter.hasNext
    iter.hasNext
    val rows = (for (row <- iter) yield row).toArray
    rows.length should be(3)
    rows(0).length should be(1)
    rows(0)(0).toString should be("id1")
    rows(1)(0).toString should be("id2")
    rows(2)(0).toString should be("id3")
  }

  private def attr(name: String): Attribute = AttributeReference(
    name, StringType, nullable = false, Metadata.empty
  )()
}