org.apache.spark.sql.Column Scala Examples

The following examples show how to use org.apache.spark.sql.Column. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: Cleaner.scala    From cleanframes   with Apache License 2.0 6 votes vote down vote up
package cleanframes

import org.apache.spark.sql.{Column, DataFrame, functions}
import shapeless.labelled.FieldType
import shapeless.{::, HList, HNil, LabelledGeneric, Lazy, Witness}

trait Cleaner[A] {
  def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column]
}

object Cleaner {
  def apply[A](frame: DataFrame, name: Option[String], alias: Option[String])(implicit env: Cleaner[A]): DataFrame = {
    frame.select(
      env.clean(frame, name, alias): _*
    )
  }

  def materialize[A](func: (DataFrame, Option[String], Option[String]) => List[Column]): Cleaner[A] = new Cleaner[A] {
    override def clean(frame: DataFrame, name: Option[String], alias: Option[String]): List[Column] = func(frame, name, alias)
  }

  implicit val hnilCleaner: Cleaner[HNil] = materialize((_, _, _) => Nil)

  implicit def genericObjectCleaner[A, H <: HList](implicit
                                                   gen: LabelledGeneric.Aux[A, H],
                                                   hCleaner: Lazy[Cleaner[H]]): Cleaner[A] =
    materialize((frame, name, alias) => {
      val structColumn = functions.struct(
        hCleaner.value.clean(frame, name, alias): _*
      )

      List(
        alias
          .map(structColumn.as)
          .getOrElse(structColumn)
      )
    })

  implicit def hlistObjectCleaner[K <: Symbol, H, T <: HList](implicit
                                                              witness: Witness.Aux[K],
                                                              hCleaner: Lazy[Cleaner[H]],
                                                              tCleaner: Cleaner[T]): Cleaner[FieldType[K, H] :: T] = {
    val fieldName: String = witness.value.name

    materialize { (frame, name, alias) =>

      val columnName = alias match {
        case None |
             Some(`reserved_root_level_alias`) => fieldName
        case Some(alias) => s"$alias.$fieldName"
      }

      val hColumns = hCleaner.value.clean(frame, Some(columnName), alias = Some(fieldName))
      val tColumns = tCleaner.clean(frame, name, alias)
      hColumns ::: tColumns
    }
  }
} 
Example 2
Source File: HiveAcidRelation.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.hiveacid.datasource

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession}
import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan}
import org.apache.spark.sql.types._
import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf}
import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert}
import org.apache.spark.sql.catalyst.AliasIdentifier
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import collection.JavaConversions._


case class HiveAcidRelation(sparkSession: SparkSession,
                            fullyQualifiedTableName: String,
                            parameters: Map[String, String])
    extends BaseRelation
    with InsertableRelation
    with PrunedFilteredScan
    with Logging {

  private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession(
    sparkSession,
    fullyQualifiedTableName
  )
  private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession,
    hiveAcidMetadata, parameters)

  private val readOptions = SparkAcidConf(sparkSession, parameters)

  override def sqlContext: SQLContext = sparkSession.sqlContext

  override val schema: StructType = if (readOptions.includeRowIds) {
    hiveAcidMetadata.tableSchemaWithRowId
  } else {
    hiveAcidMetadata.tableSchema
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
   // sql insert into and overwrite
    if (overwrite) {
      hiveAcidTable.insertOverwrite(data)
    } else {
      hiveAcidTable.insertInto(data)
    }
  }

  def update(condition: Option[Column], newValues: Map[String, Column]): Unit = {
    hiveAcidTable.update(condition, newValues)
  }

  def delete(condition: Column): Unit = {
    hiveAcidTable.delete(condition)
  }
  override def sizeInBytes: Long = {
    val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
    (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong
  }

  def merge(sourceDf: DataFrame,
            mergeExpression: Expression,
            matchedClause: Seq[MergeWhenClause],
            notMatched: Option[MergeWhenNotInsert],
            sourceAlias: Option[AliasIdentifier],
            targetAlias: Option[AliasIdentifier]): Unit = {
    hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause,
      notMatched, sourceAlias, targetAlias)
  }

  def getHiveAcidTable(): HiveAcidTable = {
    hiveAcidTable
  }

  // FIXME: should it be true / false. Recommendation seems to
  //  be to leave it as true
  override val needConversion: Boolean = false

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    val readOptions = SparkAcidConf(sparkSession, parameters)
    // sql "select *"
    hiveAcidTable.getRdd(requiredColumns, filters, readOptions)
  }
} 
Example 3
Source File: FunctionalDependencyConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class FunctionalDependencyConstraint(determinantSet: Seq[String],
                                          dependentSet: Seq[String]) extends Constraint {

  require(determinantSet.nonEmpty, "determinantSet must not be empty")
  require(dependentSet.nonEmpty, "dependentSet must not be empty")

  val fun = (df: DataFrame) => {
    val determinantColumns = determinantSet.map(columnName => new Column(columnName))
    val dependentColumns = dependentSet.map(columnName => new Column(columnName))
    val maybeRelevantSelection = Try(df.select(determinantColumns ++ dependentColumns: _*))

    val maybeDeterminantValueCounts = maybeRelevantSelection.map(_.distinct.groupBy(determinantColumns: _*).count)
    val maybeViolatingDeterminantValuesCount = maybeDeterminantValueCounts.map(_.filter(new Column("count") =!= 1).count)
    FunctionalDependencyConstraintResult(
      constraint = this,
      data = maybeViolatingDeterminantValuesCount.toOption.map(FunctionalDependencyConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeViolatingDeterminantValuesCount, _ == 0)
    )
  }

}

case class FunctionalDependencyConstraintResult(constraint: FunctionalDependencyConstraint,
                                                data: Option[FunctionalDependencyConstraintResultData],
                                                status: ConstraintStatus) extends ConstraintResult[FunctionalDependencyConstraint] {

  val message: String = {
    val maybeFailedRows = data.map(_.failedRows)
    val maybeRowPluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s")
    val dependentSet = constraint.dependentSet
    val determinantString = s"${constraint.determinantSet.mkString(", ")}"
    val dependentString = s"${dependentSet.mkString(", ")}"
    val (columnPluralS, columnVerb) = if (dependentSet.size == 1) ("", "is") else ("s", "are")
    (status, maybeFailedRows, maybeRowPluralS) match {
      case (ConstraintSuccess, Some(0), _) =>
        s"Column$columnPluralS $dependentString $columnVerb functionally dependent on $determinantString."
      case (ConstraintFailure, Some(failedRows), Some(rowPluralS)) =>
        s"Column$columnPluralS $dependentString $columnVerb not functionally dependent on " +
        s"$determinantString ($failedRows violating determinant value$rowPluralS)."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column$columnPluralS $dependentString $columnVerb functionally " +
          s"dependent on $determinantString failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class FunctionalDependencyConstraintResultData(failedRows: Long) 
Example 4
Source File: UpdateCommand.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command

import com.qubole.spark.hiveacid.HiveAcidErrors
import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
import org.apache.spark.sql.{Column, Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.LogicalRelation

case class UpdateCommand(
    table: LogicalPlan,
    setExpressions: Map[String, Expression],
    condition: Option[Expression])
  extends RunnableCommand {

  override def children: Seq[LogicalPlan] = Seq(table)
  override def output: Seq[Attribute] = Seq.empty
  override lazy val resolved: Boolean = childrenResolved

  override def run(sparkSession: SparkSession): Seq[Row] = {
    if (children.size != 1) {
      throw new IllegalArgumentException("UPDATE command should have one table to update, whereas this has: "
        + children.size)
    }
    children(0) match {
      case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
        val setColumns = setExpressions.mapValues(expr => new Column(expr))
        val updateFilterColumn = condition.map(new Column(_))
        relation.update(updateFilterColumn, setColumns)
      }
      case LogicalRelation(_, _, Some(catalogTable), _) =>
        throw HiveAcidErrors.tableNotAcidException(catalogTable.qualifiedName)
      case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
    }
    Seq.empty[Row]
  }
} 
Example 5
Source File: DeleteCommand.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command

import com.qubole.spark.hiveacid.HiveAcidErrors
import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
import org.apache.spark.sql.{Column, Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.LogicalRelation

case class DeleteCommand(
    table: LogicalPlan,
    condition: Expression)
  extends RunnableCommand {

  // We don't want `table` in children as sometimes we don't want to transform it.
  override def children: Seq[LogicalPlan] = Seq(table)
  override def output: Seq[Attribute] = Seq.empty
  override lazy val resolved: Boolean = childrenResolved
  override def run(sparkSession: SparkSession): Seq[Row] = {
    if (children.size != 1) {
      throw new IllegalArgumentException("DELETE command should specify exactly one table, whereas this has: "
        + children.size)
    }
    children(0) match {
      case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
        relation.delete(new Column(condition))
      }
      case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
    }
    Seq.empty[Row]
  }
} 
Example 6
Source File: SimilarityFunctions.scala    From spark-stringmetric   with MIT License 5 votes vote down vote up
package com.github.mrpowers.spark.stringmetric

import com.github.mrpowers.spark.stringmetric.expressions.HammingDistance
import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.functions._

import java.util.Locale

import org.apache.commons.text.similarity.{
  CosineDistance,
  JaccardSimilarity,
  JaroWinklerDistance,
  FuzzyScore
}


object SimilarityFunctions {
  private def withExpr(expr: Expression): Column = new Column(expr)

  val cosine_distance = udf[Option[Double], String, String](cosineDistanceFun)

  def cosineDistanceFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val cd = new CosineDistance()
    Some(cd(s1, s2))
  }

  val fuzzy_score = udf[Option[Integer], String, String](fuzzyScoreFun)

  def fuzzyScoreFun(s1: String, s2: String): Option[Integer] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val f = new FuzzyScore(Locale.ENGLISH)
    Some(f.fuzzyScore(str1, str2))
  }

  def hamming(s1: Column, s2: Column): Column = withExpr {
    HammingDistance(s1.expr, s2.expr)
  }

  val jaccard_similarity = udf[Option[Double], String, String](jaccardSimilarityFun)

  def jaccardSimilarityFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val j = new JaccardSimilarity()
    Some(j.apply(str1, str2))
  }

  val jaro_winkler = udf[Option[Double], String, String](jaroWinlkerFun)

  def jaroWinlkerFun(s1: String, s2: String): Option[Double] = {
    val str1 = Option(s1).getOrElse(return None)
    val str2 = Option(s2).getOrElse(return None)
    val j = new JaroWinklerDistance()
    Some(j.apply(str1, str2))
  }

} 
Example 7
Source File: PostgresIntegrationSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.jdbc

import java.sql.Connection
import java.util.Properties

import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.{Literal, If}
import org.apache.spark.tags.DockerTest

@DockerTest
class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
  override val db = new DatabaseOnDocker {
    override val imageName = "postgres:9.4.5"
    override val env = Map(
      "POSTGRES_PASSWORD" -> "rootpass"
    )
    override val jdbcPort = 5432
    override def getJdbcUrl(ip: String, port: Int): String =
      s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass"
  }

  override def dataPreparation(conn: Connection): Unit = {
    conn.prepareStatement("CREATE DATABASE foo").executeUpdate()
    conn.setCatalog("foo")
    conn.prepareStatement("CREATE TABLE bar (c0 text, c1 integer, c2 double precision, c3 bigint, "
      + "c4 bit(1), c5 bit(10), c6 bytea, c7 boolean, c8 inet, c9 cidr, "
      + "c10 integer[], c11 text[], c12 real[])").executeUpdate()
    conn.prepareStatement("INSERT INTO bar VALUES ('hello', 42, 1.25, 123456789012345, B'0', "
      + "B'1000100101', E'\\\\xDEADBEEF', true, '172.16.0.42', '192.168.0.0/16', "
      + """'{1, 2}', '{"a", null, "b"}', '{0.11, 0.22}')""").executeUpdate()
  }

  test("Type mapping for various types") {
    val df = sqlContext.read.jdbc(jdbcUrl, "bar", new Properties)
    val rows = df.collect()
    assert(rows.length == 1)
    val types = rows(0).toSeq.map(x => x.getClass)
    assert(types.length == 13)
    assert(classOf[String].isAssignableFrom(types(0)))
    assert(classOf[java.lang.Integer].isAssignableFrom(types(1)))
    assert(classOf[java.lang.Double].isAssignableFrom(types(2)))
    assert(classOf[java.lang.Long].isAssignableFrom(types(3)))
    assert(classOf[java.lang.Boolean].isAssignableFrom(types(4)))
    assert(classOf[Array[Byte]].isAssignableFrom(types(5)))
    assert(classOf[Array[Byte]].isAssignableFrom(types(6)))
    assert(classOf[java.lang.Boolean].isAssignableFrom(types(7)))
    assert(classOf[String].isAssignableFrom(types(8)))
    assert(classOf[String].isAssignableFrom(types(9)))
    assert(classOf[Seq[Int]].isAssignableFrom(types(10)))
    assert(classOf[Seq[String]].isAssignableFrom(types(11)))
    assert(classOf[Seq[Double]].isAssignableFrom(types(12)))
    assert(rows(0).getString(0).equals("hello"))
    assert(rows(0).getInt(1) == 42)
    assert(rows(0).getDouble(2) == 1.25)
    assert(rows(0).getLong(3) == 123456789012345L)
    assert(rows(0).getBoolean(4) == false)
    // BIT(10)'s come back as ASCII strings of ten ASCII 0's and 1's...
    assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](5),
      Array[Byte](49, 48, 48, 48, 49, 48, 48, 49, 48, 49)))
    assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](6),
      Array[Byte](0xDE.toByte, 0xAD.toByte, 0xBE.toByte, 0xEF.toByte)))
    assert(rows(0).getBoolean(7) == true)
    assert(rows(0).getString(8) == "172.16.0.42")
    assert(rows(0).getString(9) == "192.168.0.0/16")
    assert(rows(0).getSeq(10) == Seq(1, 2))
    assert(rows(0).getSeq(11) == Seq("a", null, "b"))
    assert(rows(0).getSeq(12).toSeq == Seq(0.11f, 0.22f))
  }

  test("Basic write test") {
    val df = sqlContext.read.jdbc(jdbcUrl, "bar", new Properties)
    // Test only that it doesn't crash.
    df.write.jdbc(jdbcUrl, "public.barcopy", new Properties)
    // Test write null values.
    df.select(df.queryExecution.analyzed.output.map { a =>
      Column(Literal.create(null, a.dataType)).as(a.name)
    }: _*).write.jdbc(jdbcUrl, "public.barcopy2", new Properties)
  }
} 
Example 8
Source File: FrequentItems.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, Column, DataFrame}

private[sql] object FrequentItems extends Logging {

  
  private[sql] def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4, s"support ($support) must be greater than 1e-4.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    new DataFrame(df.sqlContext, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 9
Source File: FrequentItems.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 10
Source File: UserDefinedFunction.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.InterfaceStability
import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.ScalaUDF
import org.apache.spark.sql.types.DataType


  def asNondeterministic(): UserDefinedFunction = {
    if (!_deterministic) {
      this
    } else {
      val udf = copyAll()
      udf._deterministic = false
      udf
    }
  }
} 
Example 11
Source File: Minimum.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.min
import org.apache.spark.sql.types.{DoubleType, StructType}
import Analyzers._

case class MinState(minValue: Double) extends DoubleValuedState[MinState] {

  override def sum(other: MinState): MinState = {
    MinState(math.min(minValue, other.minValue))
  }

  override def metricValue(): Double = {
    minValue
  }
}

case class Minimum(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[MinState]("Minimum", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    min(conditionalSelection(column, where)).cast(DoubleType) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = {

    ifNoNullsIn(result, offset) { _ =>
      MinState(result.getDouble(offset))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: isNumeric(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 12
Source File: CountDistinct.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.metrics.DoubleMetric
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.count
import Analyzers._

case class CountDistinct(columns: Seq[String])
  extends ScanShareableFrequencyBasedAnalyzer("CountDistinct", columns) {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    count("*") :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
    toSuccessMetric(result.getLong(offset).toDouble)
  }
}

object CountDistinct {
  def apply(column: String): CountDistinct = {
    new CountDistinct(column :: Nil)
  }
} 
Example 13
Source File: Distinctness.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.functions.{col, sum}
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.Column


case class Distinctness(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil
  }

  override def filterCondition: Option[String] = where
}

object Distinctness {
  def apply(column: String): Distinctness = {
    new Distinctness(column :: Nil)
  }
} 
Example 14
Source File: Size.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.metrics.Entity
import org.apache.spark.sql.{Column, Row}
import Analyzers._

case class NumMatches(numMatches: Long) extends DoubleValuedState[NumMatches] {

  override def sum(other: NumMatches): NumMatches = {
    NumMatches(numMatches + other.numMatches)
  }

  override def metricValue(): Double = {
    numMatches.toDouble
  }

}


case class Size(where: Option[String] = None)
  extends StandardScanShareableAnalyzer[NumMatches]("Size", "*", Entity.Dataset)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    conditionalCount(where) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[NumMatches] = {
    ifNoNullsIn(result, offset) { _ =>
      NumMatches(result.getLong(offset))
    }
  }

  override def filterCondition: Option[String] = where
} 
Example 15
Source File: MinLength.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers._
import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
import org.apache.spark.sql.functions.{length, min}
import org.apache.spark.sql.types.{DoubleType, StructType}
import org.apache.spark.sql.{Column, Row}

case class MinLength(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[MinState]("MinLength", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    min(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[MinState] = {
    ifNoNullsIn(result, offset) { _ =>
      MinState(result.getDouble(offset))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: isString(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 16
Source File: Uniqueness.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.types.DoubleType


case class Uniqueness(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil
  }

  override def filterCondition: Option[String] = where
}

object Uniqueness {
  def apply(column: String): Uniqueness = {
    new Uniqueness(column :: Nil)
  }

  def apply(column: String, where: Option[String]): Uniqueness = {
    new Uniqueness(column :: Nil, where)
  }
} 
Example 17
Source File: Sum.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import org.apache.spark.sql.functions.sum
import org.apache.spark.sql.types.{DoubleType, StructType}
import org.apache.spark.sql.{Column, Row}
import Analyzers._

case class SumState(sum: Double) extends DoubleValuedState[SumState] {

  override def sum(other: SumState): SumState = {
    SumState(sum + other.sum)
  }

  override def metricValue(): Double = {
    sum
  }
}

case class Sum(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[SumState]("Sum", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    sum(conditionalSelection(column, where)).cast(DoubleType) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[SumState] = {
    ifNoNullsIn(result, offset) { _ =>
      SumState(result.getDouble(offset))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: isNumeric(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 18
Source File: MaxLength.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers._
import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isString}
import org.apache.spark.sql.functions.{length, max}
import org.apache.spark.sql.types.{DoubleType, StructType}
import org.apache.spark.sql.{Column, Row}

case class MaxLength(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[MaxState]("MaxLength", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    max(length(conditionalSelection(column, where))).cast(DoubleType) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {
    ifNoNullsIn(result, offset) { _ =>
      MaxState(result.getDouble(offset))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column):: isString(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 19
Source File: Correlation.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import com.amazon.deequ.metrics.Entity
import org.apache.spark.sql.DeequFunctions.stateful_corr
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.types.StructType
import Analyzers._

case class CorrelationState(
    n: Double,
    xAvg: Double,
    yAvg: Double,
    ck: Double,
    xMk: Double,
    yMk: Double)
  extends DoubleValuedState[CorrelationState] {

  require(n > 0.0, "Correlation undefined for n = 0.")

  override def sum(other: CorrelationState): CorrelationState = {
    val n1 = n
    val n2 = other.n
    val newN = n1 + n2
    val dx = other.xAvg - xAvg
    val dxN = if (newN == 0.0) 0.0 else dx / newN
    val dy = other.yAvg - yAvg
    val dyN = if (newN == 0.0) 0.0 else dy / newN
    val newXAvg = xAvg + dxN * n2
    val newYAvg = yAvg + dyN * n2
    val newCk = ck + other.ck + dx * dyN * n1 * n2
    val newXMk = xMk + other.xMk + dx * dxN * n1 * n2
    val newYMk = yMk + other.yMk + dy * dyN * n1 * n2

    CorrelationState(newN, newXAvg, newYAvg, newCk, newXMk, newYMk)
  }

  override def metricValue(): Double = {
    ck / math.sqrt(xMk * yMk)
  }
}


case class Correlation(
    firstColumn: String,
    secondColumn: String,
    where: Option[String] = None)
  extends StandardScanShareableAnalyzer[CorrelationState]("Correlation",
    s"$firstColumn,$secondColumn", Entity.Mutlicolumn)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {

    val firstSelection = conditionalSelection(firstColumn, where)
    val secondSelection = conditionalSelection(secondColumn, where)

    stateful_corr(firstSelection, secondSelection) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[CorrelationState] = {

    if (result.isNullAt(offset)) {
      None
    } else {
      val row = result.getAs[Row](offset)
      val n = row.getDouble(0)
      if (n > 0.0) {
        Some(CorrelationState(
          n,
          row.getDouble(1),
          row.getDouble(2),
          row.getDouble(3),
          row.getDouble(4),
          row.getDouble(5)))
      } else {
        None
      }
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(firstColumn) :: isNumeric(firstColumn) :: hasColumn(secondColumn) ::
      isNumeric(secondColumn) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 20
Source File: Entropy.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{col, sum, udf}


case class Entropy(column: String, where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    val summands = udf { (count: Double) =>
      if (count == 0.0) {
        0.0
      } else {
        -(count / numRows) * math.log(count / numRows)
      }
    }

    sum(summands(col(COUNT_COL))) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 21
Source File: StandardDeviation.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import org.apache.spark.sql.DeequFunctions.stateful_stddev_pop
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.types.StructType
import Analyzers._

case class StandardDeviationState(
    n: Double,
    avg: Double,
    m2: Double)
  extends DoubleValuedState[StandardDeviationState] {

  require(n > 0.0, "Standard deviation is undefined for n = 0.")

  override def metricValue(): Double = {
    math.sqrt(m2 / n)
  }

  override def sum(other: StandardDeviationState): StandardDeviationState = {
    val newN = n + other.n
    val delta = other.avg - avg
    val deltaN = if (newN == 0.0) 0.0 else delta / newN

    StandardDeviationState(newN, avg + deltaN * other.n,
      m2 + other.m2 + delta * deltaN * n * other.n)
  }
}

case class StandardDeviation(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[StandardDeviationState]("StandardDeviation", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    stateful_stddev_pop(conditionalSelection(column, where)) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[StandardDeviationState] = {

    if (result.isNullAt(offset)) {
      None
    } else {
      val row = result.getAs[Row](offset)
      val n = row.getDouble(0)

      if (n == 0.0) {
        None
      } else {
        Some(StandardDeviationState(n, row.getDouble(1), row.getDouble(2)))
      }
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: isNumeric(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 22
Source File: ExactEqualityConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ExactEqualityConstraint(other: DataFrame) extends Constraint {

  val fun = (df: DataFrame) => {
    val tryEquality = Try {
      if (df.schema != other.schema) {
        throw new IllegalArgumentException("Schemas do not match")
      }
      val dfGroupCount = df.groupBy(df.columns.map(new Column(_)):_*).count()
      val otherGroupCount = other.groupBy(df.columns.map(new Column(_)):_*).count()
      val diffCount1 = dfGroupCount.except(otherGroupCount).count()
      val diffCount2 = otherGroupCount.except(dfGroupCount).count()
      (diffCount1, diffCount2)
    }

    ExactEqualityConstraintResult(
      constraint = this,
      data = tryEquality.toOption.map {
        case (leftToRightCount, rightToLeftCount) => ExactEqualityConstraintData(leftToRightCount, rightToLeftCount)
      },
      status = ConstraintUtil.tryToStatus[(Long, Long)](tryEquality, {
        case (leftToRightCount, rightToLeftCount) => leftToRightCount + rightToLeftCount == 0
      })
    )
  }

}

case class ExactEqualityConstraintResult(constraint: ExactEqualityConstraint,
                                         data: Option[ExactEqualityConstraintData],
                                         status: ConstraintStatus) extends ConstraintResult[ExactEqualityConstraint] {
  val message: String = {
    val otherName = constraint.other.toString()
    val maybeNonMatchingRows = data.map(data => (data.numNonMatchingLeftToRight, data.numNonMatchingRightToLeft))
    val maybePluralS = maybeNonMatchingRows.map {
      case (leftToRightCount, rightToLeftCount) => (
        if (leftToRightCount == 1) "" else "s",
        if (rightToLeftCount == 1) "" else "s"
      )
    }
    val maybeVerb = maybeNonMatchingRows.map {
      case (leftToRightCount, rightToLeftCount) => (
        if (leftToRightCount == 1) "is" else "are",
        if (rightToLeftCount == 1) "is" else "are"
      )
    }
    (status, maybeNonMatchingRows, maybePluralS, maybeVerb) match {
      case (ConstraintSuccess, Some(_), Some(_), Some(_)) =>
        s"It is equal to $otherName."
      case (
        ConstraintFailure,
        Some((leftToRightRows, rightToLeftRows)),
        Some((leftToRightPluralS, rightToLeftPluralS)),
        Some((leftToRightVerb, rightToLeftVerb))
        ) =>
          s"It is not equal ($leftToRightRows distinct count row$leftToRightPluralS $leftToRightVerb " +
            s"present in the checked dataframe but not in the other " +
            s"and $rightToLeftRows distinct count row$rightToLeftPluralS $rightToLeftVerb " +
            s"present in the other dataframe but not in the checked one) to $otherName."
      case (ConstraintError(throwable), None, None, None) =>
        s"Checking equality with $otherName failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }
}

case class ExactEqualityConstraintData(numNonMatchingLeftToRight: Long, numNonMatchingRightToLeft: Long) 
Example 23
Source File: DataFrameFunctions.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import org.apache.spark.sql.{Column, Dataset, Row}


class DataFrameFunctions(self: DC[Row]) {

    def join(right: DC[Row]): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right)
      }
      val hashTarget = Seq("join")
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], usingColumn: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, usingColumn)
      }
      val hashTarget = Seq("join", usingColumn)
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")

    def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, joinExprs)
      }
      val hashTarget = Seq("join", joinType, joinExprs.toString())
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }


} 
Example 24
Source File: StructuredRepartition.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredRepartition() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val results = ds.repartition(config.coreNumber)
    
    val query = results.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
} 
Example 25
Source File: StructuredIdentity.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredIdentity() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val query = ds.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
} 
Example 26
Source File: QueryPeopleTest.scala    From apache-spark-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.spark.dataset

import com.github.dnvriend.TestSpec
import org.apache.spark.sql.{ Column, DataFrame }

class QueryPeopleTest extends TestSpec {

  it should "query using DSL" in withSparkSession { spark =>
    import spark.implicits._
    import org.apache.spark.sql.functions._

    val people: DataFrame =
      spark.read.parquet(TestSpec.PeopleParquet).cache() // name, age

    people.select('name).limit(1).as[String].head() shouldBe "foo"
    people.select($"name").limit(1).as[String].head() shouldBe "foo"
    people.select("name").limit(1).as[String].head() shouldBe "foo"

    people.select('age).limit(1).as[Int].head() shouldBe 30
    people.select($"age").limit(1).as[Int].head() shouldBe 30
    people.select("age").limit(1).as[Int].head() shouldBe 30

    // select a column from the Dataset
    val col1: Column = people("name")
    val col2: Column = people.col("name")

    val departments: DataFrame =
      Seq((1, "sales"), (2, "administration"), (3, "human resources"))
        .toDF("department_id", "department_name").cache()

    people
      .withColumn("department_id", lit(1))
      .withColumn("age_plus_ten", people("age") + 10)
      .as[(String, Int, Int, Int)].limit(1).head() shouldBe ("foo", 30, 1, 40)

    people
      .withColumn("department_id", lit(1))
      .withColumn("age_plus_ten", people("age") + 10)
      .as('people_dep_age)
      .join(departments, col("people_dep_age.department_id").equalTo(departments.col("department_id")))
      .select($"people_dep_age.name", col("people_dep_age.age"), departments.col("department_name"))
      .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales")

    val peopleDepAge: DataFrame =
      people
        .withColumn("department_id", lit(1))
        .withColumn("age_plus_ten", people("age") + 10)

    peopleDepAge
      .join(departments, peopleDepAge("department_id") === departments("department_id"))
      .select(peopleDepAge("name"), peopleDepAge("age"), departments("department_name"))
      .as[(String, Int, String)].limit(1).head() shouldBe ("foo", 30, "sales")

    peopleDepAge.filter($"age" > 30)
      .join(departments, peopleDepAge("department_id") === departments("department_id"))
      .agg(avg($"age"), max($"age")).limit(1)
      .as[(Double, Int)].head() shouldBe (45.0, 50)
  }
} 
Example 27
Source File: DatasetUtil.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import org.apache.spark.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata}
import org.apache.spark.sql.{Column, DataFrame, Dataset}


object DatasetUtil {
  def withColumns[T](ds: Dataset[T],
                     colNames: Seq[String],
                     cols: Seq[Column],
                     metadata: Seq[Metadata]): DataFrame = {
    require(colNames.size == cols.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of columns: ${cols.size}")
    require(colNames.size == metadata.size,
      s"The size of column names: ${colNames.size} isn't equal to " +
        s"the size of metadata elements: ${metadata.size}")

    val sparkSession = ds.sparkSession
    val queryExecution = ds.queryExecution
    val resolver = sparkSession.sessionState.analyzer.resolver
    val output = queryExecution.analyzed.output

    checkColumnNameDuplication(colNames,
      "in given column names",
      sparkSession.sessionState.conf.caseSensitiveAnalysis)

    val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) =>
      colName -> col.as(colName, metadata)
    }.toMap

    val replacedAndExistingColumns = output.map { field =>
      columnMap.find { case (colName, _) =>
        resolver(field.name, colName)
      } match {
        case Some((colName: String, col: Column)) => col.as(colName)
        case _ => new Column(field)
      }
    }

    val newColumns = columnMap.filter { case (colName, col) =>
      !output.exists(f => resolver(f.name, colName))
    }.map { case (colName, col) => col.as(colName) }

    ds.select(replacedAndExistingColumns ++ newColumns: _*)
  }

  def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = {
    withColumns(ds, Seq(colName), Seq(col), Seq(metadata))
  }

  private def checkColumnNameDuplication(columnNames: Seq[String], colType: String,
                                         caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }

  /**
    * Cast a column in a Dataset to Vector type.
    *
    * The supported data types of the input column are
    * - Vector
    * - float/double type Array.
    *
    * Note: The returned column does not have Metadata.
    *
    * @param dataset input DataFrame
    * @param colName column name.
    * @return Vector column
    */
  def columnToVector(dataset: Dataset[_], colName: String): Column = {
    val columnDataType = dataset.schema(colName).dataType
    columnDataType match {
      case _: VectorUDT => col(colName)
      case fdt: ArrayType =>
        val transferUDF = fdt.elementType match {
          case _: FloatType => udf(f = (vector: Seq[Float]) => {
            val inputArray = Array.fill[Double](vector.size)(0.0)
            vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble)
            Vectors.dense(inputArray)
          })
          case _: DoubleType => udf((vector: Seq[Double]) => {
            Vectors.dense(vector.toArray)
          })
          case other =>
            throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector")
        }
        transferUDF(col(colName))
      case other =>
        throw new IllegalArgumentException(s"$other column cannot be cast to Vector")
    }
  }

} 
Example 28
Source File: NumberOfRowsConstraintTest.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import de.frosner.ddq.core.Check
import de.frosner.ddq.testutils.{SparkContexts, TestData}
import org.apache.spark.sql.Column
import org.scalatest.{FlatSpec, Matchers}

class NumberOfRowsConstraintTest extends FlatSpec with Matchers with SparkContexts {

  "A NumberOfRowsConstraint" should "succeed if the actual number of rows is equal to the expected" in {
    val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).hasNumRows(_ === 3)
    val constraint = check.constraints.head
    val result = NumberOfRowsConstraintResult(
      constraint = NumberOfRowsConstraint(new Column(NumberOfRowsConstraint.countKey) === 3),
      actual = 3L,
      status = ConstraintSuccess
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  it should "fail if the number of rows is not in the expected range" in {
    val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).hasNumRows(
      numRows => numRows < 3 || numRows > 3
    )
    val constraint = check.constraints.head
    val numRowsColumn = new Column(NumberOfRowsConstraint.countKey)
    val result = NumberOfRowsConstraintResult(
      constraint = NumberOfRowsConstraint(numRowsColumn < 3 || numRowsColumn > 3),
      actual = 3L,
      status = ConstraintFailure
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  "A NumberOfRowsConstraintResult" should "have the correct success message" in {
    val constraint = NumberOfRowsConstraint(new Column("count") > 5L)
    val result = NumberOfRowsConstraintResult(
      constraint = constraint,
      actual = 5L,
      status = ConstraintSuccess
    )
    result.message shouldBe "The number of rows satisfies (count > 5)."
  }

  it should "have the correct failure message" in {
    val constraint = NumberOfRowsConstraint(new Column("count") === 5L)
    val result = NumberOfRowsConstraintResult(
      constraint = constraint,
      actual = 4L,
      status = ConstraintFailure
    )
    result.message shouldBe "The actual number of rows 4 does not satisfy (count = 5)."
  }

  it should "throw an exception if it is created with an illegal combination of fields" in {
    intercept[IllegalConstraintResultException] {
      NumberOfRowsConstraintResult(
        constraint = NumberOfRowsConstraint(new Column("count") === 5L),
        status = ConstraintError(new IllegalArgumentException("error")),
        actual = 4L
      )
    }
  }

  "NumberOfRowsConstraint.greaterThan" should "create a correct NumberOfRowsConstraint" in {
    val expected = 10
    val constraint = NumberOfRowsConstraint.greaterThan(expected)
    constraint shouldBe NumberOfRowsConstraint(new Column("count") > expected)
  }

  "NumberOfRowsConstraint.lessThan" should "create a correct NumberOfRowsConstraint" in {
    val expected = 10
    val constraint = NumberOfRowsConstraint.lessThan(expected)
    constraint shouldBe NumberOfRowsConstraint(new Column("count") < expected)
  }

  "NumberOfRowsConstraint.equalTo" should "create a correct NumberOfRowsConstraint" in {
    val expected = 10
    val constraint = NumberOfRowsConstraint.equalTo(expected)
    constraint shouldBe NumberOfRowsConstraint(new Column("count") === expected)
  }

} 
Example 29
Source File: RegexConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import java.util.regex.Pattern

import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class RegexConstraint(columnName: String, regex: String) extends Constraint {

  val fun = (df: DataFrame) => {
    val pattern = Pattern.compile(regex)
    val doesNotMatch = udf((column: String) => column != null && !pattern.matcher(column).find())
    val maybeDoesNotMatchCount = Try(df.filter(doesNotMatch(new Column(columnName))).count)
    RegexConstraintResult(
      constraint = this,
      data = maybeDoesNotMatchCount.toOption.map(RegexConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeDoesNotMatchCount, _ == 0)
    )
  }

}

case class RegexConstraintResult(constraint: RegexConstraint,
                                 data: Option[RegexConstraintResultData],
                                 status: ConstraintStatus) extends ConstraintResult[RegexConstraint] {

  val message: String = {
    val columnName = constraint.columnName
    val regex = constraint.regex
    val maybeFailedRows = data.map(_.failedRows)
    val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "does") else ("s", "do"))
    (status, maybeFailedRows, maybePluralSAndVerb) match {
      case (ConstraintSuccess, Some(0), _) =>
        s"Column $columnName matches $regex"
      case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) =>
        s"Column $columnName contains $failedRows row$pluralS that $verb not match $regex"
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column $columnName matches $regex failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class RegexConstraintResultData(failedRows: Long) 
Example 30
Source File: UniqueKeyConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class UniqueKeyConstraint(columnNames: Seq[String]) extends Constraint {

  require(columnNames.nonEmpty)

  val fun = (df: DataFrame) => {
    val columns = columnNames.map(name => new Column(name))
    val maybeNonUniqueRows = Try(df.groupBy(columns: _*).count.filter(new Column("count") > 1).count)
    UniqueKeyConstraintResult(
      constraint = this,
      data = maybeNonUniqueRows.toOption.map(UniqueKeyConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeNonUniqueRows, _ == 0)
    )
  }

}

case class UniqueKeyConstraintResult(constraint: UniqueKeyConstraint,
                                     data: Option[UniqueKeyConstraintResultData],
                                     status: ConstraintStatus) extends ConstraintResult[UniqueKeyConstraint] {

  val message: String = {
    val columnNames = constraint.columnNames
    val columnsString = columnNames.mkString(", ")
    val isPlural = columnNames.length > 1
    val columnNoun = "Column" + (if (isPlural) "s" else "")
    val columnVerb = if (isPlural) "are" else "is"
    val maybeNumNonUniqueTuples = data.map(_.numNonUniqueTuples)
    val maybePluralS = maybeNumNonUniqueTuples.map(numNonUniqueTuples => if (numNonUniqueTuples != 1) "s" else "")
    (status, maybeNumNonUniqueTuples, maybePluralS) match {
      case (ConstraintSuccess, Some(0), _) =>
        s"$columnNoun $columnsString $columnVerb a key."
      case (ConstraintFailure, Some(numNonUniqueTuples), Some(pluralS)) =>
        s"$columnNoun $columnsString $columnVerb not a key ($numNonUniqueTuples non-unique tuple$pluralS)."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether ${columnNoun.toLowerCase()} $columnsString $columnVerb a key failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class UniqueKeyConstraintResultData(numNonUniqueTuples: Long) 
Example 31
Source File: NeverNullConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class NeverNullConstraint(columnName: String) extends Constraint {

  val fun = (df: DataFrame) => {
    val tryNullCount = Try(df.filter(new Column(columnName).isNull).count)
    NeverNullConstraintResult(
      constraint = this,
      data = tryNullCount.toOption.map(NeverNullConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](tryNullCount, _ == 0)
    )
  }

}

case class NeverNullConstraintResult(constraint: NeverNullConstraint,
                                     data: Option[NeverNullConstraintResultData],
                                     status: ConstraintStatus) extends ConstraintResult[NeverNullConstraint] {
  val message: String = {
    val columnName = constraint.columnName
    val maybeNullRows = data.map(_.nullRows)
    val maybePluralS = maybeNullRows.map(nullRows => if (nullRows == 1) "" else "s")
    val maybeVerb = maybeNullRows.map(nullRows => if (nullRows == 1) "is" else "are")
    (status, maybeNullRows, maybePluralS, maybeVerb) match {
      case (ConstraintSuccess, Some(0), Some(pluralS), Some(verb)) =>
        s"Column $columnName is never null."
      case (ConstraintFailure, Some(nullRows), Some(pluralS), Some(verb)) =>
        s"Column $columnName contains $nullRows row$pluralS that $verb null (should never be null)."
      case (ConstraintError(throwable), None, None, None) =>
        s"Checking column $columnName for being never null failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }
}

case class NeverNullConstraintResultData(nullRows: Long) 
Example 32
Source File: ApproxCountDistinct.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.hasColumn
import org.apache.spark.sql.DeequFunctions.stateful_approx_count_distinct
import org.apache.spark.sql.catalyst.expressions.aggregate.DeequHyperLogLogPlusPlusUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Column, Row}
import Analyzers._

case class ApproxCountDistinctState(words: Array[Long])
  extends DoubleValuedState[ApproxCountDistinctState] {

  override def sum(other: ApproxCountDistinctState): ApproxCountDistinctState = {
    ApproxCountDistinctState(DeequHyperLogLogPlusPlusUtils.merge(words, other.words))
  }

  override def metricValue(): Double = {
    DeequHyperLogLogPlusPlusUtils.count(words)
  }

  override def toString: String = {
    s"ApproxCountDistinctState(${words.mkString(",")})"
  }
}


case class ApproxCountDistinct(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[ApproxCountDistinctState]("ApproxCountDistinct", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    stateful_approx_count_distinct(conditionalSelection(column, where)) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[ApproxCountDistinctState] = {

    ifNoNullsIn(result, offset) { _ =>
      DeequHyperLogLogPlusPlusUtils.wordsFromBytes(result.getAs[Array[Byte]](offset))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 33
Source File: ForeignKeyConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ForeignKeyConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint {

  val fun = (df: DataFrame) => {
    val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)}
    val (baseColumns, refColumns) = columnNames.unzip
    val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip

    // check if foreign key is a key in reference table
    val maybeNonUniqueRows = Try(
      referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count
    )
    if (maybeNonUniqueRows.toOption.exists(_ > 0)) {
      ForeignKeyConstraintResult(
        constraint = this,
        data = Some(ForeignKeyConstraintResultData(numNonMatchingRefs = None)),
        status = ConstraintFailure
      )
    } else {
      // rename all columns to avoid ambiguous column references
      val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => {
        val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map {
          case (original, renamed) => new Column(original).as(renamed)
        }: _*)
        val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map {
          case (original, renamed) => new Column(original).as(renamed)
        }: _*)
        (renamedDf, renamedRef)
      })

      // check if left outer join yields some null values
      val maybeLeftOuterJoin = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) =>
        val joinCondition = renamedColumns.map {
          case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn)
        }.reduce(_ && _)
        renamedDf.distinct.join(renamedRef, joinCondition, "outer")
      }

      val maybeNotMatchingRefs = maybeLeftOuterJoin.map(_.filter(renamedRefColumns.map(new Column(_).isNull).reduce(_ && _)).count)

      ForeignKeyConstraintResult(
        constraint = this,
        data = maybeNotMatchingRefs.toOption.map(Some(_)).map(ForeignKeyConstraintResultData),
        status = ConstraintUtil.tryToStatus[Long](maybeNotMatchingRefs, _ == 0)
      )
    }
  }

}

case class ForeignKeyConstraintResult(constraint: ForeignKeyConstraint,
                                      data: Option[ForeignKeyConstraintResultData],
                                      status: ConstraintStatus) extends ConstraintResult[ForeignKeyConstraint] {

  val message: String = {
    val referenceTable = constraint.referenceTable
    val columnNames = constraint.columnNames
    val columnsString = columnNames.map { case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ")
    val isPlural = columnNames.length > 1
    val (columnDo, columnDefine, columnIs, columnPluralS) =
      if (isPlural) ("do", "define", "are", "s") else ("does", "defines", "is", "")
    val columnNoun = "Column" + columnPluralS
    val maybeNumNonMatchingRefs = data.map(_.numNonMatchingRefs)
    (status, maybeNumNonMatchingRefs) match {
      case (ConstraintSuccess, Some(Some(0))) =>
        s"$columnNoun $columnsString $columnDefine a foreign key " +
        s"pointing to the reference table $referenceTable."
      case (ConstraintFailure, Some(None)) =>
        s"$columnNoun $columnsString $columnIs not a key in the reference table."
      case (ConstraintFailure, Some(Some(nonMatching))) =>
        val (rowsNoun, rowsDo) = if (nonMatching != 1) ("rows", "do") else ("row", "does")
        s"$columnNoun $columnsString $columnDo not define a foreign key " +
          s"pointing to $referenceTable. $nonMatching $rowsNoun $rowsDo not match."
      case (ConstraintError(throwable), None) =>
        s"Checking whether ${columnNoun.toLowerCase} $columnsString $columnDefine a foreign key failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class ForeignKeyConstraintResultData(numNonMatchingRefs: Option[Long]) 
Example 34
Source File: AnyOfConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class AnyOfConstraint(columnName: String, allowedValues: Set[Any]) extends Constraint {

  val fun = (df: DataFrame) => {
    val maybeError = Try(df.select(new Column(columnName))) // check if column is not ambiguous
    val maybeColumnIndex = maybeError.map(_ => df.columns.indexOf(columnName))
    val maybeNotAllowedCount = maybeColumnIndex.map(columnIndex => df.rdd.filter(row => !row.isNullAt(columnIndex) &&
      !allowedValues.contains(row.get(columnIndex))).count)
    AnyOfConstraintResult(
      constraint = this,
      data = maybeNotAllowedCount.toOption.map(AnyOfConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeNotAllowedCount, _ == 0)
    )
  }

}

case class AnyOfConstraintResult(constraint: AnyOfConstraint,
                                 data: Option[AnyOfConstraintResultData],
                                 status: ConstraintStatus) extends ConstraintResult[AnyOfConstraint] {
  val message: String = {
    val allowed = constraint.allowedValues
    val columnName = constraint.columnName
    val maybeFailedRows = data.map(_.failedRows)
    val maybePluralSAndVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) ("", "is") else ("s", "are"))
    (status, maybeFailedRows, maybePluralSAndVerb) match {
      case (ConstraintSuccess, Some(0), Some((pluralS, verb))) =>
        s"Column $columnName contains only values in $allowed."
      case (ConstraintFailure, Some(failedRows), Some((pluralS, verb))) =>
        s"Column $columnName contains $failedRows row$pluralS that $verb not in $allowed."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column $columnName contains only values in $allowed failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }
}

case class AnyOfConstraintResultData(failedRows: Long) 
Example 35
Source File: JoinableConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class JoinableConstraint(columnNames: Seq[(String, String)], referenceTable: DataFrame) extends Constraint {

  val fun = (df: DataFrame) => {
    val columnsMap = columnNames.toMap
    val renamedColumns = columnNames.map{ case (baseColumn, refColumn) => ("b_" + baseColumn, "r_" + refColumn)}
    val (baseColumns, refColumns) = columnNames.unzip
    val (renamedBaseColumns, renamedRefColumns) = renamedColumns.unzip

    val maybeNonUniqueRows = Try(
      referenceTable.groupBy(refColumns.map(new Column(_)):_*).count.filter(new Column("count") > 1).count
    )

    // rename all columns to avoid ambiguous column references
    val maybeRenamedDfAndRef = maybeNonUniqueRows.map(_ => {
      val renamedDf = df.select(baseColumns.zip(renamedBaseColumns).map {
        case (original, renamed) => new Column(original).as(renamed)
      }: _*)
      val renamedRef = referenceTable.select(refColumns.zip(renamedRefColumns).map {
        case (original, renamed) => new Column(original).as(renamed)
      }: _*)
      (renamedDf, renamedRef)
    })

    // check if join yields some values
    val maybeDistinctBeforeAndMatchingRows = maybeRenamedDfAndRef.map { case (renamedDf, renamedRef) =>
      val renamedDfDistinct = renamedDf.distinct
      val distinctBefore = renamedDfDistinct.count
      val joinCondition = renamedColumns.map{
        case (baseColumn, refColumn) => new Column(baseColumn) === new Column(refColumn)
      }.reduce(_ && _)
      val join = renamedDfDistinct.join(renamedRef, joinCondition)
      val matchingRows = join.distinct.count
      (distinctBefore, matchingRows)
    }

    JoinableConstraintResult(
      constraint = this,
      data = maybeDistinctBeforeAndMatchingRows.toOption.map{ case (distinctBefore, matchingRows) =>
        JoinableConstraintResultData(
          distinctBefore = distinctBefore,
          matchingKeys = matchingRows
        )
      },
      status = ConstraintUtil.tryToStatus[Long](maybeDistinctBeforeAndMatchingRows.map{
        case (distinctBefore, matchingRows) => matchingRows
      }, _ > 0)
    )
  }

}

case class JoinableConstraintResult(constraint: JoinableConstraint,
                                    data: Option[JoinableConstraintResultData],
                                    status: ConstraintStatus) extends ConstraintResult[JoinableConstraint] {

  val maybeMatchRatio: Option[Double] = data.map(d => d.matchingKeys.toDouble / d.distinctBefore)

  val message: String = {
    val columnNames = constraint.columnNames
    val columnsString = columnNames.map{ case (baseCol, refCol) => baseCol + "->" + refCol }.mkString(", ")
    val maybeMatchPercentage = maybeMatchRatio.map(_ * 100.0)
    (status, data, maybeMatchPercentage) match {
      case (ConstraintSuccess, Some(JoinableConstraintResultData(distinctBefore, matchingKeys)), Some(matchPercentage)) =>
        s"Key $columnsString can be used for joining. " +
        s"Join columns cardinality in base table: $distinctBefore. " +
        s"Join columns cardinality after joining: $matchingKeys (${"%.2f".format(matchPercentage)}" + "%)."
      case (ConstraintFailure, Some(_), Some(_)) => s"Key $columnsString cannot be used for joining (no result)."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether $columnsString can be used for joining failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class JoinableConstraintResultData(distinctBefore: Long, matchingKeys: Long) 
Example 36
Source File: ColumnColumnConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ColumnColumnConstraint(constraintColumn: Column) extends Constraint {

  val fun = (df: DataFrame) => {
    val maybeFailingRows = Try {
      val succeedingRows = df.filter(constraintColumn).count
      df.count - succeedingRows
    }
    ColumnColumnConstraintResult(
      constraint = this,
      data = maybeFailingRows.toOption.map(ColumnColumnConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0)
    )
  }

}

case class ColumnColumnConstraintResult(constraint: ColumnColumnConstraint,
                                        data: Option[ColumnColumnConstraintResultData],
                                        status: ConstraintStatus) extends ConstraintResult[ColumnColumnConstraint] {

  val message: String = ColumnConstraintUtil.createColumnConstraintMessage(
    status = status,
    constraintResult = this,
    constraintString = constraint.constraintColumn.toString,
    maybeViolatingRows = data.map(_.failedRows)
  )

}

case class ColumnColumnConstraintResultData(failedRows: Long) 
Example 37
Source File: ConditionalColumnConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class ConditionalColumnConstraint(statement: Column, implication: Column) extends Constraint {

  val fun = (df: DataFrame) => {
    val maybeFailingRows = Try {
      val succeedingRows = df.filter(!statement || implication).count
      df.count - succeedingRows
    }
    ConditionalColumnConstraintResult(
      constraint = this,
      data = maybeFailingRows.toOption.map(ConditionalColumnConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeFailingRows, _ == 0)
    )
  }

}

case class ConditionalColumnConstraintResult(constraint: ConditionalColumnConstraint,
                                             data: Option[ConditionalColumnConstraintResultData],
                                             status: ConstraintStatus) extends ConstraintResult[ConditionalColumnConstraint] {

  val message: String = ColumnConstraintUtil.createColumnConstraintMessage(
    status = status,
    constraintResult = this,
    constraintString = s"${constraint.statement} -> ${constraint.implication}",
    maybeViolatingRows = data.map(_.failedRows)
  )

}

case class ConditionalColumnConstraintResultData(failedRows: Long) 
Example 38
Source File: TypeConversionConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class TypeConversionConstraint(columnName: String,
                                    convertedType: DataType) extends Constraint {

  val fun = (df: DataFrame) => {
    val originalColumn = new Column(columnName)
    val castedColumnName = columnName + "_casted"
    val maybeCasted = Try(df.select(originalColumn, originalColumn.cast(convertedType).as(castedColumnName)))
    val maybeFailedCastsAndOriginalType = maybeCasted.map(casted => {
      val failedCastsCount = casted.filter(new Column(castedColumnName).isNull && originalColumn.isNotNull).count
      val originalType = df.schema.find(_.name == columnName).get.dataType
      (failedCastsCount, originalType)
    })
    TypeConversionConstraintResult(
      constraint = this,
      data = maybeFailedCastsAndOriginalType.toOption.map{ case (failedCastsCount, originalType) =>
        TypeConversionConstraintResultData(
          originalType = originalType,
          failedRows = failedCastsCount
        )
      },
      status = ConstraintUtil.tryToStatus[Long](maybeFailedCastsAndOriginalType.map{
        case (failedCastsCount, originalType) => failedCastsCount
      }, _ == 0)
    )
  }

}

case class TypeConversionConstraintResult(constraint: TypeConversionConstraint,
                                          data: Option[TypeConversionConstraintResultData],
                                          status: ConstraintStatus) extends ConstraintResult[TypeConversionConstraint] {

  val message: String = {
    val convertedType = constraint.convertedType
    val columnName = constraint.columnName
    val maybePluralSVerb = data.map(data => if (data.failedRows == 1) ("", "is") else ("s", "are"))
    (status, data, maybePluralSVerb) match {
      case (ConstraintSuccess, Some(TypeConversionConstraintResultData(originalType, 0)), _) =>
        s"Column $columnName can be converted from $originalType to $convertedType."
      case (ConstraintFailure, Some(TypeConversionConstraintResultData(originalType, failedRows)), Some((pluralS, verb))) =>
        s"Column $columnName cannot be converted from $originalType to $convertedType. " +
        s"$failedRows row$pluralS could not be converted."
      case (ConstraintError(throwable), None, None) =>
        s"Checking whether column $columnName can be converted to $convertedType failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class TypeConversionConstraintResultData(originalType: DataType, failedRows: Long) 
Example 39
Source File: DateFormatConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import java.text.SimpleDateFormat

import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class DateFormatConstraint(columnName: String,
                                formatString: String) extends Constraint {

  val fun = (df: DataFrame) => {
    val cannotBeDate = udf((column: String) =>
      column != null && Try {
        val format = new SimpleDateFormat(formatString)
        format.setLenient(false)
        format.parse(column)
      }.isFailure)
    val maybeCannotBeDateCount = Try(df.filter(cannotBeDate(new Column(columnName))).count)
    DateFormatConstraintResult(
      this,
      data = maybeCannotBeDateCount.toOption.map(DateFormatConstraintResultData),
      status = ConstraintUtil.tryToStatus[Long](maybeCannotBeDateCount, _ == 0)
    )
  }

}

case class DateFormatConstraintResult(constraint: DateFormatConstraint,
                                      data: Option[DateFormatConstraintResultData],
                                      status: ConstraintStatus) extends ConstraintResult[DateFormatConstraint] {

  val message: String = {
    val format = constraint.formatString
    val columnName = constraint.columnName
    val maybeFailedRows = data.map(_.failedRows)
    val maybePluralS = maybeFailedRows.map(failedRows => if (failedRows == 1) "" else "s")
    val maybeVerb = maybeFailedRows.map(failedRows => if (failedRows == 1) "is" else "are")
    (status, maybeFailedRows, maybePluralS, maybeVerb) match {
      case (ConstraintSuccess, Some(0), _, _) =>
        s"Column $columnName is formatted by $format."
      case (ConstraintFailure, Some(failedRows), Some(pluralS), Some(verb)) =>
        s"Column $columnName contains $failedRows row$pluralS that $verb not formatted by $format."
      case (ConstraintError(throwable), None, None, None) =>
        s"Checking whether column $columnName is formatted by $format failed: $throwable"
      case default => throw IllegalConstraintResultException(this)
    }

  }

}

case class DateFormatConstraintResultData(failedRows: Long) 
Example 40
Source File: AlwaysNullConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.{Column, DataFrame}

import scala.util.Try

case class AlwaysNullConstraint(columnName: String) extends Constraint {

  override val fun = (df: DataFrame) => {
    val tryNotNullCount = Try(df.filter(new Column(columnName).isNotNull).count)
    AlwaysNullConstraintResult(
      constraint = this,
      status = ConstraintUtil.tryToStatus[Long](tryNotNullCount, _ == 0),
      data = tryNotNullCount.toOption.map(AlwaysNullConstraintResultData)
    )
  }

}

case class AlwaysNullConstraintResult(constraint: AlwaysNullConstraint,
                                      status: ConstraintStatus,
                                      data: Option[AlwaysNullConstraintResultData]
                                     ) extends ConstraintResult[AlwaysNullConstraint] {

  val message: String = {
    val columnName = constraint.columnName
    val maybeNonNullRows = data.map(_.nonNullRows)
    val maybePluralS = maybeNonNullRows.map(n => if (n == 1) "" else "s")
    (status, maybeNonNullRows, maybePluralS) match {
      case (ConstraintError(throwable), None, None) =>
        s"Checking column $columnName for being always null failed: $throwable"
      case (ConstraintSuccess, Some(0), Some(pluralS)) =>
        s"Column $columnName is always null."
      case (ConstraintFailure, Some(nonNullRows), Some(pluralS)) =>
        s"Column $columnName contains $nonNullRows non-null row$pluralS (should always be null)."
      case default => throw IllegalConstraintResultException(this)
    }
  }

}

case class AlwaysNullConstraintResultData(nonNullRows: Long) 
Example 41
Source File: NumberOfRowsConstraint.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import org.apache.spark.sql.functions.count
import org.apache.spark.sql.{Column, DataFrame}

case class NumberOfRowsConstraint private[ddq] (expected: Column) extends Constraint {

  val fun = (df: DataFrame) => {
    val countDf = df.agg(count(new Column("*")).as(NumberOfRowsConstraint.countKey))
    val actual = countDf.collect().map(_.getLong(0)).apply(0)
    val satisfied = countDf.select(expected).collect().map(_.getBoolean(0)).apply(0)
    NumberOfRowsConstraintResult(
      constraint = this,
      actual = actual,
      status = if (satisfied) ConstraintSuccess else ConstraintFailure
    )
  }

}

object NumberOfRowsConstraint {

  private[constraints] val countKey: String = "count"

  def apply(expected: Column => Column): NumberOfRowsConstraint = {
    new NumberOfRowsConstraint(expected(new Column(countKey)))
  }

  def greaterThan(expected: Int): NumberOfRowsConstraint = {
    NumberOfRowsConstraint(_ > expected)
  }

  def lessThan(expected: Int): NumberOfRowsConstraint = {
    NumberOfRowsConstraint(_ < expected)
  }

  def equalTo(expected: Int): NumberOfRowsConstraint = {
    NumberOfRowsConstraint(_ === expected)
  }

}

case class NumberOfRowsConstraintResult(constraint: NumberOfRowsConstraint,
                                        actual: Long,
                                        status: ConstraintStatus) extends ConstraintResult[NumberOfRowsConstraint] {

  val message: String = {
    val expected = constraint.expected
    status match {
      case ConstraintSuccess => s"The number of rows satisfies $expected."
      case ConstraintFailure => s"The actual number of rows $actual does not satisfy $expected."
      case default => throw IllegalConstraintResultException(this)
    }
  }

} 
Example 42
Source File: HivemallUtils.scala    From hivemall-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive

import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, DataFrame, Row, UserDefinedFunction}

object HivemallUtils {

  // # of maximum dimensions for feature vectors
  val maxDims = 100000000

  
  def funcVectorizer(dense: Boolean = false, dims: Int = maxDims)
    : UserDefinedFunction = {
    udf(funcVectorizerImpl(dense, dims))
  }

  private def funcVectorizerImpl(dense: Boolean, dims: Int)
    : Seq[String] => Vector = {
    if (dense) {
      // Dense features
      i: Seq[String] => {
        val features = new Array[Double](dims)
        i.map { ft =>
          val s = ft.split(":").ensuring(_.size == 2)
          features(s(0).toInt) = s(1).toDouble
        }
        Vectors.dense(features)
      }
    } else {
      // Sparse features
      i: Seq[String] => {
        val features = i.map { ft =>
          // val s = ft.split(":").ensuring(_.size == 2)
          val s = ft.split(":")
          (s(0).toInt, s(1).toDouble)
        }
        Vectors.sparse(dims, features)
      }
    }
  }
} 
Example 43
Source File: udfs.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Column
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.DoubleType

import scala.collection.mutable

//scalastyle:off
object udfs {

  def get_value_at(colName: String, i: Int): Column = {
    udf({
      vec: org.apache.spark.ml.linalg.Vector => vec(i)
    }, DoubleType)(col(colName))
  }

  val to_vector: UserDefinedFunction = udf({
    arr: Seq[Double] => Vectors.dense(arr.toArray)
  }, VectorType)

  def to_vector(colName: String): Column = to_vector(col(colName))

} 
Example 44
Source File: UDFTransformer.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable}
import com.microsoft.ml.spark.core.env.InternalWrapper
import com.microsoft.ml.spark.core.serialize.ComplexParam
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.execution.python.UserDefinedPythonFunction
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.types.{DataType, StructField, StructType}
import org.apache.spark.sql.{Column, DataFrame, Dataset}
import org.apache.spark.sql.functions.col

object UDFTransformer extends ComplexParamsReadable[UDFTransformer]


  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    if (isSet(inputCol)) {
      dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol)))
    } else {
      dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*))
    }
  }

  def validateAndTransformSchema(schema: StructType): StructType = {
    if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*))
    schema.add(StructField(getOutputCol, getDataType))
  }

  def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema)

  def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra)

} 
Example 45
Source File: ServingUDFs.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.sql.execution.streaming

import com.microsoft.ml.spark.io.http.HTTPResponseData
import com.microsoft.ml.spark.io.http.HTTPSchema.{binary_to_response, empty_response, string_to_response}
import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{lit, struct, to_json, udf}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, Row}

import scala.util.Try

object ServingUDFs {

  private def jsonReply(c: Column) = string_to_response(to_json(c))

  def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = {
    dt match {
      case NullType => empty_response(code, reason)
      case StringType => string_to_response(data, code, reason)
      case BinaryType => binary_to_response(data)
      case _: StructType => jsonReply(data)
      case _: MapType => jsonReply(data)
      case at: ArrayType => at.elementType match {
        case _: StructType => jsonReply(data)
        case _: MapType => jsonReply(data)
        case _ => jsonReply(struct(data))
      }
      case _ => jsonReply(struct(data))
    }
  }

  private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = {
    if (Option(reply).isEmpty || Option(id).isEmpty) {
      null.asInstanceOf[Boolean] //scalastyle:ignore null
    } else {
      Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply)))
        .toOption.isDefined
    }
  }

  def sendReplyUDF: UserDefinedFunction = {
    val toData = HTTPResponseData.makeFromRowConverter
    udf(sendReplyHelper(toData) _, BooleanType)
  }

} 
Example 46
Source File: DeltaTableOperations.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package io.delta.tables.execution

import scala.collection.Map

import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate}
import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand}
import org.apache.spark.sql.delta.util.AnalysisHelper
import io.delta.tables.DeltaTable

import org.apache.spark.sql.{functions, Column, DataFrame, Dataset}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
import org.apache.spark.sql.catalyst.plans.logical._


trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable =>

  protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError {
    val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition)
    toDataset(sparkSession, delete)
  }

  protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = {
    val history = new DeltaHistoryManager(deltaLog)
    val spark = self.toDF.sparkSession
    spark.createDataFrame(history.getHistory(limit))
  }

  protected def executeGenerate(tblIdentifier: String, mode: String): Unit = {
    val tableId: TableIdentifier = sparkSession
      .sessionState
      .sqlParser
      .parseTableIdentifier(tblIdentifier)
    val generate = DeltaGenerateCommand(mode, tableId)
    generate.run(sparkSession)
  }

  protected def executeUpdate(
      set: Map[String, Column],
      condition: Option[Column]): Unit = improveUnsupportedOpError {
    val assignments = set.map { case (targetColName, column) =>
      Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr)
    }.toSeq
    val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr))
    toDataset(sparkSession, update)
  }

  protected def executeVacuum(
      deltaLog: DeltaLog,
      retentionHours: Option[Double]): DataFrame = {
    VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours)
    sparkSession.emptyDataFrame
  }

  protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = {
    map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap
  }

  protected def sparkSession = self.toDF.sparkSession
} 
Example 47
Source File: HasEmbeddingsProperties.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.AnnotatorType
import org.apache.spark.ml.param.{BooleanParam, IntParam, Params}
import org.apache.spark.sql.Column
import org.apache.spark.sql.types.MetadataBuilder

trait HasEmbeddingsProperties extends Params {

  val dimension = new IntParam(this, "dimension", "Number of embedding dimensions")

  def setDimension(value: Int): this.type = set(this.dimension, value)
  def getDimension: Int = $(dimension)

  protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

  protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS)
    metadataBuilder.putLong("dimension", embeddingsDim.toLong)
    embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref))
    col.as(col.toString, metadataBuilder.build)
  }

} 
Example 48
Source File: PartitionHelpers.scala    From m3d-engine   with Apache License 2.0 5 votes vote down vote up
package com.adidas.analytics.algo.core

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}


trait PartitionHelpers {

  protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = {
    val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString))

    outputDataFrame.select(targetPartitionsColumns: _*).distinct
  }

  protected def getParameterValue(row: Row, partitionString: String): String =
    createParameterValue(row.get(row.fieldIndex(partitionString)))

  protected def createParameterValue(partitionRawValue: Any): String =
    partitionRawValue match {
      case value: java.lang.Short => value.toString
      case value: java.lang.Integer => value.toString
      case value: scala.Predef.String => "'" + value + "'"
      case null => throw new Exception("Partition Value is null. No support for null partitions!")
      case value => throw new Exception("Unsupported partition DataType: " + value.getClass)
    }
} 
Example 49
Source File: FrequentItems.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 50
Source File: DataFrameModifierHelper.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.driver.writer

import com.stratio.sparta.sdk.pipeline.autoCalculations.AutoCalculatedField
import com.stratio.sparta.sdk.pipeline.output.Output
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{Column, DataFrame}

object DataFrameModifierHelper {

  def applyAutoCalculateFields(dataFrame: DataFrame,
                               autoCalculateFields: Seq[AutoCalculatedField],
                               auxSchema: StructType): DataFrame =
    autoCalculateFields.headOption match {
      case Some(firstAutoCalculate) =>
        applyAutoCalculateFields(
          addColumnToDataFrame(dataFrame, firstAutoCalculate, auxSchema), autoCalculateFields.drop(1), auxSchema)
      case None =>
        dataFrame
    }

  private[driver] def addColumnToDataFrame(dataFrame: DataFrame,
                                   autoCalculateField: AutoCalculatedField,
                                   auxSchema: StructType): DataFrame = {
    (autoCalculateField.fromNotNullFields,
      autoCalculateField.fromPkFields,
      autoCalculateField.fromFields,
      autoCalculateField.fromFixedValue) match {
      case (Some(fromNotNullFields), _, _, _) =>
        val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field =>
          if (!field.nullable) Some(col(field.name)) else None).toSeq
        addField(fromNotNullFields.field.name, fromNotNullFields.field.outputType, dataFrame, fields)
      case (None, Some(fromPkFields), _, _) =>
        val fields = fieldsWithAuxMetadata(dataFrame.schema.fields, auxSchema.fields).flatMap(field =>
          if (field.metadata.contains(Output.PrimaryKeyMetadataKey)) Some(col(field.name)) else None).toSeq
        addField(fromPkFields.field.name, fromPkFields.field.outputType, dataFrame, fields)
      case (None, None, Some(fromFields), _) =>
        val fields = autoCalculateField.fromFields.get.fromFields.map(field => col(field))
        addField(fromFields.field.name, fromFields.field.outputType, dataFrame, fields)
      case (None, None, None, Some(fromFixedValue)) =>
        addLiteral(fromFixedValue.field.name, fromFixedValue.field.outputType, dataFrame, fromFixedValue.value)
      case _ => dataFrame
    }
  }

  private[driver] def addField(name: String, outputType: String, dataFrame: DataFrame, fields: Seq[Column]): DataFrame =
    outputType match {
      case "string" => dataFrame.withColumn(name, concat_ws(Output.Separator, fields: _*))
      case "array" => dataFrame.withColumn(name, array(fields: _*))
      case "map" => dataFrame.withColumn(name, struct(fields: _*))
      case _ => dataFrame
    }

  private[driver] def addLiteral(name: String, outputType: String, dataFrame: DataFrame, literal: String): DataFrame =
    outputType match {
      case "string" => dataFrame.withColumn(name, lit(literal))
      case "array" => dataFrame.withColumn(name, array(lit(literal)))
      case "map" => dataFrame.withColumn(name, struct(lit(literal)))
      case _ => dataFrame
    }

  private[driver] def fieldsWithAuxMetadata(dataFrameFields: Array[StructField], auxFields: Array[StructField]) =
    dataFrameFields.map(field => {
      auxFields.find(auxField => auxField.name == field.name) match {
        case Some(auxFounded) => field.copy(metadata = auxFounded.metadata)
        case None => field
      }
    })
} 
Example 51
Source File: ShortestPaths.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import java.util

import scala.collection.JavaConverters._

import org.apache.spark.graphx.{lib => graphxlib}
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.api.java.UDF1
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{IntegerType, MapType}

import org.graphframes.GraphFrame


  def landmarks(value: util.ArrayList[Any]): this.type = {
    landmarks(value.asScala)
  }

  def run(): DataFrame = {
    ShortestPaths.run(graph, check(lmarks, "landmarks"))
  }
}

private object ShortestPaths {

  private def run(graph: GraphFrame, landmarks: Seq[Any]): DataFrame = {
    val idType = graph.vertices.schema(GraphFrame.ID).dataType
    val longIdToLandmark = landmarks.map(l => GraphXConversions.integralId(graph, l) -> l).toMap
    val gx = graphxlib.ShortestPaths.run(
      graph.cachedTopologyGraphX,
      longIdToLandmark.keys.toSeq.sorted).mapVertices { case (_, m) => m.toSeq }
    val g = GraphXConversions.fromGraphX(graph, gx, vertexNames = Seq(DISTANCE_ID))
    val distanceCol: Column = if (graph.hasIntegralIdType) {
      // It seems there are no easy way to convert a sequence of pairs into a map
      val mapToLandmark = udf { distances: Seq[Row] =>
        distances.map { case Row(k: Long, v: Int) =>
          k -> v
        }.toMap
      }
      mapToLandmark(g.vertices(DISTANCE_ID))
    } else {
      val func = new UDF1[Seq[Row], Map[Any, Int]] {
        override def call(t1: Seq[Row]): Map[Any, Int] = {
          t1.map { case Row(k: Long, v: Int) =>
              longIdToLandmark(k) -> v
          }.toMap
        }
      }
      val mapToLandmark = udf(func, MapType(idType, IntegerType, false))
      mapToLandmark(col(DISTANCE_ID))
    }
    val cols = graph.vertices.columns.map(col) :+ distanceCol.as(DISTANCE_ID)
    g.vertices.select(cols: _*)
  }

  private val DISTANCE_ID = "distances"

} 
Example 52
Source File: VerifyVowpalWabbitRegressorFuzzing.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.test.benchmarks.DatasetUtils
import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{Column, DataFrame}

class VerifyVowpalWabbitRegressorFuzzing extends EstimatorFuzzing[VowpalWabbitRegressor] {
  val numPartitions = 2

  
  def readCSV(fileName: String, fileLocation: String): DataFrame = {
    session.read
      .option("header", "true").option("inferSchema", "true")
      .option("treatEmptyValuesAsNulls", "false")
      .option("delimiter", if (fileName.endsWith(".csv")) "," else "\t")
      .csv(fileLocation)
  }

  override def reader: MLReadable[_] = VowpalWabbitRegressor

  override def modelReader: MLReadable[_] = VowpalWabbitRegressionModel

  override def testObjects(): Seq[TestObject[VowpalWabbitRegressor]] = {
    val fileName = "energyefficiency2012_data.train.csv"
    val columnsFilter = Some("X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2")
    val labelCol = "Y1"

    val fileLocation = DatasetUtils.regressionTrainFile(fileName).toString
    val readDataset = readCSV(fileName, fileLocation).repartition(numPartitions)
    val dataset =
      if (columnsFilter.isDefined) {
        readDataset.select(columnsFilter.get.split(",").map(new Column(_)): _*)
      } else {
        readDataset
      }

    val featuresColumn = "features"

    val featurizer = new VowpalWabbitFeaturizer()
      .setInputCols(dataset.columns.filter(col => col != labelCol))
      .setOutputCol("features")

    val vw = new VowpalWabbitRegressor()
    val predCol = "pred"
    val trainData = featurizer.transform(dataset)
    val model = vw.setLabelCol(labelCol)
      .setFeaturesCol("features")
      .setPredictionCol(predCol)
      .fit(trainData)

    Seq(new TestObject(
      vw,
      trainData))
  }
} 
Example 53
Source File: FrequentItems.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 54
Source File: UserDefinedFunction.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.annotation.InterfaceStability
import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.catalyst.expressions.ScalaUDF
import org.apache.spark.sql.types.DataType


  def asNondeterministic(): UserDefinedFunction = {
    if (!_deterministic) {
      this
    } else {
      val udf = copyAll()
      udf._deterministic = false
      udf
    }
  }
}

// We have to use a name different than `UserDefinedFunction` here, to avoid breaking the binary
// compatibility of the auto-generate UserDefinedFunction object.
private[sql] object SparkUserDefinedFunction {

  def create(
      f: AnyRef,
      dataType: DataType,
      inputSchemas: Seq[Option[ScalaReflection.Schema]]): UserDefinedFunction = {
    val inputTypes = if (inputSchemas.contains(None)) {
      None
    } else {
      Some(inputSchemas.map(_.get.dataType))
    }
    val udf = new UserDefinedFunction(f, dataType, inputTypes)
    udf.nullableTypes = Some(inputSchemas.map(_.map(_.nullable).getOrElse(true)))
    udf
  }
} 
Example 55
Source File: GroupSortedDataset.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted.sql

import scala.reflect.ClassTag

import org.apache.spark.sql.{ Column, Dataset, Encoder }
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder }

import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate }

object GroupSortedDataset {
  private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = {
    val key = col(dataset.columns.head)
    val valueSort = {
      val sort = sortBy(col(dataset.columns.last))
      if (reverse) sort.desc else sort.asc
    }
    new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort))
  }
}

class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable {
  def toDS: Dataset[(K, V)] = dataset

  def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f))
  }

  def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIterator(_)(f))
  }

  def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f)))
  }

  def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] =
    mapStreamByKey(iter => Iterator(iter.reduceLeft(f)))

  def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(_.scanLeft(wCreate())(f))
  }
} 
Example 56
Source File: Filter.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.common

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.{Column, DataFrame}

class Filter extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Filter by condition"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var condition: String = _

  override def setProperties(map: Map[String, Any]): Unit = {
    condition = MapUtil.get(map,"condition").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val condition = new PropertyDescriptor().name("condition").
      displayName("condition")
      .description("The condition you want to filter")
      .defaultValue("name=='zhangsan'")
      .required(true)
      .example("name=='zhangsan'")
    descriptor = condition :: descriptor
    descriptor

  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/common/SelectField.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.CommonGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val df = in.read()

    var filterDF : DataFrame = df.filter(condition)

    out.write(filterDF)
  }
} 
Example 57
Source File: SelectField.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.common

import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.{Column, DataFrame}

import scala.beans.BeanProperty


class SelectField extends ConfigurableStop {

  val authorEmail: String = "[email protected]"
  val description: String = "Select data column"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)

  var columnNames:String = _

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val df = in.read()

    val field = columnNames.split(",").map(x => x.trim)
    val columnArray : Array[Column] = new Array[Column](field.size)
    for(i <- 0 to field.size - 1){
      columnArray(i) = new Column(field(i))
    }

    var finalFieldDF : DataFrame = df.select(columnArray:_*)
    out.write(finalFieldDF)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  def setProperties(map : Map[String, Any]): Unit = {
    columnNames = MapUtil.get(map,"columnNames").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val inports = new PropertyDescriptor()
      .name("columnNames")
      .displayName("ColumnNames")
      .description("Select the column you want,multiple columns separated by commas")
      .defaultValue("")
      .required(true)
      .example("id,name")
    descriptor = inports :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/common/SelectField.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.CommonGroup)
  }

} 
Example 58
Source File: Join.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.common

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.{Column, DataFrame}

class Join extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Table joins include full join, left join, right join and inner join"
  override val inportList: List[String] =List(Port.LeftPort,Port.RightPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var joinMode:String=_
  var correlationColumn:String=_

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val leftDF =  in.read(Port.LeftPort)
    val rightDF = in.read(Port.RightPort)

    var seq: Seq[String]= Seq()
    correlationColumn.split(",").foreach(x=>{
      seq = seq .++(Seq(x.trim.toString))
    })

    var df: DataFrame = null
    joinMode match {
      case "inner" =>df = leftDF.join(rightDF, seq)
      case "left" => df = leftDF.join(rightDF,seq,"left_outer")
      case "right" => df = leftDF.join(rightDF,seq,"right_outer")
      case "full_outer" => df = leftDF.join(rightDF,seq,"outer")
    }
    out.write(df)
  }

  override def setProperties(map: Map[String, Any]): Unit = {
    joinMode = MapUtil.get(map,"joinMode").asInstanceOf[String]
    correlationColumn = MapUtil.get(map,"correlationColumn").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()

    val joinMode = new PropertyDescriptor()
      .name("joinMode")
      .displayName("JoinMode")
      .description("For table associations, you can choose inner,left,right,full")
      .allowableValues(Set("inner","left","right","full_outer"))
      .defaultValue("inner")
      .required(true)
      .example("left")
    descriptor = joinMode :: descriptor

    val correlationColumn = new PropertyDescriptor()
      .name("correlationColumn")
      .displayName("CorrelationColumn")
      .description("Columns associated with tables,if multiple are separated by commas")
      .defaultValue("")
      .required(true)
      .example("id,name")
    descriptor = correlationColumn :: descriptor

    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/common/Join.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.CommonGroup)
  }


  override def initialize(ctx: ProcessContext): Unit = {

  }

} 
Example 59
Source File: JsonUtil.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.util

import org.apache.spark.sql.functions.explode
import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession}

import scala.collection.mutable.ArrayBuffer

object JsonUtil extends Serializable{


//  The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField)
  def ParserJsonDF(df:DataFrame,tag:String): DataFrame = {

    var openArrField:String=""
    var ArrSchame:String=""

    var tagARR: Array[String] = tag.split(",")
    var tagNew:String=""


    for(tt<-tagARR){

      if(tt.indexOf("_")> -1){
        //包含“.”
        val openField: Array[String] = tt.split("_")
        openArrField=openField(0)

        ArrSchame+=(openField(1)+",")
      }else{
        tagNew+=(tt+",")
      }
    }
    tagNew+=openArrField
    ArrSchame=ArrSchame.substring(0,ArrSchame.length-1)

    tagARR = tagNew.split(",")
    var FinalDF:DataFrame=df

    //如果用户选择返回字段
    var strings: Seq[Column] =tagNew.split(",").toSeq.map(p => new Column(p))

    if(tag.length>0){
      val df00 = FinalDF.select(strings : _*)
      FinalDF=df00
    }

    //如果用户选择打开的数组字段,并给出schame
    if(openArrField.length>0&&ArrSchame.length>0){

      val schames: Array[String] = ArrSchame.split(",")

      var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段
      //遍历数组,封装到column对象中
      var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用
      val sss = tagNew.split(",")//打开字段后todf方法用
      var co: Column =null
      for(each<-tagARR){
        if(each==openArrField){
          co = explode(FinalDF(openArrField))
          for(x<-schames){

            selARR+=(openArrField+"."+x)
          }
        }else{
          selARR+=each
          co=FinalDF(each)
        }
        coARR+=co
      }
      println("###################")
      selARR.foreach(println(_))
      var selSEQ: Seq[Column] = selARR.toSeq.map(q => new Column(q))

      var df01: DataFrame = FinalDF.select(coARR : _*).toDF(sss:_*)
      FinalDF = df01.select(selSEQ : _*)

    }

FinalDF

  }
} 
Example 60
Source File: FieldPointer.scala    From ArchiveSpark   with MIT License 5 votes vote down vote up
package org.archive.archivespark.model.pointers

import org.apache.spark.sql
import org.apache.spark.sql.Column
import org.archive.archivespark.model._
import org.archive.archivespark.util.SelectorUtil

trait GenericFieldPointer[+R <: EnrichRoot, +T] extends Serializable { this: FieldPointer[_, _] =>
}

trait FieldPointer[Root <: EnrichRoot, T] extends GenericFieldPointer[Root, T] {
  def path[R <: Root](root: EnrichRootCompanion[R]): Seq[String]

  def get(root: Root): Option[T] = enrichable(root).map(_.get)

  def exists(root: Root): Boolean = root[T](path(root)).isDefined

  def enrichable(root: Root): Option[TypedEnrichable[T]] = {
    val initialized = init(root, excludeFromOutput = false)
    initialized[T](path(initialized))
  }

  def multi: MultiFieldPointer[Root, T] = new SingleToMultiFieldPointer[Root, T](this)

  def init[R <: Root](root: R, excludeFromOutput: Boolean): R = root

  def pathTo[R <: Root](root: EnrichRootCompanion[R], field: String): Seq[String] = path(root) ++ SelectorUtil.parse(field)

  def col(root: EnrichRootCompanion[Root]): Column = sql.functions.col(SelectorUtil.toString(path(root).filter(f => f != "*" && !f.startsWith("["))))

  def parent[A]: FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq.empty)

  def child[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 0, Seq(field))

  def sibling[A](field: String): FieldPointer[Root, A] = new RelativeFieldPointer(this, 1, Seq(field))

  def mapEnrichable[A](field: String)(f: TypedEnrichable[T] => A): EnrichFunc[Root, T, A] = {
    val sourcePointer = this
    new EnrichFunc[Root, T, A] {
      override def source: FieldPointer[Root, T] = sourcePointer
      override def fields: Seq[String] = Seq(field)
      override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = {
        derivatives << f(source)
      }
    }
  }

  def map[A](field: String)(f: T => A): EnrichFunc[Root, T, A] = mapEnrichable(field)(e => f(e.get))

  def mapMultiEnrichable[A](field: String)(f: TypedEnrichable[T] => Seq[A]): MultiEnrichFunc[Root, T, A] = {
    val sourcePointer = this
    new MultiEnrichFunc[Root, T, A] {
      override def source: FieldPointer[Root, T] = sourcePointer
      override def fields: Seq[String] = Seq(field)
      override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = {
        derivatives.setNext(MultiValueEnrichable(f(source)))
      }
    }
  }

  def mapMulti[A](field: String)(f: T => Seq[A]): MultiEnrichFunc[Root, T, A] = mapMultiEnrichable(field)(e => f(e.get))

  def mapIdentity(field: String): EnrichFunc[Root, T, T] = {
    val sourcePointer = this
    new EnrichFunc[Root, T, T] {
      override def source: FieldPointer[Root, T] = sourcePointer
      override def fields: Seq[String] = Seq(field)
      override def derive(source: TypedEnrichable[T], derivatives: Derivatives): Unit = {
        derivatives.setNext(IdentityField[T])
      }
    }
  }
}

object FieldPointer {
  def apply[Root <: EnrichRoot, T](path: String): FieldPointer[Root, T] = apply(SelectorUtil.parse(path))
  def apply[Root <: EnrichRoot, T](path: Seq[String]): FieldPointer[Root, T] = new PathFieldPointer(path)

  def multi[Root <: EnrichRoot, T](path: String): MultiFieldPointer[Root, T] = multi(SelectorUtil.parse(path))
  def multi[Root <: EnrichRoot, T](path: Seq[String]): MultiFieldPointer[Root, T] = apply(path).multi

  def root[Root <: TypedEnrichRoot[T], T]: FieldPointer[Root, T] = new PathFieldPointer(Seq.empty)
} 
Example 61
Source File: FrequentItems.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 62
Source File: SchemaColumnSelection.scala    From data-faker   with MIT License 5 votes vote down vote up
package com.dunnhumby.datafaker.schema.table.columns

import scala.reflect.runtime.universe.TypeTag
import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{rand, udf}

case class SchemaColumnSelection[T](override val name: String, values: List[T])(implicit tag: TypeTag[T]) extends SchemaColumn {
  override def column(rowID: Option[Column] = None): Column = {
    val intToSelectionUDF = udf((index: Int) => {
      values(index)
    })

    intToSelectionUDF(rand() * values.length % values.length)
  }
}

object SchemaColumnSelectionProtocol extends SchemaColumnSelectionProtocol
trait SchemaColumnSelectionProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnSelectionFormat extends YamlFormat[SchemaColumnSelection[_]] {

    override def read(yaml: YamlValue): SchemaColumnSelection[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError("data_type not set"))
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val values = fields.getOrElse(YamlString("values"), deserializationError("selection values not set"))

      dataType match {
        case SchemaColumnDataType.Int => SchemaColumnSelection(name, values.convertTo[List[Int]])
        case SchemaColumnDataType.Long => SchemaColumnSelection(name, values.convertTo[List[Long]])
        case SchemaColumnDataType.Float => SchemaColumnSelection(name, values.convertTo[List[Float]])
        case SchemaColumnDataType.Double => SchemaColumnSelection(name, values.convertTo[List[Double]])
        case SchemaColumnDataType.Date => SchemaColumnSelection(name, values.convertTo[List[Date]])
        case SchemaColumnDataType.Timestamp => SchemaColumnSelection(name, values.convertTo[List[Timestamp]])
        case SchemaColumnDataType.String => SchemaColumnSelection(name, values.convertTo[List[String]])
        case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Selection}")
      }

    }

    override def write(obj: SchemaColumnSelection[_]): YamlValue = ???

  }

} 
Example 63
Source File: UniqueValueRatio.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Analyzers.COUNT_COL
import com.amazon.deequ.metrics.DoubleMetric
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.{col, count, lit, sum}
import org.apache.spark.sql.types.DoubleType

case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None)
  extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns)
  with FilterableAnalyzer {

  override def aggregationFunctions(numRows: Long): Seq[Column] = {
    sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = {
    val numUniqueValues = result.getDouble(offset)
    val numDistinctValues = result.getLong(offset + 1).toDouble

    toSuccessMetric(numUniqueValues / numDistinctValues)
  }

  override def filterCondition: Option[String] = where
}

object UniqueValueRatio {
  def apply(column: String): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil)
  }

  def apply(column: String, where: Option[String]): UniqueValueRatio = {
    new UniqueValueRatio(column :: Nil, where)
  }
} 
Example 64
Source File: Mean.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.{count, sum}
import org.apache.spark.sql.types.{DoubleType, StructType, LongType}
import Analyzers._

case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] {

  override def sum(other: MeanState): MeanState = {
    MeanState(sum + other.sum, count + other.count)
  }

  override def metricValue(): Double = {
    if (count == 0L) Double.NaN else sum / count
  }
}

case class Mean(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[MeanState]("Mean", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    sum(conditionalSelection(column, where)).cast(DoubleType) ::
      count(conditionalSelection(column, where)).cast(LongType) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = {

    ifNoNullsIn(result, offset, howMany = 2) { _ =>
      MeanState(result.getDouble(offset), result.getLong(offset + 1))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: isNumeric(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 65
Source File: ArrangePostprocessor.scala    From DataQuality   with GNU Lesser General Public License v3.0 5 votes vote down vote up
package it.agilelab.bigdata.DataQuality.postprocessors

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig
import it.agilelab.bigdata.DataQuality.utils
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, NumericType}
import org.apache.spark.sql.{Column, DataFrame, SQLContext}

import scala.collection.JavaConversions._

final class ArrangePostprocessor(config: Config, settings: DQSettings)
    extends BasicPostprocessor(config, settings) {

  private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) {
    def toColumn()(implicit df: DataFrame): Column = {

      val dataType: Option[NumericType with Product with Serializable] =
        tipo.getOrElse("").toUpperCase match {
          case "DOUBLE" => Some(DoubleType)
          case "INT"    => Some(IntegerType)
          case "LONG"   => Some(LongType)
          case _        => None
        }

      import org.apache.spark.sql.functions.format_number
      import org.apache.spark.sql.functions.format_string

      (dataType, precision, format) match {
        case (Some(dt), None, None) => df(name).cast(dt)
        case(Some(dt), None, Some(f)) => format_string(f, df(name).cast(dt)).alias(name)
        case (Some(dt), Some(p),None) => format_number(df(name).cast(dt), p).alias(name)
        case (None, Some(p), None) => format_number(df(name), p).alias(name)
        case (None, None, Some(f)) => format_string(f, df(name)).alias(name)
        case _ => df(name)
      }
    }
  }

  private val vs = config.getString("source")
  private val target: HdfsTargetConfig = {
    val conf = config.getConfig("saveTo")
    utils.parseTargetConfig(conf)(settings).get
  }

  private val columns: Seq[ColumnSelector] =
    config.getAnyRefList("columnOrder").map {
      case x: String => ColumnSelector(x)
      case x: java.util.HashMap[_, String] => {
        val (name, v) = x.head.asInstanceOf[String Tuple2 _]

        v match {
          case v: String =>
            ColumnSelector(name, Option(v))
          case v: java.util.HashMap[String, _] => {
            val k = v.head._1
            val f = v.head._2

            f match {
              case f: Integer =>
                ColumnSelector(name, Option(k), None, Option(f))
              case f: String =>
                ColumnSelector(name, Option(k), Option(f))
            }
          }
        }
      }
    }

  override def process(vsRef: Set[HdfsFile],
                       metRes: Seq[MetricResult],
                       chkRes: Seq[CheckResult])(
      implicit fs: FileSystem,
      sqlContext: SQLContext,
      settings: DQSettings): HdfsFile = {

    val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head
    implicit val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head

    val arrangeDF = df.select(columns.map(_.toColumn): _*)

    HdfsWriter.saveVirtualSource(arrangeDF, target, settings.refDateString)(
      fs,
      sqlContext.sparkContext)

    new HdfsFile(target)
  }
} 
Example 66
Source File: FramelessSyntax.scala    From frameless   with Apache License 2.0 5 votes vote down vote up
package frameless

import org.apache.spark.sql.{Column, DataFrame, Dataset}

trait FramelessSyntax {
  implicit class ColumnSyntax(self: Column) {
    def typedColumn[T, U: TypedEncoder]: TypedColumn[T, U] = new TypedColumn[T, U](self)
    def typedAggregate[T, U: TypedEncoder]: TypedAggregate[T, U] = new TypedAggregate[T, U](self)
  }

  implicit class DatasetSyntax[T: TypedEncoder](self: Dataset[T]) {
    def typed: TypedDataset[T] = TypedDataset.create[T](self)
  }

  implicit class DataframeSyntax(self: DataFrame){
    def unsafeTyped[T: TypedEncoder]: TypedDataset[T] = TypedDataset.createUnsafe(self)
  }
} 
Example 67
Source File: basics.scala    From odsc-west-streaming-trends   with GNU General Public License v3.0 5 votes vote down vote up
//spark-shell -i basics.scala 
import org.apache.spark.sql.types._
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, Column, Row}

case class Coffee(
  name: String,
  roast:Int,
  region:String,
  bean: String,
  acidity:Int = 1,
  bitterness:Int = 1,
  flavors: Seq[String]
  )

case class CoffeeRating(
  coffeeName: String,
  score: Int,
  notes: Option[String] = None
  )

val availableCoffee = Seq(
  Coffee(name="folgers", roast=2, region="US", bean="robusta", acidity=7, bitterness=10, flavors=Seq("nutty")),
  Coffee(name="yuban", roast=2, region="Mexico", bean="robusta", acidity=6, bitterness=7, flavors=Seq("nutty")),
  Coffee(name="nespresso", roast=2, region="Cuba", bean="arabica", acidity=5, bitterness=3, flavors=Seq("nutty", "chocolate")),
  Coffee(name="ritual", roast=1, region="Brazil", bean="arabica", acidity=2, bitterness=1, flavors=Seq("fruity", "floral", "chocolate")),
  Coffee(name="four barrel", roast=1, region="Columbia", bean="arabica", flavors=Seq("nutty", "fruity"))
  )

val rawCoffeeRatings = Seq(
  CoffeeRating("folgers",1,Some("terrible")),
  CoffeeRating("folgers",2,Some("meh")),
  CoffeeRating("yuban",3,Some("worth the money")),
  CoffeeRating("nespresso",2,Some("it's coffee")),
  CoffeeRating("ritual",5,Some("fantastic")),
  CoffeeRating("four barrel",3),
  CoffeeRating("four barrel",5,Some("my fav")),
  CoffeeRating("ritual",4)
  )

def expandArray(df: DataFrame, col: Column): DataFrame = {
  val colName = col.toString()
  val values = df
    .selectExpr(s"explode($colName) as $colName")
    .select(col).distinct()
    .map { _.getString(0) }
    .collect().toSeq
  val expandedRows = values.foldLeft[DataFrame](df)( (d, v) =>
    d.withColumn(v, when(array_contains(col, v), 1).otherwise(0))
  )
  expandedRows
}



// take the available coffee and add it to the stand
val coffeeStand = spark.createDataset(availableCoffee)
val coffeeRatings = spark.createDataset(rawCoffeeRatings)
val coffeeWithRatings = coffeeStand.join(coffeeRatings, coffeeStand("name") === coffeeRatings("coffeeName")).drop("coffeeName")

val sparkWay = coffeeWithRatings.groupBy("name").agg(avg("score") as "rating").sort(desc("rating"))

// create memory sql table
coffeeWithRatings.createOrReplaceTempView("coffee_ratings")
val sqlWay = spark.sql("select name, avg(score) as rating from coffee_ratings GROUP BY name ORDER BY rating DESC")

sparkWay.explain(true)
sparkWay.show(10, false)

sqlWay.explain(true)
sqlWay.show(10, false) 
Example 68
Source File: FrequentItems.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.Logging
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types.{ArrayType, StructField, StructType}

private[sql] object FrequentItems extends Logging {

  
  private[sql] def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4, s"support ($support) must be greater than 1e-4.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toSeq)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    new DataFrame(df.sqlContext, LocalRelation(schema, Seq(resultRow)))
  }
} 
Example 69
Source File: SchemaColumnFixed.scala    From data-faker   with MIT License 5 votes vote down vote up
package com.dunnhumby.datafaker.schema.table.columns

import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.lit

case class SchemaColumnFixed[T](override val name: String, value: T) extends SchemaColumn {
  override def column(rowID: Option[Column] = None): Column = lit(value)
}

object SchemaColumnFixedProtocol extends SchemaColumnFixedProtocol
trait SchemaColumnFixedProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnFixedFormat extends YamlFormat[SchemaColumnFixed[_]] {

    override def read(yaml: YamlValue): SchemaColumnFixed[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name"))
      val value = fields.getOrElse(YamlString("value"), deserializationError(s"value not set for $name"))

      dataType match {
        case SchemaColumnDataType.Int => SchemaColumnFixed(name, value.convertTo[Int])
        case SchemaColumnDataType.Long => SchemaColumnFixed(name, value.convertTo[Long])
        case SchemaColumnDataType.Float => SchemaColumnFixed(name, value.convertTo[Float])
        case SchemaColumnDataType.Double => SchemaColumnFixed(name, value.convertTo[Double])
        case SchemaColumnDataType.Date => SchemaColumnFixed(name, value.convertTo[Date])
        case SchemaColumnDataType.Timestamp => SchemaColumnFixed(name, value.convertTo[Timestamp])
        case SchemaColumnDataType.String => SchemaColumnFixed(name, value.convertTo[String])
        case SchemaColumnDataType.Boolean => SchemaColumnFixed(name, value.convertTo[Boolean])
        case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Fixed}")
      }

    }

    override def write(obj: SchemaColumnFixed[_]): YamlValue = ???

  }

} 
Example 70
Source File: SchemaColumnSequential.scala    From data-faker   with MIT License 5 votes vote down vote up
package com.dunnhumby.datafaker.schema.table.columns

import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{to_utc_timestamp, from_unixtime, monotonically_increasing_id, to_date}

trait SchemaColumnSequential[T] extends SchemaColumn

object SchemaColumnSequential {
  def apply(name: String, start: Int, step: Int): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step)
  def apply(name: String, start: Long, step: Long): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step)
  def apply(name: String, start: Float, step: Float): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step)
  def apply(name: String, start: Double, step: Double): SchemaColumn = SchemaColumnSequentialNumeric(name, start, step)
  def apply(name: String, start: Date, step: Int): SchemaColumn = SchemaColumnSequentialDate(name, start, step)
  def apply(name: String, start: Timestamp, step: Int): SchemaColumn = SchemaColumnSequentialTimestamp(name, start, step)
}

private case class SchemaColumnSequentialNumeric[T: Numeric](override val name: String, start: T, step: T) extends SchemaColumnSequential[T] {
  override def column(rowID: Option[Column] = Some(monotonically_increasing_id)): Column = (rowID.get * step) + start
}

private case class SchemaColumnSequentialTimestamp(override val name: String, start: Timestamp, stepSeconds: Int) extends SchemaColumnSequential[Timestamp] {
  override def column(rowID: Option[Column] = Some(monotonically_increasing_id)): Column = {
    val startTime = start.getTime / 1000
    to_utc_timestamp(from_unixtime(rowID.get * stepSeconds + startTime), "UTC")
  }
}

private case class SchemaColumnSequentialDate(override val name: String, start: Date, stepDays: Int) extends SchemaColumnSequential[Date] {
  val timestamp = SchemaColumnSequentialTimestamp(name, new Timestamp(start.getTime), stepDays * 86400)

  override def column(rowID: Option[Column]): Column = to_date(timestamp.column())
}

object SchemaColumnSequentialProtocol extends SchemaColumnSequentialProtocol
trait SchemaColumnSequentialProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnSequentialFormat extends YamlFormat[SchemaColumnSequential[_]] {

    override def read(yaml: YamlValue): SchemaColumnSequential[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError("data_type not set"))
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val start = fields.getOrElse(YamlString("start"), deserializationError("start not set"))
      val step = fields.getOrElse(YamlString("step"), deserializationError("step not set"))

      dataType match {
        case "Int" => SchemaColumnSequentialNumeric(name, start.convertTo[Int], step.convertTo[Int])
        case "Long" => SchemaColumnSequentialNumeric(name, start.convertTo[Long], step.convertTo[Long])
        case "Float" => SchemaColumnSequentialNumeric(name, start.convertTo[Float], step.convertTo[Float])
        case "Double" => SchemaColumnSequentialNumeric(name, start.convertTo[Double], step.convertTo[Double])
        case "Date" => SchemaColumnSequentialDate(name, start.convertTo[Date], step.convertTo[Int])
        case "Timestamp" => SchemaColumnSequentialTimestamp(name, start.convertTo[Timestamp], step.convertTo[Int])
        case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Sequential}")
      }

    }

    override def write(obj: SchemaColumnSequential[_]): YamlValue = ???

  }

} 
Example 71
Source File: SchemaColumnRandom.scala    From data-faker   with MIT License 5 votes vote down vote up
package com.dunnhumby.datafaker.schema.table.columns

import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{to_utc_timestamp, round, rand, from_unixtime, to_date}
import org.apache.spark.sql.types.{IntegerType, LongType}

trait SchemaColumnRandom[T] extends SchemaColumn

object SchemaColumnRandom {
  val FloatDP = 3
  val DoubleDP = 3

  def apply(name: String, min: Int, max: Int): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Long, max: Long): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Float, max: Float): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Double, max: Double): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Date, max: Date): SchemaColumn = SchemaColumnRandomDate(name, min, max)
  def apply(name: String, min: Timestamp, max: Timestamp): SchemaColumn = SchemaColumnRandomTimestamp(name, min, max)
  def apply(name: String): SchemaColumn = SchemaColumnRandomBoolean(name)
}

private case class SchemaColumnRandomNumeric[T: Numeric](override val name: String, min: T, max: T) extends SchemaColumnRandom[T] {
  override def column(rowID: Option[Column] = None): Column = {
    import Numeric.Implicits._

    (min, max) match {
      case (_: Int, _: Int) => round(rand() * (max - min) + min, 0).cast(IntegerType)
      case (_: Long, _: Long) => round(rand() * (max - min) + min, 0).cast(LongType)
      case (_: Float, _: Float) => round(rand() * (max - min) + min, SchemaColumnRandom.FloatDP)
      case (_: Double, _: Double) => round(rand() * (max - min) + min, SchemaColumnRandom.DoubleDP)
    }
  }
}

private case class SchemaColumnRandomTimestamp(override val name: String, min: Timestamp, max: Timestamp) extends SchemaColumnRandom[Timestamp] {
  override def column(rowID: Option[Column] = None): Column = {
    val minTime = min.getTime / 1000
    val maxTime = max.getTime / 1000
    to_utc_timestamp(from_unixtime(rand() * (maxTime - minTime) + minTime), "UTC")
  }
}

private case class SchemaColumnRandomDate(override val name: String, min: Date, max: Date) extends SchemaColumnRandom[Date] {
  val timestamp = SchemaColumnRandomTimestamp(name, new Timestamp(min.getTime), new Timestamp(max.getTime + 86400000))

  override def column(rowID: Option[Column] = None): Column = to_date(timestamp.column())
}

private case class SchemaColumnRandomBoolean(override val name: String) extends SchemaColumnRandom[Boolean] {
  override def column(rowID: Option[Column] = None): Column = rand() < 0.5f
}

object SchemaColumnRandomProtocol extends SchemaColumnRandomProtocol
trait SchemaColumnRandomProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnRandomFormat extends YamlFormat[SchemaColumnRandom[_]] {

    override def read(yaml: YamlValue): SchemaColumnRandom[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name"))

      if (dataType == SchemaColumnDataType.Boolean) {
        SchemaColumnRandomBoolean(name)
      }
      else {
        val min = fields.getOrElse(YamlString("min"), deserializationError(s"min not set for $name"))
        val max = fields.getOrElse(YamlString("max"), deserializationError(s"max not set for $name"))

        dataType match {
          case SchemaColumnDataType.Int => SchemaColumnRandomNumeric(name, min.convertTo[Int], max.convertTo[Int])
          case SchemaColumnDataType.Long => SchemaColumnRandomNumeric(name, min.convertTo[Long], max.convertTo[Long])
          case SchemaColumnDataType.Float => SchemaColumnRandomNumeric(name, min.convertTo[Float], max.convertTo[Float])
          case SchemaColumnDataType.Double => SchemaColumnRandomNumeric(name, min.convertTo[Double], max.convertTo[Double])
          case SchemaColumnDataType.Date => SchemaColumnRandomDate(name, min.convertTo[Date], max.convertTo[Date])
          case SchemaColumnDataType.Timestamp => SchemaColumnRandomTimestamp(name, min.convertTo[Timestamp], max.convertTo[Timestamp])
          case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Random}")
        }
      }

    }

    override def write(obj: SchemaColumnRandom[_]): YamlValue = ???

  }

} 
Example 72
Source File: Maximum.scala    From deequ   with Apache License 2.0 5 votes vote down vote up
package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.max
import org.apache.spark.sql.types.{DoubleType, StructType}
import Analyzers._

case class MaxState(maxValue: Double) extends DoubleValuedState[MaxState] {

  override def sum(other: MaxState): MaxState = {
    MaxState(math.max(maxValue, other.maxValue))
  }

  override def metricValue(): Double = {
    maxValue
  }
}

case class Maximum(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[MaxState]("Maximum", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    max(conditionalSelection(column, where)).cast(DoubleType) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[MaxState] = {

    ifNoNullsIn(result, offset) { _ =>
      MaxState(result.getDouble(offset))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: isNumeric(column) :: Nil
  }

  override def filterCondition: Option[String] = where
} 
Example 73
Source File: SchemaColumn.scala    From data-faker   with MIT License 5 votes vote down vote up
package com.dunnhumby.datafaker.schema.table.columns

import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column

abstract class SchemaColumn {
  def name: String

  def column(rowID: Option[Column] = None): Column
}

object SchemaColumnDataType {
  val Int = "Int"
  val Long = "Long"
  val Float = "Float"
  val Double = "Double"
  val Date = "Date"
  val Timestamp = "Timestamp"
  val String = "String"
  val Boolean = "Boolean"
}

object SchemaColumnType {
  val Fixed = "Fixed"
  val Random = "Random"
  val Selection = "Selection"
  val Sequential = "Sequential"
  val Expression = "Expression"
}

object SchemaColumnProtocol extends YamlParserProtocol
  with SchemaColumnFixedProtocol
  with SchemaColumnRandomProtocol
  with SchemaColumnSelectionProtocol
  with SchemaColumnSequentialProtocol
  with SchemaColumnExpressionProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnFormat extends YamlFormat[SchemaColumn] {

    override def read(yaml: YamlValue): SchemaColumn = {
      val fields = yaml.asYamlObject.fields
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val YamlString(columnType) = fields.getOrElse(YamlString("column_type"), deserializationError(s"column_type not set for $name"))

      columnType match {
        case SchemaColumnType.Fixed => yaml.convertTo[SchemaColumnFixed[_]]
        case SchemaColumnType.Random => yaml.convertTo[SchemaColumnRandom[_]]
        case SchemaColumnType.Selection => yaml.convertTo[SchemaColumnSelection[_]]
        case SchemaColumnType.Sequential => yaml.convertTo[SchemaColumnSequential[_]]
        case SchemaColumnType.Expression => yaml.convertTo[SchemaColumnExpression]
        case _ => deserializationError(s"unsupported column_type: $columnType")
      }

    }

    override def write(obj: SchemaColumn): YamlValue = ???

  }

} 
Example 74
Source File: FrequentItems.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.stat

import scala.collection.mutable.{Map => MutableMap}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types._

object FrequentItems extends Logging {

  
  def singlePassFreqItems(
      df: DataFrame,
      cols: Seq[String],
      support: Double): DataFrame = {
    require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.")
    val numCols = cols.length
    // number of max items to keep counts for
    val sizeOfMap = (1 / support).toInt
    val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap))
    val originalSchema = df.schema
    val colInfo: Array[(String, DataType)] = cols.map { name =>
      val index = originalSchema.fieldIndex(name)
      (name, originalSchema.fields(index).dataType)
    }.toArray

    val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)(
      seqOp = (counts, row) => {
        var i = 0
        while (i < numCols) {
          val thisMap = counts(i)
          val key = row.get(i)
          thisMap.add(key, 1L)
          i += 1
        }
        counts
      },
      combOp = (baseCounts, counts) => {
        var i = 0
        while (i < numCols) {
          baseCounts(i).merge(counts(i))
          i += 1
        }
        baseCounts
      }
    )
    val justItems = freqItems.map(m => m.baseMap.keys.toArray)
    val resultRow = Row(justItems : _*)
    // append frequent Items to the column name for easy debugging
    val outputCols = colInfo.map { v =>
      StructField(v._1 + "_freqItems", ArrayType(v._2, false))
    }
    val schema = StructType(outputCols).toAttributes
    Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow)))
  }
} 
Example 75
Source File: YelpHelpers.scala    From morpheus   with Apache License 2.0 5 votes vote down vote up
package org.opencypher.morpheus.integration.yelp

import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType}
import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions}
import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey
import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey}
import org.opencypher.morpheus.impl.table.SparkTable._
import org.opencypher.morpheus.integration.yelp.YelpConstants._

object YelpHelpers {

  case class YelpTables(
    userDf: DataFrame,
    businessDf: DataFrame,
    reviewDf: DataFrame
  )

  def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = {
    import spark.implicits._

    log("read business.json", 2)
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    log("read review.json", 2)
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    log("read user.json", 2)
    val rawUserDf = spark.read.json(s"$inputPath/user.json")

    val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state")
    val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType))
    val userDf = rawUserDf.select(
      $"user_id".as(sourceIdKey),
      $"name",
      $"yelping_since".cast(DateType),
      functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite"))

    YelpTables(userDf, businessDf, reviewDf)
  }

  def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = {
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")

    import spark.implicits._

    rawBusinessDf.select($"city", $"state").distinct().show()
    rawBusinessDf.withColumnRenamed("business_id", "id")
      .join(rawReviewDf, $"id" === $"business_id")
      .groupBy($"city", $"state")
      .count().as("count")
      .orderBy($"count".desc, $"state".asc)
      .show(100)
  }

  def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = {
    import spark.implicits._

    def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com"))

    val rawUserDf = spark.read.json(s"$inputPath/user.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")

    val businessDf = rawBusinessDf.filter($"city" === city)
    val reviewDf = rawReviewDf
      .join(businessDf, Seq("business_id"), "left_semi")
      .withColumn("user_email", emailColumn("user_id"))
      .withColumnRenamed("stars", "stars_tmp")
      .withColumn("stars", $"stars_tmp".cast(IntegerType))
      .drop("stars_tmp")
    val userDf = rawUserDf
      .join(reviewDf, Seq("user_id"), "left_semi")
      .withColumn("email", emailColumn("user_id"))
    val friendDf = userDf
      .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id"))
      .withColumn("user2_email", emailColumn("user2_id"))
      .select(s"user1_email", s"user2_email")

    businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json")
    reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json")
    userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json")
    friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json")
  }

  implicit class DataFrameOps(df: DataFrame) {
    def prependIdColumn(idColumn: String, prefix: String): DataFrame =
      df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn))
  }
} 
Example 76
Source File: EncodeLong.scala    From morpheus   with Apache License 2.0 5 votes vote down vote up
package org.opencypher.morpheus.impl.expressions

import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression}
import org.apache.spark.sql.types.{BinaryType, DataType, LongType}
import org.opencypher.morpheus.api.value.MorpheusElement._


case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes {

  override val dataType: DataType = BinaryType

  override val inputTypes: Seq[LongType] = Seq(LongType)

  override protected def nullSafeEval(input: Any): Any =
    EncodeLong.encodeLong(input.asInstanceOf[Long])

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode =
    defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))")
}

object EncodeLong {

  private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2)
  private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2)
  private final val otherBitsMask = ~varLength7BitMask
  private final val maxBytesForLongVarEncoding = 10

  // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding
  @inline
  final def encodeLong(l: Long): Array[Byte] = {
    val tempResult = new Array[Byte](maxBytesForLongVarEncoding)

    var remainder = l
    var index = 0

    while ((remainder & otherBitsMask) != 0) {
      tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte
      remainder >>>= 7
      index += 1
    }
    tempResult(index) = remainder.toByte

    val result = new Array[Byte](index + 1)
    System.arraycopy(tempResult, 0, result, 0, index + 1)
    result
  }

  // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding
  @inline
  final def decodeLong(input: Array[Byte]): Long = {
    assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input")
    var index = 0
    var currentByte = input(index)
    var decoded = currentByte & varLength7BitMask
    var nextLeftShift = 7

    while ((currentByte & moreBytesBitMask) != 0) {
      index += 1
      currentByte = input(index)
      decoded |= (currentByte & varLength7BitMask) << nextLeftShift
      nextLeftShift += 7
    }
    assert(index == input.length - 1,
      s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.")
    decoded
  }

  implicit class ColumnLongOps(val c: Column) extends AnyVal {

    def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name)

    def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr))

  }

} 
Example 77
Source File: DatasetFilter.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.util

import org.apache.spark.sql.{Column, DataFrame, Dataset}
import org.apache.spark.sql.functions.substring

case class DatasetFilter(lastModifiedGte: Option[java.sql.Timestamp] = None,
                         current: Option[Boolean] = None,
                         pathPrefix: Option[String] = None) {

  def applyFilter[T](ds: Dataset[T],
                     forAnalysis: Boolean): Dataset[T] = {

    // Apply filters if a filter value was supplied, and the corresponding column exists in ds

    val lastModifiedOptionalPredicate: Option[Column] =
      if (ds.columns.contains("lastModified"))
        lastModifiedGte.map(ds("lastModified") >= _)
      else
        None

    val currentOptionalPredicate =
      if (ds.columns.contains("current"))
        current.map(ds("current") > _)
      else
        None

    val pathOptionalPredicate =
      if (ds.columns.contains("path"))
        pathPrefix.map { pathPrefix => substring(ds("path"), 0, pathPrefix.length) === pathPrefix }
      else
        None

    val temporalOptionalPredicate = (lastModifiedOptionalPredicate, currentOptionalPredicate) match {
      case (Some(lastModifiedPredicate), Some(currentPredicate)) if forAnalysis =>
        Some(lastModifiedPredicate && currentPredicate)
      case (Some(lastModifiedPredicate), Some(currentPredicate)) if !forAnalysis =>
        Some(lastModifiedPredicate || currentPredicate)
      case (Some(lastModifiedPredicate), _) =>
        Some(lastModifiedPredicate)
      case (_, Some(currentPredicate)) =>
        Some(currentPredicate)
      case _ =>
        None
    }

    val overallOptionalPredicate = (pathOptionalPredicate, temporalOptionalPredicate) match {
      case (Some(pathPredicate), Some(currentPredicate)) =>
        Some(pathPredicate && currentPredicate)
      case (Some(pathPredicate), _) =>
        Some(pathPredicate)
      case (_, Some(temporalPredicate)) =>
        Some(temporalPredicate)
      case _ =>
        None
    }

    overallOptionalPredicate.fold(ds)(ds.filter)
  }
} 
Example 78
Source File: PathWithKeyFields.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.data

import cmwell.analytics.util.{CassandraSystem, DatasetFilter, KeyFields}
import com.datastax.spark.connector._
import com.datastax.spark.connector.rdd.CassandraTableScanRDD
import org.apache.spark.sql.{Column, DataFrame, Dataset, SparkSession}

object PathWithKeyFields extends EstimateDatasetSize {

  private val BytesPerRow = 8 + (3 * 8) + (16 + 8 + 32) // bit mask, fixed, variable

  override def estimateDatasetSize(implicit spark: SparkSession): Long =
    CassandraSystem.rowCount(table = "path") * BytesPerRow


  case class Columns(path: Column,
                     lastModified: Column,
                     uuid: Column) {

    def this(dataset: DataFrame, prefix: String = "") = this(
      path = dataset(prefix + "path"),
      lastModified = dataset(prefix + "lastModified"),
      uuid = dataset(prefix + "uuid"))
  }

  def isWellFormed(dataset: DataFrame, prefix: String = ""): Column = {

    val columns = new Columns(dataset, prefix)

    Constraints.isPathWellFormed(columns.path) &&
      Constraints.isLastModifiedCasWellFormed(columns.lastModified) &&
      Constraints.isUuidWellFormed(columns.uuid)
  }

  def apply(datasetFilter: Option[DatasetFilter] = None)
           (implicit spark: SparkSession): Dataset[KeyFields] = {

    // We can push filters on last_modified down to Cassandra.
    // CQL doesn't support filtering on path prefix.
    def pushDownDatasetFilter(scan: CassandraTableScanRDD[CassandraRow]): CassandraTableScanRDD[CassandraRow] =
      datasetFilter.fold(scan)(_.lastModifiedGte.fold(scan)(scan.where("last_modified >= ?", _)))

    val infotonRdd = pushDownDatasetFilter(spark.sparkContext.cassandraTable("data2", "path"))
      .select("path", "last_modified", "uuid")

    val objectRDD = infotonRdd.map { cassandraRow =>

      KeyFields(
        path = cassandraRow.getString("path"),
        lastModified = new java.sql.Timestamp(cassandraRow.getDateTime("last_modified").getMillis),
        uuid = cassandraRow.getString("uuid"))
    }

    import spark.implicits._
    val ds = spark.createDataset(objectRDD)

    datasetFilter.fold(ds)(_.applyFilter(ds, forAnalysis = false))
  }
} 
Example 79
Source File: InfotonType.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.data

import org.apache.spark.sql.Column

object InfotonType {

  // In Elasticsearch indexes, the infoton type string is a full word in Pascal casing.
  private val ElasticsearchRepresentation = Seq(
    "ObjectInfoton",
    "FileInfoton",
    "LinkInfoton",
    "DeletedInfoton",
    "CompoundInfoton",
    "GhostInfoton")

  // In the infoton table, the type field is the first letter of the infoton type name in lower case.
  private val CassandraRepresentation: Seq[String] = ElasticsearchRepresentation.map(_.substring(0, 1).toLowerCase)

  def isWellFormedCas(column: Column): Column =
    column.isin(CassandraRepresentation: _*)

  def isWellFormedEs(column: Column): Column =
    column.isin(ElasticsearchRepresentation: _*)
} 
Example 80
Source File: AnalyzeInconsistenciesResult.scala    From CM-Well   with Apache License 2.0 5 votes vote down vote up
package cmwell.analytics.main

import java.io.File
import java.nio.charset.StandardCharsets.UTF_8

import cmwell.analytics.data.InfotonAndIndexWithSystemFields
import cmwell.analytics.util.Connector
import org.apache.commons.io.FileUtils
import org.apache.log4j.LogManager
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.rogach.scallop.{ScallopConf, ScallopOption}

import scala.collection.breakOut

object AnalyzeInconsistenciesResult {

  def main(args: Array[String]): Unit = {

    val logger = LogManager.getLogger(AnalyzeInconsistenciesResult.getClass)

    try {

      object Opts extends ScallopConf(args) {

        val in: ScallopOption[String] = opt[String]("in", short = 'i', descr = "The path to read the (parquet) inconsistencies dataset from", required = true)
        val out: ScallopOption[String] = opt[String]("out", short = 'o', descr = "The path to save the (csv) output to", required = true)
        val shell: ScallopOption[Boolean] = opt[Boolean]("spark-shell", short = 's', descr = "Run a Spark shell", required = false, default = Some(false))

        verify()
      }

      Connector(
        appName = "Analyze InfotonAndIndexWithSystemFields Output",
        sparkShell = Opts.shell()
      ).withSparkSessionDo { spark =>

        val ds: DataFrame = spark.read.parquet(Opts.in())

        import org.apache.spark.sql.functions._

        // A column expression that counts the number of failures for each constraint.
        // This will also include null counts, needed to interpret the results.
        val constraints: Seq[(String, Column)] = InfotonAndIndexWithSystemFields.constraints(ds).map { case (name, predicate) =>
          name -> sum(when(predicate, 0L).otherwise(1L)).as(name)
        }(breakOut)

        // Compute the failure counts
        val failureCounts: Row = ds.agg(constraints.head._2, constraints.tail.map(_._2): _*).head

        val results = for {
          i <- constraints.indices
          constraintName = constraints(i)._1
          failureCount = if (failureCounts.isNullAt(i)) 0 else failureCounts.getAs[Long](i)
        } yield s"$constraintName,$failureCount"

        FileUtils.write(new File(Opts.out()), "constraint,failures\n" + results.mkString("\n"), UTF_8)
      }
    }
    catch {
      case ex: Throwable =>
        logger.error(ex.getMessage, ex)
        System.exit(1)
    }
  }
}