org.apache.spark.api.java.JavaRDD Scala Examples

The following examples show how to use org.apache.spark.api.java.JavaRDD. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: Word2VecModelWrapper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))
  }

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }


  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map { case (k, v) =>
      (k, v.toList.asJava)
    }.asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
} 
Example 2
Source File: MatrixFactorizationModelWrapper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
} 
Example 3
Source File: KernelDensity.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD


  def normPdf(
      mean: Double,
      standardDeviation: Double,
      logStandardDeviationPlusHalfLog2Pi: Double,
      x: Double): Double = {
    val x0 = x - mean
    val x1 = x0 / standardDeviation
    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
    math.exp(logDensity)
  }
} 
Example 4
Source File: RRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
} 
Example 5
Source File: MatrixFactorizationModelWrapper.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }
} 
Example 6
Source File: KernelDensity.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.Experimental
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD


  def normPdf(
      mean: Double,
      standardDeviation: Double,
      logStandardDeviationPlusHalfLog2Pi: Double,
      x: Double): Double = {
    val x0 = x - mean
    val x1 = x0 / standardDeviation
    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
    math.exp(logDensity)
  }
} 
Example 7
Source File: Word2VecModelWrapper.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{ArrayList => JArrayList, List => JList, Map => JMap}
import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = {
    rdd.rdd.map(model.transform)
  }

  def findSynonyms(word: String, num: Int): JList[Object] = {
    val vec = transform(word)
    findSynonyms(vec, num)
  }

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    val result = model.findSynonyms(vector, num)
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }

  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map({case (k, v) => (k, v.toList.asJava)}).asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
} 
Example 8
Source File: MatrixFactorizationModelWrapper.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }
} 
Example 9
Source File: KernelDensity.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD


  def normPdf(
      mean: Double,
      standardDeviation: Double,
      logStandardDeviationPlusHalfLog2Pi: Double,
      x: Double): Double = {
    val x0 = x - mean
    val x1 = x0 / standardDeviation
    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
    math.exp(logDensity)
  }
} 
Example 10
Source File: Word2VecModelWrapper.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))
  }

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }


  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map { case (k, v) =>
      (k, v.toList.asJava)
    }.asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
} 
Example 11
Source File: MatrixFactorizationModelWrapper.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
} 
Example 12
Source File: KernelDensity.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD


  def normPdf(
      mean: Double,
      standardDeviation: Double,
      logStandardDeviationPlusHalfLog2Pi: Double,
      x: Double): Double = {
    val x0 = x - mean
    val x1 = x0 / standardDeviation
    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
    math.exp(logDensity)
  }
} 
Example 13
Source File: PythonSQLUtils.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.execution.arrow.ArrowConverters
import org.apache.spark.sql.types.DataType

private[sql] object PythonSQLUtils {
  def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText)

  // This is needed when generating SQL documentation for built-in functions.
  def listBuiltinFunctionInfos(): Array[ExpressionInfo] = {
    FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
  }

  
  def arrowPayloadToDataFrame(
      payloadRDD: JavaRDD[Array[Byte]],
      schemaString: String,
      sqlContext: SQLContext): DataFrame = {
    ArrowConverters.toDataFrame(payloadRDD, schemaString, sqlContext)
  }
} 
Example 14
Source File: Word2VecModelWrapper.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{ArrayList => JArrayList, List => JList, Map => JMap}
import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = {
    rdd.rdd.map(model.transform)
  }

  def findSynonyms(word: String, num: Int): JList[Object] = {
    val vec = transform(word)
    findSynonyms(vec, num)
  }

  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    val result = model.findSynonyms(vector, num)
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }

  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map({case (k, v) => (k, v.toList.asJava)}).asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
} 
Example 15
Source File: MatrixFactorizationModelWrapper.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
} 
Example 16
Source File: KernelDensity.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD


  def normPdf(
      mean: Double,
      standardDeviation: Double,
      logStandardDeviationPlusHalfLog2Pi: Double,
      x: Double): Double = {
    val x0 = x - mean
    val x1 = x0 / standardDeviation
    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
    math.exp(logDensity)
  }
} 
Example 17
Source File: Neo4jJavaIntegration.scala    From neo4j-spark-connector   with Apache License 2.0 5 votes vote down vote up
package org.neo4j.spark

import java.util

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.neo4j.spark.dataframe.Neo4jDataFrame
import org.neo4j.spark.rdd.{Neo4jRowRDD, Neo4jTupleRDD}

import scala.collection.JavaConverters._


object Neo4jJavaIntegration {
  def rowRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]) =
    new Neo4jRowRDD(sc, query, if (parameters == null) Seq.empty else parameters.asScala.toSeq).toJavaRDD()

  def tupleRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]): JavaRDD[util.Map[String, AnyRef]] = {
    val params = if (parameters == null) Seq.empty else parameters.asScala.toSeq
    Neo4jTupleRDD(sc, query, params)
      .map((t) => new util.LinkedHashMap[String, AnyRef](t.toMap.asJava).asInstanceOf[util.Map[String, AnyRef]])
      .toJavaRDD()
  }

  def dataFrame(sqlContext: SQLContext, query: String, parameters: java.util.Map[String, AnyRef], schemaInfo: util.Map[String, String]) = {
    Neo4jDataFrame(sqlContext, query, parameters.asScala.toSeq, schemaInfo.asScala.toSeq: _*)
  }
} 
Example 18
Source File: HashingTF.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import java.lang.{Iterable => JavaIterable}

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.unsafe.hash.Murmur3_x86_32._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.Utils


  private[spark] def murmur3Hash(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = UTF8String.fromString(s)
        hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed)
      case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
    }
  }
} 
Example 19
Source File: RRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
} 
Example 20
Source File: KernelDensity.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD


  def normPdf(
      mean: Double,
      standardDeviation: Double,
      logStandardDeviationPlusHalfLog2Pi: Double,
      x: Double): Double = {
    val x0 = x - mean
    val x1 = x0 / standardDeviation
    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
    math.exp(logDensity)
  }
} 
Example 21
Source File: MatrixFactorizationModelWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
} 
Example 22
Source File: Word2VecModelWrapper.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))
  }

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }


  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map { case (k, v) =>
      (k, v.toList.asJava)
    }.asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
} 
Example 23
Source File: PythonSQLUtils.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.api.python

import java.io.InputStream
import java.nio.channels.Channels

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.python.PythonRDDServer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.execution.arrow.ArrowConverters
import org.apache.spark.sql.types.DataType

private[sql] object PythonSQLUtils {
  def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText)

  // This is needed when generating SQL documentation for built-in functions.
  def listBuiltinFunctionInfos(): Array[ExpressionInfo] = {
    FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
  }

  
private[sql] class ArrowRDDServer(sqlContext: SQLContext) extends PythonRDDServer {

  override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = {
    // Create array to consume iterator so that we can safely close the inputStream
    val batches = ArrowConverters.getBatchesFromStream(Channels.newChannel(input)).toArray
    // Parallelize the record batches to create an RDD
    JavaRDD.fromRDD(sqlContext.sparkContext.parallelize(batches, batches.length))
  }

} 
Example 24
Source File: SparkInflux.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.jawa

import com.pygmalios.reactiveinflux.ReactiveInfluxDbName
import com.pygmalios.reactiveinflux.jawa.{Conversions, PointNoTime}
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.streaming.api.java.JavaDStream

import scala.concurrent.duration._

class SparkInflux(val dbName: String,
                  val awaitAtMostMillis: Long) {
  private implicit val reactiveInfluxDbName = ReactiveInfluxDbName(dbName)
  private implicit val awaitAtMost = awaitAtMostMillis.millis

  def saveToInflux[T <: PointNoTime](javaRdd: JavaRDD[T]): Unit = {
    javaRdd.rdd.map(Conversions.toScala).saveToInflux()
  }

  def saveToInflux[T <: PointNoTime](javaDStream: JavaDStream[T]): Unit = {
    javaDStream.dstream.map(Conversions.toScala).saveToInflux()
  }
} 
Example 25
Source File: BisectingKMeansModel.scala    From bisecting-kmeans   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.bisectingkmeans

import breeze.linalg.{Vector => BV, norm => breezeNorm}

import org.apache.spark.Logging
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD


  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
    this.node.toLinkageMatrix.foreach {x =>
      val row = new java.util.ArrayList[java.lang.Double]()
      row.add(x._1.toDouble)
      row.add(x._2.toDouble)
      row.add(x._3.toDouble)
      row.add(x._4.toDouble)
      javaList.add(row)
    }
    javaList
  }
} 
Example 26
Source File: RiakPythonHelper.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.util.python

import com.basho.riak.spark._
import com.basho.riak.spark.rdd.RiakRDD
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.api.java.JavaRDD
import com.basho.riak.spark.writer.WriteConf
import org.apache.spark.rdd.RDD
import java.util.ArrayList
import scala.collection.JavaConversions._

class RiakPythonHelper {
  implicit val pickling = new PicklingUtils()
  def riakBucket(jsc: JavaSparkContext, bucketName: String, bucketType: String): RiakRDD[(String, Any)] = {
    jsc.sc.riakBucket(bucketName, bucketType)
  }

  def saveToRiak(jrdd: JavaRDD[Array[Byte]], bucketName: String, bucketType: String) = {
    jrdd.rdd.unpickle().saveToRiak(bucketName, bucketType, WriteConf())
  }

  def query2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) =
    jsc.sc.riakBucket(bucketName, bucketType).query2iKeys(index, keys: _*)

  def queryBucketKeys(jsc: JavaSparkContext, bucketName: String, bucketType: String, keys: ArrayList[String]) =
    jsc.sc.riakBucket(bucketName, bucketType).queryBucketKeys(keys: _*)

  def partitionBy2iRanges[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, ranges: ArrayList[ArrayList[K]]) = {
    val r = ranges.map(x => (x(0),  x(1)))
    jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iRanges(index, r: _*)
  }

  def partitionBy2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) =
    jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iKeys(index, keys: _*)

  def pickleRows(rdd: RDD[_]): RDD[Array[Byte]] = rdd.pickle()

  def javaRDD(rdd: RDD[_]) = JavaRDD.fromRDD(rdd)
} 
Example 27
Source File: Subtract.scala    From piflow   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package cn.piflow.bundle.common

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.ImageUtil
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

class Subtract extends ConfigurableStop{
  override val authorEmail: String = "[email protected]"
  override val description: String = "Delete the existing data in the right table from the left table"
  override val inportList: List[String] =List(Port.LeftPort,Port.RightPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  override def setProperties(map: Map[String, Any]): Unit = {
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()

    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/common/Subtract.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.CommonGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()

    val leftDF =  in.read(Port.LeftPort)
    val rightDF = in.read(Port.RightPort)

    val outDF = leftDF.except(rightDF)

    out.write(outDF)
  }
} 
Example 28
Source File: RRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
} 
Example 29
Source File: KernelDensity.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD


  def normPdf(
      mean: Double,
      standardDeviation: Double,
      logStandardDeviationPlusHalfLog2Pi: Double,
      x: Double): Double = {
    val x0 = x - mean
    val x1 = x0 / standardDeviation
    val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi
    math.exp(logDensity)
  }
} 
Example 30
Source File: AssociationRules.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.fpm

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.fpm.AssociationRules.Rule
import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
import org.apache.spark.rdd.RDD


    @Since("1.5.0")
    def javaConsequent: java.util.List[Item] = {
      consequent.toList.asJava
    }

    override def toString: String = {
      s"${antecedent.mkString("{", ",", "}")} => " +
        s"${consequent.mkString("{", ",", "}")}: ${confidence}"
    }
  }
} 
Example 31
Source File: MatrixFactorizationModelWrapper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
} 
Example 32
Source File: Word2VecModelWrapper.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))
  }

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }


  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map { case (k, v) =>
      (k, v.toList.asJava)
    }.asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
} 
Example 33
Source File: KMeansModel.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.clustering

import scala.collection.JavaConverters._

import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkContext
import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.pmml.PMMLExportable
import org.apache.spark.mllib.util.{Loader, Saveable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}


  @Since("0.8.0")
  def computeCost(data: RDD[Vector]): Double = {
    val centersWithNorm = clusterCentersWithNorm
    val bcCentersWithNorm = data.context.broadcast(centersWithNorm)
    data.map(p => KMeans.pointCost(bcCentersWithNorm.value, new VectorWithNorm(p))).sum()
  }

  private def clusterCentersWithNorm: Iterable[VectorWithNorm] =
    clusterCenters.map(new VectorWithNorm(_))

  @Since("1.4.0")
  override def save(sc: SparkContext, path: String): Unit = {
    KMeansModel.SaveLoadV1_0.save(sc, this, path)
  }

  override protected def formatVersion: String = "1.0"
}

@Since("1.4.0")
object KMeansModel extends Loader[KMeansModel] {

  @Since("1.4.0")
  override def load(sc: SparkContext, path: String): KMeansModel = {
    KMeansModel.SaveLoadV1_0.load(sc, path)
  }

  private case class Cluster(id: Int, point: Vector)

  private object Cluster {
    def apply(r: Row): Cluster = {
      Cluster(r.getInt(0), r.getAs[Vector](1))
    }
  }

  private[clustering]
  object SaveLoadV1_0 {

    private val thisFormatVersion = "1.0"

    private[clustering]
    val thisClassName = "org.apache.spark.mllib.clustering.KMeansModel"

    def save(sc: SparkContext, model: KMeansModel, path: String): Unit = {
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val metadata = compact(render(
        ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("k" -> model.k)))
      sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
      val dataRDD = sc.parallelize(model.clusterCenters.zipWithIndex).map { case (point, id) =>
        Cluster(id, point)
      }
      spark.createDataFrame(dataRDD).write.parquet(Loader.dataPath(path))
    }

    def load(sc: SparkContext, path: String): KMeansModel = {
      implicit val formats = DefaultFormats
      val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
      val (className, formatVersion, metadata) = Loader.loadMetadata(sc, path)
      assert(className == thisClassName)
      assert(formatVersion == thisFormatVersion)
      val k = (metadata \ "k").extract[Int]
      val centroids = spark.read.parquet(Loader.dataPath(path))
      Loader.checkSchema[Cluster](centroids.schema)
      val localCentroids = centroids.rdd.map(Cluster.apply).collect()
      assert(k == localCentroids.length)
      new KMeansModel(localCentroids.sortBy(_.id).map(_.point))
    }
  }
}