Example 1
Source File: IntegrationTest.scala    From kmq   with Apache License 2.0
package com.softwaremill.kmq.redelivery

import java.time.Duration
import java.util.Random

import akka.kafka.scaladsl.{Consumer, Producer}
import akka.kafka.{ConsumerSettings, ProducerMessage, ProducerSettings, Subscriptions}
import akka.testkit.TestKit
import com.softwaremill.kmq._
import com.softwaremill.kmq.redelivery.infrastructure.KafkaSpec
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.{ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringDeserializer
import org.scalatest.concurrent.Eventually
import org.scalatest.time.{Seconds, Span}
import org.scalatest.{BeforeAndAfterAll, FlatSpecLike, Matchers}

import scala.collection.mutable.ArrayBuffer

class IntegrationTest extends TestKit(ActorSystem("test-system")) with FlatSpecLike with KafkaSpec with BeforeAndAfterAll with Eventually with Matchers {

  implicit val materializer = ActorMaterializer()
  import system.dispatcher

  "KMQ" should "resend message if not committed" in {
    val bootstrapServer = s"localhost:${testKafkaConfig.kafkaPort}"
    val kmqConfig = new KmqConfig("queue", "markers", "kmq_client", "kmq_redelivery", Duration.ofSeconds(1).toMillis,

    val consumerSettings = ConsumerSettings(system, new StringDeserializer, new StringDeserializer)
      .withProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")

    val markerProducerSettings = ProducerSettings(system,
      new MarkerKey.MarkerKeySerializer(), new MarkerValue.MarkerValueSerializer())
      .withProperty(ProducerConfig.PARTITIONER_CLASS_CONFIG, classOf[ParititionFromMarkerKey].getName)
    val markerProducer = markerProducerSettings.createKafkaProducer()

    val random = new Random()

    lazy val processedMessages = ArrayBuffer[String]()
    lazy val receivedMessages = ArrayBuffer[String]()

    val control = Consumer.committableSource(consumerSettings, Subscriptions.topics(kmqConfig.getMsgTopic)) // 1. get messages from topic
      .map { msg =>
        new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(msg.record), new StartMarker(kmqConfig.getMsgTimeoutMs)), msg)
      .via(Producer.flow(markerProducerSettings, markerProducer)) // 2. write the "start" marker
      .mapAsync(1) { msg =>
        msg.committableOffset.commitScaladsl().map(_ => msg.record) // this should be batched
      .map { msg =>
        receivedMessages += msg.value
      .filter(_ => random.nextInt(5) != 0)
      .map { processedMessage =>
        processedMessages += processedMessage.value
        new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(processedMessage), EndMarker.INSTANCE)
      .to(Producer.plainSink(markerProducerSettings, markerProducer)) // 5. write "end" markers

    val redeliveryHook = RedeliveryTracker.start(new KafkaClients(bootstrapServer), kmqConfig)

    val messages = (0 to 20).map(_.toString)
    messages.foreach(msg => sendToKafka(kmqConfig.getMsgTopic,msg))

    eventually {
      receivedMessages.size should be > processedMessages.size
      processedMessages.sortBy(_.toInt).distinct shouldBe messages
    }(PatienceConfig(timeout = Span(15, Seconds)), implicitly)


  override def afterAll(): Unit = {
Example 2
Source File: package.scala    From iotchain   with MIT License
package jbok

import java.nio.charset.StandardCharsets
import java.util.Random

import jbok.crypto.hash._
import scodec.bits.ByteVector
import jbok.crypto.signature.SignatureInstances

trait StringSyntax {
  implicit final def stringSyntax(a: String): StringOps = new StringOps(a)

final class StringOps(val a : String) extends AnyVal {
  def utf8bytes: ByteVector = ByteVector(a.getBytes(StandardCharsets.UTF_8))

trait CryptoSyntax extends CryptoHasherSyntax with StringSyntax
trait CryptoInstances extends CryptoHasherInstances with SignatureInstances

package object crypto extends CryptoSyntax with CryptoInstances {
  def randomByteString(random: Random, length: Int): ByteVector =
    ByteVector(randomByteArray(random, length))

  def randomByteArray(random: Random, length: Int): Array[Byte] = {
    val bytes = Array.ofDim[Byte](length)
Example 3
Source File: SignaturePlatform.scala    From iotchain   with MIT License
package jbok.crypto.signature

import java.math.BigInteger
import java.util.Random

import cats.effect.Sync
import jbok.crypto.facade.{BN, EC, SignatureEC}

import scala.scalajs.js.JSConverters._
import scala.scalajs.js.typedarray.Uint8Array

trait SignaturePlatform {
  val ecdsa: Signature[ECDSA] = ECDSAPlatform

private object ECDSAPlatform extends Signature[ECDSA] {
  import ECDSACommon._
  val secp256k1 = new EC("secp256k1")

  override def generateKeyPair[F[_]](random: Option[Random])(implicit F: Sync[F]): F[KeyPair] = F.delay {
    val keyPair = secp256k1.genKeyPair()
    val secret  = KeyPair.Secret(keyPair.getPrivate("hex"))
    // drop uncompressed indicator, make it 64-bytes
    val pubkey = KeyPair.Public(keyPair.getPublic(false, "hex").drop(2))
    KeyPair(pubkey, secret)

  override def generatePublicKey[F[_]](secret: KeyPair.Secret)(implicit F: Sync[F]): F[KeyPair.Public] = F.delay {
    val keyPair = secp256k1.keyFromPrivate(secret.bytes.toHex, "hex")
    // drop uncompressed indicator, make it 64-bytes
    KeyPair.Public(keyPair.getPublic(false, "hex").drop(2))

  override def sign[F[_]](hash: Array[Byte], keyPair: KeyPair, chainId: BigInt)(implicit F: Sync[F]): F[CryptoSignature] = F.delay {
    val kp  = secp256k1.keyFromPrivate(keyPair.secret.bytes.toHex, "hex")
    val sig = secp256k1.sign(new Uint8Array(hash.toJSArray), kp)
    val r   = new BigInteger(sig.r.toString)
    val s   = new BigInteger(sig.s.toString)
    val pointSign = calculatePointSign(r, toCanonicalS(s), keyPair, hash, chainId) match {
      case Some(recId) => recId
      case None        => throw new Exception("unexpected error")
    val rid: BigInt = getRecoveryId(chainId, pointSign).getOrElse(pointSign)
    CryptoSignature(r, toCanonicalS(s), rid)

  override def verify[F[_]](hash: Array[Byte], sig: CryptoSignature, public: KeyPair.Public, chainId: BigInt)(implicit F: Sync[F]): F[Boolean] = F.delay {
    getPointSign(chainId, sig.v).exists { bigInt =>
      val signatureEC = convert(sig.copy(v = bigInt))
      val key         = secp256k1.keyFromPublic(UNCOMPRESSED_INDICATOR_STRING + public.bytes.toHex, "hex")
      secp256k1.verify(new Uint8Array(hash.toJSArray), signatureEC, key)

  override def recoverPublic(hash: Array[Byte], sig: CryptoSignature, chainId: BigInt): Option[KeyPair.Public] =
    getPointSign(chainId, sig.v).map { bigInt =>
      val signatureEC = convert(sig.copy(v = bigInt))
      val msg         = new Uint8Array(hash.toJSArray)
      val recId       = secp256k1.getKeyRecoveryParam(msg, signatureEC)
      val point       = secp256k1.recoverPubKey(new Uint8Array(hash.toJSArray), signatureEC, recId)
      KeyPair.Public(point.encode("hex", false).drop(2))

  private def convert(sig: CryptoSignature) = {
    val r = new BN(sig.r.toString(16), 16)
    val s = new BN(sig.s.toString(16), 16)
    SignatureEC(r, s, recoveryParam = (sig.v - NEGATIVE_POINT_SIGN).toInt)

  private def calculatePointSign(r: BigInt, s: BigInt, keyPair: KeyPair, hash: Array[Byte], chainId: BigInt): Option[BigInt] =
      v =>
        recoverPublic(hash, CryptoSignature(r, s, getRecoveryId(chainId, v).getOrElse(v)), chainId)
Example 4
Source File: LoggerSimulation.scala    From BigData-News   with Apache License 2.0
package com.vita.spark.utils


class LoggerSimulation {


object LoggerSimulation {

  var numIndex = 0

    * 生成一个字母
    * @param 字母的下标
    * @return 生成的字母
  def gennerateContent(index: Int): String = {
    import scala.collection.mutable.ListBuffer
    val charList = ListBuffer[Char]();
    for (i <- 65 to 90) {
      charList += i.toChar
    val charArray = charList.toArray

  def gennerateNumber(): String = {
    //    numIndex += 1
    //    return numIndex.toString
    return "a,b,c,d,e,f"

    * 生成随机下标
    * @return 返回一个下标
  def index = {
    import java.util.Random
    val rdm = new Random()

    * 启动一个main方法来创建一个serversockt发送消息
    * @param args 端口,发送的时间间隔
  def main(args: Array[String]): Unit = {
    if (args.length != 2) {

    val listener = new ServerSocket(args(0).toInt)
    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from:" + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream, true)
          while (true) {
            //            val content = gennerateContent(index)
            val content = gennerateNumber()
            out.write(content + "\n")
Example 5
Source File: SimpleSkewedGroupByTest.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
    // Enforce that everything has been calculated and in cache

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

// scalastyle:on println 
Example 6
Source File: SkewedGroupByTest.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


// scalastyle:on println 
Example 7
Source File: SparkHdfsLR.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")


    val spark = SparkSession

    val inputPath = args(0)
    val lines =

    val points =
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
// scalastyle:on println 
Example 8
Source File: LocalLR.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient +=  p.x * scale
      w -= gradient

    println("Final w: " + w)
// scalastyle:on println 
Example 9
Source File: GroupByTest.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


// scalastyle:on println 
Example 10
Source File: LocalFileLR.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val lines =
    val points = _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

    println("Final w: " + w)
// scalastyle:on println 
Example 11
Source File: PageViewGenerator.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples.streaming.clickstream

import java.util.Random

// scalastyle:on
object PageViewGenerator {
  val pages = Map("" -> .7,
                  "" -> 0.2,
                  "" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println("Listening on port: " + port)

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from: " + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
// scalastyle:on println 
Example 12
Source File: SparkLR.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val spark = SparkSession

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)

// scalastyle:on println 
Example 13
Source File: LocalKMeans.scala    From drizzle-spark   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}

object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i


  def showWarning() {
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {

    val iter = points.iterator
    for (i <- 1 to points.size) {

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))

      var newPoints = {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)

    println("Final centers: " + kPoints)
// scalastyle:on println 
Example 14
Source File: StopwatchSuite.scala    From drizzle-spark   with Apache License 2.0

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert( === "sw")
    assert(sw.elapsed() === 0L)
    intercept[AssertionError] {
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    intercept[AssertionError] {

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      val duration = checkStopwatch(sw("spark"))
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)

private object StopwatchSuite extends SparkFunSuite {

  private def now: Long = System.currentTimeMillis()
Example 15
Source File: PartitionwiseSampledRDD.scala    From drizzle-spark   with Apache License 2.0
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Example 16
Source File: CsvKafkaPublisher.scala    From Taxi360   with Apache License 2.0
package com.hadooparchitecturebook.taxi360.common

import java.util.Random

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}


object CsvKafkaPublisher {

  var counter = 0
  var salts = 0

  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<brokerList> " +
        "<topicName> " +
        "<dataFolderOrFile> " +
        "<sleepPerRecord> " +
        "<acks> " +
        "<> " +
        "<producer.type> " +
        "<batch.size> " +

    val kafkaBrokerList = args(0)
    val kafkaTopicName = args(1)
    val nyTaxiDataFolder = args(2)
    val sleepPerRecord = args(3).toInt
    val acks = args(4).toInt
    val lingerMs = args(5).toInt
    val producerType = args(6) //"async"
    val batchSize = args(7).toInt
    salts = args(8).toInt

    val kafkaProducer = KafkaProducerUntil.getNewProducer(kafkaBrokerList, acks, lingerMs, producerType, batchSize)

    println("--Input:" + nyTaxiDataFolder)

    val dataFolder = new File(nyTaxiDataFolder)
    if (dataFolder.isDirectory) {
      val files = dataFolder.listFiles().iterator
      files.foreach(f => {
        println("--Input:" + f)
        processFile(f, kafkaTopicName, kafkaProducer, sleepPerRecord)
    } else {
      println("--Input:" + dataFolder)
      processFile(dataFolder, kafkaTopicName, kafkaProducer, sleepPerRecord)

  def processFile(file:File, kafkaTopicName:String,
                  kafkaProducer: KafkaProducer[String, String], sleepPerRecord:Int): Unit = {
    var counter = 0
    val r = new Random()

    println("-Starting Reading")
    Source.fromFile(file).getLines().foreach(l => {
      counter += 1
      if (counter % 10000 == 0) {
        println("{Sent:" + counter + "}")
      if (counter % 100 == 0) {

      val saltedVender = r.nextInt(salts) + l

      if (counter > 2) {
        publishTaxiRecord(saltedVender, kafkaTopicName, kafkaProducer)

  def publishTaxiRecord(line:String, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String]): Unit = {

    if (line.startsWith("vendor_name") || line.length < 10) {
    } else {
      val message = new ProducerRecord[String, String](kafkaTopicName, line.hashCode.toString, line)

Example 17
Source File: ExtremeSummarizerSpec.scala    From flint   with Apache License 2.0
package com.twosigma.flint.timeseries.summarize.summarizer

import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer
import com.twosigma.flint.timeseries.row.Schema
import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite }
import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite }
import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType }
import java.util.Random

import org.apache.spark.sql.Row

class ExtremeSummarizerSpec extends SummarizerSuite {

  override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer"

  private def test[T](
    dataType: DataType,
    randValue: Row => Any,
    summarizer: String => SummarizerFactory,
    reduceFn: (T, T) => T,
    inputColumn: String,
    outputColumn: String
  ): Unit = {
    val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns(
      inputColumn -> dataType -> randValue

    val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) }

    val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) }

    val result = priceTSRdd.summarize(summarizer(inputColumn))

    val extreme = result.first().getAs[T](outputColumn)
    val outputType = result.schema(outputColumn).dataType

    assert(outputType == dataType, s"$outputType")
    assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}")

  "MaxSummarizer" should "compute double max correctly" in {
    val rand = new Random()
    test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max")

  it should "compute long max correctly" in {
    val rand = new Random()
    test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max")

  it should "compute float max correctly" in {
    val rand = new Random()
    test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max")

  it should "compute int max correctly" in {
    val rand = new Random()
    test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max")

  "MinSummarizer" should "compute double min correctly" in {
    val rand = new Random()
    test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min")

  it should "compute long min correctly" in {
    val rand = new Random()
    test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min")

  it should "compute float min correctly" in {
    val rand = new Random()
    test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min")

  it should "compute int min correctly" in {
    val rand = new Random()
    test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min")

  Example 18

  it should "ignore null values" in {
    val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType))
    val inputWithNull = insertNullRows(input, "price")

Example 18
Source File: WithdrawalEpochCertificateFixture.scala    From Sidechains-SDK   with MIT License
package com.horizen.block

import java.util.Random

trait WithdrawalEpochCertificateFixture {
  private def getBytes(len: Int = 32, rnd: Random = new Random()): Array[Byte] = {
    val bytes = new Array[Byte](len)

  def generateWithdrawalEpochCertificate(previousMcBlockHashOpt: Option[Array[Byte]] = None, rnd: Random = new Random()): WithdrawalEpochCertificate = {
Example 19
Source File: GenerationRules.scala    From Sidechains-SDK   with MIT License
package com.horizen.fixtures.sidechainblock.generation

import java.util.Random

import scorex.util.ModifierId

case class GenerationRules(forgingBoxesToAdd: Set[SidechainForgingData] = Set(),
                           forgingBoxesToSpent: Set[SidechainForgingData] = Set(),
                           mcReferenceIsPresent: Option[Boolean] = None,
                           corruption: CorruptedGenerationRules = CorruptedGenerationRules.emptyCorruptedGenerationRules,
                           forcedParentId: Option[ModifierId] = None,
                           forcedTimestamp: Option[Long] = None
                         ) {
  def isCorrupted: Boolean = corruption == CorruptedGenerationRules.emptyCorruptedGenerationRules

object GenerationRules {
  def generateCorrectGenerationRules(rnd: Random, allNotSpentForgerData: Set[SidechainForgingData]): GenerationRules = {
    val addForgingData: Set[SidechainForgingData] =
      if (allNotSpentForgerData.size > 100) {
        Set(SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000))))
      else {
        Set(SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000))), SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000))))

    val removedForgingData: Set[SidechainForgingData] =
      if (rnd.nextBoolean()) {
      else {
        val deleteSize = if (allNotSpentForgerData.size > 100) 10 else 1

    require((removedForgingData -- allNotSpentForgerData).isEmpty)

    GenerationRules(forgingBoxesToAdd = addForgingData, forgingBoxesToSpent = removedForgingData)
Example 20
Source File: SidechainForgingData.scala    From Sidechains-SDK   with MIT License
package com.horizen.fixtures.sidechainblock.generation

import java.util.Random

import com.horizen.consensus._
import com.horizen.proof.VrfProof
import com.horizen.proposition.VrfPublicKey
import com.horizen.secret.{PrivateKey25519, PrivateKey25519Creator, VrfKeyGenerator, VrfSecretKey}
import com.horizen.vrf.VrfOutput

case class SidechainForgingData(key: PrivateKey25519, forgerBox: ForgerBox, vrfSecret: VrfSecretKey) {
  def canBeForger(vrfMessage: VrfMessage, totalStake: Long, additionalCheck: Boolean => Boolean): Option[(VrfProof, VrfOutput)] = {
    val vrfProofAndHash = vrfSecret.prove(vrfMessage)
    val vrfProof = vrfProofAndHash.getKey
    val vrfOutput = vrfProofAndHash.getValue

    val checker = (stakeCheck _).tupled.andThen(additionalCheck)
    Some((vrfProof, vrfOutput)).filter{case (vrfProof, vrfOutput) => checker(vrfOutput, totalStake)}

  private def stakeCheck(vrfOutput: VrfOutput, totalStake: Long): Boolean = {
    vrfProofCheckAgainstStake(vrfOutput, forgerBox.value(), totalStake)

  val forgerId: Array[Byte] =

  override def toString: String = {
    s"id - ${key.hashCode()}, value - ${forgerBox.value()}"

  override def equals(obj: Any): Boolean = {
    obj match {
      case that: SidechainForgingData => {
        val keyEquals = this.key.equals(that.key)
        val forgerBoxEquals = this.forgerBox.equals(that.forgerBox)
        val vrfSecretEquals = this.vrfSecret.equals(that.vrfSecret)

        keyEquals && forgerBoxEquals && vrfSecretEquals
      case _ =>

object SidechainForgingData {
  def generate(rnd: Random, value: Long): SidechainForgingData = {
    val key: PrivateKey25519 = PrivateKey25519Creator.getInstance().generateSecret(rnd.nextLong().toString.getBytes)
    val vrfSecretKey = VrfKeyGenerator.getInstance().generateSecret(rnd.nextLong().toString.getBytes())
    val vrfPublicKey: VrfPublicKey = vrfSecretKey.publicImage();
    val forgerBox = new ForgerBoxData(key.publicImage(), value, key.publicImage(), vrfPublicKey).getBox(rnd.nextLong())

    SidechainForgingData(key, forgerBox, vrfSecretKey)
Example 21
Source File: ForgerBoxFixture.scala    From Sidechains-SDK   with MIT License
package com.horizen.fixtures

import java.util.Random

import com.horizen.proposition.VrfPublicKey
import com.horizen.secret.{PrivateKey25519, VrfKeyGenerator, VrfSecretKey}
import com.horizen.utils
import com.horizen.utils.Ed25519

case class ForgerBoxGenerationMetadata(propositionSecret: PrivateKey25519, blockSignSecret: PrivateKey25519, vrfSecret: VrfSecretKey)

object ForgerBoxFixture {
  def generateForgerBox(seed: Long): (ForgerBox, ForgerBoxGenerationMetadata) = generateForgerBox(seed, None)

  def generateForgerBox(seed: Long,
                        vrfKeysOpt: Option[(VrfSecretKey, VrfPublicKey)]): (ForgerBox, ForgerBoxGenerationMetadata) = {
    val randomGenerator = new Random(seed)
    val byteSeed = new Array[Byte](32)
    val propositionKeyPair: utils.Pair[Array[Byte], Array[Byte]] = Ed25519.createKeyPair(byteSeed)
    val ownerKeys: PrivateKey25519 = new PrivateKey25519(propositionKeyPair.getKey, propositionKeyPair.getValue)
    val value: Long = randomGenerator.nextLong
    val (vrfSecret, vrfPubKey) = vrfKeysOpt.getOrElse{
      val secretKey = VrfKeyGenerator.getInstance().generateSecret(ownerKeys.bytes())
      val publicKey = secretKey.publicImage()
        (secretKey, publicKey)
    val proposition = ownerKeys.publicImage()

    val forgerBoxData = new ForgerBoxData(proposition, value, proposition, vrfPubKey)
    val nonce: Long = randomGenerator.nextLong

    val forgerBox = forgerBoxData.getBox(nonce)
    (forgerBox, ForgerBoxGenerationMetadata(ownerKeys, ownerKeys, vrfSecret))
Example 22
Source File: HistoryConsensusCheckerTest.scala    From Sidechains-SDK   with MIT License
package com.horizen.consensus

import java.util.Random

import com.horizen.SidechainHistory
import com.horizen.fixtures.sidechainblock.generation._
import com.horizen.params.{NetworkParams, TestNetParams}
import org.junit.Test
import org.scalatest.junit.JUnitSuite

import scala.collection.mutable
import scala.util.{Failure, Success, Try}

class HistoryConsensusCheckerTest extends JUnitSuite with HistoryConsensusChecker {

  def testWithSeed(testSeed: Int): Unit = {
    //val testSeed = 234
    val rnd: Random = new Random(testSeed)

    val initialParams = TestNetParams(consensusSlotsInEpoch = 10, sidechainGenesisBlockTimestamp = 1333344452L)
    val (params, genesisBlock, genesisGenerator, genesisForgingData, genesisEndEpochInfo) = SidechainBlocksGenerator.startSidechain(10000000000L, testSeed, initialParams)
    val history: SidechainHistory = createHistory(params, genesisBlock, genesisEndEpochInfo)
    val nonce = history.calculateNonceForEpoch(blockIdToEpochId(
    val stake = genesisEndEpochInfo.stakeConsensusEpochInfo
    history.applyFullConsensusInfo(, FullConsensusEpochInfo(stake, nonce))
    println(s"//////////////// Genesis epoch ${} had been ended ////////////////")

    val generators = mutable.IndexedSeq(genesisGenerator)

    (1 to 50)
      .foldLeft[(SidechainHistory, mutable.IndexedSeq[SidechainBlocksGenerator])]((history, generators)) { (acc, index) =>
        val currentHistory: SidechainHistory = acc._1
        val currentGenerators: mutable.IndexedSeq[SidechainBlocksGenerator] =  acc._2

        val nextGenerator: SidechainBlocksGenerator = generatorSelection(rnd, currentGenerators)
        val nextCorrectGenerationRules: GenerationRules = GenerationRules.generateCorrectGenerationRules(rnd, nextGenerator.getNotSpentBoxes)

        println("try to add incorrect block(s)")
        tryToAddIncorrectBlocks(params, currentHistory, nextGenerator, nextCorrectGenerationRules, rnd)
        println("try to add correct block")
        val correctRes = Try(generateBlock(nextCorrectGenerationRules, nextGenerator, history)) match {
          case Success((gens, generatedBlock)) =>
            val updatedHistory = historyUpdateShallBeSuccessful(currentHistory, generatedBlock)
            val updatedGenerators = currentGenerators ++ gens
            (updatedHistory, updatedGenerators)

          case Failure(ex: GenerationIsNoLongerPossible) =>
            println("Finishing block generation")

          case Failure(ex) =>
            println("Error during block generation")
            throw ex


  private def tryToAddIncorrectBlocks(params: NetworkParams,
                                      currentHistory: SidechainHistory,
                                      currentGenerator: SidechainBlocksGenerator,
                                      correctGenerationRules: GenerationRules,
                                      rnd: Random,
                                      incorrectBlocksCount: Int = 2): Unit = Try {
    (1 to incorrectBlocksCount)
      .foreach{ _ =>
        val incorrectGenerationRules: GenerationRules = CorruptedGenerationRules.corruptGenerationRules(rnd, params, currentGenerator, correctGenerationRules)
        //println(s"Generated corruption rules are: ${incorrectGenerationRules}")
          .map(generationInfo => historyUpdateShallBeFailed(currentHistory,generationInfo.block, incorrectGenerationRules))

  def testManySeeds(): Unit = {
    val seed = 9084

    (50 to 50).foreach{index =>
      println(s"SEED IS ${index}")
      testWithSeed(index + seed)

Example 23
Source File: DescriptiveStatsSuite.scala    From HANAVora-Extensions   with Apache License 2.0

import java.util.Random

import org.scalatest.FunSuite

// scalastyle:off magic.number
class DescriptiveStatsSuite extends FunSuite {
  val SEED = 123
  val SignificantPosCorrelation = 0.9

  test("mean") {
    val samples0 = Seq(1, 1)
    val samples1 = Seq(1, 2, 3, 4)
    val samples2 = Seq(1.1, 0.9, 1.0)
    val samples3 = Seq.empty[Int]
    val samples3mean = DescriptiveStats.mean(samples3)

  test("stdev") {
    val samples0 = Seq(0, 2)
    val samples1 = Seq.empty[Double]
    val samples2 = Seq.fill(1000)(0) ++ Seq.fill(1000)(2)
    val samples2stdev = DescriptiveStats.stdev(samples2)
    assert(samples2stdev > 1.0 && samples2stdev < 1.001)

  test("pearson") {
    val rand = new Random(SEED)
    val samples1 = Seq((1, 1), (2, 2), (3, 3))
    val samples2 = Seq((3.0, 1), (2.0, 2), (1.0, 3))
    val samples3 = (1 to 100000).map { i => (rand.nextDouble(), rand.nextDouble()) }
    val samples4 = Seq.empty[(Double, Double)]
    assert(math.abs(DescriptiveStats.pearson(samples3)) < 0.01)

  test("spearman") {
    val rand = new Random(SEED)
    val samples1 = Seq((1, 1), (2, 2), (3, 3))
    val samples2 = Seq((3.0, 1), (2.0, 2), (1.0, 3))
    val samples3 = (1 to 100000).map { i => (rand.nextDouble(), rand.nextDouble()) }
    val samples4 = Seq.empty[(Double, Double)]
    assert(math.abs(DescriptiveStats.spearman(samples3)) < 0.01)

  test("spearman & pearson w/ noise & outliers") {
    val samples1 = Seq((1, 300.0), (2, 250.0), (3, 400.0), (4, 350.0), (5, 500.0),
                       (6, 450.0), (7, 600.0), (8, 550.0), (9, 700.0), (10, 650.0))
    val samples2 = Seq((1, 300.0), (2, 350.0), (3, 400.0), (4, 450.0), (5, 500.0),
                       (6, 550.0), (7, 2000.0), (8, 700.0), (9, 750.0), (10, 800.0))
    assert(DescriptiveStats.pearson(samples1) > SignificantPosCorrelation)
    assert(DescriptiveStats.spearman(samples1) > SignificantPosCorrelation)
    // pearson is less robust, does not detect dependency
    assert(DescriptiveStats.pearson(samples2) < SignificantPosCorrelation)
    // spearman detects dependency
    assert(DescriptiveStats.spearman(samples2) > SignificantPosCorrelation)

  test("spearman & pearson w/ real data") {

    val measure1 = Seq(379, 379, 382, 360, 378, 374, 364, 371, 360, 365, 364, 363, 369, 375, 365,
                       369, 358, 372, 370, 363, 363, 369, 361, 362, 367, 357, 365, 364, 363, 368,
                       360, 361, 360, 363, 359, 357, 365, 367, 364, 363)
    val measure2 = Seq(411, 379, 380, 382, 387, 404, 410, 431, 430, 444, 468, 489, 519, 573, 571,
                       620, 643, 657, 694, 711, 752, 783, 807, 841, 856, 891, 912, 962,1042, 982,
    val samples1 = measure1.zipWithIndex
    val samples2 = measure2.zipWithIndex

    assert(DescriptiveStats.pearson(samples1) < SignificantPosCorrelation)
    assert(DescriptiveStats.spearman(samples1) < SignificantPosCorrelation)
    assert(DescriptiveStats.pearson(samples2) > SignificantPosCorrelation)
    assert(DescriptiveStats.spearman(samples2) > SignificantPosCorrelation)

Example 24
Source File: VLBFGS1.scala    From spark-vl-bfgs   with Apache License 2.0

import java.util.Random

import scala.language.implicitConversions

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import{Oracle, VectorSpace}
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.{RDD, UnionRDD}

  private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = {
    data.cartesian(dx).map { case (points, x) =>
      val g = Vectors.zeros(x.size)
      points.foreach { case LabeledPoint(b, a) =>
        val err =, x) - b
        BLAS.axpy(err, a, g)

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val n = 1000
    val p = 100
    val random = new Random(0L)
    val xExact = Vectors.dense(Array.fill(p)(random.nextDouble()))
    val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) =>
      val random = new Random(100 + idx) { v =>
        val target =, xExact) + 0.1 * random.nextGaussian()
        LabeledPoint(target, v)

    val x = solve(data).first()

    println(s"x_exact = $xExact")
    println(s"x_vlbfgs = $x")

Example 25
Source File: ProjectionsTest.scala    From spark-tda   with Apache License 2.0
package org.apache.spark.mllib.linalg

import java.util.Random
import org.scalacheck.Gen
import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.scalatest.prop.Checkers.check

class ProjectionsTest
    extends LinalgPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {
  import org.scalactic.Tolerance._
  val dimGen = for {
    srcDim <- Gen.choose(100, 200)
    dstDim <- Gen.choose(100, 200)
  } yield (srcDim, dstDim)

  property("gaussian random projection have good statistical properties") {
    forAllNoShrink(dimGen) {
      case (srcDim, dstDim) =>
        val projection =
          GaussianRandomProjection(srcDim, dstDim, new Random())
        projection.mean === 0.0 +- 0.01
        projection.stddev === 1.0 +- 0.01

  property("cauchy random projection have good statistical properties") {
    forAllNoShrink(dimGen) {
      case (srcDim, dstDim) =>
        val projection =
          CauchyRandomProjection(srcDim, dstDim)
        projection.median === 0.0 +- 0.01
Example 26
package com.stefansavev.similaritysearch.implementation

import java.util
import java.util.Random

import com.stefansavev.randomprojections.utils.RandomUtils
import com.stefansavev.similaritysearch.{SimilaritySearchIndex, SimilaritySearchResult, SimilaritySearchResultBuilder, SimilaritySearchResults}

object FuzzySearchEvaluationUtilsWrapper {

  def generateRandomTestSet(rnd: Random, numQueries: Int, index: SimilaritySearchIndex): SimilaritySearchResults = {
    import scala.collection.JavaConversions._
    val itemNames =

    val sampleIds = RandomUtils.sample(rnd, numQueries, Array.range(0, itemNames.length))
    val builder = new SimilaritySearchResultBuilder()
    for (id <- sampleIds) {
      val queryId = itemNames(id)
      val queryVector = index.getItemByName(queryId).getVector
      val queryResults = new util.ArrayList[SimilaritySearchResult]()
      builder.addResult(queryId, queryResults)

Example 27
Example 27
package com.stefansavev.randomprojections.datarepr.dense

import java.util.Random

import com.stefansavev.randomprojections.datarepr.sparse.SparseVector
import com.stefansavev.randomprojections.implementation.{Signatures, PointSignatures}

class PointIndexes(val indexes: Array[Int]){
  def toTuple: PointIndexes.TupleType = (0, indexes)
  def size = indexes.length
  def apply(i: Int): Int = indexes(i)

object PointIndexes{
  type TupleType = (Int, Array[Int]) //the first is dummy because I need to add a tuple1
  def apply(indexes: Array[Int]): PointIndexes = new PointIndexes(indexes)
  def unapply(pntIndexes: PointIndexes): Option[Array[Int]] = Some(pntIndexes.indexes)
  def fromTuple(t: TupleType): PointIndexes = new PointIndexes(t._2)

class DataFrameView(val indexes: PointIndexes, val rowStoredView: RowStoredMatrixView) {
  def toTuple:DataFrameView.TupleType = (indexes, rowStoredView)

  var pointSignatures: PointSignatures = null //work around until the concept is validated
  def numRows: Int = indexes.size
  def numCols: Int = rowStoredView.numCols

  def setPointSignatures(pointSignatures: PointSignatures): Unit = {
    this.pointSignatures = pointSignatures

  def getRowIdByName(name: String): Int = {

  def buildSetSignatures(numSignatures: Int, rnd: Random): Unit = {
    if (pointSignatures != null){
      throw new IllegalStateException("Signatures cannot be overwritten")
    val (signatureVecs, signatures) = Signatures.computePointSignatures(numSignatures, rnd, this)

  def getPointSignatures(): PointSignatures = {

  def getAllRowNames(): Array[String] = {

  def getPointAsDenseVector(pntId: Int): Array[Double] = {

  def getPointAsDenseVector(pntId: Int, columnIds: Array[Int], vec: Array[Double]): Unit = {
    rowStoredView.getPointAsDenseVector(pntId, columnIds, vec)

  def multiplyRowComponentWiseBySparseVector(pntId: Int, sv: SparseVector, output: Array[Double]): Unit = {
    rowStoredView.multiplyRowComponentWiseBySparseVector(pntId, sv, output)

  def getUnderlyingIndexes(): PointIndexes = indexes

  def childView(newIndexes: PointIndexes): DataFrameView = {
    new DataFrameView(newIndexes, rowStoredView)

  def getLabel(rowId: Int): Int = rowStoredView.getLabel(rowId)

  def getAllLabels(): Array[Int] = rowStoredView.getAllLabels()

  def getName(rowId: Int): String = {

  //def dist(id1: Int, id2: Int): Double = rowStoredView.dist(id1, id2)

  def cosineForNormalizedData(query: Array[Double], id: Int): Double = rowStoredView.cosineForNormalizedData(query, id)

  override def toString = s"DataFrameView($numRows, $numCols)"

object DataFrameView{
  type TupleType = (PointIndexes, RowStoredMatrixView)
  def fromTuple(t: TupleType) = new DataFrameView(t._1, t._2)
Example 28
Source File: RandomUtils.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0
package com.stefansavev.randomprojections.utils

import java.util.Random

import com.stefansavev.randomprojections.buffers.IntArrayBuffer
import com.stefansavev.randomprojections.datarepr.sparse.SparseVector

object RandomUtils {
  def shuffleInts(rnd: Random, arr: Array[Int]): Array[Int] = {
    val values = => (v, rnd.nextDouble())).sortBy(_._2).map(_._1)

  def shuffleDoubles(rnd: Random, arr: Array[Double]): Array[Double] = {
    val values = => (v, rnd.nextDouble())).sortBy(_._2).map(_._1)

  def sign(rnd: Random): Double = {
    if (rnd.nextDouble() > 0.5) 1.0 else -1.0

  def generateRandomVector(rnd: Random, numCols: Int, columnIds: Array[Int]): SparseVector = {
    val signs = => (if (rnd.nextDouble() >= 0.5) 1.0 else -1.0))
    var sum = 0.0
    var i = 0
    while (i < signs.length) {
      val v = signs(i)
      sum += v * v
      i += 1
    sum = Math.sqrt(sum)

    i = 0
    while (i < signs.length) {
      signs(i) /= sum
      i += 1

    val sparseVec = new SparseVector(numCols, columnIds, signs)

  def generateRandomVector(rnd: Random, numCols: Int): SparseVector = {
    generateRandomVector(rnd, numCols, Array.range(0, numCols))

  //TODO: use a version of reservoir sampling together with random shuffle
  def sample(rnd: Random, k: Int, arr: Array[Int]): Array[Int] = {
    def getValue(arr: Array[Int], overWrites: scala.collection.mutable.HashMap[Int, Int], index: Int): Int = {
      if (overWrites.contains(index)) {
      } else {
    var currentLength = arr.length
    val buffer = new IntArrayBuffer()
    val overWrites = new scala.collection.mutable.HashMap[Int, Int]()
    var i = 0
    while (i < k && currentLength > 0) {
      val nextPos = rnd.nextInt(currentLength)
      val sampledValue = getValue(arr, overWrites, nextPos)
      buffer += sampledValue
      if (nextPos < currentLength - 1) {
        val lastValue = getValue(arr, overWrites, currentLength - 1)
        overWrites += ((nextPos, lastValue))
      currentLength -= 1
      i += 1
Example 29
Source File: SplitIntoKProjection.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0
package com.stefansavev.randomprojections.implementation

class SplitIntoKProjection {


import java.util.Random

import com.stefansavev.randomprojections.datarepr.dense.DataFrameView
import com.stefansavev.randomprojections.datarepr.sparse.SparseVector
import com.stefansavev.randomprojections.utils.RandomUtils

import scala.collection.mutable.ArrayBuffer

case class SplitIntoKProjectionStrategy(rnd: Random, numCols: Int, k: Int) extends ProjectionStrategy {

  def chooseKPoints(k: Int, pointIds: Array[Int], view: DataFrameView): Array[Int] = {
    RandomUtils.shuffleInts(rnd, pointIds).take(k)

  def chooseKDimensions(k: Int): Array[Int] = {
    val columns = Array.range(0, numCols)
    RandomUtils.shuffleInts(rnd, columns).take(k).sorted

  def generateRandomVector(columnIds: Array[Int]): SparseVector = {
    val signs = => (if (rnd.nextDouble() >= 0.5) 1.0 else -1.0))

    var sum = 0.0
    var i = 0
    while (i < signs.length) {
      val v = signs(i)
      sum += v * v
      i += 1
    sum = Math.sqrt(sum)

    i = 0
    while (i < signs.length) {
      signs(i) /= sum
      i += 1

    val sparseVec = new SparseVector(numCols, columnIds, signs)

  def generateKRandomVectors(num: Int, columnIds: Array[Int]): Array[SparseVector] = {
    val buff = new ArrayBuffer[SparseVector]()
    for (i <- 0 until num) {
      buff += generateRandomVector(columnIds)

  def nextRandomProjection(depth: Int, view: DataFrameView, projectionVector: AbstractProjectionVector): AbstractProjectionVector = {
    val useK = HadamardUtils.largestPowerOf2(k)
    val chosenDim = chooseKDimensions(useK)
    val randomVector = generateRandomVector(chosenDim)
    val proj = new HadamardProjectionVector(randomVector)

case class SplitIntoKProjectionSettings(k: Int)

class SplitIntoKProjectionBuilder(builderSettings: SplitIntoKProjectionSettings) extends ProjectionStrategyBuilder {
  type T = SplitIntoKProjectionStrategy
  val splitStrategy: DatasetSplitStrategy = new HadamardProjectionSplitStrategy()

  def build(settings: IndexSettings, rnd: Random, dataFrameView: DataFrameView): T = SplitIntoKProjectionStrategy(rnd, dataFrameView.numCols, builderSettings.k)

  def datasetSplitStrategy: DatasetSplitStrategy = splitStrategy
Example 30
Source File: ValuesStoreTest.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0
package com.stefansavev

import java.util.Random

import com.typesafe.scalalogging.StrictLogging
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

class TestSingleByteEncodingSpec extends FlatSpec with Matchers {
  "Error after encoding double to float" should "be small" in {
    val minV = -1.0f
    val maxV = 2.0f
    val rnd = new Random(481861)
    for (i <- 0 until 100) {
      //we encode a float (which is 4 bytes) with a single byte
      //therefore the loss of precision
      val value = rnd.nextFloat() * 3.0f - 1.0f
      val enc = FloatToSingleByteEncoder.encodeValue(minV, maxV, value)
      val dec = FloatToSingleByteEncoder.decodeValue(minV, maxV, enc)
      val error = Math.abs(value - dec)
      error should be < (0.01)

class TestValueStores extends FlatSpec with Matchers {

  case class BuilderTypeWithErrorPredicate(builderType: StoreBuilderType, pred: Double => Boolean)

  "ValueStore" should "return store the data with small error" in {

    val tests = List(
      BuilderTypeWithErrorPredicate(StoreBuilderAsDoubleType, error => (error <= 0.0)),
      BuilderTypeWithErrorPredicate(StoreBuilderAsBytesType, error => (error <= 0.01)),
      BuilderTypeWithErrorPredicate(StoreBuilderAsSingleByteType, error => (error <= 0.01))

    for (test <- tests) {

    def testBuilder(builderWithPred: BuilderTypeWithErrorPredicate): Unit = {
      val dataGenSettings = RandomBitStrings.RandomBitSettings(
        numGroups = 1000,
        numRowsPerGroup = 2,
        numCols = 256,
        per1sInPrototype = 0.5,
        perNoise = 0.2)

      val debug = false
      val randomBitStringsDataset = RandomBitStrings.genRandomData(58585, dataGenSettings, debug, true)
      val builder = builderWithPred.builderType.getBuilder(randomBitStringsDataset.numCols)

      def addValues(): Unit = {
        var i = 0
        while (i < randomBitStringsDataset.numRows) {
          val values = randomBitStringsDataset.getPointAsDenseVector(i)
          i += 1


      val valueStore =

      def verifyStoredValues(expected: Array[Double], stored: Array[Double]): Unit = {
        for (i <- 0 until expected.length) {
          val error = Math.abs(expected(i) - stored(i))
          val passed = builderWithPred.pred(error)
          passed should be (true)

      def testValues(): Unit = {
        var i = 0
        while (i < randomBitStringsDataset.numRows) {
          val values = randomBitStringsDataset.getPointAsDenseVector(i)
          val output = Array.ofDim[Double](randomBitStringsDataset.numCols)
          valueStore.fillRow(i, output, true)
          verifyStoredValues(values, output)
          i += 1

object Test extends StrictLogging {
  def main(args: Array[String]) {"hello")
Example 31
Source File: SpeedSimulator.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0
package com.stefansavev.tuning

import java.util.Random

case class SpeedSimulatorParams(numberOfQueries: Int,
  numberOfTrees: Int,
  requiredPointsPerTree: Int,
  deviationOfRequiredPointsPerTree: Int = 0){

object SpeedSimulator {
  val rnd = new Random(11144)

  def testPoint(p: Int, params: SpeedSimulatorParams, buffer: Array[Int]): Int = {
    val rnd = this.rnd
    val len = buffer.length
    var i = 0
    val numTrees = params.numberOfTrees
    val numPointsPerTree = params.requiredPointsPerTree
    val offset = rnd.nextInt(len)
    val stride = offset/params.requiredPointsPerTree  + rnd.nextInt(50)
    var totalOperations = 0
    while(i < numTrees){
      var j = 0
      while(j < numPointsPerTree){
        val k = (offset + stride*j + i) % len //some random formula
        buffer(k) += 1
        j += 1
      i += 1
      totalOperations += 1

  def simulate(params: SpeedSimulatorParams): Unit = {
    val numQueries = params.numberOfQueries
    val bufflen = params.numberOfQueries
    val buffer = Array.ofDim[Int](bufflen)
    val start = System.currentTimeMillis()
    var i = 0
    while(i < numQueries){
      var j = 0
      while(j < bufflen){
        buffer(j) = 0
        j += 1
      if (i % 5000 == 0){
      testPoint(i, params, buffer)
      i += 1
    val result = System.currentTimeMillis() - start
    println("Simulation in secs: " + result/1000.0 + "    ; per point in ms " + result.toDouble/numQueries.toDouble)

  def main (args: Array[String]): Unit = {
    val params = SpeedSimulatorParams(numberOfQueries = 42000, numberOfTrees = 10, requiredPointsPerTree = 1200)
Example 32
Source File: RandomBitStrings.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0
package com.stefansavev

import java.util.Random

import com.stefansavev.randomprojections.datarepr.dense.{ColumnHeaderBuilder, DataFrameView, PointIndexes, RowStoredMatrixViewBuilderFactory}

object RandomBitStrings {

  case class RandomBitSettings(numGroups: Int, numRowsPerGroup: Int, numCols: Int, per1sInPrototype: Double, perNoise: Double)

  def generatePrototype(rnd: Random, dim: Int, perValue: Double): Array[Double] = {
    val arr = Array.ofDim[Double](dim)
    for (i <- 0 until dim) {
      val gaussian = Math.abs(rnd.nextGaussian())
      arr(i) = -1.0 * gaussian
      if (rnd.nextDouble() < perValue) {
        arr(i) = 1.0 * gaussian

  def corrupt(rnd: Random, input: Array[Double], perNoise: Double): Array[Double] = {
    val arr = Array.ofDim[Double](input.length)
    for (i <- 0 until input.length) {
      if (rnd.nextDouble() < perNoise) {
        arr(i) = -input(i)
      else {
        arr(i) = input(i)

  //todo: put in utils
  def normalize(input: Array[Double]): Array[Double] = {
    val arr = Array.ofDim[Double](input.length)
    var norm = 0.0
    for (i <- 0 until input.length) {
      norm += input(i) * input(i)
    norm = Math.sqrt(norm)
    for (i <- 0 until input.length) {
      arr(i) = input(i) / norm

  def genRandomData(seed: Int, settings: RandomBitSettings, debug: Boolean, dense: Boolean): DataFrameView = {

    val (numGroups, numRowsPerGroup, numCols: Int, per1sInPrototype: Double, perNoise: Double) =
      (settings.numGroups, settings.numRowsPerGroup, settings.numCols, settings.per1sInPrototype, settings.perNoise)

    val numRows = numGroups * numRowsPerGroup

    val labels = Array.ofDim[Int](numRows)
    val rnd = new Random(seed)
    var i = 0

    val columnNames = Array.range(0, numCols).map((i: Int) => ("feature" + i, i))
    val rowNames = Array.range(0, numRows).map(_.toString)
    val header ="label", columnNames, true)

    val builder = RowStoredMatrixViewBuilderFactory.createDense(header)

    for (g <- 0 until numGroups) {
      val prototype = generatePrototype(rnd, numCols, per1sInPrototype)
      for (r <- 0 until numRowsPerGroup) {
        val noisyProt = corrupt(rnd, prototype, perNoise)
        labels(i) = g
        if (i != builder.currentRowId) {
          throw new IllegalStateException("Cannot skip rows")

        builder.addRow(i.toString, g, Array.range(0, numCols), normalize(noisyProt))
        i += 1
    val indexes = PointIndexes(Array.range(0, numRows))
    new DataFrameView(indexes,

Example 33
Source File: RandomBitStrings.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0
package com.stefansavev.fuzzysearchtest

import java.util.Random

import com.stefansavev.randomprojections.datarepr.dense.{ColumnHeaderBuilder, DataFrameView, PointIndexes, RowStoredMatrixViewBuilderFactory}

object RandomBitStrings {

  case class RandomBitSettings(numGroups: Int, numRowsPerGroup: Int, numCols: Int, per1sInPrototype: Double, perNoise: Double)

  def generatePrototype(rnd: Random, dim: Int, perValue: Double): Array[Double] = {
    val arr = Array.ofDim[Double](dim)
    for (i <- 0 until dim) {
      arr(i) = -1.0
      if (rnd.nextDouble() < perValue) {
        arr(i) = 1.0

  def corrupt(rnd: Random, input: Array[Double], perNoise: Double): Array[Double] = {
    val arr = Array.ofDim[Double](input.length)
    for (i <- 0 until input.length) {
      if (rnd.nextDouble() < perNoise) {
        arr(i) = -input(i)
      else {
        arr(i) = input(i)

  def genRandomData(seed: Int, settings: RandomBitSettings, debug: Boolean, dense: Boolean): DataFrameView = {

    val (numGroups, numRowsPerGroup, numCols: Int, per1sInPrototype: Double, perNoise: Double) =
      (settings.numGroups, settings.numRowsPerGroup, settings.numCols, settings.per1sInPrototype, settings.perNoise)

    val numRows = numGroups * numRowsPerGroup

    val labels = Array.ofDim[Int](numRows)
    val rnd = new Random(seed)
    var i = 0

    val columnNames = Array.range(0, numCols).map((i: Int) => ("feature" + i, i))
    val header ="label", columnNames, false)

    val builder = RowStoredMatrixViewBuilderFactory.createDense(header)

    for (g <- 0 until numGroups) {
      val prototype = generatePrototype(rnd, numCols, per1sInPrototype)
      for (r <- 0 until numRowsPerGroup) {
        val noisyProt = corrupt(rnd, prototype, perNoise)
        labels(i) = g
        if (i != builder.currentRowId) {
          throw new IllegalStateException("Cannot skip rows")
        builder.addRow(g, Array.range(0, numCols), noisyProt)
        i += 1
    val indexes = PointIndexes(Array.range(0, numRows))
    new DataFrameView(indexes,

Example 34
Source File: TestOnRandomData.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0
package com.stefansavev.fuzzysearchtest

import java.util.Random

import com.stefansavev.randomprojections.actors.Application
import com.stefansavev.randomprojections.implementation._
import com.stefansavev.randomprojections.utils.Utils
import com.stefansavev.similaritysearch.SimilaritySearchEvaluationUtils
import com.stefansavev.similaritysearch.VectorType.StorageSize
import com.stefansavev.similaritysearch.implementation.FuzzySearchIndexBuilderWrapper
import com.typesafe.scalalogging.StrictLogging

object TestOnRandomData extends StrictLogging {
  implicit val _ = logger

  def main(args: Array[String]): Unit = {
    val dataGenSettings = RandomBitStrings.RandomBitSettings(
      numGroups = 100000,
      numRowsPerGroup = 2,
      numCols = 256,
      per1sInPrototype = 0.5,
      perNoise = 0.1)

    val debug = false
    val randomBitStringsDataset = RandomBitStrings.genRandomData(58585, dataGenSettings, debug, true)

    val randomTreeSettings = IndexSettings(
      maxPntsPerBucket = 50,
      numTrees = 50,
      maxDepth = None,
      projectionStrategyBuilder = ProjectionStrategies.splitIntoKRandomProjection(),
      reportingDistanceEvaluator = ReportingDistanceEvaluators.cosineOnOriginalData(),
      randomSeed = 39393

    println("Number of Rows: " + randomBitStringsDataset.numRows)
    val diskLocation = "D:/tmp/randomfile"
    val trees = Utils.timed("Build Index", {
      val wrapper = new FuzzySearchIndexBuilderWrapper(diskLocation, randomBitStringsDataset.numCols, 50, StorageSize.Double)
      var i = 0
      while (i < randomBitStringsDataset.numRows) {
        wrapper.addItem(i.toString, 0, randomBitStringsDataset.getPointAsDenseVector(i))
        i += 1

    SimilaritySearchEvaluationUtils.compareWithBruteForce(diskLocation, new Random(481868), 1000, 50)


Example 35
Source File: RenderParticle.scala    From Electrodynamics   with GNU Lesser General Public License v3.0
package com.calclavia.edx.quantum.machine.accelerator

import java.util.Random

import cpw.mods.fml.relauncher.{Side, SideOnly}
import net.minecraft.client.renderer.entity.Render
import net.minecraft.client.renderer.{RenderHelper, Tessellator}
import net.minecraft.entity.Entity
import net.minecraft.util.ResourceLocation
import org.lwjgl.opengl.GL11

@SideOnly(Side.CLIENT) class RenderParticle extends Render
  def doRender(entity: Entity, x: Double, y: Double, z: Double, var8: Float, var9: Float)
    val tessellator: Tessellator = Tessellator.instance
    var par2: Float = (entity.ticksExisted)
    while (par2 > 200)
      par2 -= 100
    val var41: Float = (5 + par2) / 200.0F
    var var51: Float = 0.0F
    if (var41 > 0.8F)
      var51 = (var41 - 0.8F) / 0.2F
    val rand: Random = new Random(432L)
    GL11.glTranslatef(x.asInstanceOf[Float], y.asInstanceOf[Float], z.asInstanceOf[Float])
    GL11.glScalef(0.15f, 0.15f, 0.15f)
    GL11.glBlendFunc(GL11.GL_SRC_ALPHA, GL11.GL_ONE)
    GL11.glTranslatef(0.0F, -1.0F, -2.0F)

    for (i1 <- 0 to ((var41 + var41 * var41) / 2.0F * 60.0F).asInstanceOf[Int])
      GL11.glRotatef(rand.nextFloat * 360.0F, 1.0F, 0.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 1.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 0.0F, 1.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 1.0F, 0.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 1.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F + var41 * 90.0F, 0.0F, 0.0F, 1.0F)
      val var81: Float = rand.nextFloat * 20.0F + 5.0F + var51 * 10.0F
      val var91: Float = rand.nextFloat * 2.0F + 1.0F + var51 * 2.0F
      tessellator.setColorRGBA_I(16777215, (255.0F * (1.0F - var51)).asInstanceOf[Int])
      tessellator.addVertex(0.0D, 0.0D, 0.0D)
      tessellator.setColorRGBA_I(0, 0)
      tessellator.addVertex(-0.866D * var91, var81, -0.5F * var91)
      tessellator.addVertex(0.866D * var91, var81, -0.5F * var91)
      tessellator.addVertex(0.0D, var81, 1.0F * var91)
      tessellator.addVertex(-0.866D * var91, var81, -0.5F * var91)
    GL11.glColor4f(1.0F, 1.0F, 1.0F, 1.0F)

  protected def getEntityTexture(entity: Entity): ResourceLocation =
    return null
Example 36
Source File: BlockRadioactive.scala    From Electrodynamics   with GNU Lesser General Public License v3.0
package com.calclavia.edx.quantum.blocks

import java.util.{List, Random}

import cpw.mods.fml.relauncher.{Side, SideOnly}
import net.minecraft.block.Block
import net.minecraft.block.material.Material
import net.minecraft.client.Minecraft
import net.minecraft.client.particle.EntitySmokeFX
import net.minecraft.client.renderer.texture.IIconRegister
import net.minecraft.entity.{Entity, EntityLiving, EntityLivingBase}
import net.minecraft.init.Blocks
import net.minecraft.util.{AxisAlignedBB, IIcon}
import resonantengine.lib.potion.PoisonRadiation
import resonantengine.lib.transform.vector.Vector3

import scala.collection.JavaConversions._

class BlockRadioactive(material: Material) extends Block(material)
  var canSpread: Boolean = true
  var radius: Float = 5
  var amplifier: Int = 2
  var canWalkPoison: Boolean = true
  var isRandomlyRadioactive: Boolean = true
  var spawnParticle: Boolean = true
  private var iconTop: IIcon = null
  private var iconBottom: IIcon = null


  override def getIcon(side: Int, metadata: Int): IIcon =
    return if (side == 1) this.iconTop else (if (side == 0) this.iconBottom else this.blockIcon)

  @SideOnly(Side.CLIENT) override def registerBlockIcons(iconRegister: IIconRegister)
    this.iconTop = iconRegister.registerIcon(this.getUnlocalizedName.replace("tile.", "") + "_top")
    this.iconBottom = iconRegister.registerIcon(this.getUnlocalizedName.replace("tile.", "") + "_bottom")

  override def onEntityWalking(par1World: World, x: Int, y: Int, z: Int, par5Entity: Entity)
    if (par5Entity.isInstanceOf[EntityLiving] && this.canWalkPoison)
      PoisonRadiation.INSTANCE.poisonEntity(new Vector3(x, y, z), par5Entity.asInstanceOf[EntityLiving])

  override def quantityDropped(par1Random: Random): Int =
    return 0

  @SideOnly(Side.CLIENT) override def randomDisplayTick(world: World, x: Int, y: Int, z: Int, par5Random: Random)
    if (this.spawnParticle)
      if (Minecraft.getMinecraft.gameSettings.particleSetting == 0)
        val radius: Int = 3
        for (i <- 0 to 2)
          val pos: Vector3 = new Vector3(x, y, z)
          pos.add(Math.random * radius - radius / 2, Math.random * radius - radius / 2, Math.random * radius - radius / 2)
          val fx: EntitySmokeFX = new EntitySmokeFX(world, pos.x, pos.y, pos.z, (Math.random - 0.5) / 2, (Math.random - 0.5) / 2, (Math.random - 0.5) / 2)
          fx.setRBGColorF(0.2f, 0.8f, 0)


Example 37
Source File: BlockToxicWaste.scala    From Electrodynamics   with GNU Lesser General Public License v3.0
package com.calclavia.edx.quantum.blocks

import java.util.Random

import com.calclavia.edx.quantum.QuantumContent
import QuantumContent
import net.minecraft.block.material.Material
import net.minecraft.entity.{Entity, EntityLivingBase}
import net.minecraft.util.DamageSource
import net.minecraftforge.fluids.BlockFluidClassic
import resonantengine.lib.potion.PoisonRadiation
import resonantengine.lib.transform.vector.Vector3

class BlockToxicWaste extends BlockFluidClassic(QuantumContent.getFluidToxicWaste, Material.water)

  override def randomDisplayTick(par1World: World, x: Int, y: Int, z: Int, par5Random: Random)
    super.randomDisplayTick(par1World, x, y, z, par5Random)
    if (par5Random.nextInt(100) == 0)
      val d5: Double = x + par5Random.nextFloat
      val d7: Double = y + this.maxY
      val d6: Double = z + par5Random.nextFloat
      par1World.spawnParticle("suspended", d5, d7, d6, 0.0D, 0.0D, 0.0D)
    if (par5Random.nextInt(200) == 0)
      par1World.playSound(x, y, z, "liquid.lava", 0.2F + par5Random.nextFloat * 0.2F, 0.9F + par5Random.nextFloat * 0.15F, false)

  override def onEntityCollidedWithBlock(par1World: World, x: Int, y: Int, z: Int, entity: Entity)
    if (entity.isInstanceOf[EntityLivingBase])
      entity.attackEntityFrom(DamageSource.wither, 3)
      PoisonRadiation.INSTANCE.poisonEntity(new Vector3(x, y, z), entity.asInstanceOf[EntityLivingBase], 4)
Example 38
Source File: DirectDataInjector.scala    From SparkOnKudu   with Apache License 2.0
package org.kududb.spark.demo.gamer.cdc

import java.text.SimpleDateFormat
import java.util.Random

import org.kududb.client.{PartialRow, Operation, KuduClient}
import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator

class DirectDataInjector {
  val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy")
  val random = new Random
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<kuduMaster> <tableName> <numberOfRecords>")

    val kuduMaster = args(0)
    val tableName = args(1)
    val numberOfRecords = args(2).toInt

    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val table = kuduClient.openTable(tableName)
    val session = kuduClient.newSession()

    for (i <- 0 to numberOfRecords) {
      val record = GamerDataGenerator.makeNewGamerRecord(100000)

      val pr = new PartialRow(table.getSchema)
      pr.addString(0, "record.gamerId")
      pr.addString(1, "")
      val scannerRows = kuduClient.newScannerBuilder(table).lowerBound(null).limit(1).build().nextRows()
      val op:Operation = if (scannerRows.hasNext) {
        val oldRow =

        val oldRecordUpdateOp = table.newInsert()

        val row = oldRecordUpdateOp.getRow
        row.addString("gamer_id", oldRow.getString("gamer_id"))
        row.addString("eff_to", simpleDateFormat.format(System.currentTimeMillis()))
        row.addString("eff_from", oldRow.getString("eff_from"))
        row.addLong("last_time_played", oldRow.getLong("last_time_played"))
        row.addInt("games_played", oldRow.getInt("games_played"))
        row.addInt("games_won", oldRow.getInt("games_won"))
        row.addInt("oks", oldRow.getInt("oks"))
        row.addInt("deaths", oldRow.getInt("deaths"))
        row.addInt("damage_given", oldRow.getInt("damage_given"))
        row.addInt("damage_taken", oldRow.getInt("damage_taken"))
        row.addInt("max_oks_in_one_game", oldRow.getInt("max_oks_in_one_game"))
        row.addInt("max_deaths_in_one_game", oldRow.getInt("max_deaths_in_one_game"))

      } else {

      val row = op.getRow
      row.addString("gamer_id", record.gamerId)
      row.addString("eff_to", "")
      row.addString("eff_from", simpleDateFormat.format(System.currentTimeMillis()))
      row.addLong("last_time_played", record.lastTimePlayed)
      row.addInt("games_played", record.gamesPlayed)
      row.addInt("games_won", record.gamesWon)
      row.addInt("oks", record.oks)
      row.addInt("deaths", record.deaths)
      row.addInt("damage_given", record.damageGiven)
      row.addInt("damage_taken", record.damageTaken)
      row.addInt("max_oks_in_one_game", record.maxOksInOneGame)
      row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame)



Example 39
Source File: DirectDataMultiThreadedInjector.scala    From SparkOnKudu   with Apache License 2.0
package org.kududb.spark.demo.gamer.cdc

import java.text.SimpleDateFormat
import java.util.Random
import java.util.concurrent.atomic.AtomicInteger
import java.util.concurrent.{TimeUnit, Executors}

import org.kududb.client.{Operation, PartialRow, KuduClient}
import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator

object DirectDataMultiThreadedInjector {
  val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy")
  val random = new Random
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<kuduMaster> <tableName> <numberOfRecords> <numberOfThreads>")

    val kuduMaster = args(0)
    val tableName = args(1)
    val numberOfRecords = args(2).toInt
    val executor = Executors.newFixedThreadPool(args(3).toInt)
    val numberOfGamers = args(4).toInt
    val sleepTime = args(5).toInt

    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val leftToRun = new AtomicInteger()

    for (i <- 0 to numberOfRecords) {
      executor.execute(new ApplyNewRecordRunnable(GamerDataGenerator.makeNewGamerRecord(numberOfGamers),
      kuduClient, tableName, leftToRun))
      println("Summited:" + i)


    val startTime = System.currentTimeMillis()
    while (!executor.awaitTermination(10000, TimeUnit.SECONDS)) {
      val newTime = System.currentTimeMillis()
      println("> Still Waiting: {Time:" + (newTime - startTime) + ", LeftToRun:" + leftToRun + "}" )


Example 40
Source File: DirectDataInjector.scala    From SparkOnKudu   with Apache License 2.0
package org.kududb.spark.demo.gamer.aggregates

import java.util.Random

import org.kududb.client.KuduClient

object DirectDataInjector {

  val random = new Random
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<kuduMaster> <tableName> <numberOfRecords>")

    val kuduMaster = args(0)
    val tableName = args(1)
    val numberOfRecords = args(2).toInt

    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val table = kuduClient.openTable(tableName)
    val session = kuduClient.newSession()


    for (i <- 0 to numberOfRecords) {
      val record = GamerDataGenerator.makeNewGamerRecord(100000)
      val op = table.newInsert()

      val row = op.getRow
      row.addString("gamer_id", record.gamerId)
      row.addLong("last_time_played", record.lastTimePlayed)
      row.addInt("games_played", record.gamesPlayed)
      row.addInt("games_won", record.gamesWon)
      row.addInt("oks", record.oks)
      row.addInt("deaths", record.deaths)
      row.addInt("damage_given", record.damageGiven)
      row.addInt("damage_taken", record.damageTaken)
      row.addInt("max_oks_in_one_game", record.maxOksInOneGame)
      row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame)



Example 41
Source File: GamerDataGenerator.scala    From SparkOnKudu   with Apache License 2.0
package org.kududb.spark.demo.gamer.aggregates

import java.util.{Date, Random}

import org.kududb.spark.demo.gamer.GamerEvent

object GamerDataGenerator {

  val random = new Random()
  val averagePlayerPercentage = 40
  val advancedPlayerPercentage = 80
  val superStarPlayerPercentage = 100
  var date = System.currentTimeMillis()

  def makeNewGamerRecord(numOfGamers:Int): GamerEvent = {
    println("date" + new Date(date))
    date += 60000 * 60 * 6
    val playerSelection = random.nextInt(100)
    if (playerSelection < averagePlayerPercentage) {

      val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection

      new GamerEvent(gamerId.toString,
        if (random.nextInt(10) > 7) 1 else 0,
    } else if (playerSelection < advancedPlayerPercentage) {
      val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection

      new GamerEvent(gamerId.toString,
        if (random.nextInt(10) > 5) 1 else 0,
    } else {
      val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection

      new GamerEvent(gamerId.toString,
        if (random.nextInt(10) > 3) 1 else 0,
Example 42
Example 42
package org.kududb.spark.demo.basic

import java.util.Random

import org.kududb.client.{PartialRow, KuduClient}

object AddSingleRecord {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <tableName> <rowKey>")

    val kuduMaster = args(0)
    val tableName = args(1)
    val rowKey = args(2)

    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val table = kuduClient.openTable(tableName)
    val session = kuduClient.newSession()

    val lowerBound = new PartialRow(table.getSchema)
    lowerBound.addString(0, rowKey)
    val upperBound = new PartialRow(table.getSchema)
    upperBound.addString(0, rowKey + "_")

    var startTime = System.currentTimeMillis()
    val random = new Random()

    startTime = System.currentTimeMillis()
    val update = table.newInsert()
    val row = update.getRow
    row.addString(0, rowKey)
    val columns = table.getSchema.getColumns
    for (c <- 1 until columns.size()) {
      println(columns.get(c).getName + " " + columns.get(c).getType)
      row.addInt(columns.get(c).getName, random.nextInt(100000))
    println("new key: " + rowKey)
    println(" new key time spent: " + (System.currentTimeMillis() - startTime))

    startTime = System.currentTimeMillis()
    val scanner2 = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build()

    while (scanner2.hasMoreRows) {
      val rows = scanner2.nextRows()
      while (rows.hasNext) {
        val row =
        println("NewValue: " + rowKey + " " + row.rowToString())
    println(" scan time spent: " + (System.currentTimeMillis() - startTime))

    val scannerX = kuduClient.newScannerBuilder(table).build()
    while (scannerX.hasMoreRows) {
      val rows = scannerX.nextRows()
      while (rows.hasNext) {
        val row =
        println("Full Scan: " + row.rowToString())
Example 43
Example 43
package org.kududb.spark.demo.basic

import java.util.Random

import scala.collection.mutable

object NameGenerator {

  val random = new Random()
  val listOfNames = new mutable.MutableList[NameAndCounter]
  listOfNames += new NameAndCounter("Katlyn")
  listOfNames += new NameAndCounter("Laurena")
  listOfNames += new NameAndCounter("Jenise")
  listOfNames += new NameAndCounter("Vida")
  listOfNames += new NameAndCounter("Delphine")
  listOfNames += new NameAndCounter("Tiffanie")
  listOfNames += new NameAndCounter("Carroll")
  listOfNames += new NameAndCounter("Steve")
  listOfNames += new NameAndCounter("Nu")
  listOfNames += new NameAndCounter("Robbin")
  listOfNames += new NameAndCounter("Mahalia")
  listOfNames += new NameAndCounter("Norah")
  listOfNames += new NameAndCounter("Selina")
  listOfNames += new NameAndCounter("Cornelius")
  listOfNames += new NameAndCounter("Bennie")
  listOfNames += new NameAndCounter("Kemberly")
  listOfNames += new NameAndCounter("Johnie")
  listOfNames += new NameAndCounter("Jenee")
  listOfNames += new NameAndCounter("Napoleon")
  listOfNames += new NameAndCounter("Brenton")
  listOfNames += new NameAndCounter("Roxana")
  listOfNames += new NameAndCounter("Kalyn")
  listOfNames += new NameAndCounter("Jeana")
  listOfNames += new NameAndCounter("Tennie")
  listOfNames += new NameAndCounter("Tasia")
  listOfNames += new NameAndCounter("Ashely")
  listOfNames += new NameAndCounter("Hester")
  listOfNames += new NameAndCounter("Zita")
  listOfNames += new NameAndCounter("Evalyn")
  listOfNames += new NameAndCounter("Anderson")
  listOfNames += new NameAndCounter("Elaina")
  listOfNames += new NameAndCounter("Benny")
  listOfNames += new NameAndCounter("Heidi")
  listOfNames += new NameAndCounter("Mammie")
  listOfNames += new NameAndCounter("Alisa")
  listOfNames += new NameAndCounter("Billie")
  listOfNames += new NameAndCounter("Wan")
  listOfNames += new NameAndCounter("Dionna")
  listOfNames += new NameAndCounter("Julene")
  listOfNames += new NameAndCounter("Chasidy")
  listOfNames += new NameAndCounter("Vennie")
  listOfNames += new NameAndCounter("Cara")
  listOfNames += new NameAndCounter("Charissa")
  listOfNames += new NameAndCounter("Russell")
  listOfNames += new NameAndCounter("Daniela")
  listOfNames += new NameAndCounter("Kindra")
  listOfNames += new NameAndCounter("Eduardo")
  listOfNames += new NameAndCounter("Marci")
  listOfNames += new NameAndCounter("Gustavo")
  listOfNames += new NameAndCounter("Dianna	")

  def getName(): String = {
    val nameAndCounter = listOfNames.get(random.nextInt(listOfNames.length - 1)).get
    nameAndCounter.counter += 1 + "_" + nameAndCounter.counter

class NameAndCounter(val name:String = "N/A", var counter:Int = 0) {

Example 44
Source File: BasicExample.scala    From SparkOnKudu   with Apache License 2.0
package org.kududb.spark.demo.basic

import java.util
import java.util.Random

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.kududb.ColumnSchema.ColumnSchemaBuilder
import org.kududb.client.KuduClient
import org.kududb.{ColumnSchema, Schema, Type}

object BasicExample {
  def main(args: Array[String]): Unit = {

    val kuduMaster = "quickstart.cloudera"

    println(" -- Starting ")
    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    try {
      println(" -- ")

      val columnList = new util.ArrayList[ColumnSchema]()
      columnList.add(new ColumnSchemaBuilder("KEY_ID", Type.STRING).key(true).build())
      columnList.add(new ColumnSchemaBuilder("COL_A", Type.STRING).key(false).build())
      columnList.add(new ColumnSchemaBuilder("COL_B", Type.STRING).key(false).build())
      columnList.add(new ColumnSchemaBuilder("COL_C", Type.STRING).key(false).build())
      val schema = new Schema(columnList)

      if (kuduClient.tableExists("foobar")) {
      kuduClient.createTable("foobar", schema)

      val session = kuduClient.newSession()
      val table = kuduClient.openTable("foobar")

      try {
        val random = new Random()
        for (i <- 0 until 10) {
          val insert = table.newInsert()
          val row = insert.getRow()
          row.addString(0, i.toString)
          row.addString(1, "value " + i)
          row.addString(2, "42:" + i)
          row.addString(3, "Cat" + random.nextGaussian())
      } finally {

      val tableList = kuduClient.getTablesList.getTablesList
      for (i <- 0 until tableList.size()) {
        println("Table " + i + ":" + tableList.get(i))

      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      val sc = new SparkContext("local[2]", "SparkSQL on Kudu", sparkConfig)

      val sqlContext = new SQLContext(sc)

      val df = sqlContext.load("org.kududb.spark",
        Map("kudu.table" -> "foobar", "kudu.master" -> kuduMaster))


      sqlContext.sql("SELECT * FROM foobar").foreach(r => {
        println("Row: " + r)
    } finally {
    println("-- finished")
Example 45
Example 45
package org.kududb.spark.demo.basic

import java.util
import java.util.Random

import org.kududb.{Schema, Type, ColumnSchema}
import org.kududb.ColumnSchema.ColumnSchemaBuilder
import org.kududb.client.{AsyncKuduClient, KuduClient}

object InitialDataPopulation {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <TableName> <numberOfColumns> <numberOfRows>")


    val kuduMaster = args(0)
    val tableName = args(1)
    val numOfColumns = args(2).toInt
    val numOfRows = args(3).toInt

    val kuduClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build()
    try {
      //Delete table if exist
      if (kuduClient.tableExists(tableName).join()) {

      //Create Schema
      val columnList = new util.ArrayList[ColumnSchema]()
      columnList.add(new ColumnSchemaBuilder("key_id", Type.STRING).key(true).build())
      for (c <- 0 until numOfColumns) {
        columnList.add(new ColumnSchemaBuilder("col_" + c, Type.INT32).key(false).build())
      val schema = new Schema(columnList)

      //Create table
      kuduClient.createTable(tableName, schema).join()

      //Populate table
      val random = new Random
      val table = kuduClient.openTable(tableName).join()
      val asyncSession = kuduClient.newSession()

      for (r <- 0 until numOfRows) {
        val insert = table.newInsert()
        val row = insert.getRow()
        row.addString(0, NameGenerator.getName())
        val columns = table.getSchema.getColumns
        for (c <- 1 until columns.size()) {
          row.addInt(columns.get(c).getName, random.nextInt(100000))

        if (r % 1000 == 0) {
          println("Inserted: " + r)

      val scannerX = kuduClient.newScannerBuilder(table).build()
      while (scannerX.hasMoreRows) {
        val rows = scannerX.nextRows().join()
        while (rows.hasNext) {
          val row =
          println(" - " + row.rowToString())


    } finally {
Example 46
Example 46
package org.apache.gearpump.streaming.examples.sol

import java.time.Instant
import java.util.Random

import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.streaming.examples.sol.SOLStreamProducer._
import org.apache.gearpump.streaming.source.Watermark
import org.apache.gearpump.streaming.task.{Task, TaskContext}

class SOLStreamProducer(taskContext: TaskContext, conf: UserConfig)
  extends Task(taskContext, conf) {

  import taskContext.output

  private val sizeInBytes = conf.getInt(SOLStreamProducer.BYTES_PER_MESSAGE)
  private var messages: Array[String] = null
  private var rand: Random = null
  private var messageCount: Long = 0

  override def onStart(startTime: Instant): Unit = {
    self ! Watermark(

  private def prepareRandomMessage = {
    rand = new Random()
    val differentMessages = 100
    messages = new Array(differentMessages)

    0.until(differentMessages).map { index =>
      val sb = new StringBuilder(sizeInBytes)
      // Even though java encodes strings in UCS2, the serialized version sent by the tuples
      // is UTF8, so it should be a single byte
      0.until(sizeInBytes).foldLeft(sb) { (sb, j) =>
      messages(index) = sb.toString()

  override def onNext(msg: Message): Unit = {
    val message = messages(rand.nextInt(messages.length))
    output(Message(message, System.currentTimeMillis()))
    messageCount = messageCount + 1L
    self ! Watermark(


object SOLStreamProducer {
  // Bytes
  val BYTES_PER_MESSAGE = "bytesPerMessage"
Example 47
Source File: ScalaClientTestUtils.scala    From incubator-livy   with Apache License 2.0
package org.apache.livy.scalaapi

import java.util.Random
import java.util.concurrent.{CountDownLatch, TimeUnit}

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{Await, Future}
import scala.concurrent.duration._

import org.scalatest.FunSuite

import org.apache.livy.LivyBaseUnitTestSuite

object ScalaClientTestUtils extends FunSuite with LivyBaseUnitTestSuite {

  val Timeout = 40

  def helloJob(context: ScalaJobContext): String = "hello"

  def throwExceptionJob(context: ScalaJobContext): Unit = throw new CustomTestFailureException

  def simpleSparkJob(context: ScalaJobContext): Long = {
    val r = new Random
    val count = 5
    val partitions = Math.min(r.nextInt(10) + 1, count)
    val buffer = new ArrayBuffer[Int]()
    for (a <- 1 to count) {
      buffer += r.nextInt()
    }, partitions).count()

  def assertAwait(lock: CountDownLatch): Unit = {
    assert(lock.await(Timeout, TimeUnit.SECONDS) == true)

  def assertTestPassed[T](future: Future[T], expectedValue: T): Unit = {
    val result = Await.result(future, Timeout second)
    assert(result === expectedValue)
Example 48
Source File: SimpleSkewedGroupByTest.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
    // Enforce that everything has been calculated and in cache

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

// scalastyle:on println 
Example 49
Source File: SkewedGroupByTest.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


// scalastyle:on println 
Example 50
Source File: SparkHdfsLR.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")


    val spark = SparkSession

    val inputPath = args(0)
    val lines =

    val points =
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
// scalastyle:on println 
Example 51
Source File: LocalLR.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient +=  p.x * scale
      w -= gradient

    println("Final w: " + w)
// scalastyle:on println 
Example 52
Source File: GroupByTest.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


// scalastyle:on println 
Example 53
Source File: LocalFileLR.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val lines =
    val points = _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

    println("Final w: " + w)
// scalastyle:on println 
Example 54
Source File: PageViewGenerator.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples.streaming.clickstream

import java.util.Random

// scalastyle:on
object PageViewGenerator {
  val pages = Map("" -> .7,
                  "" -> 0.2,
                  "" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println("Listening on port: " + port)

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from: " + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
// scalastyle:on println 
Example 55
Source File: SparkLR.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val spark = SparkSession

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)

// scalastyle:on println 
Example 56
Source File: LocalKMeans.scala    From sparkoscope   with Apache License 2.0
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}

object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i


  def showWarning() {
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {

    val iter = points.iterator
    for (i <- 1 to points.size) {

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))

      var newPoints = {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)

    println("Final centers: " + kPoints)
// scalastyle:on println 
Example 57
Source File: StopwatchSuite.scala    From sparkoscope   with Apache License 2.0

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert( === "sw")
    assert(sw.elapsed() === 0L)
    intercept[AssertionError] {
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    intercept[AssertionError] {

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      val duration = checkStopwatch(sw("spark"))
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)

private object StopwatchSuite extends SparkFunSuite {

  private def now: Long = System.currentTimeMillis()
Example 58
Source File: PartitionwiseSampledRDD.scala    From sparkoscope   with Apache License 2.0
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Example 59
Source File: SampledRDD.scala    From SparkCore   with Apache License 2.0
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T] => new SampledRDDPartition(x, rg.nextInt))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
Example 60
Source File: PartitionwiseSampledRDD.scala    From SparkCore   with Apache License 2.0
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Example 61
Source File: RandomProjection.scala    From spark-neighbors   with MIT License
package com.github.karlhigley.spark.neighbors.linalg

import java.util.Random

import breeze.stats.distributions.CauchyDistribution
import org.apache.spark.mllib.linalg.{ DenseMatrix, Matrices }
import org.apache.spark.mllib.linalg.{ DenseVector, Vector }

  def generateGaussian(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = {
    val localMatrix = DenseMatrix.randn(projectedDim, originalDim, random)
    new RandomProjection(localMatrix)

  def generateCauchy(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = {
    def randc(numRows: Int, numCols: Int): DenseMatrix = {
        numRows.toLong * numCols <= Int.MaxValue,
        s"$numRows x $numCols dense matrix is too large to allocate"
      val cauchyDistribution = new CauchyDistribution(0, 1)
      new DenseMatrix(numRows, numCols, cauchyDistribution.drawMany(numRows * numCols))

    val localMatrix = randc(projectedDim, originalDim)
    new RandomProjection(localMatrix)
Example 62
Source File: BitSamplingFunction.scala    From spark-neighbors   with MIT License
package com.github.karlhigley.spark.neighbors.lsh

import java.util.Random
import scala.collection.immutable.BitSet

import org.apache.spark.mllib.linalg.SparseVector

  def generate(
    originalDim: Int,
    signatureLength: Int,
    random: Random = new Random
  ): BitSamplingFunction = {
    val indices = Array.fill(signatureLength) {

    new BitSamplingFunction(indices)
Example 63
Source File: MinhashFunction.scala    From spark-neighbors   with MIT License
package com.github.karlhigley.spark.neighbors.lsh

import java.util.Random

import org.apache.spark.mllib.linalg.SparseVector

  def generate(
    dimensions: Int,
    signatureLength: Int,
    prime: Int,
    random: Random = new Random
  ): MinhashFunction = {

    val perms = new Array[PermutationFunction](signatureLength)
    var i = 0
    while (i < signatureLength) {
      perms(i) = PermutationFunction.random(dimensions, prime, random)
      i += 1

    new MinhashFunction(perms)
Example 64
Source File: TestLoadDataWithJunkChars.scala    From carbondata   with Apache License 2.0
package org.apache.carbondata.integration.spark.testsuite.dataload

import{BufferedWriter, File, FileWriter}
import java.util.Random

import org.apache.spark.sql.Row
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

class TestLoadDataWithJunkChars extends QueryTest with BeforeAndAfterAll {
  var filePath = ""
  val junkchars = "ǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰ"

  def buildTestData() = {
    filePath = s"$integrationPath/spark/target/junkcharsdata.csv"
    val file = new File(filePath)
    val writer = new BufferedWriter(new FileWriter(file))
    val random = new Random
    for (i <- 1 until 1000) {
      writer.write("a" + i + "," + junkchars + "\n")
      if ( i % 100 == 0) {
    writer.write("a1000000," + junkchars)

  test("[bug]fix bug of duplicate rows in UnivocityCsvParser #877") {
    sql("drop table if exists junkcharsdata")
    sql("""create table if not exists junkcharsdata
             (c1 string, c2 string)
             STORED AS carbondata""")
    sql(s"LOAD DATA LOCAL INPATH '$filePath' into table junkcharsdata")
    checkAnswer(sql("select count(*) from junkcharsdata"), Seq(Row(1000)))
    sql("drop table if exists junkcharsdata")
    new File(filePath).delete()
Source File: DoubleDataTypeTestCase.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.integration.spark.testsuite.primitiveTypes

import java.util.Random

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.scalatest.BeforeAndAfterAll

class DoubleDataTypeTestCase extends QueryTest with BeforeAndAfterAll {

  lazy val df: DataFrame = generateDataFrame

  private def generateDataFrame(): DataFrame = {
    val r = new Random()
    val rdd = sqlContext.sparkContext
      .parallelize(1 to 10, 2)
      .map { x =>
        Row(x, "London" + (x % 2), x.toDouble / 13, x.toDouble / 11)

    val schema = StructType(
        StructField("id", IntegerType, nullable = false),
        StructField("city", StringType, nullable = false),
        StructField("m1", DoubleType, nullable = false),
        StructField("m2", DoubleType, nullable = false)

    sqlContext.createDataFrame(rdd, schema)

  override def beforeAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")

      .option("tableName", "doubleTypeCarbonTable")
      .option("tempCSV", "false")
      .option("table_blocksize", "32")



  test("detail query") {
    checkAnswer(sql("select * from doubleTypeCarbonTable order by id"),
      sql("select * from doubleTypeHiveTable order by id"))


  test("duplicate values") {
    sql("create table uniq_carbon(name string, double_column double) STORED AS carbondata ")
    sql(s"load data inpath '$resourcesPath/uniq.csv' into table uniq_carbon")
    sql("create table uniq_hive(name string, double_column double) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")
    sql(s"load data local inpath '$resourcesPath/uniqwithoutheader.csv' into table uniq_hive")
    checkAnswer(sql("select * from uniq_carbon where double_column>=11"),
      sql("select * from uniq_hive where double_column>=11"))

//  test("agg query") {
//    checkAnswer(sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeHiveTable group by city"))
//    checkAnswer(sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeHiveTable group by city"))
//  }

  override def afterAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")
Source File: TestSource.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbon.flink

import java.util.Random

import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext}
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction
import org.apache.flink.streaming.api.functions.source.SourceFunction

abstract class TestSource(val dataCount: Int) extends SourceFunction[Array[AnyRef]] with CheckpointedFunction {
  private var dataIndex = 0
  private var dataIndexState: ListState[Integer] = _
  private var running = false

  def get(index: Int): Array[AnyRef]

  def onFinish(): Unit = {
    // to do nothing.

  override def run(sourceContext: SourceFunction.SourceContext[Array[AnyRef]]): Unit = {
    this.running = true
    while ( {
      this.running && this.dataIndex < this.dataCount
    }) {
      sourceContext.collectWithTimestamp(this.get(this.dataIndex), System.currentTimeMillis)
      this.dataIndex += 1

  override def cancel(): Unit = {
    this.running = false

  override def snapshotState(context: FunctionSnapshotContext): Unit = {

  override def initializeState(context: FunctionInitializationContext): Unit = {
    this.dataIndexState = context.getOperatorStateStore.getListState(new ListStateDescriptor[Integer]("dataIndex", classOf[Integer]))
    if (!context.isRestored) return
    import scala.collection.JavaConversions._
    for (dataIndex <- this.dataIndexState.get) {
      this.dataIndex = dataIndex

object TestSource {

  val randomCache = new ThreadLocal[Random] {
    override def initialValue(): Random = new Random()

Source File: AppleCustomPartitioner.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.spark.Partitioner

class AppleCustomPartitioner(numOfParts:Int) extends Partitioner {
  override def numPartitions: Int = numOfParts
  def random = new Random()

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[(String, Long)]
    val ticker = k._1
    if (ticker.equals("apple")) {
      val saltedTicker = ticker + random.nextInt(9)
      Math.abs(saltedTicker.hashCode) % numPartitions
    } else {
      Math.abs(ticker.hashCode) % numPartitions
Source File: SaltedExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SaltedExample {

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .config("spark.some.config.option", "config-value")

    val jsonDfLeft =

    val saltedLeft = jsonDfLeft.rdd.flatMap(r => {
      val group = r.getAs[String]("group")
      val value = r.getAs[Long]("value")

      Seq((group + "_" + 0, value),(group + "_" + 1, value))

    val jsonDfRight =

    val saltedRight = jsonDfRight.rdd.mapPartitions(it => {

      val random = new Random() => {
        val group = r.getAs[String]("group")
        val value = r.getAs[Long]("value")

        (group + "_" + random.nextInt(2), value)

    jsonDfLeft.join(jsonDfRight).collect().foreach(r => {
      println("Normal.result:" + r)
    saltedLeft.join(saltedRight).collect().foreach(r => {
      println("Salted.result:" + r)
Source File: SessionDataFileHDFSWriter.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up

import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import java.util.Random

object SessionDataFileHDFSWriter {
  val eol = System.getProperty("line.separator");  
  def main(args: Array[String]) {
    if (args.length == 0) {
        println("SessionDataFileWriter {tempDir} {distDir} {numberOfFiles} {numberOfEventsPerFile} {waitBetweenFiles}");
    val conf = new Configuration
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"))
    conf.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"))
    conf.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"))
    val fs = FileSystem.get(new Configuration)
    val rootTempDir = args(0)
    val rootDistDir = args(1)
    val files = args(2).toInt
    val loops = args(3).toInt
    val waitBetweenFiles = args(4).toInt
    val r = new Random
    for (f <- 1 to files) {
      val rootName = "/weblog." + System.currentTimeMillis()
      val tmpPath = new Path(rootTempDir + rootName + ".tmp")
      val writer = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath)))
      print(f + ": [")
      val randomLoops = loops + r.nextInt(loops)
      for (i <- 1 to randomLoops) {
        writer.write(SessionDataGenerator.getNextEvent + eol)
        if (i%100 == 0) {
      val distPath = new Path(rootDistDir + rootName + ".dat")
      fs.rename(tmpPath, distPath)
Source File: RandSampleData.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package util

import java.util.Random
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd
import breeze.numerics.{
  exp => Bexp,
  cos => Bcos,
  tanh => Btanh
import scala.math.Pi

object RandSampleData extends Serializable {
  // Rosenbrock:
  //��(100*(x(i+1)-x(i) 2) 2 + (x(i)-1) 2)
  // Rastrigin:
  //��(x(i) 2 -10*cos(2*3.14*x(i))+10)
  // Sphere :
  //��(x(i) 2)
  def RandM(
    n1: Int,
    n2: Int,
    b1: Double,
    b2: Double,
    function: String): BDM[Double] = {
    //    val n1 = 2
    //    val n2 = 3
    //    val b1 = -30
    //    val b2 = 30
    val bdm1 = BDM.rand(n1, n2) * (b2 - b1).toDouble + b1.toDouble
    val bdm_y = function match {
      case "rosenbrock" =>
        val xi0 = bdm1(::, 0 to (bdm1.cols - 2))
        val xi1 = bdm1(::, 1 to (bdm1.cols - 1))
        val xi2 = (xi0 :* xi0)
        val m1 = ((xi1 - xi2) :* (xi1 - xi2)) * 100.0 + ((xi0 - 1.0) :* (xi0 - 1.0))
        val m2 = m1 * BDM.ones[Double](m1.cols, 1)
      case "rastrigin" =>
        val xi0 = bdm1
        val xi2 = (xi0 :* xi0)
        val sicos = Bcos(xi0 * 2.0 * Pi) * 10.0
        val m1 = xi2 - sicos + 10.0
        val m2 = m1 * BDM.ones[Double](m1.cols, 1)
      case "sphere" =>
        val xi0 = bdm1
        val xi2 = (xi0 :* xi0)
        val m1 = xi2
        val m2 = m1 * BDM.ones[Double](m1.cols, 1)
    val randm = BDM.horzcat(bdm_y, bdm1)
Source File: RconConnector.scala    From chatoverflow   with Eclipse Public License 2.0 5 votes vote down vote up
package org.codeoverflow.chatoverflow.requirement.service.rcon

import{DataInputStream, IOException, InputStream, OutputStream}
import{Socket, SocketException}
import java.nio.{ByteBuffer, ByteOrder}
import java.util.Random

import org.codeoverflow.chatoverflow.WithLogger
import org.codeoverflow.chatoverflow.connector.Connector

class RconConnector(override val sourceIdentifier: String) extends Connector(sourceIdentifier) with WithLogger {
  override protected var requiredCredentialKeys: List[String] = List("password", "address")
  override protected var optionalCredentialKeys: List[String] = List("port")

  private var socket: Socket = _
  private var outputStream: OutputStream = _
  private var inputStream: InputStream = _
  private var requestId: Int = 0

  def sendCommand(command: String): String = {
    logger debug s"Sending $command to RCON"
    requestId += 1
    if (write(2, command.getBytes("ASCII"))) {
      return read()

  override def stop(): Boolean = {
    logger info s"Stopped RCON connector to ${credentials.get.getValue("address").get}!"
Example 72
package com.cloudera.sparkts.models

import java.util.Random

import com.cloudera.sparkts.MatrixUtil.toBreeze

import org.apache.spark.mllib.linalg._
import org.apache.commons.math3.random.MersenneTwister
import org.scalatest.FunSuite

class AutoregressionSuite extends FunSuite {
  test("fit AR(1) model") {
    val model = new ARModel(1.5, Array(.2))
    val ts = model.sample(5000, new MersenneTwister(10L))
    val fittedModel = Autoregression.fitModel(ts, 1)
    assert(fittedModel.coefficients.length == 1)
    assert(math.abs(fittedModel.c - 1.5) < .07)
    assert(math.abs(fittedModel.coefficients(0) - .2) < .03)

  test("fit AR(2) model") {
    val model = new ARModel(1.5, Array(.2, .3))
    val ts = model.sample(5000, new MersenneTwister(10L))
    val fittedModel = Autoregression.fitModel(ts, 2)
    assert(fittedModel.coefficients.length == 2)
    assert(math.abs(fittedModel.c - 1.5) < .15)
    assert(math.abs(fittedModel.coefficients(0) - .2) < .03)
    assert(math.abs(fittedModel.coefficients(1) - .3) < .03)

  test("add and remove time dependent effects") {
    val rand = new Random()
    val ts = new DenseVector(Array.fill(1000)(rand.nextDouble()))
    val model = new ARModel(1.5, Array(.2, .3))
    val added = model.addTimeDependentEffects(ts, Vectors.zeros(ts.size))
    val removed = model.removeTimeDependentEffects(added, Vectors.zeros(ts.size))
    assert((toBreeze(ts) - toBreeze(removed)).toArray.forall(math.abs(_) < .001))
Source File: utils.scala    From scalabpe   with Apache License 2.0 5 votes vote down vote up
package scalabpe.flow

import scalabpe.core._
import java.util.Random

object Global {
    def init() {
        println("init called")
    def close() {
        println("close called")

object FlowHelper { 

    val random = new Random()
    val jobStatusCache = new java.util.concurrent.ConcurrentHashMap[String,String]()

    def getConfig(s:String,defaultValue:String="") = Flow.router.getConfig(s,defaultValue)

    def isEmpty(req:Request,name:String):Boolean={
        if( name.indexOf(",") >= 0 ) return isEmptyForAny(req,name)
        return isEmpty(req.s(name))

    private def isEmptyForAny(req:Request,names:String):Boolean={
        val ss = names.split(",")
        var i = 0
        while( i < ss.length ) {
            if( isEmpty(req.s(ss(i)) )) return true
            i += 1

    def isInt(req:Request,name:String):Boolean={
        if( name.indexOf(",") >= 0 ) return isIntForAny(req,name)
        return isInt(req.s(name))
    private def isIntForAny(req:Request,names:String):Boolean={
        val ss = names.split(",")
        var i = 0
        while( i < ss.length ) {
            if( isInt(req.s(ss(i)) )) return true
            i += 1

    def isEmpty(str:String):Boolean={
        return str == null || str.length() == 0

    def isInt(n:String):Boolean={
        try {
            return true
        } catch {
            case e: Throwable =>
             return false

    def checkInclude(ss:String,s:String,t:String=","):Boolean={
        if( ss == null || ss == "" ) return false
        if( s == null || s == "" ) return true
        return (t+ss+t).indexOf(t+s+t) >= 0 

    def uuid(): String = {
        return java.util.UUID.randomUUID().toString().replaceAll("-", "")

    def generateSeed():String = {
    def contact(a:String,b:String):String = a + b
Example 74
package coursier.cli.util

import{ByteArrayInputStream, ByteArrayOutputStream}
import java.util.Random
import{Deflater, ZipEntry, ZipInputStream, ZipOutputStream}

import coursier.launcher.internal.Zip
import org.junit.runner.RunWith
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatestplus.junit.JUnitRunner

class ZipTests extends AnyFlatSpec {

  "zipEntries" should "be fine with custom deflaters" in {

    // Inspired by

    val baos = new ByteArrayOutputStream
    val output = new ZipOutputStream(baos) {
      `def` = new Deflater(Deflater.NO_COMPRESSION, true)
    val data = Array.ofDim[Byte](1024 * 1024)
    new Random().nextBytes(data)
    val entry = new ZipEntry("entry.dat")

    val result = baos.toByteArray

    val zos = new ZipOutputStream(new ByteArrayOutputStream)
    val entryNames = Zip.zipEntries(new ZipInputStream(new ByteArrayInputStream(result)))
      .map {
        case (ent, content) =>
          val name = ent.getName
    assert(entryNames == Vector("entry.dat"))

Source File: SimpleSkewedGroupByTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
    // Enforce that everything has been calculated and in cache

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

// scalastyle:on println 
Source File: SkewedGroupByTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Example 77
Source File: SparkHdfsLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")


    val spark = SparkSession

    val inputPath = args(0)
    val lines =

    val points =
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
// scalastyle:on println 
Source File: LocalLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient +=  p.x * scale
      w -= gradient

    println("Final w: " + w)
Example 79
Source File: GroupByTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Example 80
Source File: LocalFileLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val lines =
    val points = _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

    println("Final w: " + w)
Example 81
Source File: PageViewGenerator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.clickstream

import java.util.Random

// scalastyle:on
object PageViewGenerator {
  val pages = Map("" -> .7,
                  "" -> 0.2,
                  "" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()

    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println("Listening on port: " + port)

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from: " + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
Example 82
Source File: SparkLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val spark = SparkSession

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)

Example 83
Source File: LocalKMeans.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}

object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i


  def showWarning() {
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {

    val iter = points.iterator
    for (i <- 1 to points.size) {

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))

      var newPoints = {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)

    println("Final centers: " + kPoints)
Example 84
Source File: StopwatchSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert( === "sw")
    assert(sw.elapsed() === 0L)
    intercept[AssertionError] {
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    intercept[AssertionError] {

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      val duration = checkStopwatch(sw("spark"))
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)

private object StopwatchSuite extends SparkFunSuite {

  private def now: Long = System.currentTimeMillis()
Example 85
Source File: PartitionwiseSampledRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Source File: SimpleSkewedGroupByTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers
    var ratio = if (args.length > 4) args(4).toInt else 5.0

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
    // Enforce that everything has been calculated and in cache

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

Source File: SparkTachyonHdfsLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo

object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def main(args: Array[String]) {


    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Source File: SkewedGroupByTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: SparkHdfsLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo

object SparkHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")


    val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
    val inputPath = args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Source File: LocalLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}

object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient +=  p.x * scale
      w -= gradient

Example 91
Source File: GroupByTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object GroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: LocalFileLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}

object LocalFileLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val lines =
    val points = _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

Example 93
Source File: SparkLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("SparkLR")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

Example 94
Source File: LocalKMeans.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._

object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i


  def showWarning() {
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {

    val iter = points.iterator
    for (i <- 1 to points.size) {

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))

      var newPoints = {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)

Example 95
Source File: SampledRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T] => new SampledRDDPartition(x, rg.nextInt))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
Source File: PartitionwiseSampledRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Source File: SparkTachyonHdfsLR.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo

object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or
        |for more conventional use.

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def main(args: Array[String]) {


    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value 将w初始化为一个随机值
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Source File: SkewedGroupByTest.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: SparkHdfsLR.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo


    val sparkConf = new SparkConf().setAppName("SparkHdfsLR").setMaster("local[2]")
    val inputPath = "D:\\spark\\spark-1.5.0-hadoop2.6\\data\\mllib\\lr_data.txt"//args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).cache()//缓存
    val ITERATIONS = 6 //args(1).toInt 迭代次数

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        //p代表DataPoint Vector
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Source File: LocalFileLR.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}

object LocalFileLR {
  val D = 10   // Numer of dimensions 维度
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)
  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {

    val lines =
    val points = _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

    println("Final w: " + w)
Source File: PeopleInfoFileGenerator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
import java.util.Random

object PeopleInfoFileGenerator {
  def main(args: Array[String]) {
    val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_people_info.txt"), false)
    val rand = new Random()
    for (i <- 1 to 10000) {
      var height = rand.nextInt(220)
      if (height < 50) {
        height = height + 50
      var gender = getRandomGender
      if (height < 100 && gender == "M")
        height = height + 100
      if (height < 100 && gender == "F")
        height = height + 50
      writer.write(i + " " + getRandomGender + " " + height)
    println("People Information File generated successfully.")

  def getRandomGender(): String = {
    val rand = new Random()
    val randNum = rand.nextInt(2) + 1
    if (randNum % 2 == 0) {
    } else {
// scalastyle:off println
import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or
        |for more conventional use.
    //String.stripMargin 移除每行字符串开头的空格和第一个遇到的垂直分割符|

  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("SparkLR").setMaster("local")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)

Source File: FileWrite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
import{File, FileWriter}
import java.util.Random

import org.apache.spark.util.Utils

object FileWrite {
  def main(args: Array[String]) {

    val outFile = File.createTempFile("test-load-spark-properties", "test")
    Files.write("spark.test.fileNameLoadA true\n" +
      "spark.test.fileNameLoadB 1\n", outFile, UTF_8)

    val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_age_data.txt"), false)
    val rand = new Random()
    for (i <- 1 to 10000) {
      writer.write(i + " " + rand.nextInt(100))
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T] => new SampledRDDPartition(x, rg.nextInt))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
package com.tmalaska.flinktraining.example.session

import java.util.{Properties, Random}

import net.liftweb.json.DefaultFormats
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import net.liftweb.json.Serialization.write

object SessionKafkaProducer {
  def main(args:Array[String]): Unit = {

    implicit val formats = DefaultFormats

    val kafkaServerURL = args(0)
    val kafkaServerPort = args(1)
    val topic = args(2)
    val numberOfEntities = args(3).toInt
    val numberOfMessagesPerEntity = args(4).toInt
    val waitTimeBetweenMessageBatch = args(5).toInt
    val chancesOfMissing = args(6).toInt

    val props = new Properties()
    props.put("bootstrap.servers", kafkaServerURL + ":" + kafkaServerPort)
    props.put("acks", "all")
    props.put("retries", "0")
    props.put("batch.size", "16384")
    props.put("", "1")
    props.put("buffer.memory", "33554432")
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")

    val producer = new KafkaProducer[String, String](props)

    val r = new Random()
    var sentCount = 0

    println("About to send to " + topic)
    for (j <- 0 to numberOfMessagesPerEntity) {
      for (i <- 0 to numberOfEntities) {
        if (r.nextInt(chancesOfMissing) != 0) {
          val message = write(HeartBeat(i.toString, System.currentTimeMillis()))
          val producerRecord = new ProducerRecord[String,String](topic, message)
          sentCount += 1
      println("Sent Count:" + sentCount)

Example 107
Source File: RandomProjectionsHasher.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.spark.annotation.DeveloperApi
import{HasInputCol, HasOutputCol, HasSeed}
import{Identifiable, SchemaUtils}
import{Matrices, SparseMatrix, Vector}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{LongType, StructType}

  def setDim(value: Long): this.type = set(dim, value)

  def this() = this(Identifiable.randomUID("randomProjectionsHasher"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val dimensity = {
      if (!isSet(dim)) {//If dimensions is not set - will search  AttributeGroup in metadata as it comes from OdklCountVectorizer
        val vectorsIndex = dataset.schema.fieldIndex($(inputCol))
      } else {
    val projectionMatrix = dataset.sqlContext.sparkContext.broadcast(
      Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix])
  //the matrix of random vectors to costruct hash

    val binHashSparseVectorColumn = udf((vector: Vector) => {
        .map(f =>  if (f>0) 1L else 0L)
        .foldLeft(0L) {case  (acc,(v, i)) => acc | (v << i) }

    dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol))))

  override def copy(extra: ParamMap): Transformer = {

  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), LongType)

Example 108
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
    // Enforce that everything has been calculated and in cache

    println(s"RESULT: ${pairs1.groupByKey(numReducers).count}")

Source File: SkewedGroupByTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: SparkHdfsLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    val y = tok.nextToken.toDouble
    val x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")


    val spark = SparkSession

    val inputPath = args(0)
    val lines =

    val points =
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println(s"Final w: $w")
Source File: LocalLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient +=  p.x * scale
      w -= gradient

    println(s"Final w: $w")
Source File: GroupByTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession

object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .appName("GroupBy Test")

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)

    // Enforce that everything has been calculated and in cache

    implicit val caseInsensitiveOrdering = new Ordering[(Int, String)] {

      override def compare(a: (Int, String), b: (Int, String)): Int = a._1.compareTo( b._1)



Source File: LocalFileLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}

object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val fileSrc =
    val lines = fileSrc.getLines().toArray
    val points =
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

    println(s"Final w: $w")
Source File: PageViewGenerator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming.clickstream

import java.util.Random

// scalastyle:on
object PageViewGenerator {
  val pages = Map("" -> .7,
                  "" -> 0.2,
                  "" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println(s"Listening on port: $port")

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println(s"Got client connected from: ${socket.getInetAddress}")
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
Source File: SparkLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val spark = SparkSession

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println(s"Final w: $w")

Source File: LocalKMeans.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}

object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers(i)
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i


  def showWarning() {
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    val points = new HashSet[Vector[Double]]
    val kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {

    val iter = points.iterator
    for (i <- 1 to points.size) {

    println(s"Initial centers: $kPoints")

    while(tempDist > convergeDist) {
      val closest = (p => (closestPoint(p, kPoints), (p, 1)))

      val mappings = closest.groupBy[Int] (x => x._1)

      val pointStats = { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))

      var newPoints = {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints(mapping._1), mapping._2)

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)

    println(s"Final centers: $kPoints")
Source File: ChiSquareTestSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.spark.{SparkException, SparkFunSuite}
import{Vector, Vectors}
import org.apache.spark.mllib.stat.test.ChiSqTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ChiSquareTestSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("test DataFrame of labeled points") {
    // labels: 1.0 (2 / 6), 0.0 (4 / 6)
    // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
    // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
    val data = Seq(
      LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
      LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
      LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
      LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
    for (numParts <- List(2, 4, 6, 8)) {
      val df = spark.createDataFrame(sc.parallelize(data, numParts))
      val chi = ChiSquareTest.test(df, "features", "label")
      val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) ="pValues", "degreesOfFreedom", "statistics")
          .as[(Vector, Array[Int], Vector)].head()
      assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4)
      assert(degreesOfFreedom === Array(2, 3))
      assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4)

  test("large number of features (SPARK-3087)") {
    // Test that the right number of results is returned
    val numCols = 1001
    val sparseData = Array(
      LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))),
      LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0)))))
    val df = spark.createDataFrame(sparseData)
    val chi = ChiSquareTest.test(df, "features", "label")
    val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) ="pValues", "degreesOfFreedom", "statistics")
        .as[(Vector, Array[Int], Vector)].head()
    assert(pValues.size === numCols)
    assert(degreesOfFreedom.length === numCols)
    assert(statistics.size === numCols)
    assert(pValues(1000) !== null)  // SPARK-3087

  test("fail on continuous features or labels") {
    val tooManyCategories: Int = 100000
    assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " +
      "tooManyCategories be large enough to cause ChiSqTest to throw an exception.")

    val random = new Random(11L)
    val continuousLabel = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2))))
    withClue("ChiSquare should throw an exception when given a continuous-valued label") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousLabel)
        ChiSquareTest.test(df, "features", "label")
    val continuousFeature = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble())))
    withClue("ChiSquare should throw an exception when given continuous-valued features") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousFeature)
        ChiSquareTest.test(df, "features", "label")
Example 118
Source File: StopwatchSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert( === "sw")
    assert(sw.elapsed() === 0L)
    intercept[AssertionError] {
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    intercept[AssertionError] {

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      val duration = checkStopwatch(sw("spark"))
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)

private object StopwatchSuite extends SparkFunSuite {

  private def now: Long = System.currentTimeMillis()
Source File: PartitionwiseSampledRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.RandomSampler

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Example 120
Source File: SimpleSkewedGroupByTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers
    var ratio = if (args.length > 4) args(4).toInt else 5.0

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
    // Enforce that everything has been calculated and in cache

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

// scalastyle:on println 
Example 121
Source File: SparkTachyonHdfsLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo

object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def main(args: Array[String]) {


    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Source File: SkewedGroupByTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: SparkHdfsLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo

object SparkHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")


    val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
    val inputPath = args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Source File: LocalLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}

object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient +=  p.x * scale
      w -= gradient

    println("Final w: " + w)
Source File: GroupByTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object GroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: Utils.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

object Utils {
  val random = new Random()
  def log1pExp(x: Double): Double = {
    if (x > 0) {
      x + math.log1p(math.exp(-x))
    } else {
Example 127
Source File: CustomActivationExample.scala    From dl4scala   with MIT License 5 votes vote down vote up
package org.dl4scala.examples.misc.activationfunctions

import java.util.{Collections, Random}

import org.deeplearning4j.datasets.iterator.impl.ListDataSetIterator
import org.deeplearning4j.nn.api.OptimizationAlgorithm
import org.deeplearning4j.nn.conf.layers.{DenseLayer, OutputLayer}
import org.deeplearning4j.nn.conf.{NeuralNetConfiguration, Updater}
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
import org.deeplearning4j.nn.weights.WeightInit
import org.deeplearning4j.optimize.listeners.ScoreIterationListener
import org.nd4j.linalg.activations.Activation
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.dataset.DataSet
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.lossfunctions.LossFunctions

object CustomActivationExample {
  val seed = 12345
  val iterations = 1
  val nEpochs = 500
  val nSamples = 1000
  val batchSize = 100
  val learningRate = 0.001
  var MIN_RANGE = 0
  var MAX_RANGE = 3

  val rng = new Random(seed)

  def main(args: Array[String]): Unit = {
    // Create the network
    val numInput = 2
    val numOutputs = 1
    val nHidden = 10

    val net = new MultiLayerNetwork(new NeuralNetConfiguration.Builder()
      //Refer to CustomActivation class for more details on implementation
      .layer(0, new DenseLayer.Builder().nIn(numInput).nOut(nHidden)
      .activation(new CustomActivation())
      .layer(1, new OutputLayer.Builder(LossFunctions.LossFunction.MSE)

    net.setListeners(new ScoreIterationListener(100))

    (0 until nEpochs).foreach{_ =>

    // Test the addition of 2 numbers (Try different numbers here)
    val input: INDArray = Nd4j.create(Array[Double](0.111111, 0.3333333333333), Array[Int](1, 2))
    val out: INDArray = net.output(input, false)

  private def getTrainingData(batchSize: Int, rand: Random): DataSetIterator = {
    val sum = new Array[Double](nSamples)
    val input1 = new Array[Double](nSamples)
    val input2 = new Array[Double](nSamples)

    (0 until nSamples).foreach{i =>
      input1(i) = MIN_RANGE + (MAX_RANGE - MIN_RANGE) * rand.nextDouble
      input2(i) = MIN_RANGE + (MAX_RANGE - MIN_RANGE) * rand.nextDouble
      sum(i) = input1(i) + input2(i)

    val inputNDArray1 = Nd4j.create(input1, Array[Int](nSamples, 1))
    val inputNDArray2 = Nd4j.create(input2, Array[Int](nSamples, 1))
    val inputNDArray = Nd4j.hstack(inputNDArray1, inputNDArray2)
    val outPut = Nd4j.create(sum, Array[Int](nSamples, 1))
    val dataSet = new DataSet(inputNDArray, outPut)
    val listDs = dataSet.asList
    Collections.shuffle(listDs, rng)
    new ListDataSetIterator(listDs, batchSize)
Example 128
Source File: ToxCoreTestBase.scala    From jvm-toxcore-c   with GNU General Public License v3.0 5 votes vote down vote up
package im.tox.tox4j

import{ InetAddress, Socket }
import java.util.Random

import org.jetbrains.annotations.NotNull
import org.scalatest.Assertions

object ToxCoreTestBase extends Assertions {

  private[tox4j] val nodeCandidates = Seq(
    new DhtNode("", "", 33445, "3F0A45A268367C1BEA652F258C85F4A66DA76BCAA667A49E770BCC4917AB6A25"),
    new DhtNode("", null, 33445, "1C5293AEF2114717547B39DA8EA6F1E331E5E358B35F9B6B5F19317911C5F976")

  @NotNull def randomBytes(length: Int): Array[Byte] = {
    val array = new Array[Byte](length)
    new Random().nextBytes(array)

  def readablePublicKey(@NotNull id: Array[Byte]): String = {
    val str = new StringBuilder
    id foreach { c => str.append(f"$c%02X") }

  def parsePublicKey(@NotNull id: String): Array[Byte] = {
    val publicKey = new Array[Byte](id.length / 2)
    publicKey.indices foreach { i =>
      publicKey(i) =
        ((fromHexDigit(id.charAt(i * 2)) << 4) +
          fromHexDigit(id.charAt(i * 2 + 1))).toByte

  private def fromHexDigit(c: Char): Byte = {
    val digit =
      if (false) { 0 }
      else if ('0' to '9' contains c) { c - '0' }
      else if ('A' to 'F' contains c) { c - 'A' + 10 }
      else if ('a' to 'f' contains c) { c - 'a' + 10 }
      else { throw new IllegalArgumentException(s"Non-hex digit character: $c") }

  private def hasConnection(ip: String, port: Int): Option[String] = {
    var socket: Socket = null
    try {
      socket = new Socket(InetAddress.getByName(ip), port)
      if (socket.getInputStream == null) {
        Some("Socket input stream is null")
      } else {
    } catch {
      case e: IOException =>
        Some(s"A network connection can't be established to $ip:$port: ${e.getMessage}")
    } finally {
      if (socket != null) {

  def checkIPv4: Option[String] = {
    hasConnection("", 53)

  def checkIPv6: Option[String] = {
    hasConnection("2001:4860:4860::8888", 53)

  protected[tox4j] def assumeIPv4(): Unit = {

  protected[tox4j] def assumeIPv6(): Unit = {

Example 129
Source File: PartitionwiseSampledRDD.scala    From sona   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.util.random.RandomSampler

import scala.reflect.ClassTag

private[sona] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

  * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD,
  * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain
  * a random sample of the records in the partition. The random seeds assigned to the samplers
  * are guaranteed to have different values.
  * @param prev                  RDD to be sampled
  * @param sampler               a random sampler
  * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD
  * @param seed                  random seed
  * @tparam T input RDD item type
  * @tparam U sampled RDD item type
private[sona] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
                                                                       prev: RDD[T],
                                                                       sampler: RandomSampler[T, U],
                                                                       preservesPartitioning: Boolean,
                                                                       @transient private val seed: Long = (new Random).nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Example 130
Source File: WeightedRandomSampler.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.utils
import java.util.Random

import org.apache.spark.util.SparkUtil
import org.apache.spark.util.random.RandomSampler

import scala.reflect.ClassTag

abstract class WeightedRandomSampler[T: ClassTag, U: ClassTag] extends RandomSampler[(T, Float), U] {

  protected var fraction = 0.0

  override def sample(items: Iterator[(T, Float)]): Iterator[U] = {
    items.filter(x => sample(x._2) > 0).asInstanceOf[Iterator[U]]

  def sample(weight: Float): Int

  override def sample(): Int = ???

  def setFraction(fraction: Double): Unit = {
      fraction >= (0.0 - 1e-6)
        && fraction <= (1.0 + 1e-6),
      s"Sampling fraction ($fraction) must be on interval [0, 1]")
    this.fraction = fraction

  override def clone: WeightedRandomSampler[T, U] = ???

class NaiveWeightedBernoulliSampler[T: ClassTag] extends WeightedRandomSampler[T, (T, Float)] {

  private val rng: Random = SparkUtil.getXORShiftRandom(System.nanoTime)

  override def setSeed(seed: Long): Unit = rng.setSeed(seed)

  def sample(weight: Float): Int = {
    if (fraction <= 0.0) {
    } else {
      if (rng.nextDouble() <= fraction * weight) {
      } else {

  override def clone: NaiveWeightedBernoulliSampler[T] = new NaiveWeightedBernoulliSampler[T]
Example 131
Source File: PartitionwiseWeightedSampledRDD.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.utils
import java.util.Random

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, Partitioner, TaskContext}

import scala.reflect.ClassTag
import scala.util.{Random => ScalaRandom}

class PartitionwiseWeightedSampledRDDPartition(val prev: Partition, val seed: Long, val fraction: Double)
  extends Partition with Serializable {
  override val index: Int = prev.index

class PartitionwiseWeightedSampledRDD[T: ClassTag, U: ClassTag](
                                                                 prev: RDD[(T, Float)],
                                                                 sampler: WeightedRandomSampler[T, U],
                                                                 fractions: Map[Int, Double],
                                                                 preservesPartitioning: Boolean,
                                                                 @transient private val seed: Long = ScalaRandom.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner: Option[Partitioner] = {
    if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[(T, Float)] { x =>
      new PartitionwiseWeightedSampledRDDPartition(x, random.nextLong(), fractions.getOrElse(x.index, 0.0))

  override def getPreferredLocations(split: Partition): Seq[String] = {
    firstParent[(T, Float)].preferredLocations(

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseWeightedSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[(T, Float)].iterator(split.prev, context))
Example 132
Source File: CompositeSampler.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import spire.math.{Numeric => spNum}

class CompositeSampler(implicit ev: spNum[Double])
  extends Sampler[Double] {
  private var samplers: Seq[Sampler[_]] = _

  protected def numer: spNum[Double] = ev

  def apply(state: Int): Double =

  def norm: Double =

  def sampleFrom(base: Double, gen: Random): Int = {
    val sampIter = samplers.iterator
    var curSampler =
    var subNorm = curSampler.normDouble
    var remain = base
    while (remain >= subNorm) {
      remain -= subNorm
      curSampler =
      subNorm = curSampler.normDouble
    curSampler.sampleFromDouble(remain, gen)

  def resetComponents(samplers: Sampler[_]*): CompositeSampler = {
    this.samplers = samplers
Example 133
Source File: MetropolisHastings.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import spire.math.{Numeric => spNum}

class MetropolisHastings(implicit ev: spNum[Double])
  extends Sampler[Double] {
  type TransProb = Int => Double

  private var origFunc: TransProb = _
  private var proposal: Sampler[Double] = _
  private var state: Int = _

  protected def numer: spNum[Double] = ev

  def apply(state: Int): Double = origFunc(state)

  def norm: Double = proposal.norm

  def sampleFrom(base: Double, gen: Random): Int = {
    val newState = proposal.sampleFrom(base, gen)
    if (newState != state) {
      val ar = acceptRate(newState)
      if (ar >= 1.0 || gen.nextDouble() < ar) {
        state = newState

  private def acceptRate(newState:Int): Double = {
    origFunc(newState) * proposal(state) /
      (origFunc(state) * proposal(newState))

  def resetProb(origFunc: TransProb,
    proposal: Sampler[Double],
    initState: Int): MetropolisHastings = {
    this.origFunc = origFunc
    this.proposal = proposal
    this.state = initState

  def resetProb(origFunc: TransProb,
    proposal: Sampler[Double],
    gen: Random): MetropolisHastings = {
    this.origFunc = origFunc
    this.proposal = proposal
    this.state = proposal.sampleRandom(gen)
Example 134
Source File: DiscreteSampler.scala    From zen   with Apache License 2.0 5 votes vote down vote up

import java.util.Random
import scala.annotation.tailrec

import spire.math.{Numeric => spNum}

trait DiscreteSampler[@specialized(Double, Int, Float, Long) T] extends Sampler[T] {
  def length: Int
  def used: Int
  def update(state: Int, value: => T): Unit
  def deltaUpdate(state: Int, delta: => T): Unit
  def resetDist(probs: Array[T], space: Array[Int], psize: Int): DiscreteSampler[T]
  def resetDist(distIter: Iterator[(Int, T)], psize: Int): DiscreteSampler[T]
  def reset(newSize: Int): DiscreteSampler[T]

  @tailrec final def resampleRandom(gen: Random,
    state: Int,
    residualRate: Double,
    numResampling: Int = 2)(implicit ev: spNum[T]): Int = {
    val newState = sampleRandom(gen)
    if (newState == state && numResampling >= 0 && used > 1 &&
      (residualRate >= 1.0 || gen.nextDouble() < residualRate)) {
    } else {

  @tailrec final def resampleFrom(base: T,
    gen: Random,
    state: Int,
    residualRate: Double,
    numResampling: Int = 2)(implicit ev: spNum[T]): Int = {
    val newState = sampleFrom(base, gen)
    if (newState == state && numResampling >= 0 && used > 1 &&
      (residualRate >= 1.0 || gen.nextDouble() < residualRate)) {
      val newBase = ev.fromDouble(gen.nextDouble() * ev.toDouble(norm))
      resampleFrom(newBase, gen, state, residualRate, numResampling - 1)
    } else {
Example 135
Source File: FlowerDataSetIterator.scala    From dl4scala   with MIT License 5 votes vote down vote up
package org.dl4scala.examples.transferlearning.vgg16.dataHelpers

import{File, IOException}

import org.datavec.api.split.{FileSplit, InputSplit}
import org.datavec.image.loader.BaseImageLoader
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator
import java.util
import java.util.Random

import org.datavec.api.util.ArchiveUtils
import org.datavec.image.recordreader.ImageRecordReader
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator
import org.deeplearning4j.nn.modelimport.keras.trainedmodels.TrainedModels

object FlowerDataSetIterator {
  private val log = org.slf4j.LoggerFactory.getLogger(FlowerDataSetIterator.getClass)

  private val DATA_DIR = new File(System.getProperty("user.home")) + "/dl4jDataDir"
  private val DATA_URL = ""
  private val FLOWER_DIR = DATA_DIR + "/flower_photos"

  private val allowedExtensions = BaseImageLoader.ALLOWED_FORMATS
  private val rng = new Random(13)

  private val height = 224
  private val width = 224
  private val channels = 3
  private val numClasses = 5

  private val labelMaker = new ParentPathLabelGenerator
  private var trainData: InputSplit = _
  private var testData: InputSplit = _
  private var batchSize = 0

  def trainIterator: DataSetIterator = makeIterator(trainData)

  def testIterator: DataSetIterator = makeIterator(testData)

  def setup(batchSizeArg: Int, trainPerc: Int): Unit = {
    catch {
      case e: IOException =>
        log.error("IOException : ", e)

    batchSize = batchSizeArg
    val parentDir = new File(FLOWER_DIR)
    val filesInDir = new FileSplit(parentDir, allowedExtensions, rng)
    val pathFilter = new BalancedPathFilter(rng, allowedExtensions, labelMaker)
    if (trainPerc >= 100)
      throw new IllegalArgumentException("Percentage of data set aside for training has to be less than 100%." +
        " Test percentage = 100 - training percentage, has to be greater than 0")
    val filesInDirSplit = filesInDir.sample(pathFilter, trainPerc, 100 - trainPerc)
    trainData = filesInDirSplit(0)
    testData = filesInDirSplit(1)

  private def makeIterator(split: InputSplit) = {
    val recordReader = new ImageRecordReader(height, width, channels, labelMaker)
    val iter = new RecordReaderDataSetIterator(recordReader, batchSize, 1, numClasses)

  def downloadAndUntar(): Unit = {
    val rootFile = new File(DATA_DIR)
    if (!rootFile.exists) rootFile.mkdir
    val tarFile = new File(DATA_DIR, "flower_photos.tgz")
    if (!tarFile.isFile) {"Downloading the flower dataset from " + DATA_URL + "...")
      FileUtils.copyURLToFile(new URL(DATA_URL), tarFile)
    ArchiveUtils.unzipFileTo(tarFile.getAbsolutePath, rootFile.getAbsolutePath)
Example 136
Source File: LDADataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up


import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import scala.collection.mutable.{HashMap => MHashMap}
import org.apache.spark.rdd.RDD

  def generateLDARDD(
    sc: SparkContext,
    numDocs: Long,
    numVocab: Int,
    docLenMin: Int,
    docLenMax: Int,
    numParts: Int = 3,
    seed: Long = System.currentTimeMillis()): RDD[(Long, Vector)] = {
    val data = sc.parallelize(0L until numDocs, numParts).mapPartitionsWithIndex { 
      (idx, part) =>
        val rng = new Random(seed ^ idx) { case docIndex =>
          var currentSize = 0
          val entries = MHashMap[Int, Int]()
          val docLength = rng.nextInt(docLenMax - docLenMin + 1) + docLenMin
          while (currentSize < docLength) {
            val index = rng.nextInt(numVocab)
            entries(index) = entries.getOrElse(index, 0) + 1
            currentSize += 1

          val iter = => (v._1, v._2.toDouble))
          (docIndex, Vectors.sparse(numVocab, iter))

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("LDADataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numDocs: Long = 500L
    var numVocab: Int = 1000
    var docLenMin: Int = 50
    var docLenMax: Int = 10000
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 5) {
      outputPath = args(0)
      numDocs = args(1).toInt
      numVocab = args(2).toInt
      docLenMin = args(3).toInt
      docLenMax = args(4).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Documents: $numDocs")
      println(s"Vocabulary size: $numVocab")
    } else {

    val data = generateLDARDD(sc, numDocs, numVocab, docLenMin, docLenMax, numPartitions)


Example 137
Source File: Bagging.scala    From streamDM   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streamdm.classifiers.meta

import java.util.Random
import com.github.javacliparser.{ClassOption, IntOption}
import org.apache.spark.streamdm.classifiers.Classifier
import org.apache.spark.streamdm.classifiers.model._
import org.apache.spark.streamdm.core._
import org.apache.spark.streaming.dstream._
import org.apache.spark.streamdm.utils.Utils
import org.apache.spark.streamdm.core.specification.ExampleSpecification

  def ensemblePredict(example: Example): Double = {
    val sizeEnsemble = ensembleSizeOption.getValue
    val predictions: Array[Double] = new Array(sizeEnsemble)
    for (i <- 0 until sizeEnsemble) {
      predictions(i) = classifiers(i).getModel.asInstanceOf[ClassificationModel].predict(example)
    Utils.majorityVote(predictions, numberClasses)

  def numberClasses(): Integer = {
    if (exampleLearnerSpecification == null) 2
    else exampleLearnerSpecification.out(0).range
Example 138
Source File: DiscreteAliasSamplerSpec.scala    From bidirectional-random-walk   with MIT License 5 votes vote down vote up
package soal.util

import org.scalatest.FlatSpec
import org.scalatest.Matchers
import java.util.Random

class DiscreteAliasSamplerSpec  extends FlatSpec with Matchers  {
  val random = new Random(1)
  def testDistribution(unnormalizedProbabilities: Array[Float],
                       values: Seq[Int],
                       nSamples: Int = 10000
      ): Unit = {
    val probabilities = unnormalizedProbabilities map { _ / unnormalizedProbabilities.sum }
    val n = unnormalizedProbabilities.size
    val valueToIndex = (values zip (0 until n)).toMap
    val sampler = new DiscreteAliasSampler(values, unnormalizedProbabilities, random)
    val sampleCounts = Array.fill(n)(0)
    val tol = 4.0f / math.sqrt(nSamples).toFloat
    for (i <- 0 until nSamples) {
      val v = sampler.sample()
      sampleCounts(valueToIndex(v)) += 1
    for (i <- 0 until n) {
      sampleCounts(i).toFloat / nSamples should equal (probabilities(i) +- tol)

    def f(v: Int): Float = v.toFloat * v.toFloat // compute expectation of v => v^2
    val trueExpectation = ((probabilities zip values) map { case (p, v) => p * v * v }).sum
    sampler.expectation(f) shouldEqual (trueExpectation +- trueExpectation * 1.00001f)

  "A Discrete Distribution" should "support sampling" in {
    testDistribution(Array(575.6355f, 89.733475f, 86.90718f, 721.26416f), Array(2, 3, 5, 7))
    testDistribution(Array(2.0f, 5.0f, 3.0f), Array(17, 11, 13))
    testDistribution(Array(1.0f, 1.0f, 1.0f, 1.0f), Array(-2, 3, -5, 7))
    testDistribution(Array(0.9f, 0.1f), Array(19, 17))
    an[IllegalArgumentException] should be thrownBy {
      new DiscreteAliasSampler(Array(1), Array(1.0f, 2.0f))
Example 139
Source File: BidirectionalPPREstimatorSpec.scala    From bidirectional-random-walk   with MIT License 5 votes vote down vote up
package soal.ppr

import java.util.Random

import co.teapot.graph.ConcurrentHashMapDynamicGraph
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable

class BidirectionalPPREstimatorSpec extends FlatSpec with Matchers {
  val graph = ConcurrentHashMapDynamicGraph.readGraph("src/test/resources/test_graph.txt")
  val teleportProb = 0.2f
  val random = new Random(2) // Seed for consistent tests
  val estimator = new BidirectionalPPREstimator(graph, teleportProb, random)
  val truePPRs = BidirectionalPPREstimatorSpec.testGraphTruePPRs

  "BidirectionalPPRSearcher.estimateInversePPR" should "be correct on the test graph" in {
    val pprErrorTolerance = 2.0e-6f
    for (((s, t), truePPR) <- truePPRs) {
      val inversePPRs = estimator.estimatePPRToTarget(t, pprErrorTolerance)
      withClue (s"Testing Pair ($s, $t)") {
        inversePPRs(s) should equal (truePPR +- pprErrorTolerance)

  "BidirectionalPPRSearcher.estimatePPR" should "be correct on the test graph" in {
    val relativeError = 0.01f
    val stPairs = Array(0 -> 1, 2 -> 3, 5 -> 9, 0 -> 0)

    for ((s, t) <- stPairs) {
      withClue (s"Testing Pair ($s, $t)") {
        estimator.estimatePPRSingleSource(s, t, 0.03f, relativeError) should equal (
          truePPRs((s, t)) +- truePPRs((s, t)) * relativeError * 2)

object BidirectionalPPREstimatorSpec {
  def testGraphTruePPRs: collection.Map[(Int, Int), Float] = {
    val pprMap = new mutable.HashMap[(Int, Int), Float] {
      override def default(key: (Int, Int)) = 0.0f
    for (line <- Source.fromFile("src/test/resources/test_graph_true_pprs.txt").getLines()) {
      val pieces = line.split("\t")
      val (startId, targetId, truePPR) = (pieces(0).toInt, pieces(1).toInt, pieces(2).toFloat)
      pprMap((startId, targetId)) = truePPR
Example 140
Source File: CsvKafkaPublisher.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}


object CsvKafkaPublisher {

  var counter = 0
  var salts = 0

  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<brokerList> " +
        "<topicName> " +
        "<dataFolderOrFile> " +
        "<sleepPerRecord> " +
        "<acks> " +
        "<> " +
        "<producer.type> " +
        "<batch.size> " +

    val kafkaBrokerList = args(0)
    val kafkaTopicName = args(1)
    val nyTaxiDataFolder = args(2)
    val sleepPerRecord = args(3).toInt
    val acks = args(4).toInt
    val lingerMs = args(5).toInt
    val producerType = args(6) //"async"
    val batchSize = args(7).toInt
    salts = args(8).toInt

    val kafkaProducer = KafkaProducerUntil.getNewProducer(kafkaBrokerList, acks, lingerMs, producerType, batchSize)

    println("--Input:" + nyTaxiDataFolder)

    val dataFolder = new File(nyTaxiDataFolder)
    if (dataFolder.isDirectory) {
      val files = dataFolder.listFiles().iterator
      files.foreach(f => {
        println("--Input:" + f)
        processFile(f, kafkaTopicName, kafkaProducer, sleepPerRecord)
    } else {
      println("--Input:" + dataFolder)
      processFile(dataFolder, kafkaTopicName, kafkaProducer, sleepPerRecord)

  def processFile(file:File, kafkaTopicName:String,
                  kafkaProducer: KafkaProducer[String, String], sleepPerRecord:Int): Unit = {
    var counter = 0
    val r = new Random()

    println("-Starting Reading")
    Source.fromFile(file).getLines().foreach(l => {
      counter += 1
      if (counter % 10000 == 0) {
        println("{Sent:" + counter + "}")
      if (counter % 100 == 0) {

      val saltedVender = r.nextInt(salts) + l

      if (counter > 2) {
        publishTaxiRecord(saltedVender, kafkaTopicName, kafkaProducer)

  def publishTaxiRecord(line:String, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String]): Unit = {

    if (line.startsWith("vendor_name") || line.length < 10) {
    } else {
      val message = new ProducerRecord[String, String](kafkaTopicName, line.hashCode.toString, line)

Example 141
Source File: RNG.scala    From Scalaprof   with GNU General Public License v2.0 5 votes vote down vote up

import java.util.Random

trait RNG[+A] {
  def next: RNG[A]
  def value: A

abstract class RNG_Java[+A](n: Long) extends RNG[A] { 
  // must be overridden by sub-classes
  def value: A
  def newRNG(n: Long): RNG[A]
  // may be overridden (if you want to define your own pseudo-random sequence)
  def nextSeed: Long = RNG_Java.nextSeed(n)
  // base method -- not normally overridden
  def next: RNG[A] = newRNG(nextSeed)
  def state = n

object RNG_Java {
  def nextSeed(n: Long): Long = new Random(n).nextLong

case class LongRNG(n: Long) extends RNG_Java[Long](n) {
  def newRNG(n: Long): RNG[Long] = LongRNG(n) 
  def value = n 

case class DoubleRNG(n: Long) extends RNG_Java[Double](n) {
  def newRNG(n: Long) = DoubleRNG(n) 
  def value = n.toDouble/Long.MaxValue
  override def toString = s"DoubleRNG: $n->$value"

case class UniformDouble(x: Double) extends AnyVal with Ordered[UniformDouble] {
    def + (y: Double) = x + y
    def compare(that: UniformDouble): Int =

object DoubleRNG {
  def apply: RNG[Double] = DoubleRNG(System.currentTimeMillis())

object UniformDoubleRNG {
  def apply: RNG[UniformDouble] = UniformDoubleRNG(System.currentTimeMillis())
  implicit val u: Unit = Unit

object GaussianRNG {
  def apply: RNG[(Double,Double)] = GaussianRNG(System.currentTimeMillis())

object UniformDouble {
  def create(x: Double)(implicit y: Unit) = if (x>=0 && x<=1) new UniformDouble(x) else throw new IllegalArgumentException(s"$x is not in range 0..1")
  def + (x: Double, y: UniformDouble) = y+x
Example 142
Source File: RNG.scala    From Scalaprof   with GNU General Public License v2.0 5 votes vote down vote up
package edu.neu.coe.csye._7200

package rng

import java.util.Random

trait RNG[+A] {
  def next: RNG[A]
  def value: A

abstract class RNG_Java[+A](n: Long) extends RNG[A] { 
  // must be overridden by sub-classes
  def value: A
  def newRNG(n: Long): RNG_Java[A]
  // may be overridden (if you want to define your own pseudo-random sequence)
  def nextSeed: Long = RNG_Java.nextSeed(n)
  // base method -- not normally overridden
  def next: RNG_Java[A] = newRNG(nextSeed)
  def state = n

object RNG_Java {
  def nextSeed(n: Long): Long = new Random(n).nextLong

case class LongRNG(n: Long) extends RNG_Java[Long](n) {
  def newRNG(n: Long) = ???
  def value = ???

case class DoubleRNG(n: Long) extends RNG_Java[Double](n) {
  def newRNG(n: Long) = ???
  def value = ???
  override def toString = s"DoubleRNG: $n->$value"

case class UniformDouble(x: Double) {
    def + (y: Double) = x + y

object UniformDoubleRNG {
  def apply: RNG[UniformDouble] = UniformDoubleRNG(System.currentTimeMillis())

object GaussianRNG {
  def apply: RNG[(Double,Double)] = GaussianRNG(System.currentTimeMillis())

object UniformDouble {
  def apply(x: Double, y: Unit): UniformDouble = if (x>=0 && x<=1) new UniformDouble(x) else throw new IllegalArgumentException(s"$x is not in range 0..1")
  def + (x: Double, y: UniformDouble) = y+x
Example 143
Source File: ProbabilityDistributionTest.scala    From ScalphaGoZero   with Apache License 2.0 5 votes vote down vote up
package org.deeplearning4j.scalphagozero.agents

import java.util.Random
import org.scalatest.funspec.AnyFunSpec

class ProbabilityDistributionTest extends AnyFunSpec {

  describe("Select from a distribution") {

    it("should be low index if distribution skewed low") {
      val dist = createPDist(Array(0.9, 0.8, 0.5, 0.3, 0.2, 0.1, 0.01, 0.001))
      assert(dist.selectRandomIdx() == 2)
      assert(dist.selectRandomIdx() == 1)
      assert(dist.selectRandomIdx() == 0)

    it("should be high index if distribution skewed high") {
      val dist = createPDist(Array(0.001, 0.01, 0.1, 0.3, 0.8, 0.5, 0.8, 0.9))
      assert(dist.selectRandomIdx() == 6)

    it("should be highest index if distribution skewed very high") {
      val dist = createPDist(Array(0.001, 0.01, 0.01, 0.01, 0.01, 0.1, 0.9))
      assert(dist.selectRandomIdx() == 6)

    it("should be near middle if gaussian distribution") {
      val dist = createPDist(Array(0.001, 0.01, 0.1, 0.3, 0.6, 0.8, 0.9, 0.9, 0.8, 0.55, 0.4, 0.2, 0.05, 0.01))
      assert(dist.selectRandomIdx() == 8)

    it("random if uniform distribution") {
      val dist = createPDist(Array(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2))
      assert(dist.selectRandomIdx() == 5)

    it("should be 0 index if distribution has only 1 0 value") {
      val dist = createPDist(Array(0.0))
      assert(dist.selectRandomIdx() == 0)

  private def createPDist(a: Array[Double]) = ProbabilityDistribution(a, new Random(1))
Example 144
Source File: SparkHdfsLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo

object SparkHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")


    val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
    val inputPath = args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Example 145
Source File: SparkLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("SparkLR")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)

Source File: LocalKMeans.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._

object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i


  def showWarning() {
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {

    val iter = points.iterator
    for (i <- 1 to points.size) {

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))

      var newPoints = {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)

    println("Final centers: " + kPoints)
Source File: StopwatchSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert( === "sw")
    assert(sw.elapsed() === 0L)
    intercept[AssertionError] {
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    intercept[AssertionError] {

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.accumulator(0L)
    rdd.foreach { i =>
      acc += checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.accumulator(0L)
    rdd.foreach { i =>
      val duration = checkStopwatch(sw("spark"))
      acc += duration
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)

private object StopwatchSuite extends SparkFunSuite {

  private def now: Long = System.currentTimeMillis()
Example 148
Source File: SampledRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T] => new SampledRDDPartition(x, rg.nextInt))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
Example 149
Source File: PartitionwiseSampledRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index

private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T] => new PartitionwiseSampledRDDPartition(x, random.nextLong()))

  override def getPreferredLocations(split: Partition): Seq[String] =

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
Source File: SimpleSkewedGroupByTest.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers
    var ratio = if (args.length > 4) args(4).toInt else 5.0

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
    // Enforce that everything has been calculated and in cache

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
Source File: SparkTachyonHdfsLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo

object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    DataPoint(new DenseVector(x), y)

  def main(args: Array[String]) {


    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
    val lines = sc.textFile(inputPath)
    val points = _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)
Example 152
Source File: SkewedGroupByTest.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: LocalFileLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}

object LocalFileLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val lines =
    val points = _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

    println("Final w: " + w)
Source File: LocalLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}

object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData = {
    def generatePoint(i: Int) = {
      val y = if(i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient +=  p.x * scale
      w -= gradient

    println("Final w: " + w)
Source File: GroupByTest.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object GroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
    // Enforce that everything has been calculated and in cache


Source File: LocalFileLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}

object LocalFileLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val lines =
    val points = _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * ( - 1) * p.y
        gradient += p.x * scale
      w -= gradient

    println("Final w: " + w)
Source File: SparkLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._

object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData = {
    def generatePoint(i: Int) = {
      val y = if(i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)

  def showWarning() {
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |for more conventional use.

  def main(args: Array[String]) {


    val sparkConf = new SparkConf().setAppName("SparkLR")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = { p =>
        p.x * (1 / (1 + exp(-p.y * ( - 1) * p.y
      }.reduce(_ + _)
      w -= gradient

    println("Final w: " + w)

Source File: LocalKMeans.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._

object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData = {
    def generatePoint(i: Int) = {
      DenseVector.fill(D){rand.nextDouble * R}

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i


  def showWarning() {
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.

  def main(args: Array[String]) {


    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {

    val iter = points.iterator
    for (i <- 1 to points.size) {

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))

      var newPoints = {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)

    println("Final centers: " + kPoints)
Source File: GroupByKey.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package com.javachen.spark.examples.rdd

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object GroupByKey {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]")
    var numMappers = 10
    var numKVPairs = 100
    var valSize = 100
    var numReducers = 3

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        arr1(i) = (ranGen.nextInt(10), byteArr)
    // Enforce that everything has been calculated and in cache

    val result = pairs1.groupByKey(numReducers)

Example 160
Source File: UserRepositoryInMemoryInterpreter.scala    From scala-pet-store   with Apache License 2.0 5 votes vote down vote up
package io.github.pauljamescleary.petstore
package infrastructure.repository.inmemory

import java.util.Random

import cats.implicits._
import cats.Applicative
import domain.users.{User, UserRepositoryAlgebra}
import tsec.authentication.IdentityStore

import scala.collection.concurrent.TrieMap

class UserRepositoryInMemoryInterpreter[F[_]: Applicative]
    extends UserRepositoryAlgebra[F]
    with IdentityStore[F, Long, User] {
  private val cache = new TrieMap[Long, User]

  private val random = new Random

  def create(user: User): F[User] = {
    val id = random.nextLong
    val toSave = user.copy(id = id.some)
    cache += (id -> toSave)

  def update(user: User): OptionT[F, User] = OptionT { { id =>
      cache.update(id, user)

  def get(id: Long): OptionT[F, User] =

  def delete(id: Long): OptionT[F, User] =

  def findByUserName(userName: String): OptionT[F, User] =
    OptionT.fromOption(cache.values.find(u => u.userName == userName))

  def list(pageSize: Int, offset: Int): F[List[User]] =
    cache.values.toList.sortBy(_.lastName).slice(offset, offset + pageSize).pure[F]

  def deleteByUserName(userName: String): OptionT[F, User] =
      for {
        user <- cache.values.find(u => u.userName == userName)
        removed <- cache.remove(
      } yield removed,

object UserRepositoryInMemoryInterpreter {
  def apply[F[_]: Applicative]() =
    new UserRepositoryInMemoryInterpreter[F]
Example 161
Source File: PigFuncs.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.backends.flink

import java.util.Random

import dbis.piglet.CommonPigFuncs
import dbis.piglet.backends._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala._

import scala.reflect.ClassTag

class CustomSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) {
  def sample(withReplacement: Boolean, fraction: Double, seed: Long = new Random().nextLong()) = {
    dataSet.mapPartition(new SampleWithFraction[T](withReplacement, fraction, seed))


object Sampler {
  implicit def addSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) = {
    new CustomSampler(dataSet)

object PigFuncs extends CommonPigFuncs {
Example 162
Source File: StreamingPredictionsSpec.scala    From odsc-east-realish-predictions   with Apache License 2.0 4 votes vote down vote up

import java.sql.Timestamp
import java.time.Instant
import java.util.{Random, UUID}

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoders, SQLContext, SparkSession}
import org.scalatest.{FunSuite, Matchers}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}

import scala.concurrent.duration._

class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql {

  override def conf: SparkConf = {
    new SparkConf()
      .set("spark.ui.enabled", "false")
      .set("", appID)
      .set("", "localhost")
      .set("spark.sql.session.timeZone", "UTC")

  final val notRandomRandom = {
    val generator = new Random

  test("should stream in some mock data for fun") {
    implicit val spark: SparkSession = sparkSql
    import spark.implicits._
    implicit val sqlContext: SQLContext = spark.sqlContext

    implicit val metricEncoder = Encoders.product[Metric]
    val metricData = MemoryStream[Metric]

    val startingInstant =

    val backingData = (1 to 10000).map(offset => {
      val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration"
      val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100)
        value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240),
        countryCode = if (offset % 8 == 0) "US" else "BR",
        callDirection = if (metric == "loss_percentage") "inbound" else "outbound"
    val processingTimeTrigger = Trigger.ProcessingTime(2.seconds)

    val streamingQuery = metricData.toDF()
      .withWatermark("timestamp", "2 hours")
      .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes"))
        min("value") as "min",
        avg("value") as "mean",
        max("value") as "max",
        count("*") as "total"



    spark.sql("select * from datastream").show(20, false)

    val checkChange = spark.sql("select * from datastream")
        sum("total") as "total",
        avg("mean") as "mean"
      ), false)

    // now can do interesting things with minor back tracking...


