java.util.Random Scala Examples

The following examples show how to use java.util.Random. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: IntegrationTest.scala    From kmq   with Apache License 2.0 6 votes vote down vote up
package com.softwaremill.kmq.redelivery

import java.time.Duration
import java.util.Random

import akka.actor.ActorSystem
import akka.kafka.scaladsl.{Consumer, Producer}
import akka.kafka.{ConsumerSettings, ProducerMessage, ProducerSettings, Subscriptions}
import akka.stream.ActorMaterializer
import akka.testkit.TestKit
import com.softwaremill.kmq._
import com.softwaremill.kmq.redelivery.infrastructure.KafkaSpec
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.{ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringDeserializer
import org.scalatest.concurrent.Eventually
import org.scalatest.time.{Seconds, Span}
import org.scalatest.{BeforeAndAfterAll, FlatSpecLike, Matchers}

import scala.collection.mutable.ArrayBuffer

class IntegrationTest extends TestKit(ActorSystem("test-system")) with FlatSpecLike with KafkaSpec with BeforeAndAfterAll with Eventually with Matchers {

  implicit val materializer = ActorMaterializer()
  import system.dispatcher

  "KMQ" should "resend message if not committed" in {
    val bootstrapServer = s"localhost:${testKafkaConfig.kafkaPort}"
    val kmqConfig = new KmqConfig("queue", "markers", "kmq_client", "kmq_redelivery", Duration.ofSeconds(1).toMillis,
    1000)

    val consumerSettings = ConsumerSettings(system, new StringDeserializer, new StringDeserializer)
      .withBootstrapServers(bootstrapServer)
      .withGroupId(kmqConfig.getMsgConsumerGroupId)
      .withProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest")

    val markerProducerSettings = ProducerSettings(system,
      new MarkerKey.MarkerKeySerializer(), new MarkerValue.MarkerValueSerializer())
      .withBootstrapServers(bootstrapServer)
      .withProperty(ProducerConfig.PARTITIONER_CLASS_CONFIG, classOf[ParititionFromMarkerKey].getName)
    val markerProducer = markerProducerSettings.createKafkaProducer()

    val random = new Random()

    lazy val processedMessages = ArrayBuffer[String]()
    lazy val receivedMessages = ArrayBuffer[String]()

    val control = Consumer.committableSource(consumerSettings, Subscriptions.topics(kmqConfig.getMsgTopic)) // 1. get messages from topic
      .map { msg =>
      ProducerMessage.Message(
        new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(msg.record), new StartMarker(kmqConfig.getMsgTimeoutMs)), msg)
    }
      .via(Producer.flow(markerProducerSettings, markerProducer)) // 2. write the "start" marker
      .map(_.message.passThrough)
      .mapAsync(1) { msg =>
        msg.committableOffset.commitScaladsl().map(_ => msg.record) // this should be batched
      }
      .map { msg =>
        receivedMessages += msg.value
        msg
      }
      .filter(_ => random.nextInt(5) != 0)
      .map { processedMessage =>
        processedMessages += processedMessage.value
        new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(processedMessage), EndMarker.INSTANCE)
      }
      .to(Producer.plainSink(markerProducerSettings, markerProducer)) // 5. write "end" markers
      .run()

    val redeliveryHook = RedeliveryTracker.start(new KafkaClients(bootstrapServer), kmqConfig)

    val messages = (0 to 20).map(_.toString)
    messages.foreach(msg => sendToKafka(kmqConfig.getMsgTopic,msg))

    eventually {
      receivedMessages.size should be > processedMessages.size
      processedMessages.sortBy(_.toInt).distinct shouldBe messages
    }(PatienceConfig(timeout = Span(15, Seconds)), implicitly)

    redeliveryHook.close()
    control.shutdown()
  }

  override def afterAll(): Unit = {
    super.afterAll()
    TestKit.shutdownActorSystem(system)
  }
} 
Example 2
Source File: package.scala    From iotchain   with MIT License 5 votes vote down vote up
package jbok

import java.nio.charset.StandardCharsets
import java.util.Random

import jbok.crypto.hash._
import scodec.bits.ByteVector
import jbok.crypto.signature.SignatureInstances

trait StringSyntax {
  implicit final def stringSyntax(a: String): StringOps = new StringOps(a)
}

final class StringOps(val a : String) extends AnyVal {
  def utf8bytes: ByteVector = ByteVector(a.getBytes(StandardCharsets.UTF_8))
}

trait CryptoSyntax extends CryptoHasherSyntax with StringSyntax
trait CryptoInstances extends CryptoHasherInstances with SignatureInstances

package object crypto extends CryptoSyntax with CryptoInstances {
  def randomByteString(random: Random, length: Int): ByteVector =
    ByteVector(randomByteArray(random, length))

  def randomByteArray(random: Random, length: Int): Array[Byte] = {
    val bytes = Array.ofDim[Byte](length)
    random.nextBytes(bytes)
    bytes
  }
} 
Example 3
Source File: SignaturePlatform.scala    From iotchain   with MIT License 5 votes vote down vote up
package jbok.crypto.signature

import java.math.BigInteger
import java.util.Random

import cats.effect.Sync
import jbok.crypto.facade.{BN, EC, SignatureEC}

import scala.scalajs.js.JSConverters._
import scala.scalajs.js.typedarray.Uint8Array

trait SignaturePlatform {
  val ecdsa: Signature[ECDSA] = ECDSAPlatform
}

private object ECDSAPlatform extends Signature[ECDSA] {
  import ECDSACommon._
  val secp256k1 = new EC("secp256k1")

  override def generateKeyPair[F[_]](random: Option[Random])(implicit F: Sync[F]): F[KeyPair] = F.delay {
    val keyPair = secp256k1.genKeyPair()
    val secret  = KeyPair.Secret(keyPair.getPrivate("hex"))
    // drop uncompressed indicator, make it 64-bytes
    val pubkey = KeyPair.Public(keyPair.getPublic(false, "hex").drop(2))
    KeyPair(pubkey, secret)
  }

  override def generatePublicKey[F[_]](secret: KeyPair.Secret)(implicit F: Sync[F]): F[KeyPair.Public] = F.delay {
    val keyPair = secp256k1.keyFromPrivate(secret.bytes.toHex, "hex")
    // drop uncompressed indicator, make it 64-bytes
    KeyPair.Public(keyPair.getPublic(false, "hex").drop(2))
  }

  override def sign[F[_]](hash: Array[Byte], keyPair: KeyPair, chainId: BigInt)(implicit F: Sync[F]): F[CryptoSignature] = F.delay {
    val kp  = secp256k1.keyFromPrivate(keyPair.secret.bytes.toHex, "hex")
    val sig = secp256k1.sign(new Uint8Array(hash.toJSArray), kp)
    val r   = new BigInteger(sig.r.toString)
    val s   = new BigInteger(sig.s.toString)
    val pointSign = calculatePointSign(r, toCanonicalS(s), keyPair, hash, chainId) match {
      case Some(recId) => recId
      case None        => throw new Exception("unexpected error")
    }
    val rid: BigInt = getRecoveryId(chainId, pointSign).getOrElse(pointSign)
    CryptoSignature(r, toCanonicalS(s), rid)
  }

  override def verify[F[_]](hash: Array[Byte], sig: CryptoSignature, public: KeyPair.Public, chainId: BigInt)(implicit F: Sync[F]): F[Boolean] = F.delay {
    getPointSign(chainId, sig.v).exists { bigInt =>
      val signatureEC = convert(sig.copy(v = bigInt))
      val key         = secp256k1.keyFromPublic(UNCOMPRESSED_INDICATOR_STRING + public.bytes.toHex, "hex")
      secp256k1.verify(new Uint8Array(hash.toJSArray), signatureEC, key)
    }
  }

  override def recoverPublic(hash: Array[Byte], sig: CryptoSignature, chainId: BigInt): Option[KeyPair.Public] =
    getPointSign(chainId, sig.v).map { bigInt =>
      val signatureEC = convert(sig.copy(v = bigInt))
      val msg         = new Uint8Array(hash.toJSArray)
      val recId       = secp256k1.getKeyRecoveryParam(msg, signatureEC)
      val point       = secp256k1.recoverPubKey(new Uint8Array(hash.toJSArray), signatureEC, recId)
      KeyPair.Public(point.encode("hex", false).drop(2))
    }

  private def convert(sig: CryptoSignature) = {
    val r = new BN(sig.r.toString(16), 16)
    val s = new BN(sig.s.toString(16), 16)
    SignatureEC(r, s, recoveryParam = (sig.v - NEGATIVE_POINT_SIGN).toInt)
  }

  private def calculatePointSign(r: BigInt, s: BigInt, keyPair: KeyPair, hash: Array[Byte], chainId: BigInt): Option[BigInt] =
    allowedPointSigns.find(
      v =>
        recoverPublic(hash, CryptoSignature(r, s, getRecoveryId(chainId, v).getOrElse(v)), chainId)
          .contains(keyPair.public))
} 
Example 4
Source File: LoggerSimulation.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark.utils

import java.io.PrintWriter
import java.net.ServerSocket

class LoggerSimulation {

}

object LoggerSimulation {

  var numIndex = 0

  /**
    * 生成一个字母
    *
    * @param 字母的下标
    * @return 生成的字母
    */
  def gennerateContent(index: Int): String = {
    import scala.collection.mutable.ListBuffer
    val charList = ListBuffer[Char]();
    for (i <- 65 to 90) {
      charList += i.toChar
    }
    val charArray = charList.toArray
    charArray(index).toString();
  }

  def gennerateNumber(): String = {
    //    numIndex += 1
    //    return numIndex.toString
    return "a,b,c,d,e,f"
  }

  /**
    * 生成随机下标
    *
    * @return 返回一个下标
    */
  def index = {
    import java.util.Random
    val rdm = new Random()
    rdm.nextInt(7)
  }

  /**
    * 启动一个main方法来创建一个serversockt发送消息
    *
    * @param args 端口,发送的时间间隔
    */
  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage:<port><millisecond>")
      System.exit(1);
    }

    val listener = new ServerSocket(args(0).toInt)
    println("已经做好连接的准备-------")
    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from:" + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream, true)
          while (true) {
            Thread.sleep(args(1).toLong)
            //            val content = gennerateContent(index)
            val content = gennerateNumber()
            println(content)
            out.write(content + "\n")
            out.flush()
          }
          socket.close()
        }
      }.start()
    }
  }
} 
Example 5
Source File: SimpleSkewedGroupByTest.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("SimpleSkewedGroupByTest")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
        }
      }
      result
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

    spark.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: SkewedGroupByTest.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 7
Source File: SparkHdfsLR.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")
      System.exit(1)
    }

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkHdfsLR")
      .getOrCreate()

    val inputPath = args(0)
    val lines = spark.read.textFile(inputPath).rdd

    val points = lines.map(parsePoint).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    spark.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: LocalLR.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient +=  p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 9
Source File: GroupByTest.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 10
Source File: LocalFileLR.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
    val points = lines.map(parsePoint _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 11
Source File: PageViewGenerator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.clickstream

import java.io.PrintWriter
import java.net.ServerSocket
import java.util.Random


// scalastyle:on
object PageViewGenerator {
  val pages = Map("http://foo.com/" -> .7,
                  "http://foo.com/news" -> 0.2,
                  "http://foo.com/contact" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
      }
    }
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0
  }

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()
  }

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
      System.exit(1)
    }
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println("Listening on port: " + port)

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from: " + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
            Thread.sleep(sleepDelayMs)
            out.write(getNextClickEvent())
            out.flush()
          }
          socket.close()
        }
      }.start()
    }
  }
}
// scalastyle:on println 
Example 12
Source File: SparkLR.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkLR")
      .getOrCreate()

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)

    spark.stop()
  }
}
// scalastyle:on println 
Example 13
Source File: LocalKMeans.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println 
Example 14
Source File: StopwatchSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      acc.add(checkStopwatch(sw))
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc.add(duration)
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
} 
Example 15
Source File: PartitionwiseSampledRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 16
Source File: CsvKafkaPublisher.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.common

import java.io.File
import java.util.Random

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}

import scala.io.Source

object CsvKafkaPublisher {

  var counter = 0
  var salts = 0

  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<brokerList> " +
        "<topicName> " +
        "<dataFolderOrFile> " +
        "<sleepPerRecord> " +
        "<acks> " +
        "<linger.ms> " +
        "<producer.type> " +
        "<batch.size> " +
        "<salts>")
      return
    }

    val kafkaBrokerList = args(0)
    val kafkaTopicName = args(1)
    val nyTaxiDataFolder = args(2)
    val sleepPerRecord = args(3).toInt
    val acks = args(4).toInt
    val lingerMs = args(5).toInt
    val producerType = args(6) //"async"
    val batchSize = args(7).toInt
    salts = args(8).toInt

    val kafkaProducer = KafkaProducerUntil.getNewProducer(kafkaBrokerList, acks, lingerMs, producerType, batchSize)

    println("--Input:" + nyTaxiDataFolder)

    val dataFolder = new File(nyTaxiDataFolder)
    if (dataFolder.isDirectory) {
      val files = dataFolder.listFiles().iterator
      files.foreach(f => {
        println("--Input:" + f)
        processFile(f, kafkaTopicName, kafkaProducer, sleepPerRecord)
      })
    } else {
      println("--Input:" + dataFolder)
      processFile(dataFolder, kafkaTopicName, kafkaProducer, sleepPerRecord)
    }
    println("---Done")
  }

  def processFile(file:File, kafkaTopicName:String,
                  kafkaProducer: KafkaProducer[String, String], sleepPerRecord:Int): Unit = {
    var counter = 0
    val r = new Random()

    println("-Starting Reading")
    Source.fromFile(file).getLines().foreach(l => {
      counter += 1
      if (counter % 10000 == 0) {
        println("{Sent:" + counter + "}")
      }
      if (counter % 100 == 0) {
        print(".")
      }
      Thread.sleep(sleepPerRecord)

      val saltedVender = r.nextInt(salts) + l

      if (counter > 2) {
        publishTaxiRecord(saltedVender, kafkaTopicName, kafkaProducer)
      }
    })
  }

  def publishTaxiRecord(line:String, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String]): Unit = {

    if (line.startsWith("vendor_name") || line.length < 10) {
      println("skip")
    } else {
      val message = new ProducerRecord[String, String](kafkaTopicName, line.hashCode.toString, line)
      kafkaProducer.send(message)
    }
  }


} 
Example 17
Source File: ExtremeSummarizerSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries.summarize.summarizer

import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer
import com.twosigma.flint.timeseries.row.Schema
import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite }
import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite }
import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType }
import java.util.Random

import org.apache.spark.sql.Row

class ExtremeSummarizerSpec extends SummarizerSuite {

  override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer"

  private def test[T](
    dataType: DataType,
    randValue: Row => Any,
    summarizer: String => SummarizerFactory,
    reduceFn: (T, T) => T,
    inputColumn: String,
    outputColumn: String
  ): Unit = {
    val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns(
      inputColumn -> dataType -> randValue
    )

    val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) }

    val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) }

    val result = priceTSRdd.summarize(summarizer(inputColumn))

    val extreme = result.first().getAs[T](outputColumn)
    val outputType = result.schema(outputColumn).dataType

    assert(outputType == dataType, s"$outputType")
    assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}")
  }

  "MaxSummarizer" should "compute double max correctly" in {
    val rand = new Random()
    test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max")
  }

  it should "compute long max correctly" in {
    val rand = new Random()
    test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max")
  }

  it should "compute float max correctly" in {
    val rand = new Random()
    test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max")
  }

  it should "compute int max correctly" in {
    val rand = new Random()
    test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max")
  }

  "MinSummarizer" should "compute double min correctly" in {
    val rand = new Random()
    test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "compute long min correctly" in {
    val rand = new Random()
    test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "compute float min correctly" in {
    val rand = new Random()
    test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "compute int min correctly" in {
    val rand = new Random()
    test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "pass summarizer property test" in {
    summarizerPropertyTest(AllProperties)(Summarizers.max("x1"))
    summarizerPropertyTest(AllProperties)(Summarizers.min("x2"))
  }

  it should "ignore null values" in {
    val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType))
    val inputWithNull = insertNullRows(input, "price")

    assertEquals(
      input.summarize(Summarizers.min("price")),
      inputWithNull.summarize(Summarizers.min("price"))
    )
  }
} 
Example 18
Source File: WithdrawalEpochCertificateFixture.scala    From Sidechains-SDK   with MIT License 5 votes vote down vote up
package com.horizen.block

import java.util.Random

trait WithdrawalEpochCertificateFixture {
  private def getBytes(len: Int = 32, rnd: Random = new Random()): Array[Byte] = {
    val bytes = new Array[Byte](len)
    rnd.nextBytes(bytes)
    bytes
  }

  def generateWithdrawalEpochCertificate(previousMcBlockHashOpt: Option[Array[Byte]] = None, rnd: Random = new Random()): WithdrawalEpochCertificate = {
    WithdrawalEpochCertificate(
      getBytes(),
      rnd.nextInt,
      getBytes(),
      rnd.nextInt(),
      rnd.nextLong(),
      previousMcBlockHashOpt.getOrElse(getBytes()),
      getBytes(),
      Seq(),
      Seq(),
      Seq())
  }
} 
Example 19
Source File: GenerationRules.scala    From Sidechains-SDK   with MIT License 5 votes vote down vote up
package com.horizen.fixtures.sidechainblock.generation

import java.util.Random

import scorex.util.ModifierId

case class GenerationRules(forgingBoxesToAdd: Set[SidechainForgingData] = Set(),
                           forgingBoxesToSpent: Set[SidechainForgingData] = Set(),
                           mcReferenceIsPresent: Option[Boolean] = None,
                           corruption: CorruptedGenerationRules = CorruptedGenerationRules.emptyCorruptedGenerationRules,
                           forcedParentId: Option[ModifierId] = None,
                           forcedTimestamp: Option[Long] = None
                         ) {
  def isCorrupted: Boolean = corruption == CorruptedGenerationRules.emptyCorruptedGenerationRules
}

object GenerationRules {
  def generateCorrectGenerationRules(rnd: Random, allNotSpentForgerData: Set[SidechainForgingData]): GenerationRules = {
    val addForgingData: Set[SidechainForgingData] =
      if (allNotSpentForgerData.size > 100) {
        Set(SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000))))
      }
      else {
        Set(SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000))), SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000))))
      }

    val removedForgingData: Set[SidechainForgingData] =
      if (rnd.nextBoolean()) {
        Set(allNotSpentForgerData.toSeq(rnd.nextInt(allNotSpentForgerData.size)))
      }
      else {
        val deleteSize = if (allNotSpentForgerData.size > 100) 10 else 1
        allNotSpentForgerData.toSeq.sortBy(_.forgerBox.value())(Ordering[Long]).take(deleteSize).toSet
      }

    require((removedForgingData -- allNotSpentForgerData).isEmpty)

    GenerationRules(forgingBoxesToAdd = addForgingData, forgingBoxesToSpent = removedForgingData)
  }
} 
Example 20
Source File: SidechainForgingData.scala    From Sidechains-SDK   with MIT License 5 votes vote down vote up
package com.horizen.fixtures.sidechainblock.generation

import java.util.Random

import com.horizen.box.ForgerBox
import com.horizen.box.data.ForgerBoxData
import com.horizen.consensus._
import com.horizen.proof.VrfProof
import com.horizen.proposition.VrfPublicKey
import com.horizen.secret.{PrivateKey25519, PrivateKey25519Creator, VrfKeyGenerator, VrfSecretKey}
import com.horizen.vrf.VrfOutput


case class SidechainForgingData(key: PrivateKey25519, forgerBox: ForgerBox, vrfSecret: VrfSecretKey) {
  
  def canBeForger(vrfMessage: VrfMessage, totalStake: Long, additionalCheck: Boolean => Boolean): Option[(VrfProof, VrfOutput)] = {
    val vrfProofAndHash = vrfSecret.prove(vrfMessage)
    val vrfProof = vrfProofAndHash.getKey
    val vrfOutput = vrfProofAndHash.getValue

    val checker = (stakeCheck _).tupled.andThen(additionalCheck)
    Some((vrfProof, vrfOutput)).filter{case (vrfProof, vrfOutput) => checker(vrfOutput, totalStake)}
  }

  private def stakeCheck(vrfOutput: VrfOutput, totalStake: Long): Boolean = {
    vrfProofCheckAgainstStake(vrfOutput, forgerBox.value(), totalStake)
  }

  val forgerId: Array[Byte] = forgerBox.id()

  override def toString: String = {
    s"id - ${key.hashCode()}, value - ${forgerBox.value()}"
  }

  override def equals(obj: Any): Boolean = {
    obj match {
      case that: SidechainForgingData => {
        val keyEquals = this.key.equals(that.key)
        val forgerBoxEquals = this.forgerBox.equals(that.forgerBox)
        val vrfSecretEquals = this.vrfSecret.equals(that.vrfSecret)

        keyEquals && forgerBoxEquals && vrfSecretEquals
      }
      case _ =>
        false
    }
  }
}

object SidechainForgingData {
  def generate(rnd: Random, value: Long): SidechainForgingData = {
    val key: PrivateKey25519 = PrivateKey25519Creator.getInstance().generateSecret(rnd.nextLong().toString.getBytes)
    val vrfSecretKey = VrfKeyGenerator.getInstance().generateSecret(rnd.nextLong().toString.getBytes())
    val vrfPublicKey: VrfPublicKey = vrfSecretKey.publicImage();
    val forgerBox = new ForgerBoxData(key.publicImage(), value, key.publicImage(), vrfPublicKey).getBox(rnd.nextLong())

    SidechainForgingData(key, forgerBox, vrfSecretKey)
  }
} 
Example 21
Source File: ForgerBoxFixture.scala    From Sidechains-SDK   with MIT License 5 votes vote down vote up
package com.horizen.fixtures

import java.util.Random

import com.horizen.box.ForgerBox
import com.horizen.box.data.ForgerBoxData
import com.horizen.proposition.VrfPublicKey
import com.horizen.secret.{PrivateKey25519, VrfKeyGenerator, VrfSecretKey}
import com.horizen.utils
import com.horizen.utils.Ed25519

case class ForgerBoxGenerationMetadata(propositionSecret: PrivateKey25519, blockSignSecret: PrivateKey25519, vrfSecret: VrfSecretKey)

object ForgerBoxFixture {
  def generateForgerBox(seed: Long): (ForgerBox, ForgerBoxGenerationMetadata) = generateForgerBox(seed, None)

  def generateForgerBox(seed: Long,
                        vrfKeysOpt: Option[(VrfSecretKey, VrfPublicKey)]): (ForgerBox, ForgerBoxGenerationMetadata) = {
    val randomGenerator = new Random(seed)
    val byteSeed = new Array[Byte](32)
    randomGenerator.nextBytes(byteSeed)
    val propositionKeyPair: utils.Pair[Array[Byte], Array[Byte]] = Ed25519.createKeyPair(byteSeed)
    val ownerKeys: PrivateKey25519 = new PrivateKey25519(propositionKeyPair.getKey, propositionKeyPair.getValue)
    val value: Long = randomGenerator.nextLong
    val (vrfSecret, vrfPubKey) = vrfKeysOpt.getOrElse{
      val secretKey = VrfKeyGenerator.getInstance().generateSecret(ownerKeys.bytes())
      val publicKey = secretKey.publicImage()
        (secretKey, publicKey)
    }
    val proposition = ownerKeys.publicImage()

    val forgerBoxData = new ForgerBoxData(proposition, value, proposition, vrfPubKey)
    val nonce: Long = randomGenerator.nextLong

    val forgerBox = forgerBoxData.getBox(nonce)
    (forgerBox, ForgerBoxGenerationMetadata(ownerKeys, ownerKeys, vrfSecret))
  }
} 
Example 22
Source File: HistoryConsensusCheckerTest.scala    From Sidechains-SDK   with MIT License 5 votes vote down vote up
package com.horizen.consensus

import java.util.Random

import com.horizen.SidechainHistory
import com.horizen.fixtures.sidechainblock.generation._
import com.horizen.params.{NetworkParams, TestNetParams}
import org.junit.Test
import org.scalatest.junit.JUnitSuite

import scala.collection.mutable
import scala.util.{Failure, Success, Try}


class HistoryConsensusCheckerTest extends JUnitSuite with HistoryConsensusChecker {

  def testWithSeed(testSeed: Int): Unit = {
    //val testSeed = 234
    val rnd: Random = new Random(testSeed)

    val initialParams = TestNetParams(consensusSlotsInEpoch = 10, sidechainGenesisBlockTimestamp = 1333344452L)
    val (params, genesisBlock, genesisGenerator, genesisForgingData, genesisEndEpochInfo) = SidechainBlocksGenerator.startSidechain(10000000000L, testSeed, initialParams)
    val history: SidechainHistory = createHistory(params, genesisBlock, genesisEndEpochInfo)
    val nonce = history.calculateNonceForEpoch(blockIdToEpochId(genesisBlock.id))
    val stake = genesisEndEpochInfo.stakeConsensusEpochInfo
    history.applyFullConsensusInfo(genesisBlock.id, FullConsensusEpochInfo(stake, nonce))
    println(s"//////////////// Genesis epoch ${genesisBlock.id} had been ended ////////////////")

    val generators = mutable.IndexedSeq(genesisGenerator)

    (1 to 50)
      .foldLeft[(SidechainHistory, mutable.IndexedSeq[SidechainBlocksGenerator])]((history, generators)) { (acc, index) =>
        val currentHistory: SidechainHistory = acc._1
        val currentGenerators: mutable.IndexedSeq[SidechainBlocksGenerator] =  acc._2

        val nextGenerator: SidechainBlocksGenerator = generatorSelection(rnd, currentGenerators)
        val nextCorrectGenerationRules: GenerationRules = GenerationRules.generateCorrectGenerationRules(rnd, nextGenerator.getNotSpentBoxes)

        println("try to add incorrect block(s)")
        tryToAddIncorrectBlocks(params, currentHistory, nextGenerator, nextCorrectGenerationRules, rnd)
        println("try to add correct block")
        val correctRes = Try(generateBlock(nextCorrectGenerationRules, nextGenerator, history)) match {
          case Success((gens, generatedBlock)) =>
            val updatedHistory = historyUpdateShallBeSuccessful(currentHistory, generatedBlock)
            val updatedGenerators = currentGenerators ++ gens
            (updatedHistory, updatedGenerators)

          case Failure(ex: GenerationIsNoLongerPossible) =>
            println("Finishing block generation")
            return

          case Failure(ex) =>
            println("Error during block generation")
            throw ex
        }

        correctRes
      }
  }

  private def tryToAddIncorrectBlocks(params: NetworkParams,
                                      currentHistory: SidechainHistory,
                                      currentGenerator: SidechainBlocksGenerator,
                                      correctGenerationRules: GenerationRules,
                                      rnd: Random,
                                      incorrectBlocksCount: Int = 2): Unit = Try {
    (1 to incorrectBlocksCount)
      .foreach{ _ =>
        val incorrectGenerationRules: GenerationRules = CorruptedGenerationRules.corruptGenerationRules(rnd, params, currentGenerator, correctGenerationRules)
        //println(s"Generated corruption rules are: ${incorrectGenerationRules}")
        currentGenerator
          .tryToGenerateBlockForCurrentSlot(incorrectGenerationRules)
          .map(generationInfo => historyUpdateShallBeFailed(currentHistory,generationInfo.block, incorrectGenerationRules))
    }
  }

  @Test
  def testManySeeds(): Unit = {
    val seed = 9084

    (50 to 50).foreach{index =>
      println(s"SEED IS ${index}")
      testWithSeed(index + seed)
    }
  }

} 
Example 23
Source File: DescriptiveStatsSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package com.sap.commons

import java.util.Random

import org.scalatest.FunSuite

// scalastyle:off magic.number
class DescriptiveStatsSuite extends FunSuite {
  val SEED = 123
  val SignificantPosCorrelation = 0.9

  test("mean") {
    val samples0 = Seq(1, 1)
    val samples1 = Seq(1, 2, 3, 4)
    val samples2 = Seq(1.1, 0.9, 1.0)
    val samples3 = Seq.empty[Int]
    assertResult(1.0)(DescriptiveStats.mean(samples0))
    assertResult(2.5)(DescriptiveStats.mean(samples1))
    assertResult(1.0)(DescriptiveStats.mean(samples2))
    val samples3mean = DescriptiveStats.mean(samples3)
    assert(samples3mean.isNaN)
  }

  test("stdev") {
    val samples0 = Seq(0, 2)
    val samples1 = Seq.empty[Double]
    val samples2 = Seq.fill(1000)(0) ++ Seq.fill(1000)(2)
    assertResult(math.sqrt(2))(DescriptiveStats.stdev(samples0))
    assert(DescriptiveStats.stdev(samples1).isNaN)
    val samples2stdev = DescriptiveStats.stdev(samples2)
    assert(samples2stdev > 1.0 && samples2stdev < 1.001)
  }

  test("pearson") {
    val rand = new Random(SEED)
    val samples1 = Seq((1, 1), (2, 2), (3, 3))
    val samples2 = Seq((3.0, 1), (2.0, 2), (1.0, 3))
    val samples3 = (1 to 100000).map { i => (rand.nextDouble(), rand.nextDouble()) }
    val samples4 = Seq.empty[(Double, Double)]
    assertResult(1.0)(DescriptiveStats.pearson(samples1))
    assertResult(-1.0)(DescriptiveStats.pearson(samples2))
    assert(math.abs(DescriptiveStats.pearson(samples3)) < 0.01)
    assert(DescriptiveStats.pearson(samples4).isNaN)
  }

  test("spearman") {
    val rand = new Random(SEED)
    val samples1 = Seq((1, 1), (2, 2), (3, 3))
    val samples2 = Seq((3.0, 1), (2.0, 2), (1.0, 3))
    val samples3 = (1 to 100000).map { i => (rand.nextDouble(), rand.nextDouble()) }
    val samples4 = Seq.empty[(Double, Double)]
    assertResult(1.0)(DescriptiveStats.spearman(samples1))
    assertResult(-1.0)(DescriptiveStats.spearman(samples2))
    assert(math.abs(DescriptiveStats.spearman(samples3)) < 0.01)
    assert(DescriptiveStats.spearman(samples4).isNaN)
  }

  test("spearman & pearson w/ noise & outliers") {
    val samples1 = Seq((1, 300.0), (2, 250.0), (3, 400.0), (4, 350.0), (5, 500.0),
                       (6, 450.0), (7, 600.0), (8, 550.0), (9, 700.0), (10, 650.0))
    val samples2 = Seq((1, 300.0), (2, 350.0), (3, 400.0), (4, 450.0), (5, 500.0),
                       (6, 550.0), (7, 2000.0), (8, 700.0), (9, 750.0), (10, 800.0))
    assert(DescriptiveStats.pearson(samples1) > SignificantPosCorrelation)
    assert(DescriptiveStats.spearman(samples1) > SignificantPosCorrelation)
    // pearson is less robust, does not detect dependency
    assert(DescriptiveStats.pearson(samples2) < SignificantPosCorrelation)
    // spearman detects dependency
    assert(DescriptiveStats.spearman(samples2) > SignificantPosCorrelation)
  }

  test("spearman & pearson w/ real data") {

    val measure1 = Seq(379, 379, 382, 360, 378, 374, 364, 371, 360, 365, 364, 363, 369, 375, 365,
                       369, 358, 372, 370, 363, 363, 369, 361, 362, 367, 357, 365, 364, 363, 368,
                       360, 361, 360, 363, 359, 357, 365, 367, 364, 363)
    val measure2 = Seq(411, 379, 380, 382, 387, 404, 410, 431, 430, 444, 468, 489, 519, 573, 571,
                       620, 643, 657, 694, 711, 752, 783, 807, 841, 856, 891, 912, 962,1042, 982,
                       1076,1056,1092,1145,1128,1186,1221,1245,1284,1307)
    val samples1 = measure1.zipWithIndex
    val samples2 = measure2.zipWithIndex

    assert(DescriptiveStats.pearson(samples1) < SignificantPosCorrelation)
    assert(DescriptiveStats.spearman(samples1) < SignificantPosCorrelation)
    assert(DescriptiveStats.pearson(samples2) > SignificantPosCorrelation)
    assert(DescriptiveStats.spearman(samples2) > SignificantPosCorrelation)

  }
} 
Example 24
Source File: VLBFGS1.scala    From spark-vl-bfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.optim

import java.util.Random

import scala.language.implicitConversions

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace}
import org.apache.spark.ml.optim.VectorRDDFunctions._
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.storage.StorageLevel


  private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = {
    data.cartesian(dx).map { case (points, x) =>
      val g = Vectors.zeros(x.size)
      points.foreach { case LabeledPoint(b, a) =>
        val err = BLAS.dot(a, x) - b
        BLAS.axpy(err, a, g)
      }
      g
    }.treeSum()
  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]")
    val sc = new SparkContext(conf)
    sc.setCheckpointDir("/tmp/checkpoint")
    val n = 1000
    val p = 100
    val random = new Random(0L)
    val xExact = Vectors.dense(Array.fill(p)(random.nextDouble()))
    val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) =>
      val random = new Random(100 + idx)
      part.map { v =>
        val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian()
        LabeledPoint(target, v)
      }
    }.glom()
    .cache()

    val x = solve(data).first()

    println(s"x_exact = $xExact")
    println(s"x_vlbfgs = $x")

    sc.stop()
  }
} 
Example 25
Source File: ProjectionsTest.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.linalg

import java.util.Random
import org.scalacheck.Gen
import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.scalatest.prop.Checkers.check


class ProjectionsTest
    extends LinalgPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {
  import org.scalactic.Tolerance._
  val dimGen = for {
    srcDim <- Gen.choose(100, 200)
    dstDim <- Gen.choose(100, 200)
  } yield (srcDim, dstDim)

  property("gaussian random projection have good statistical properties") {
    forAllNoShrink(dimGen) {
      case (srcDim, dstDim) =>
        val projection =
          GaussianRandomProjection(srcDim, dstDim, new Random())
        projection.mean === 0.0 +- 0.01
        projection.stddev === 1.0 +- 0.01
    }
  }

  property("cauchy random projection have good statistical properties") {
    forAllNoShrink(dimGen) {
      case (srcDim, dstDim) =>
        val projection =
          CauchyRandomProjection(srcDim, dstDim)
        projection.median === 0.0 +- 0.01
    }
  }
} 
Example 26
package com.stefansavev.similaritysearch.implementation

import java.util
import java.util.Random

import com.stefansavev.randomprojections.utils.RandomUtils
import com.stefansavev.similaritysearch.{SimilaritySearchIndex, SimilaritySearchResult, SimilaritySearchResultBuilder, SimilaritySearchResults}

object FuzzySearchEvaluationUtilsWrapper {

  def generateRandomTestSet(rnd: Random, numQueries: Int, index: SimilaritySearchIndex): SimilaritySearchResults = {
    import scala.collection.JavaConversions._
    val itemNames = index.getItems.toIterator.map(_.getName).toArray

    val sampleIds = RandomUtils.sample(rnd, numQueries, Array.range(0, itemNames.length))
    val builder = new SimilaritySearchResultBuilder()
    for (id <- sampleIds) {
      val queryId = itemNames(id)
      val queryVector = index.getItemByName(queryId).getVector
      val queryResults = new util.ArrayList[SimilaritySearchResult]()
      builder.addResult(queryId, queryResults)
    }
    return builder.build()
  }

} 
Example 27
Source File: DataFrameView.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev.randomprojections.datarepr.dense

import java.util.Random

import com.stefansavev.randomprojections.datarepr.sparse.SparseVector
import com.stefansavev.randomprojections.implementation.{Signatures, PointSignatures}

class PointIndexes(val indexes: Array[Int]){
  def toTuple: PointIndexes.TupleType = (0, indexes)
  def size = indexes.length
  def apply(i: Int): Int = indexes(i)
}

object PointIndexes{
  type TupleType = (Int, Array[Int]) //the first is dummy because I need to add a tuple1
  def apply(indexes: Array[Int]): PointIndexes = new PointIndexes(indexes)
  def unapply(pntIndexes: PointIndexes): Option[Array[Int]] = Some(pntIndexes.indexes)
  def fromTuple(t: TupleType): PointIndexes = new PointIndexes(t._2)
}

class DataFrameView(val indexes: PointIndexes, val rowStoredView: RowStoredMatrixView) {
  def toTuple:DataFrameView.TupleType = (indexes, rowStoredView)

  var pointSignatures: PointSignatures = null //work around until the concept is validated
  def numRows: Int = indexes.size
  def numCols: Int = rowStoredView.numCols

  def setPointSignatures(pointSignatures: PointSignatures): Unit = {
    this.pointSignatures = pointSignatures
  }

  def getRowIdByName(name: String): Int = {
    rowStoredView.getRowIdByName(name)
  }

  def buildSetSignatures(numSignatures: Int, rnd: Random): Unit = {
    if (pointSignatures != null){
      throw new IllegalStateException("Signatures cannot be overwritten")
    }
    val (signatureVecs, signatures) = Signatures.computePointSignatures(numSignatures, rnd, this)
    this.setPointSignatures(signatures)
  }

  def getPointSignatures(): PointSignatures = {
    this.pointSignatures
  }

  def getAllRowNames(): Array[String] = {
    rowStoredView.getAllRowNames()
  }

  def getPointAsDenseVector(pntId: Int): Array[Double] = {
    rowStoredView.getPointAsDenseVector(pntId)
  }

  def getPointAsDenseVector(pntId: Int, columnIds: Array[Int], vec: Array[Double]): Unit = {
    rowStoredView.getPointAsDenseVector(pntId, columnIds, vec)
  }

  def multiplyRowComponentWiseBySparseVector(pntId: Int, sv: SparseVector, output: Array[Double]): Unit = {
    rowStoredView.multiplyRowComponentWiseBySparseVector(pntId, sv, output)
  }

  def getUnderlyingIndexes(): PointIndexes = indexes

  def childView(newIndexes: PointIndexes): DataFrameView = {
    new DataFrameView(newIndexes, rowStoredView)
  }

  def getLabel(rowId: Int): Int = rowStoredView.getLabel(rowId)

  def getAllLabels(): Array[Int] = rowStoredView.getAllLabels()

  def getName(rowId: Int): String = {
    rowStoredView.getName(rowId)
  }

  //def dist(id1: Int, id2: Int): Double = rowStoredView.dist(id1, id2)

  def cosineForNormalizedData(query: Array[Double], id: Int): Double = rowStoredView.cosineForNormalizedData(query, id)

  override def toString = s"DataFrameView($numRows, $numCols)"
}

object DataFrameView{
  type TupleType = (PointIndexes, RowStoredMatrixView)
  def fromTuple(t: TupleType) = new DataFrameView(t._1, t._2)
} 
Example 28
Source File: RandomUtils.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev.randomprojections.utils

import java.util.Random

import com.stefansavev.randomprojections.buffers.IntArrayBuffer
import com.stefansavev.randomprojections.datarepr.sparse.SparseVector

object RandomUtils {
  def shuffleInts(rnd: Random, arr: Array[Int]): Array[Int] = {
    val values = arr.map(v => (v, rnd.nextDouble())).sortBy(_._2).map(_._1)
    values
  }

  def shuffleDoubles(rnd: Random, arr: Array[Double]): Array[Double] = {
    val values = arr.map(v => (v, rnd.nextDouble())).sortBy(_._2).map(_._1)
    values
  }

  def sign(rnd: Random): Double = {
    if (rnd.nextDouble() > 0.5) 1.0 else -1.0
  }

  def generateRandomVector(rnd: Random, numCols: Int, columnIds: Array[Int]): SparseVector = {
    val signs = columnIds.map(_ => (if (rnd.nextDouble() >= 0.5) 1.0 else -1.0))
    var sum = 0.0
    var i = 0
    while (i < signs.length) {
      val v = signs(i)
      sum += v * v
      i += 1
    }
    sum = Math.sqrt(sum)

    i = 0
    while (i < signs.length) {
      signs(i) /= sum
      i += 1
    }

    val sparseVec = new SparseVector(numCols, columnIds, signs)
    sparseVec
  }

  def generateRandomVector(rnd: Random, numCols: Int): SparseVector = {
    generateRandomVector(rnd, numCols, Array.range(0, numCols))
  }

  //TODO: use a version of reservoir sampling together with random shuffle
  def sample(rnd: Random, k: Int, arr: Array[Int]): Array[Int] = {
    def getValue(arr: Array[Int], overWrites: scala.collection.mutable.HashMap[Int, Int], index: Int): Int = {
      if (overWrites.contains(index)) {
        overWrites(index)
      } else {
        arr(index)
      }
    }
    var currentLength = arr.length
    val buffer = new IntArrayBuffer()
    val overWrites = new scala.collection.mutable.HashMap[Int, Int]()
    var i = 0
    while (i < k && currentLength > 0) {
      val nextPos = rnd.nextInt(currentLength)
      val sampledValue = getValue(arr, overWrites, nextPos)
      buffer += sampledValue
      if (nextPos < currentLength - 1) {
        val lastValue = getValue(arr, overWrites, currentLength - 1)
        overWrites += ((nextPos, lastValue))
      }
      currentLength -= 1
      i += 1
    }
    buffer.toArray()
  }
} 
Example 29
Source File: SplitIntoKProjection.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev.randomprojections.implementation

class SplitIntoKProjection {

}

import java.util.Random

import com.stefansavev.randomprojections.datarepr.dense.DataFrameView
import com.stefansavev.randomprojections.datarepr.sparse.SparseVector
import com.stefansavev.randomprojections.utils.RandomUtils

import scala.collection.mutable.ArrayBuffer

case class SplitIntoKProjectionStrategy(rnd: Random, numCols: Int, k: Int) extends ProjectionStrategy {

  def chooseKPoints(k: Int, pointIds: Array[Int], view: DataFrameView): Array[Int] = {
    RandomUtils.shuffleInts(rnd, pointIds).take(k)
  }

  def chooseKDimensions(k: Int): Array[Int] = {
    val columns = Array.range(0, numCols)
    RandomUtils.shuffleInts(rnd, columns).take(k).sorted
  }

  def generateRandomVector(columnIds: Array[Int]): SparseVector = {
    val signs = columnIds.map(_ => (if (rnd.nextDouble() >= 0.5) 1.0 else -1.0))

    var sum = 0.0
    var i = 0
    while (i < signs.length) {
      val v = signs(i)
      sum += v * v
      i += 1
    }
    sum = Math.sqrt(sum)

    i = 0
    while (i < signs.length) {
      signs(i) /= sum
      i += 1
    }

    val sparseVec = new SparseVector(numCols, columnIds, signs)
    sparseVec
  }

  def generateKRandomVectors(num: Int, columnIds: Array[Int]): Array[SparseVector] = {
    val buff = new ArrayBuffer[SparseVector]()
    for (i <- 0 until num) {
      buff += generateRandomVector(columnIds)
    }
    buff.toArray
  }

  def nextRandomProjection(depth: Int, view: DataFrameView, projectionVector: AbstractProjectionVector): AbstractProjectionVector = {
    val useK = HadamardUtils.largestPowerOf2(k)
    val chosenDim = chooseKDimensions(useK)
    val randomVector = generateRandomVector(chosenDim)
    val proj = new HadamardProjectionVector(randomVector)
    proj
  }
}

case class SplitIntoKProjectionSettings(k: Int)

class SplitIntoKProjectionBuilder(builderSettings: SplitIntoKProjectionSettings) extends ProjectionStrategyBuilder {
  type T = SplitIntoKProjectionStrategy
  val splitStrategy: DatasetSplitStrategy = new HadamardProjectionSplitStrategy()

  def build(settings: IndexSettings, rnd: Random, dataFrameView: DataFrameView): T = SplitIntoKProjectionStrategy(rnd, dataFrameView.numCols, builderSettings.k)

  def datasetSplitStrategy: DatasetSplitStrategy = splitStrategy
} 
Example 30
Source File: ValuesStoreTest.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev

import java.util.Random

import com.stefansavev.randomprojections.datarepr.dense.store._
import com.typesafe.scalalogging.StrictLogging
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{FlatSpec, Matchers}

@RunWith(classOf[JUnitRunner])
class TestSingleByteEncodingSpec extends FlatSpec with Matchers {
  "Error after encoding double to float" should "be small" in {
    val minV = -1.0f
    val maxV = 2.0f
    val rnd = new Random(481861)
    for (i <- 0 until 100) {
      //we encode a float (which is 4 bytes) with a single byte
      //therefore the loss of precision
      val value = rnd.nextFloat() * 3.0f - 1.0f
      val enc = FloatToSingleByteEncoder.encodeValue(minV, maxV, value)
      val dec = FloatToSingleByteEncoder.decodeValue(minV, maxV, enc)
      val error = Math.abs(value - dec)
      error should be < (0.01)
    }
  }
}

@RunWith(classOf[JUnitRunner])
class TestValueStores extends FlatSpec with Matchers {

  case class BuilderTypeWithErrorPredicate(builderType: StoreBuilderType, pred: Double => Boolean)

  "ValueStore" should "return store the data with small error" in {

    val tests = List(
      BuilderTypeWithErrorPredicate(StoreBuilderAsDoubleType, error => (error <= 0.0)),
      BuilderTypeWithErrorPredicate(StoreBuilderAsBytesType, error => (error <= 0.01)),
      BuilderTypeWithErrorPredicate(StoreBuilderAsSingleByteType, error => (error <= 0.01))
    )

    for (test <- tests) {
      testBuilder(test)
    }

    def testBuilder(builderWithPred: BuilderTypeWithErrorPredicate): Unit = {
      val dataGenSettings = RandomBitStrings.RandomBitSettings(
        numGroups = 1000,
        numRowsPerGroup = 2,
        numCols = 256,
        per1sInPrototype = 0.5,
        perNoise = 0.2)

      val debug = false
      val randomBitStringsDataset = RandomBitStrings.genRandomData(58585, dataGenSettings, debug, true)
      val builder = builderWithPred.builderType.getBuilder(randomBitStringsDataset.numCols)

      def addValues(): Unit = {
        var i = 0
        while (i < randomBitStringsDataset.numRows) {
          val values = randomBitStringsDataset.getPointAsDenseVector(i)
          builder.addValues(values)
          i += 1
        }
      }

      addValues()

      val valueStore = builder.build()

      def verifyStoredValues(expected: Array[Double], stored: Array[Double]): Unit = {
        for (i <- 0 until expected.length) {
          val error = Math.abs(expected(i) - stored(i))
          val passed = builderWithPred.pred(error)
          passed should be (true)
        }
      }

      def testValues(): Unit = {
        var i = 0
        while (i < randomBitStringsDataset.numRows) {
          val values = randomBitStringsDataset.getPointAsDenseVector(i)
          val output = Array.ofDim[Double](randomBitStringsDataset.numCols)
          valueStore.fillRow(i, output, true)
          verifyStoredValues(values, output)
          i += 1
        }
      }
      testValues()
    }
  }
}


object Test extends StrictLogging {
  def main(args: Array[String]) {
    logger.info("hello")
  }
} 
Example 31
Source File: SpeedSimulator.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev.tuning

import java.util.Random

case class SpeedSimulatorParams(numberOfQueries: Int,
  numberOfTrees: Int,
  requiredPointsPerTree: Int,
  deviationOfRequiredPointsPerTree: Int = 0){
}

object SpeedSimulator {
  val rnd = new Random(11144)

  def testPoint(p: Int, params: SpeedSimulatorParams, buffer: Array[Int]): Int = {
    val rnd = this.rnd
    val len = buffer.length
    var i = 0
    val numTrees = params.numberOfTrees
    val numPointsPerTree = params.requiredPointsPerTree
    val offset = rnd.nextInt(len)
    val stride = offset/params.requiredPointsPerTree  + rnd.nextInt(50)
    var totalOperations = 0
    while(i < numTrees){
      var j = 0
      while(j < numPointsPerTree){
        val k = (offset + stride*j + i) % len //some random formula
        buffer(k) += 1
        j += 1
      }
      i += 1
      totalOperations += 1
    }
    totalOperations
  }

  def simulate(params: SpeedSimulatorParams): Unit = {
    val numQueries = params.numberOfQueries
    val bufflen = params.numberOfQueries
    val buffer = Array.ofDim[Int](bufflen)
    val start = System.currentTimeMillis()
    var i = 0
    while(i < numQueries){
      var j = 0
      while(j < bufflen){
        buffer(j) = 0
        j += 1
      }
      if (i % 5000 == 0){
        println(".")
      }
      testPoint(i, params, buffer)
      i += 1
    }
    val result = System.currentTimeMillis() - start
    println("Simulation in secs: " + result/1000.0 + "    ; per point in ms " + result.toDouble/numQueries.toDouble)
  }

  def main (args: Array[String]): Unit = {
    val params = SpeedSimulatorParams(numberOfQueries = 42000, numberOfTrees = 10, requiredPointsPerTree = 1200)
    simulate(params)
  }
} 
Example 32
Source File: RandomBitStrings.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev

import java.util.Random

import com.stefansavev.randomprojections.datarepr.dense.{ColumnHeaderBuilder, DataFrameView, PointIndexes, RowStoredMatrixViewBuilderFactory}

object RandomBitStrings {

  case class RandomBitSettings(numGroups: Int, numRowsPerGroup: Int, numCols: Int, per1sInPrototype: Double, perNoise: Double)

  def generatePrototype(rnd: Random, dim: Int, perValue: Double): Array[Double] = {
    val arr = Array.ofDim[Double](dim)
    for (i <- 0 until dim) {
      val gaussian = Math.abs(rnd.nextGaussian())
      arr(i) = -1.0 * gaussian
      if (rnd.nextDouble() < perValue) {
        arr(i) = 1.0 * gaussian
      }
    }
    arr
  }

  def corrupt(rnd: Random, input: Array[Double], perNoise: Double): Array[Double] = {
    val arr = Array.ofDim[Double](input.length)
    for (i <- 0 until input.length) {
      if (rnd.nextDouble() < perNoise) {
        arr(i) = -input(i)
      }
      else {
        arr(i) = input(i)
      }
    }
    arr
  }

  //todo: put in utils
  def normalize(input: Array[Double]): Array[Double] = {
    val arr = Array.ofDim[Double](input.length)
    var norm = 0.0
    for (i <- 0 until input.length) {
      norm += input(i) * input(i)
    }
    norm = Math.sqrt(norm)
    for (i <- 0 until input.length) {
      arr(i) = input(i) / norm
    }
    arr
  }

  def genRandomData(seed: Int, settings: RandomBitSettings, debug: Boolean, dense: Boolean): DataFrameView = {

    val (numGroups, numRowsPerGroup, numCols: Int, per1sInPrototype: Double, perNoise: Double) =
      (settings.numGroups, settings.numRowsPerGroup, settings.numCols, settings.per1sInPrototype, settings.perNoise)

    val numRows = numGroups * numRowsPerGroup

    val labels = Array.ofDim[Int](numRows)
    val rnd = new Random(seed)
    var i = 0

    val columnNames = Array.range(0, numCols).map((i: Int) => ("feature" + i, i))
    val rowNames = Array.range(0, numRows).map(_.toString)
    val header = ColumnHeaderBuilder.build("label", columnNames, true)

    val builder = RowStoredMatrixViewBuilderFactory.createDense(header)

    for (g <- 0 until numGroups) {
      val prototype = generatePrototype(rnd, numCols, per1sInPrototype)
      for (r <- 0 until numRowsPerGroup) {
        val noisyProt = corrupt(rnd, prototype, perNoise)
        labels(i) = g
        if (i != builder.currentRowId) {
          throw new IllegalStateException("Cannot skip rows")
        }

        builder.addRow(i.toString, g, Array.range(0, numCols), normalize(noisyProt))
        i += 1
      }
    }
    val indexes = PointIndexes(Array.range(0, numRows))
    new DataFrameView(indexes, builder.build())
  }

} 
Example 33
Source File: RandomBitStrings.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev.fuzzysearchtest

import java.util.Random

import com.stefansavev.randomprojections.datarepr.dense.{ColumnHeaderBuilder, DataFrameView, PointIndexes, RowStoredMatrixViewBuilderFactory}

object RandomBitStrings {

  case class RandomBitSettings(numGroups: Int, numRowsPerGroup: Int, numCols: Int, per1sInPrototype: Double, perNoise: Double)

  def generatePrototype(rnd: Random, dim: Int, perValue: Double): Array[Double] = {
    val arr = Array.ofDim[Double](dim)
    for (i <- 0 until dim) {
      arr(i) = -1.0
      if (rnd.nextDouble() < perValue) {
        arr(i) = 1.0
      }
    }
    arr
  }

  def corrupt(rnd: Random, input: Array[Double], perNoise: Double): Array[Double] = {
    val arr = Array.ofDim[Double](input.length)
    for (i <- 0 until input.length) {
      if (rnd.nextDouble() < perNoise) {
        arr(i) = -input(i)
      }
      else {
        arr(i) = input(i)
      }
    }
    arr
  }

  def genRandomData(seed: Int, settings: RandomBitSettings, debug: Boolean, dense: Boolean): DataFrameView = {

    val (numGroups, numRowsPerGroup, numCols: Int, per1sInPrototype: Double, perNoise: Double) =
      (settings.numGroups, settings.numRowsPerGroup, settings.numCols, settings.per1sInPrototype, settings.perNoise)

    val numRows = numGroups * numRowsPerGroup

    val labels = Array.ofDim[Int](numRows)
    val rnd = new Random(seed)
    var i = 0

    val columnNames = Array.range(0, numCols).map((i: Int) => ("feature" + i, i))
    val header = ColumnHeaderBuilder.build("label", columnNames, false)

    val builder = RowStoredMatrixViewBuilderFactory.createDense(header)

    for (g <- 0 until numGroups) {
      val prototype = generatePrototype(rnd, numCols, per1sInPrototype)
      for (r <- 0 until numRowsPerGroup) {
        val noisyProt = corrupt(rnd, prototype, perNoise)
        labels(i) = g
        if (i != builder.currentRowId) {
          throw new IllegalStateException("Cannot skip rows")
        }
        builder.addRow(g, Array.range(0, numCols), noisyProt)
        i += 1
      }
    }
    val indexes = PointIndexes(Array.range(0, numRows))
    new DataFrameView(indexes, builder.build())
  }

} 
Example 34
Source File: TestOnRandomData.scala    From random-projections-at-berlinbuzzwords   with Apache License 2.0 5 votes vote down vote up
package com.stefansavev.fuzzysearchtest

import java.util.Random

import com.stefansavev.randomprojections.actors.Application
import com.stefansavev.randomprojections.implementation._
import com.stefansavev.randomprojections.utils.Utils
import com.stefansavev.similaritysearch.SimilaritySearchEvaluationUtils
import com.stefansavev.similaritysearch.VectorType.StorageSize
import com.stefansavev.similaritysearch.implementation.FuzzySearchIndexBuilderWrapper
import com.typesafe.scalalogging.StrictLogging


object TestOnRandomData extends StrictLogging {
  implicit val _ = logger

  def main(args: Array[String]): Unit = {
    val dataGenSettings = RandomBitStrings.RandomBitSettings(
      numGroups = 100000,
      numRowsPerGroup = 2,
      numCols = 256,
      per1sInPrototype = 0.5,
      perNoise = 0.1)

    val debug = false
    val randomBitStringsDataset = RandomBitStrings.genRandomData(58585, dataGenSettings, debug, true)

    val randomTreeSettings = IndexSettings(
      maxPntsPerBucket = 50,
      numTrees = 50,
      maxDepth = None,
      projectionStrategyBuilder = ProjectionStrategies.splitIntoKRandomProjection(),
      reportingDistanceEvaluator = ReportingDistanceEvaluators.cosineOnOriginalData(),
      randomSeed = 39393
    )

    println("Number of Rows: " + randomBitStringsDataset.numRows)
    val diskLocation = "D:/tmp/randomfile"
    val trees = Utils.timed("Build Index", {
      val wrapper = new FuzzySearchIndexBuilderWrapper(diskLocation, randomBitStringsDataset.numCols, 50, StorageSize.Double)
      var i = 0
      while (i < randomBitStringsDataset.numRows) {
        wrapper.addItem(i.toString, 0, randomBitStringsDataset.getPointAsDenseVector(i))
        i += 1
      }
      wrapper.build()
      //SimilaritySearchIndex.open(diskLocation)
      ()
    }).result

    SimilaritySearchEvaluationUtils.compareWithBruteForce(diskLocation, new Random(481868), 1000, 50)

    

    Application.shutdown()
  }
} 
Example 35
Source File: RenderParticle.scala    From Electrodynamics   with GNU Lesser General Public License v3.0 5 votes vote down vote up
package com.calclavia.edx.quantum.machine.accelerator

import java.util.Random

import cpw.mods.fml.relauncher.{Side, SideOnly}
import net.minecraft.client.renderer.entity.Render
import net.minecraft.client.renderer.{RenderHelper, Tessellator}
import net.minecraft.entity.Entity
import net.minecraft.util.ResourceLocation
import org.lwjgl.opengl.GL11

@SideOnly(Side.CLIENT) class RenderParticle extends Render
{
  def doRender(entity: Entity, x: Double, y: Double, z: Double, var8: Float, var9: Float)
  {
    val tessellator: Tessellator = Tessellator.instance
    var par2: Float = (entity.ticksExisted)
    while (par2 > 200)
    {
      par2 -= 100
    }
    RenderHelper.disableStandardItemLighting
    val var41: Float = (5 + par2) / 200.0F
    var var51: Float = 0.0F
    if (var41 > 0.8F)
    {
      var51 = (var41 - 0.8F) / 0.2F
    }
    val rand: Random = new Random(432L)
    GL11.glPushMatrix
    GL11.glTranslatef(x.asInstanceOf[Float], y.asInstanceOf[Float], z.asInstanceOf[Float])
    GL11.glScalef(0.15f, 0.15f, 0.15f)
    GL11.glDisable(GL11.GL_TEXTURE_2D)
    GL11.glShadeModel(GL11.GL_SMOOTH)
    GL11.glEnable(GL11.GL_BLEND)
    GL11.glBlendFunc(GL11.GL_SRC_ALPHA, GL11.GL_ONE)
    GL11.glDisable(GL11.GL_ALPHA_TEST)
    GL11.glEnable(GL11.GL_CULL_FACE)
    GL11.glDepthMask(false)
    GL11.glPushMatrix
    GL11.glTranslatef(0.0F, -1.0F, -2.0F)

    for (i1 <- 0 to ((var41 + var41 * var41) / 2.0F * 60.0F).asInstanceOf[Int])
    {
      GL11.glRotatef(rand.nextFloat * 360.0F, 1.0F, 0.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 1.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 0.0F, 1.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 1.0F, 0.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 1.0F, 0.0F)
      GL11.glRotatef(rand.nextFloat * 360.0F + var41 * 90.0F, 0.0F, 0.0F, 1.0F)
      tessellator.startDrawing(6)
      val var81: Float = rand.nextFloat * 20.0F + 5.0F + var51 * 10.0F
      val var91: Float = rand.nextFloat * 2.0F + 1.0F + var51 * 2.0F
      tessellator.setColorRGBA_I(16777215, (255.0F * (1.0F - var51)).asInstanceOf[Int])
      tessellator.addVertex(0.0D, 0.0D, 0.0D)
      tessellator.setColorRGBA_I(0, 0)
      tessellator.addVertex(-0.866D * var91, var81, -0.5F * var91)
      tessellator.addVertex(0.866D * var91, var81, -0.5F * var91)
      tessellator.addVertex(0.0D, var81, 1.0F * var91)
      tessellator.addVertex(-0.866D * var91, var81, -0.5F * var91)
      tessellator.draw
    }
    GL11.glPopMatrix
    GL11.glDepthMask(true)
    GL11.glDisable(GL11.GL_CULL_FACE)
    GL11.glDisable(GL11.GL_BLEND)
    GL11.glShadeModel(GL11.GL_FLAT)
    GL11.glColor4f(1.0F, 1.0F, 1.0F, 1.0F)
    GL11.glEnable(GL11.GL_TEXTURE_2D)
    GL11.glEnable(GL11.GL_ALPHA_TEST)
    RenderHelper.enableStandardItemLighting
    GL11.glPopMatrix
  }

  protected def getEntityTexture(entity: Entity): ResourceLocation =
  {
    return null
  }
} 
Example 36
Source File: BlockRadioactive.scala    From Electrodynamics   with GNU Lesser General Public License v3.0 5 votes vote down vote up
package com.calclavia.edx.quantum.blocks

import java.util.{List, Random}

import cpw.mods.fml.relauncher.{Side, SideOnly}
import net.minecraft.block.Block
import net.minecraft.block.material.Material
import net.minecraft.client.Minecraft
import net.minecraft.client.particle.EntitySmokeFX
import net.minecraft.client.renderer.texture.IIconRegister
import net.minecraft.entity.{Entity, EntityLiving, EntityLivingBase}
import net.minecraft.init.Blocks
import net.minecraft.util.{AxisAlignedBB, IIcon}
import net.minecraft.world.World
import resonantengine.lib.potion.PoisonRadiation
import resonantengine.lib.transform.vector.Vector3

import scala.collection.JavaConversions._

class BlockRadioactive(material: Material) extends Block(material)
{
  var canSpread: Boolean = true
  var radius: Float = 5
  var amplifier: Int = 2
  var canWalkPoison: Boolean = true
  var isRandomlyRadioactive: Boolean = true
  var spawnParticle: Boolean = true
  private var iconTop: IIcon = null
  private var iconBottom: IIcon = null

  //Constructor
  this.setTickRandomly(true)
  this.setHardness(0.2F)

  override def getIcon(side: Int, metadata: Int): IIcon =
  {
    return if (side == 1) this.iconTop else (if (side == 0) this.iconBottom else this.blockIcon)
  }

  @SideOnly(Side.CLIENT) override def registerBlockIcons(iconRegister: IIconRegister)
  {
    super.registerBlockIcons(iconRegister)
    this.iconTop = iconRegister.registerIcon(this.getUnlocalizedName.replace("tile.", "") + "_top")
    this.iconBottom = iconRegister.registerIcon(this.getUnlocalizedName.replace("tile.", "") + "_bottom")
  }

  
  override def onEntityWalking(par1World: World, x: Int, y: Int, z: Int, par5Entity: Entity)
  {
    if (par5Entity.isInstanceOf[EntityLiving] && this.canWalkPoison)
    {
      PoisonRadiation.INSTANCE.poisonEntity(new Vector3(x, y, z), par5Entity.asInstanceOf[EntityLiving])
    }
  }

  override def quantityDropped(par1Random: Random): Int =
  {
    return 0
  }

  @SideOnly(Side.CLIENT) override def randomDisplayTick(world: World, x: Int, y: Int, z: Int, par5Random: Random)
  {
    if (this.spawnParticle)
    {
      if (Minecraft.getMinecraft.gameSettings.particleSetting == 0)
      {
        val radius: Int = 3
        for (i <- 0 to 2)
        {
          val pos: Vector3 = new Vector3(x, y, z)
          pos.add(Math.random * radius - radius / 2, Math.random * radius - radius / 2, Math.random * radius - radius / 2)
          val fx: EntitySmokeFX = new EntitySmokeFX(world, pos.x, pos.y, pos.z, (Math.random - 0.5) / 2, (Math.random - 0.5) / 2, (Math.random - 0.5) / 2)
          fx.setRBGColorF(0.2f, 0.8f, 0)
          Minecraft.getMinecraft.effectRenderer.addEffect(fx)

        }
      }
    }
  }

} 
Example 37
Source File: BlockToxicWaste.scala    From Electrodynamics   with GNU Lesser General Public License v3.0 5 votes vote down vote up
package com.calclavia.edx.quantum.blocks

import java.util.Random

import com.calclavia.edx.quantum.QuantumContent
import QuantumContent
import net.minecraft.block.material.Material
import net.minecraft.entity.{Entity, EntityLivingBase}
import net.minecraft.util.DamageSource
import net.minecraft.world.World
import net.minecraftforge.fluids.BlockFluidClassic
import resonantengine.lib.potion.PoisonRadiation
import resonantengine.lib.transform.vector.Vector3

class BlockToxicWaste extends BlockFluidClassic(QuantumContent.getFluidToxicWaste, Material.water)
{
  //Constructor
  setTickRate(20)

  override def randomDisplayTick(par1World: World, x: Int, y: Int, z: Int, par5Random: Random)
  {
    super.randomDisplayTick(par1World, x, y, z, par5Random)
    if (par5Random.nextInt(100) == 0)
    {
      val d5: Double = x + par5Random.nextFloat
      val d7: Double = y + this.maxY
      val d6: Double = z + par5Random.nextFloat
      par1World.spawnParticle("suspended", d5, d7, d6, 0.0D, 0.0D, 0.0D)
    }
    if (par5Random.nextInt(200) == 0)
    {
      par1World.playSound(x, y, z, "liquid.lava", 0.2F + par5Random.nextFloat * 0.2F, 0.9F + par5Random.nextFloat * 0.15F, false)
    }
  }

  override def onEntityCollidedWithBlock(par1World: World, x: Int, y: Int, z: Int, entity: Entity)
  {
    if (entity.isInstanceOf[EntityLivingBase])
    {
      entity.attackEntityFrom(DamageSource.wither, 3)
      PoisonRadiation.INSTANCE.poisonEntity(new Vector3(x, y, z), entity.asInstanceOf[EntityLivingBase], 4)
    }
  }
} 
Example 38
Source File: DirectDataInjector.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.gamer.cdc

import java.text.SimpleDateFormat
import java.util.Random

import org.kududb.client.{PartialRow, Operation, KuduClient}
import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator


class DirectDataInjector {
  val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy")
  val random = new Random
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<kuduMaster> <tableName> <numberOfRecords>")
      return
    }

    val kuduMaster = args(0)
    val tableName = args(1)
    val numberOfRecords = args(2).toInt


    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val table = kuduClient.openTable(tableName)
    val session = kuduClient.newSession()

    for (i <- 0 to numberOfRecords) {
      val record = GamerDataGenerator.makeNewGamerRecord(100000)




      val pr = new PartialRow(table.getSchema)
      pr.addString(0, "record.gamerId")
      pr.addString(1, "")
      val scannerRows = kuduClient.newScannerBuilder(table).lowerBound(null).limit(1).build().nextRows()
      val op:Operation = if (scannerRows.hasNext) {
        val oldRow = scannerRows.next()

        val oldRecordUpdateOp = table.newInsert()

        val row = oldRecordUpdateOp.getRow
        row.addString("gamer_id", oldRow.getString("gamer_id"))
        row.addString("eff_to", simpleDateFormat.format(System.currentTimeMillis()))
        row.addString("eff_from", oldRow.getString("eff_from"))
        row.addLong("last_time_played", oldRow.getLong("last_time_played"))
        row.addInt("games_played", oldRow.getInt("games_played"))
        row.addInt("games_won", oldRow.getInt("games_won"))
        row.addInt("oks", oldRow.getInt("oks"))
        row.addInt("deaths", oldRow.getInt("deaths"))
        row.addInt("damage_given", oldRow.getInt("damage_given"))
        row.addInt("damage_taken", oldRow.getInt("damage_taken"))
        row.addInt("max_oks_in_one_game", oldRow.getInt("max_oks_in_one_game"))
        row.addInt("max_deaths_in_one_game", oldRow.getInt("max_deaths_in_one_game"))

        session.apply(oldRecordUpdateOp)
        table.newUpdate()
      } else {
        table.newInsert()
      }

      val row = op.getRow
      row.addString("gamer_id", record.gamerId)
      row.addString("eff_to", "")
      row.addString("eff_from", simpleDateFormat.format(System.currentTimeMillis()))
      row.addLong("last_time_played", record.lastTimePlayed)
      row.addInt("games_played", record.gamesPlayed)
      row.addInt("games_won", record.gamesWon)
      row.addInt("oks", record.oks)
      row.addInt("deaths", record.deaths)
      row.addInt("damage_given", record.damageGiven)
      row.addInt("damage_taken", record.damageTaken)
      row.addInt("max_oks_in_one_game", record.maxOksInOneGame)
      row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame)

      session.apply(op)
    }
    session.flush()

    kuduClient.close()


  }
} 
Example 39
Source File: DirectDataMultiThreadedInjector.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.gamer.cdc

import java.text.SimpleDateFormat
import java.util.Random
import java.util.concurrent.atomic.AtomicInteger
import java.util.concurrent.{TimeUnit, Executors}

import org.kududb.client.{Operation, PartialRow, KuduClient}
import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator

object DirectDataMultiThreadedInjector {
  val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy")
  val random = new Random
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<kuduMaster> <tableName> <numberOfRecords> <numberOfThreads>")
      return
    }

    val kuduMaster = args(0)
    val tableName = args(1)
    val numberOfRecords = args(2).toInt
    val executor = Executors.newFixedThreadPool(args(3).toInt)
    val numberOfGamers = args(4).toInt
    val sleepTime = args(5).toInt

    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val leftToRun = new AtomicInteger()

    for (i <- 0 to numberOfRecords) {
      leftToRun.incrementAndGet()
      executor.execute(new ApplyNewRecordRunnable(GamerDataGenerator.makeNewGamerRecord(numberOfGamers),
      kuduClient, tableName, leftToRun))
      println("Summited:" + i)

      Thread.sleep(sleepTime)
    }


    val startTime = System.currentTimeMillis()
    while (!executor.awaitTermination(10000, TimeUnit.SECONDS)) {
      val newTime = System.currentTimeMillis()
      println("> Still Waiting: {Time:" + (newTime - startTime) + ", LeftToRun:" + leftToRun + "}" )
    }


    kuduClient.close()


  }
} 
Example 40
Source File: DirectDataInjector.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.gamer.aggregates

import java.util.Random

import org.kududb.client.KuduClient

object DirectDataInjector {

  val random = new Random
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<kuduMaster> <tableName> <numberOfRecords>")
      return
    }

    val kuduMaster = args(0)
    val tableName = args(1)
    val numberOfRecords = args(2).toInt

    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val table = kuduClient.openTable(tableName)
    val session = kuduClient.newSession()

    table.newInsert()

    for (i <- 0 to numberOfRecords) {
      val record = GamerDataGenerator.makeNewGamerRecord(100000)
      val op = table.newInsert()

      val row = op.getRow
      row.addString("gamer_id", record.gamerId)
      row.addLong("last_time_played", record.lastTimePlayed)
      row.addInt("games_played", record.gamesPlayed)
      row.addInt("games_won", record.gamesWon)
      row.addInt("oks", record.oks)
      row.addInt("deaths", record.deaths)
      row.addInt("damage_given", record.damageGiven)
      row.addInt("damage_taken", record.damageTaken)
      row.addInt("max_oks_in_one_game", record.maxOksInOneGame)
      row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame)

      session.apply(op)
    }
    session.flush()

    kuduClient.close()


  }
} 
Example 41
Source File: GamerDataGenerator.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.gamer.aggregates

import java.util.{Date, Random}

import org.kududb.spark.demo.gamer.GamerEvent

object GamerDataGenerator {

  val random = new Random()
  val averagePlayerPercentage = 40
  val advancedPlayerPercentage = 80
  val superStarPlayerPercentage = 100
  var date = System.currentTimeMillis()

  def makeNewGamerRecord(numOfGamers:Int): GamerEvent = {
    println("date" + new Date(date))
    date += 60000 * 60 * 6
    val playerSelection = random.nextInt(100)
    if (playerSelection < averagePlayerPercentage) {

      val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection

      new GamerEvent(gamerId.toString,
        date,
        1,
        if (random.nextInt(10) > 7) 1 else 0,
        random.nextInt(10),
        random.nextInt(20),
        random.nextInt(1000),
        random.nextInt(2000))
    } else if (playerSelection < advancedPlayerPercentage) {
      val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection

      new GamerEvent(gamerId.toString,
        date,
        1,
        if (random.nextInt(10) > 5) 1 else 0,
        random.nextInt(20),
        random.nextInt(18),
        random.nextInt(2000),
        random.nextInt(2000))
    } else {
      val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection

      new GamerEvent(gamerId.toString,
        date,
        1,
        if (random.nextInt(10) > 3) 1 else 0,
        random.nextInt(20),
        random.nextInt(10),
        random.nextInt(4000),
        random.nextInt(1500))
    }
  }
} 
Example 42
Source File: AddSingleRecord.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.basic

import java.util.Random

import org.kududb.client.{PartialRow, KuduClient}

object AddSingleRecord {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <tableName> <rowKey>")
      return
    }

    val kuduMaster = args(0)
    val tableName = args(1)
    val rowKey = args(2)

    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    val table = kuduClient.openTable(tableName)
    val session = kuduClient.newSession()

    val lowerBound = new PartialRow(table.getSchema)
    lowerBound.addString(0, rowKey)
    val upperBound = new PartialRow(table.getSchema)
    upperBound.addString(0, rowKey + "_")

    var startTime = System.currentTimeMillis()
    val random = new Random()

    startTime = System.currentTimeMillis()
    val update = table.newInsert()
    val row = update.getRow
    row.addString(0, rowKey)
    val columns = table.getSchema.getColumns
    for (c <- 1 until columns.size()) {
      println(columns.get(c).getName + " " + columns.get(c).getType)
      row.addInt(columns.get(c).getName, random.nextInt(100000))
    }
    session.apply(update)
    println("new key: " + rowKey)
    println(" new key time spent: " + (System.currentTimeMillis() - startTime))

    startTime = System.currentTimeMillis()
    val scanner2 = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build()

    while (scanner2.hasMoreRows) {
      val rows = scanner2.nextRows()
      while (rows.hasNext) {
        val row = rows.next()
        println("NewValue: " + rowKey + " " + row.rowToString())
      }
    }
    scanner2.close()
    println(" scan time spent: " + (System.currentTimeMillis() - startTime))

    val scannerX = kuduClient.newScannerBuilder(table).build()
    while (scannerX.hasMoreRows) {
      val rows = scannerX.nextRows()
      while (rows.hasNext) {
        val row = rows.next()
        println("Full Scan: " + row.rowToString())
      }
    }
    println("done")
    kuduClient.shutdown()
  }
} 
Example 43
Source File: NameGenerator.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.basic

import java.util.Random

import scala.collection.mutable

object NameGenerator {

  val random = new Random()
  val listOfNames = new mutable.MutableList[NameAndCounter]
  listOfNames += new NameAndCounter("Katlyn")
  listOfNames += new NameAndCounter("Laurena")
  listOfNames += new NameAndCounter("Jenise")
  listOfNames += new NameAndCounter("Vida")
  listOfNames += new NameAndCounter("Delphine")
  listOfNames += new NameAndCounter("Tiffanie")
  listOfNames += new NameAndCounter("Carroll")
  listOfNames += new NameAndCounter("Steve")
  listOfNames += new NameAndCounter("Nu")
  listOfNames += new NameAndCounter("Robbin")
  listOfNames += new NameAndCounter("Mahalia")
  listOfNames += new NameAndCounter("Norah")
  listOfNames += new NameAndCounter("Selina")
  listOfNames += new NameAndCounter("Cornelius")
  listOfNames += new NameAndCounter("Bennie")
  listOfNames += new NameAndCounter("Kemberly")
  listOfNames += new NameAndCounter("Johnie")
  listOfNames += new NameAndCounter("Jenee")
  listOfNames += new NameAndCounter("Napoleon")
  listOfNames += new NameAndCounter("Brenton")
  listOfNames += new NameAndCounter("Roxana")
  listOfNames += new NameAndCounter("Kalyn")
  listOfNames += new NameAndCounter("Jeana")
  listOfNames += new NameAndCounter("Tennie")
  listOfNames += new NameAndCounter("Tasia")
  listOfNames += new NameAndCounter("Ashely")
  listOfNames += new NameAndCounter("Hester")
  listOfNames += new NameAndCounter("Zita")
  listOfNames += new NameAndCounter("Evalyn")
  listOfNames += new NameAndCounter("Anderson")
  listOfNames += new NameAndCounter("Elaina")
  listOfNames += new NameAndCounter("Benny")
  listOfNames += new NameAndCounter("Heidi")
  listOfNames += new NameAndCounter("Mammie")
  listOfNames += new NameAndCounter("Alisa")
  listOfNames += new NameAndCounter("Billie")
  listOfNames += new NameAndCounter("Wan")
  listOfNames += new NameAndCounter("Dionna")
  listOfNames += new NameAndCounter("Julene")
  listOfNames += new NameAndCounter("Chasidy")
  listOfNames += new NameAndCounter("Vennie")
  listOfNames += new NameAndCounter("Cara")
  listOfNames += new NameAndCounter("Charissa")
  listOfNames += new NameAndCounter("Russell")
  listOfNames += new NameAndCounter("Daniela")
  listOfNames += new NameAndCounter("Kindra")
  listOfNames += new NameAndCounter("Eduardo")
  listOfNames += new NameAndCounter("Marci")
  listOfNames += new NameAndCounter("Gustavo")
  listOfNames += new NameAndCounter("Dianna	")

  def getName(): String = {
    val nameAndCounter = listOfNames.get(random.nextInt(listOfNames.length - 1)).get
    nameAndCounter.counter += 1
    nameAndCounter.name + "_" + nameAndCounter.counter
  }
}

class NameAndCounter(val name:String = "N/A", var counter:Int = 0) {

} 
Example 44
Source File: BasicExample.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.basic

import java.util
import java.util.Random

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.kududb.ColumnSchema.ColumnSchemaBuilder
import org.kududb.client.KuduClient
import org.kududb.{ColumnSchema, Schema, Type}

object BasicExample {
  def main(args: Array[String]): Unit = {

    val kuduMaster = "quickstart.cloudera"

    println(" -- Starting ")
    val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
    try {
      println(" -- ")

      val columnList = new util.ArrayList[ColumnSchema]()
      columnList.add(new ColumnSchemaBuilder("KEY_ID", Type.STRING).key(true).build())
      columnList.add(new ColumnSchemaBuilder("COL_A", Type.STRING).key(false).build())
      columnList.add(new ColumnSchemaBuilder("COL_B", Type.STRING).key(false).build())
      columnList.add(new ColumnSchemaBuilder("COL_C", Type.STRING).key(false).build())
      val schema = new Schema(columnList)

      if (kuduClient.tableExists("foobar")) {
        kuduClient.deleteTable("foobar")
      }
      kuduClient.createTable("foobar", schema)

      val session = kuduClient.newSession()
      val table = kuduClient.openTable("foobar")

      try {
        val random = new Random()
        for (i <- 0 until 10) {
          val insert = table.newInsert()
          val row = insert.getRow()
          row.addString(0, i.toString)
          row.addString(1, "value " + i)
          row.addString(2, "42:" + i)
          row.addString(3, "Cat" + random.nextGaussian())
          session.apply(insert)
        }
        session.flush()
      } finally {
        session.close()
      }

      val tableList = kuduClient.getTablesList.getTablesList
      for (i <- 0 until tableList.size()) {
        println("Table " + i + ":" + tableList.get(i))
      }

      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      val sc = new SparkContext("local[2]", "SparkSQL on Kudu", sparkConfig)

      val sqlContext = new SQLContext(sc)

      val df = sqlContext.load("org.kududb.spark",
        Map("kudu.table" -> "foobar", "kudu.master" -> kuduMaster))

      df.registerTempTable("foobar")

      sqlContext.sql("SELECT * FROM foobar").foreach(r => {
        println("Row: " + r)
      })
    } finally {
      kuduClient.shutdown()
    }
    println("-- finished")
  }
} 
Example 45
Source File: InitialDataPopulation.scala    From SparkOnKudu   with Apache License 2.0 5 votes vote down vote up
package org.kududb.spark.demo.basic

import java.util
import java.util.Random

import org.kududb.{Schema, Type, ColumnSchema}
import org.kududb.ColumnSchema.ColumnSchemaBuilder
import org.kududb.client.{AsyncKuduClient, KuduClient}


object InitialDataPopulation {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <TableName> <numberOfColumns> <numberOfRows>")

      //"quickstart.cloudera"

      return
    }
    val kuduMaster = args(0)
    val tableName = args(1)
    val numOfColumns = args(2).toInt
    val numOfRows = args(3).toInt

    val kuduClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build()
    try {
      //Delete table if exist
      if (kuduClient.tableExists(tableName).join()) {
        kuduClient.deleteTable(tableName).join()
      }

      //Create Schema
      val columnList = new util.ArrayList[ColumnSchema]()
      columnList.add(new ColumnSchemaBuilder("key_id", Type.STRING).key(true).build())
      for (c <- 0 until numOfColumns) {
        columnList.add(new ColumnSchemaBuilder("col_" + c, Type.INT32).key(false).build())
      }
      val schema = new Schema(columnList)

      //Create table
      kuduClient.createTable(tableName, schema).join()

      //Populate table
      val random = new Random
      val table = kuduClient.openTable(tableName).join()
      val asyncSession = kuduClient.newSession()

      for (r <- 0 until numOfRows) {
        val insert = table.newInsert()
        val row = insert.getRow()
        row.addString(0, NameGenerator.getName())
        val columns = table.getSchema.getColumns
        for (c <- 1 until columns.size()) {
          row.addInt(columns.get(c).getName, random.nextInt(100000))
        }
        asyncSession.apply(insert)

        if (r % 1000 == 0) {
          println("Inserted: " + r)
        }
      }
      asyncSession.flush()

      val scannerX = kuduClient.newScannerBuilder(table).build()
      while (scannerX.hasMoreRows) {
        val rows = scannerX.nextRows().join()
        while (rows.hasNext) {
          val row = rows.next()
          println(" - " + row.rowToString())
        }
      }

      asyncSession.close()

    } finally {
      kuduClient.shutdown()
    }
  }
} 
Example 46
Source File: SOLStreamProducer.scala    From incubator-retired-gearpump   with Apache License 2.0 5 votes vote down vote up
package org.apache.gearpump.streaming.examples.sol

import java.time.Instant
import java.util.Random

import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.streaming.examples.sol.SOLStreamProducer._
import org.apache.gearpump.streaming.source.Watermark
import org.apache.gearpump.streaming.task.{Task, TaskContext}

class SOLStreamProducer(taskContext: TaskContext, conf: UserConfig)
  extends Task(taskContext, conf) {

  import taskContext.output

  private val sizeInBytes = conf.getInt(SOLStreamProducer.BYTES_PER_MESSAGE)
    .getOrElse(DEFAULT_MESSAGE_SIZE)
  private var messages: Array[String] = null
  private var rand: Random = null
  private var messageCount: Long = 0

  override def onStart(startTime: Instant): Unit = {
    prepareRandomMessage
    self ! Watermark(Instant.now)
  }

  private def prepareRandomMessage = {
    rand = new Random()
    val differentMessages = 100
    messages = new Array(differentMessages)

    0.until(differentMessages).map { index =>
      val sb = new StringBuilder(sizeInBytes)
      // Even though java encodes strings in UCS2, the serialized version sent by the tuples
      // is UTF8, so it should be a single byte
      0.until(sizeInBytes).foldLeft(sb) { (sb, j) =>
        sb.append(rand.nextInt(9))
      }
      messages(index) = sb.toString()
    }
  }

  override def onNext(msg: Message): Unit = {
    val message = messages(rand.nextInt(messages.length))
    output(Message(message, System.currentTimeMillis()))
    messageCount = messageCount + 1L
    self ! Watermark(Instant.now)
  }

}

object SOLStreamProducer {
  val DEFAULT_MESSAGE_SIZE = 100
  // Bytes
  val BYTES_PER_MESSAGE = "bytesPerMessage"
} 
Example 47
Source File: ScalaClientTestUtils.scala    From incubator-livy   with Apache License 2.0 5 votes vote down vote up
package org.apache.livy.scalaapi

import java.util.Random
import java.util.concurrent.{CountDownLatch, TimeUnit}

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{Await, Future}
import scala.concurrent.duration._

import org.scalatest.FunSuite

import org.apache.livy.LivyBaseUnitTestSuite

object ScalaClientTestUtils extends FunSuite with LivyBaseUnitTestSuite {

  val Timeout = 40

  def helloJob(context: ScalaJobContext): String = "hello"

  def throwExceptionJob(context: ScalaJobContext): Unit = throw new CustomTestFailureException

  def simpleSparkJob(context: ScalaJobContext): Long = {
    val r = new Random
    val count = 5
    val partitions = Math.min(r.nextInt(10) + 1, count)
    val buffer = new ArrayBuffer[Int]()
    for (a <- 1 to count) {
      buffer += r.nextInt()
    }
    context.sc.parallelize(buffer, partitions).count()
  }

  def assertAwait(lock: CountDownLatch): Unit = {
    assert(lock.await(Timeout, TimeUnit.SECONDS) == true)
  }

  def assertTestPassed[T](future: Future[T], expectedValue: T): Unit = {
    val result = Await.result(future, Timeout second)
    assert(result === expectedValue)
  }
} 
Example 48
Source File: SimpleSkewedGroupByTest.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("SimpleSkewedGroupByTest")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
        }
      }
      result
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

    spark.stop()
  }
}
// scalastyle:on println 
Example 49
Source File: SkewedGroupByTest.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 50
Source File: SparkHdfsLR.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")
      System.exit(1)
    }

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkHdfsLR")
      .getOrCreate()

    val inputPath = args(0)
    val lines = spark.read.textFile(inputPath).rdd

    val points = lines.map(parsePoint).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    spark.stop()
  }
}
// scalastyle:on println 
Example 51
Source File: LocalLR.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient +=  p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 52
Source File: GroupByTest.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 53
Source File: LocalFileLR.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
    val points = lines.map(parsePoint _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 54
Source File: PageViewGenerator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.clickstream

import java.io.PrintWriter
import java.net.ServerSocket
import java.util.Random


// scalastyle:on
object PageViewGenerator {
  val pages = Map("http://foo.com/" -> .7,
                  "http://foo.com/news" -> 0.2,
                  "http://foo.com/contact" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
      }
    }
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0
  }

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()
  }

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
      System.exit(1)
    }
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println("Listening on port: " + port)

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from: " + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
            Thread.sleep(sleepDelayMs)
            out.write(getNextClickEvent())
            out.flush()
          }
          socket.close()
        }
      }.start()
    }
  }
}
// scalastyle:on println 
Example 55
Source File: SparkLR.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkLR")
      .getOrCreate()

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)

    spark.stop()
  }
}
// scalastyle:on println 
Example 56
Source File: LocalKMeans.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println 
Example 57
Source File: StopwatchSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      acc.add(checkStopwatch(sw))
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc.add(duration)
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
} 
Example 58
Source File: PartitionwiseSampledRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 59
Source File: SampledRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 60
Source File: PartitionwiseSampledRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 61
Source File: RandomProjection.scala    From spark-neighbors   with MIT License 5 votes vote down vote up
package com.github.karlhigley.spark.neighbors.linalg

import java.util.Random

import breeze.stats.distributions.CauchyDistribution
import org.apache.spark.mllib.linalg.{ DenseMatrix, Matrices }
import org.apache.spark.mllib.linalg.{ DenseVector, Vector }


  def generateGaussian(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = {
    val localMatrix = DenseMatrix.randn(projectedDim, originalDim, random)
    new RandomProjection(localMatrix)
  }

  def generateCauchy(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = {
    def randc(numRows: Int, numCols: Int): DenseMatrix = {
      require(
        numRows.toLong * numCols <= Int.MaxValue,
        s"$numRows x $numCols dense matrix is too large to allocate"
      )
      val cauchyDistribution = new CauchyDistribution(0, 1)
      new DenseMatrix(numRows, numCols, cauchyDistribution.drawMany(numRows * numCols))
    }

    val localMatrix = randc(projectedDim, originalDim)
    new RandomProjection(localMatrix)
  }
} 
Example 62
Source File: BitSamplingFunction.scala    From spark-neighbors   with MIT License 5 votes vote down vote up
package com.github.karlhigley.spark.neighbors.lsh

import java.util.Random
import scala.collection.immutable.BitSet

import org.apache.spark.mllib.linalg.SparseVector


  def generate(
    originalDim: Int,
    signatureLength: Int,
    random: Random = new Random
  ): BitSamplingFunction = {
    val indices = Array.fill(signatureLength) {
      random.nextInt(originalDim)
    }

    new BitSamplingFunction(indices)
  }
} 
Example 63
Source File: MinhashFunction.scala    From spark-neighbors   with MIT License 5 votes vote down vote up
package com.github.karlhigley.spark.neighbors.lsh

import java.util.Random

import org.apache.spark.mllib.linalg.SparseVector


  def generate(
    dimensions: Int,
    signatureLength: Int,
    prime: Int,
    random: Random = new Random
  ): MinhashFunction = {

    val perms = new Array[PermutationFunction](signatureLength)
    var i = 0
    while (i < signatureLength) {
      perms(i) = PermutationFunction.random(dimensions, prime, random)
      i += 1
    }

    new MinhashFunction(perms)
  }
} 
Example 64
Source File: TestLoadDataWithJunkChars.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.integration.spark.testsuite.dataload

import java.io.{BufferedWriter, File, FileWriter}
import java.util.Random

import org.apache.spark.sql.Row
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

class TestLoadDataWithJunkChars extends QueryTest with BeforeAndAfterAll {
  var filePath = ""
  val junkchars = "ǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰ"

  def buildTestData() = {
    filePath = s"$integrationPath/spark/target/junkcharsdata.csv"
    val file = new File(filePath)
    val writer = new BufferedWriter(new FileWriter(file))
    writer.write("c1,c2\n")
    val random = new Random
    for (i <- 1 until 1000) {
      writer.write("a" + i + "," + junkchars + "\n")
      if ( i % 100 == 0) {
        writer.flush()
      }
    }
    writer.write("a1000000," + junkchars)
    writer.close
  }

  test("[bug]fix bug of duplicate rows in UnivocityCsvParser #877") {
    buildTestData()
    sql("drop table if exists junkcharsdata")
    sql("""create table if not exists junkcharsdata
             (c1 string, c2 string)
             STORED AS carbondata""")
    sql(s"LOAD DATA LOCAL INPATH '$filePath' into table junkcharsdata")
    checkAnswer(sql("select count(*) from junkcharsdata"), Seq(Row(1000)))
    sql("drop table if exists junkcharsdata")
    new File(filePath).delete()
  }
} 
Example 65
Source File: DoubleDataTypeTestCase.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.integration.spark.testsuite.primitiveTypes

import java.util.Random

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.scalatest.BeforeAndAfterAll


class DoubleDataTypeTestCase extends QueryTest with BeforeAndAfterAll {

  lazy val df: DataFrame = generateDataFrame

  private def generateDataFrame(): DataFrame = {
    val r = new Random()
    val rdd = sqlContext.sparkContext
      .parallelize(1 to 10, 2)
      .map { x =>
        Row(x, "London" + (x % 2), x.toDouble / 13, x.toDouble / 11)
      }

    val schema = StructType(
      Seq(
        StructField("id", IntegerType, nullable = false),
        StructField("city", StringType, nullable = false),
        StructField("m1", DoubleType, nullable = false),
        StructField("m2", DoubleType, nullable = false)
      )
    )

    sqlContext.createDataFrame(rdd, schema)
  }

  override def beforeAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")

    df.write
      .format("carbondata")
      .option("tableName", "doubleTypeCarbonTable")
      .option("tempCSV", "false")
      .option("table_blocksize", "32")
      .mode(SaveMode.Overwrite)
      .save()

    df.write
      .mode(SaveMode.Overwrite)
      .saveAsTable("doubleTypeHiveTable")

  }

  test("detail query") {
    checkAnswer(sql("select * from doubleTypeCarbonTable order by id"),
      sql("select * from doubleTypeHiveTable order by id"))

  }

  test("duplicate values") {
    sql("create table uniq_carbon(name string, double_column double) STORED AS carbondata ")
    sql(s"load data inpath '$resourcesPath/uniq.csv' into table uniq_carbon")
    sql("create table uniq_hive(name string, double_column double) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")
    sql(s"load data local inpath '$resourcesPath/uniqwithoutheader.csv' into table uniq_hive")
    checkAnswer(sql("select * from uniq_carbon where double_column>=11"),
      sql("select * from uniq_hive where double_column>=11"))
  }

//  test("agg query") {
//    checkAnswer(sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeHiveTable group by city"))
//
//    checkAnswer(sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeHiveTable group by city"))
//  }

  override def afterAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")
  }
} 
Example 66
Source File: TestSource.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbon.flink

import java.util.Random

import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext}
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction
import org.apache.flink.streaming.api.functions.source.SourceFunction

abstract class TestSource(val dataCount: Int) extends SourceFunction[Array[AnyRef]] with CheckpointedFunction {
  private var dataIndex = 0
  private var dataIndexState: ListState[Integer] = _
  private var running = false

  @throws[Exception]
  def get(index: Int): Array[AnyRef]

  @throws[Exception]
  def onFinish(): Unit = {
    // to do nothing.
  }

  @throws[Exception]
  override def run(sourceContext: SourceFunction.SourceContext[Array[AnyRef]]): Unit = {
    this.running = true
    while ( {
      this.running && this.dataIndex < this.dataCount
    }) {
      sourceContext.collectWithTimestamp(this.get(this.dataIndex), System.currentTimeMillis)
      this.dataIndex += 1
    }
    this.onFinish()
  }

  override def cancel(): Unit = {
    this.running = false
  }

  @throws[Exception]
  override def snapshotState(context: FunctionSnapshotContext): Unit = {
    this.dataIndexState.clear()
    this.dataIndexState.add(this.dataIndex)
  }

  @throws[Exception]
  override def initializeState(context: FunctionInitializationContext): Unit = {
    this.dataIndexState = context.getOperatorStateStore.getListState(new ListStateDescriptor[Integer]("dataIndex", classOf[Integer]))
    if (!context.isRestored) return
    import scala.collection.JavaConversions._
    for (dataIndex <- this.dataIndexState.get) {
      this.dataIndex = dataIndex
    }
  }
}

object TestSource {

  val randomCache = new ThreadLocal[Random] {
    override def initialValue(): Random = new Random()
  }

} 
Example 67
Source File: AppleCustomPartitioner.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.partitioning

import java.util.Random

import org.apache.spark.Partitioner


class AppleCustomPartitioner(numOfParts:Int) extends Partitioner {
  override def numPartitions: Int = numOfParts
  def random = new Random()

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[(String, Long)]
    val ticker = k._1
    if (ticker.equals("apple")) {
      val saltedTicker = ticker + random.nextInt(9)
      Math.abs(saltedTicker.hashCode) % numPartitions
    } else {
      Math.abs(ticker.hashCode) % numPartitions
    }
  }
} 
Example 68
Source File: SaltedExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.salted

import java.util.Random

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

object SaltedExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {

    val jsonPath = args(0)

    val sparkSession = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .getOrCreate()

    val jsonDfLeft = sparkSession.read.json(jsonPath)

    val saltedLeft = jsonDfLeft.rdd.flatMap(r => {
      val group = r.getAs[String]("group")
      val value = r.getAs[Long]("value")

      Seq((group + "_" + 0, value),(group + "_" + 1, value))
    })

    val jsonDfRight = sparkSession.read.json(jsonPath)

    val saltedRight = jsonDfRight.rdd.mapPartitions(it => {

      val random = new Random()

      it.map(r => {
        val group = r.getAs[String]("group")
        val value = r.getAs[Long]("value")

        (group + "_" + random.nextInt(2), value)
      })
    })

    jsonDfLeft.join(jsonDfRight).collect().foreach(r => {
      println("Normal.result:" + r)
    })
    println("----")
    saltedLeft.join(saltedRight).collect().foreach(r => {
      println("Salted.result:" + r)
    })
  }
} 
Example 69
Source File: SessionDataFileHDFSWriter.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.streaming.dstream.sessionization

import java.io.BufferedWriter
import java.io.FileWriter
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.conf.Configuration
import java.io.OutputStreamWriter
import org.apache.hadoop.fs.Path
import java.util.Random

object SessionDataFileHDFSWriter {
  
  val eol = System.getProperty("line.separator");  
  
  def main(args: Array[String]) {
    if (args.length == 0) {
        println("SessionDataFileWriter {tempDir} {distDir} {numberOfFiles} {numberOfEventsPerFile} {waitBetweenFiles}");
        return;
    }
    val conf = new Configuration
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"))
    conf.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"))
    conf.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"))
    
    val fs = FileSystem.get(new Configuration)
    val rootTempDir = args(0)
    val rootDistDir = args(1)
    val files = args(2).toInt
    val loops = args(3).toInt
    val waitBetweenFiles = args(4).toInt
    val r = new Random
    for (f <- 1 to files) {
      val rootName = "/weblog." + System.currentTimeMillis()
      val tmpPath = new Path(rootTempDir + rootName + ".tmp")
      val writer = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath)))
      
      print(f + ": [")
      
      val randomLoops = loops + r.nextInt(loops)
      
      for (i <- 1 to randomLoops) {
        writer.write(SessionDataGenerator.getNextEvent + eol)
        if (i%100 == 0) {
          print(".")
        }
      }
      println("]")
      writer.close
      
      val distPath = new Path(rootDistDir + rootName + ".dat")
      
      fs.rename(tmpPath, distPath)
      Thread.sleep(waitBetweenFiles)
    }
    println("Done")
  }
} 
Example 70
Source File: RandSampleData.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package util

import java.util.Random
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd
}
import breeze.numerics.{
  exp => Bexp,
  cos => Bcos,
  tanh => Btanh
}
import scala.math.Pi

object RandSampleData extends Serializable {
  // Rosenbrock:
  //��(100*(x(i+1)-x(i) 2) 2 + (x(i)-1) 2)
  // Rastrigin:
  //��(x(i) 2 -10*cos(2*3.14*x(i))+10)
  // Sphere :
  //��(x(i) 2)
  
  def RandM(
    n1: Int,
    n2: Int,
    b1: Double,
    b2: Double,
    function: String): BDM[Double] = {
    //    val n1 = 2
    //    val n2 = 3
    //    val b1 = -30
    //    val b2 = 30
    val bdm1 = BDM.rand(n1, n2) * (b2 - b1).toDouble + b1.toDouble
    val bdm_y = function match {
      case "rosenbrock" =>
        val xi0 = bdm1(::, 0 to (bdm1.cols - 2))
        val xi1 = bdm1(::, 1 to (bdm1.cols - 1))
        val xi2 = (xi0 :* xi0)
        val m1 = ((xi1 - xi2) :* (xi1 - xi2)) * 100.0 + ((xi0 - 1.0) :* (xi0 - 1.0))
        val m2 = m1 * BDM.ones[Double](m1.cols, 1)
        m2
      case "rastrigin" =>
        val xi0 = bdm1
        val xi2 = (xi0 :* xi0)
        val sicos = Bcos(xi0 * 2.0 * Pi) * 10.0
        val m1 = xi2 - sicos + 10.0
        val m2 = m1 * BDM.ones[Double](m1.cols, 1)
        m2
      case "sphere" =>
        val xi0 = bdm1
        val xi2 = (xi0 :* xi0)
        val m1 = xi2
        val m2 = m1 * BDM.ones[Double](m1.cols, 1)
        m2
    }
    val randm = BDM.horzcat(bdm_y, bdm1)
    randm
  }
} 
Example 71
Source File: RconConnector.scala    From chatoverflow   with Eclipse Public License 2.0 5 votes vote down vote up
package org.codeoverflow.chatoverflow.requirement.service.rcon

import java.io.{DataInputStream, IOException, InputStream, OutputStream}
import java.net.{Socket, SocketException}
import java.nio.{ByteBuffer, ByteOrder}
import java.util.Random

import org.codeoverflow.chatoverflow.WithLogger
import org.codeoverflow.chatoverflow.connector.Connector

class RconConnector(override val sourceIdentifier: String) extends Connector(sourceIdentifier) with WithLogger {
  override protected var requiredCredentialKeys: List[String] = List("password", "address")
  override protected var optionalCredentialKeys: List[String] = List("port")

  private var socket: Socket = _
  private var outputStream: OutputStream = _
  private var inputStream: InputStream = _
  private var requestId: Int = 0

  def sendCommand(command: String): String = {
    logger debug s"Sending $command to RCON"
    requestId += 1
    if (write(2, command.getBytes("ASCII"))) {
      return read()
    }
    null
  }


  
  override def stop(): Boolean = {
    logger info s"Stopped RCON connector to ${credentials.get.getValue("address").get}!"
    socket.close()
    true
  }
} 
Example 72
Source File: AutoregressionSuite.scala    From spark-timeseries   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sparkts.models

import java.util.Random

import com.cloudera.sparkts.MatrixUtil.toBreeze

import org.apache.spark.mllib.linalg._
import org.apache.commons.math3.random.MersenneTwister
import org.scalatest.FunSuite

class AutoregressionSuite extends FunSuite {
  test("fit AR(1) model") {
    val model = new ARModel(1.5, Array(.2))
    val ts = model.sample(5000, new MersenneTwister(10L))
    val fittedModel = Autoregression.fitModel(ts, 1)
    assert(fittedModel.coefficients.length == 1)
    assert(math.abs(fittedModel.c - 1.5) < .07)
    assert(math.abs(fittedModel.coefficients(0) - .2) < .03)
  }

  test("fit AR(2) model") {
    val model = new ARModel(1.5, Array(.2, .3))
    val ts = model.sample(5000, new MersenneTwister(10L))
    val fittedModel = Autoregression.fitModel(ts, 2)
    assert(fittedModel.coefficients.length == 2)
    assert(math.abs(fittedModel.c - 1.5) < .15)
    assert(math.abs(fittedModel.coefficients(0) - .2) < .03)
    assert(math.abs(fittedModel.coefficients(1) - .3) < .03)
  }

  test("add and remove time dependent effects") {
    val rand = new Random()
    val ts = new DenseVector(Array.fill(1000)(rand.nextDouble()))
    val model = new ARModel(1.5, Array(.2, .3))
    val added = model.addTimeDependentEffects(ts, Vectors.zeros(ts.size))
    val removed = model.removeTimeDependentEffects(added, Vectors.zeros(ts.size))
    assert((toBreeze(ts) - toBreeze(removed)).toArray.forall(math.abs(_) < .001))
  }
} 
Example 73
Source File: utils.scala    From scalabpe   with Apache License 2.0 5 votes vote down vote up
package scalabpe.flow

import scalabpe.core._
import java.util.Random

object Global {
    def init() {
        println("init called")
    }
    def close() {
        println("close called")
    }
}

object FlowHelper { 

    val random = new Random()
	
    val jobStatusCache = new java.util.concurrent.ConcurrentHashMap[String,String]()

    def getConfig(s:String,defaultValue:String="") = Flow.router.getConfig(s,defaultValue)

    def isEmpty(req:Request,name:String):Boolean={
        if( name.indexOf(",") >= 0 ) return isEmptyForAny(req,name)
        return isEmpty(req.s(name))
    }

    private def isEmptyForAny(req:Request,names:String):Boolean={
        val ss = names.split(",")
        var i = 0
        while( i < ss.length ) {
            if( isEmpty(req.s(ss(i)) )) return true
            i += 1
        }
        false
    }

    def isInt(req:Request,name:String):Boolean={
        if( name.indexOf(",") >= 0 ) return isIntForAny(req,name)
        return isInt(req.s(name))
    }    
	
    private def isIntForAny(req:Request,names:String):Boolean={
        val ss = names.split(",")
        var i = 0
        while( i < ss.length ) {
            if( isInt(req.s(ss(i)) )) return true
            i += 1
        }
        false
    }

    def isEmpty(str:String):Boolean={
        return str == null || str.length() == 0
    }

    def isInt(n:String):Boolean={
        try {
            Integer.parseInt(n)
            return true
        } catch {
            case e: Throwable =>
             return false
        }
    }

    def checkInclude(ss:String,s:String,t:String=","):Boolean={
        if( ss == null || ss == "" ) return false
        if( s == null || s == "" ) return true
        return (t+ss+t).indexOf(t+s+t) >= 0 
    }

    def uuid(): String = {
        return java.util.UUID.randomUUID().toString().replaceAll("-", "")
    }

    def generateSeed():String = {
        "%08d".format(Math.abs(random.nextInt())%100000000)
    }
 
    def contact(a:String,b:String):String = a + b
} 
Example 74
Source File: ZipTests.scala    From coursier   with Apache License 2.0 5 votes vote down vote up
package coursier.cli.util

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.util.Random
import java.util.zip.{Deflater, ZipEntry, ZipInputStream, ZipOutputStream}

import coursier.launcher.internal.Zip
import org.junit.runner.RunWith
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatestplus.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class ZipTests extends AnyFlatSpec {

  "zipEntries" should "be fine with custom deflaters" in {

    // Inspired by https://github.com/spring-projects/spring-boot/commit/a50646b7cc3ad941e748dfb450077e3a73706205#diff-2297c301250b25e3b80301c58daf3ea0R621

    val baos = new ByteArrayOutputStream
    val output = new ZipOutputStream(baos) {
      `def` = new Deflater(Deflater.NO_COMPRESSION, true)
    }
    val data = Array.ofDim[Byte](1024 * 1024)
    new Random().nextBytes(data)
    val entry = new ZipEntry("entry.dat")
    output.putNextEntry(entry)
    output.write(data)
    output.closeEntry()
    output.close()

    val result = baos.toByteArray

    val zos = new ZipOutputStream(new ByteArrayOutputStream)
    val entryNames = Zip.zipEntries(new ZipInputStream(new ByteArrayInputStream(result)))
      .map {
        case (ent, content) =>
          println(ent.getCompressedSize)
          val name = ent.getName
          zos.putNextEntry(ent)
          zos.write(content)
          zos.closeEntry()
          name
      }
      .toVector
    zos.close()
    assert(entryNames == Vector("entry.dat"))
  }

} 
Example 75
Source File: SimpleSkewedGroupByTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("SimpleSkewedGroupByTest")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
        }
      }
      result
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

    spark.stop()
  }
}
// scalastyle:on println 
Example 76
Source File: SkewedGroupByTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 77
Source File: SparkHdfsLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")
      System.exit(1)
    }

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkHdfsLR")
      .getOrCreate()

    val inputPath = args(0)
    val lines = spark.read.textFile(inputPath).rdd

    val points = lines.map(parsePoint).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    spark.stop()
  }
}
// scalastyle:on println 
Example 78
Source File: LocalLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient +=  p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 79
Source File: GroupByTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 80
Source File: LocalFileLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
    val points = lines.map(parsePoint _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 81
Source File: PageViewGenerator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.clickstream

import java.io.PrintWriter
import java.net.ServerSocket
import java.util.Random


// scalastyle:on
object PageViewGenerator {
  val pages = Map("http://foo.com/" -> .7,
                  "http://foo.com/news" -> 0.2,
                  "http://foo.com/contact" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
      }
    }
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0
  }

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()
  }

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
      System.exit(1)
    }
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println("Listening on port: " + port)

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println("Got client connected from: " + socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
            Thread.sleep(sleepDelayMs)
            out.write(getNextClickEvent())
            out.flush()
          }
          socket.close()
        }
      }.start()
    }
  }
}
// scalastyle:on println 
Example 82
Source File: SparkLR.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkLR")
      .getOrCreate()

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)

    spark.stop()
  }
}
// scalastyle:on println 
Example 83
Source File: LocalKMeans.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println 
Example 84
Source File: StopwatchSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      acc.add(checkStopwatch(sw))
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc.add(duration)
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
} 
Example 85
Source File: PartitionwiseSampledRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 86
Source File: SimpleSkewedGroupByTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers
    var ratio = if (args.length > 4) args(4).toInt else 5.0

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
        }
      }
      result
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

    sc.stop()
  }
} 
Example 87
Source File: SparkTachyonHdfsLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo
import org.apache.spark.storage.StorageLevel



object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def main(args: Array[String]) {

    showWarning()

    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
} 
Example 88
Source File: SkewedGroupByTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    sc.stop()
  }
} 
Example 89
Source File: SparkHdfsLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo



object SparkHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")
      System.exit(1)
    }

    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
    val inputPath = args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
} 
Example 90
Source File: LocalLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}


object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient +=  p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
} 
Example 91
Source File: GroupByTest.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object GroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    sc.stop()
  }
} 
Example 92
Source File: LocalFileLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}


object LocalFileLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
    val points = lines.map(parsePoint _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
} 
Example 93
Source File: SparkLR.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkLR")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)

    sc.stop()
  }
} 
Example 94
Source File: LocalKMeans.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
} 
Example 95
Source File: SampledRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 96
Source File: PartitionwiseSampledRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 97
Source File: SparkTachyonHdfsLR.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo
import org.apache.spark.storage.StorageLevel



object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法)
        |for more conventional use.
      """.stripMargin)
  }

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def main(args: Array[String]) {

    showWarning()

    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value 将w初始化为一个随机值
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
}
// scalastyle:on println 
Example 98
Source File: SkewedGroupByTest.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      //map输出的大小线性增加从第一到最后
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    //执行所有的计算和缓存
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    sc.stop()
  }
}
// scalastyle:on println 
Example 99
Source File: SparkHdfsLR.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo



    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkHdfsLR").setMaster("local[2]")
    val inputPath = "D:\\spark\\spark-1.5.0-hadoop2.6\\data\\mllib\\lr_data.txt"//args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).cache()//缓存
    val ITERATIONS = 6 //args(1).toInt 迭代次数

    // Initialize w to a random value
    //初始化W到一个随机值
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        //p代表DataPoint Vector
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
}
// scalastyle:on println 
Example 100
Source File: LocalFileLR.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}


object LocalFileLR {
  val D = 10   // Numer of dimensions 维度
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)
  //解析每一行数据,生成DataPoint对像
  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法)
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()
    //fromFile读取文件,转换成Array[String]
    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
    //调用parsePoint解析每一行数据
    val points = lines.map(parsePoint _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    //初始化W到一个随机值数组
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 101
Source File: PeopleInfoFileGenerator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.demoIBM

import java.io.File
import java.util.Random
import java.io.FileWriter

object PeopleInfoFileGenerator {
  def main(args: Array[String]) {
    val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_people_info.txt"), false)
    val rand = new Random()
    for (i <- 1 to 10000) {
      var height = rand.nextInt(220)
      if (height < 50) {
        height = height + 50
      }
      var gender = getRandomGender
      if (height < 100 && gender == "M")
        height = height + 100
      if (height < 100 && gender == "F")
        height = height + 50
      writer.write(i + " " + getRandomGender + " " + height)
      writer.write(System.getProperty("line.separator"))
    }
    writer.flush()
    writer.close()
    println("People Information File generated successfully.")
  }

  def getRandomGender(): String = {
    val rand = new Random()
    val randNum = rand.nextInt(2) + 1
    if (randNum % 2 == 0) {
      "M"
    } else {
      "F"
    }
  }
} 
Example 102
Source File: SparkLR.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法)
        |for more conventional use.
      """.stripMargin)
    //String.stripMargin 移除每行字符串开头的空格和第一个遇到的垂直分割符|
  }

  def main(args: Array[String]) {

    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkLR").setMaster("local")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    //将w初始化为一个随机值
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)

    sc.stop()
  }
}
// scalastyle:on println 
Example 103
Source File: FileWrite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package scalaDemo

import java.io.{File, FileWriter}
import java.util.Random

import com.google.common.base.Charsets.UTF_8
import com.google.common.io.Files
import org.apache.spark.util.Utils

object FileWrite {
  def main(args: Array[String]) {


    val outFile = File.createTempFile("test-load-spark-properties", "test")
    Files.write("spark.test.fileNameLoadA true\n" +
      "spark.test.fileNameLoadB 1\n", outFile, UTF_8)


    val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_age_data.txt"), false)
    val rand = new Random()
    for (i <- 1 to 10000) {
      writer.write(i + " " + rand.nextInt(100))
      writer.write(System.getProperty("line.separator"))
    }
    writer.flush()
    writer.close()
  }
} 
Example 104
Source File: SampledRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      //对于大型数据集,替换样本中每个元素的预期出现次数为泊松(压缩),我们使用它来获取每个元素的计数
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          //当我们返回0个项目时,避免对象分配,这是很经常的
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 105
Source File: PartitionwiseSampledRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 106
Source File: SessionKafkaProducer.scala    From flink_training   with Apache License 2.0 5 votes vote down vote up
package com.tmalaska.flinktraining.example.session

import java.util.{Properties, Random}

import net.liftweb.json.DefaultFormats
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import net.liftweb.json.Serialization.write

object SessionKafkaProducer {
  def main(args:Array[String]): Unit = {

    implicit val formats = DefaultFormats

    val kafkaServerURL = args(0)
    val kafkaServerPort = args(1)
    val topic = args(2)
    val numberOfEntities = args(3).toInt
    val numberOfMessagesPerEntity = args(4).toInt
    val waitTimeBetweenMessageBatch = args(5).toInt
    val chancesOfMissing = args(6).toInt

    val props = new Properties()
    props.put("bootstrap.servers", kafkaServerURL + ":" + kafkaServerPort)
    props.put("acks", "all")
    props.put("retries", "0")
    props.put("batch.size", "16384")
    props.put("linger.ms", "1")
    props.put("buffer.memory", "33554432")
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")

    val producer = new KafkaProducer[String, String](props)

    val r = new Random()
    var sentCount = 0

    println("About to send to " + topic)
    for (j <- 0 to numberOfMessagesPerEntity) {
      for (i <- 0 to numberOfEntities) {
        if (r.nextInt(chancesOfMissing) != 0) {
          val message = write(HeartBeat(i.toString, System.currentTimeMillis()))
          val producerRecord = new ProducerRecord[String,String](topic, message)
          producer.send(producerRecord)
          sentCount += 1
        }
      }
      println("Sent Count:" + sentCount)
      Thread.sleep(waitTimeBetweenMessageBatch)
    }

    producer.close()
  }
} 
Example 107
Source File: RandomProjectionsHasher.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import java.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{LongType, StructType}


  def setDim(value: Long): this.type = set(dim, value)


  def this() = this(Identifiable.randomUID("randomProjectionsHasher"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val dimensity = {
      if (!isSet(dim)) {//If dimensions is not set - will search  AttributeGroup in metadata as it comes from OdklCountVectorizer
        val vectorsIndex = dataset.schema.fieldIndex($(inputCol))
        AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size
      } else {
        $(dim).toInt
      }
    }
    val projectionMatrix = dataset.sqlContext.sparkContext.broadcast(
      Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix])
  //the matrix of random vectors to costruct hash

    val binHashSparseVectorColumn = udf((vector: Vector) => {
      projectionMatrix.value.multiply(vector).values
        .map(f =>  if (f>0) 1L else 0L)
        .view.zipWithIndex
        .foldLeft(0L) {case  (acc,(v, i)) => acc | (v << i) }

    })
    dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), LongType)
  }

} 
Example 108
Source File: SimpleSkewedGroupByTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("SimpleSkewedGroupByTest")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers
    val ratio = if (args.length > 4) args(4).toInt else 5.0

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
        }
      }
      result
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    println(s"RESULT: ${pairs1.groupByKey(numReducers).count}")

    spark.stop()
  }
}
// scalastyle:on println 
Example 109
Source File: SkewedGroupByTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes linearly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 110
Source File: SparkHdfsLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkHdfsLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    val y = tok.nextToken.toDouble
    val x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")
      System.exit(1)
    }

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkHdfsLR")
      .getOrCreate()

    val inputPath = args(0)
    val lines = spark.read.textFile(inputPath).rdd

    lines.cache()
    val points = lines.map(parsePoint).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println(s"Final w: $w")
    spark.stop()
  }
}
// scalastyle:on println 
Example 111
Source File: LocalLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient +=  p.x * scale
      }
      w -= gradient
    }

    println(s"Final w: $w")
  }
}
// scalastyle:on println 
Example 112
Source File: GroupByTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.sql.SparkSession


object GroupByTest {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("GroupBy Test")
      .getOrCreate()

    val numMappers = if (args.length > 0) args(0).toInt else 2
    val numKVPairs = if (args.length > 1) args(1).toInt else 1000
    val valSize = if (args.length > 2) args(2).toInt else 1000
    val numReducers = if (args.length > 3) args(3).toInt else numMappers

    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()

    pairs1.repartition(1)
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    implicit val caseInsensitiveOrdering = new Ordering[(Int, String)] {

      override def compare(a: (Int, String), b: (Int, String)): Int = a._1.compareTo( b._1)

    }

    println(pairs1.groupByKey(numReducers).count())

    spark.stop()
  }
}
// scalastyle:on println 
Example 113
Source File: LocalFileLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{DenseVector, Vector}


object LocalFileLR {
  val D = 10   // Number of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val fileSrc = scala.io.Source.fromFile(args(0))
    val lines = fileSrc.getLines().toArray
    val points = lines.map(parsePoint)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    fileSrc.close()
    println(s"Final w: $w")
  }
}
// scalastyle:on println 
Example 114
Source File: PageViewGenerator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming.clickstream

import java.io.PrintWriter
import java.net.ServerSocket
import java.util.Random


// scalastyle:on
object PageViewGenerator {
  val pages = Map("http://foo.com/" -> .7,
                  "http://foo.com/news" -> 0.2,
                  "http://foo.com/contact" -> .1)
  val httpStatus = Map(200 -> .95,
                       404 -> .05)
  val userZipCode = Map(94709 -> .5,
                        94117 -> .5)
  val userID = Map((1 to 100).map(_ -> .01): _*)

  def pickFromDistribution[T](inputMap: Map[T, Double]): T = {
    val rand = new Random().nextDouble()
    var total = 0.0
    for ((item, prob) <- inputMap) {
      total = total + prob
      if (total > rand) {
        return item
      }
    }
    inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0
  }

  def getNextClickEvent(): String = {
    val id = pickFromDistribution(userID)
    val page = pickFromDistribution(pages)
    val status = pickFromDistribution(httpStatus)
    val zipCode = pickFromDistribution(userZipCode)
    new PageView(page, status, zipCode, id).toString()
  }

  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>")
      System.exit(1)
    }
    val port = args(0).toInt
    val viewsPerSecond = args(1).toFloat
    val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
    val listener = new ServerSocket(port)
    println(s"Listening on port: $port")

    while (true) {
      val socket = listener.accept()
      new Thread() {
        override def run(): Unit = {
          println(s"Got client connected from: ${socket.getInetAddress}")
          val out = new PrintWriter(socket.getOutputStream(), true)

          while (true) {
            Thread.sleep(sleepDelayMs)
            out.write(getNextClickEvent())
            out.flush()
          }
          socket.close()
        }
      }.start()
    }
  }
}
// scalastyle:on println 
Example 115
Source File: SparkLR.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{DenseVector, Vector}

import org.apache.spark.sql.SparkSession


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D) {rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use org.apache.spark.ml.classification.LogisticRegression
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val spark = SparkSession
      .builder
      .appName("SparkLR")
      .getOrCreate()

    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = spark.sparkContext.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
    println(s"Initial w: $w")

    for (i <- 1 to ITERATIONS) {
      println(s"On iteration $i")
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println(s"Final w: $w")

    spark.stop()
  }
}
// scalastyle:on println 
Example 116
Source File: LocalKMeans.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers(i)
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    val points = new HashSet[Vector[Double]]
    val kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println(s"Initial centers: $kPoints")

    while(tempDist > convergeDist) {
      val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      val mappings = closest.groupBy[Int] (x => x._1)

      val pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints(mapping._1), mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println(s"Final centers: $kPoints")
  }
}
// scalastyle:on println 
Example 117
Source File: ChiSquareTestSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.stat

import java.util.Random

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.stat.test.ChiSqTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ChiSquareTestSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("test DataFrame of labeled points") {
    // labels: 1.0 (2 / 6), 0.0 (4 / 6)
    // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
    // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
    val data = Seq(
      LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
      LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
      LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
      LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
    for (numParts <- List(2, 4, 6, 8)) {
      val df = spark.createDataFrame(sc.parallelize(data, numParts))
      val chi = ChiSquareTest.test(df, "features", "label")
      val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
        chi.select("pValues", "degreesOfFreedom", "statistics")
          .as[(Vector, Array[Int], Vector)].head()
      assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4)
      assert(degreesOfFreedom === Array(2, 3))
      assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4)
    }
  }

  test("large number of features (SPARK-3087)") {
    // Test that the right number of results is returned
    val numCols = 1001
    val sparseData = Array(
      LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))),
      LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0)))))
    val df = spark.createDataFrame(sparseData)
    val chi = ChiSquareTest.test(df, "features", "label")
    val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
      chi.select("pValues", "degreesOfFreedom", "statistics")
        .as[(Vector, Array[Int], Vector)].head()
    assert(pValues.size === numCols)
    assert(degreesOfFreedom.length === numCols)
    assert(statistics.size === numCols)
    assert(pValues(1000) !== null)  // SPARK-3087
  }

  test("fail on continuous features or labels") {
    val tooManyCategories: Int = 100000
    assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " +
      "tooManyCategories be large enough to cause ChiSqTest to throw an exception.")

    val random = new Random(11L)
    val continuousLabel = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2))))
    withClue("ChiSquare should throw an exception when given a continuous-valued label") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousLabel)
        ChiSquareTest.test(df, "features", "label")
      }
    }
    val continuousFeature = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble())))
    withClue("ChiSquare should throw an exception when given continuous-valued features") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousFeature)
        ChiSquareTest.test(df, "features", "label")
      }
    }
  }
} 
Example 118
Source File: StopwatchSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      acc.add(checkStopwatch(sw))
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.longAccumulator
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc.add(duration)
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
} 
Example 119
Source File: PartitionwiseSampledRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.RandomSampler

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 120
Source File: SimpleSkewedGroupByTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers
    var ratio = if (args.length > 4) args(4).toInt else 5.0

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
        }
      }
      result
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

    sc.stop()
  }
}
// scalastyle:on println 
Example 121
Source File: SparkTachyonHdfsLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo
import org.apache.spark.storage.StorageLevel



object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def main(args: Array[String]) {

    showWarning()

    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
}
// scalastyle:on println 
Example 122
Source File: SkewedGroupByTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    sc.stop()
  }
}
// scalastyle:on println 
Example 123
Source File: SparkHdfsLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo



object SparkHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")
      System.exit(1)
    }

    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
    val inputPath = args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
}
// scalastyle:on println 
Example 124
Source File: LocalLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}


object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient +=  p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 125
Source File: GroupByTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object GroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    sc.stop()
  }
}
// scalastyle:on println 
Example 126
Source File: Utils.scala    From zen   with Apache License 2.0 5 votes vote down vote up
package com.github.cloudml.zen.ml.util

import java.util.Random

object Utils {
  val random = new Random()
  def log1pExp(x: Double): Double = {
    if (x > 0) {
      x + math.log1p(math.exp(-x))
    } else {
      math.log1p(math.exp(x))
    }
  }
} 
Example 127
Source File: CustomActivationExample.scala    From dl4scala   with MIT License 5 votes vote down vote up
package org.dl4scala.examples.misc.activationfunctions

import java.util.{Collections, Random}

import org.deeplearning4j.datasets.iterator.impl.ListDataSetIterator
import org.deeplearning4j.nn.api.OptimizationAlgorithm
import org.deeplearning4j.nn.conf.layers.{DenseLayer, OutputLayer}
import org.deeplearning4j.nn.conf.{NeuralNetConfiguration, Updater}
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
import org.deeplearning4j.nn.weights.WeightInit
import org.deeplearning4j.optimize.listeners.ScoreIterationListener
import org.nd4j.linalg.activations.Activation
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.dataset.DataSet
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.lossfunctions.LossFunctions


object CustomActivationExample {
  val seed = 12345
  val iterations = 1
  val nEpochs = 500
  val nSamples = 1000
  val batchSize = 100
  val learningRate = 0.001
  var MIN_RANGE = 0
  var MAX_RANGE = 3

  val rng = new Random(seed)

  def main(args: Array[String]): Unit = {
    val iterator = getTrainingData(batchSize, rng)

    // Create the network
    val numInput = 2
    val numOutputs = 1
    val nHidden = 10

    val net = new MultiLayerNetwork(new NeuralNetConfiguration.Builder()
      .seed(seed)
      .iterations(iterations)
      .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
      .learningRate(learningRate)
      .weightInit(WeightInit.XAVIER)
      .updater(Updater.NESTEROVS)
      .list()
      //INSTANTIATING CUSTOM ACTIVATION FUNCTION here as follows
      //Refer to CustomActivation class for more details on implementation
      .layer(0, new DenseLayer.Builder().nIn(numInput).nOut(nHidden)
      .activation(new CustomActivation())
      .build())
      .layer(1, new OutputLayer.Builder(LossFunctions.LossFunction.MSE)
        .activation(Activation.IDENTITY)
        .nIn(nHidden).nOut(numOutputs).build())
      .pretrain(false).backprop(true).build()
    )

    net.init()
    net.setListeners(new ScoreIterationListener(100))

    (0 until nEpochs).foreach{_ =>
      iterator.reset()
      net.fit(iterator)
    }

    // Test the addition of 2 numbers (Try different numbers here)
    val input: INDArray = Nd4j.create(Array[Double](0.111111, 0.3333333333333), Array[Int](1, 2))
    val out: INDArray = net.output(input, false)
    System.out.println(out)
  }


  private def getTrainingData(batchSize: Int, rand: Random): DataSetIterator = {
    val sum = new Array[Double](nSamples)
    val input1 = new Array[Double](nSamples)
    val input2 = new Array[Double](nSamples)

    (0 until nSamples).foreach{i =>
      input1(i) = MIN_RANGE + (MAX_RANGE - MIN_RANGE) * rand.nextDouble
      input2(i) = MIN_RANGE + (MAX_RANGE - MIN_RANGE) * rand.nextDouble
      sum(i) = input1(i) + input2(i)
    }

    val inputNDArray1 = Nd4j.create(input1, Array[Int](nSamples, 1))
    val inputNDArray2 = Nd4j.create(input2, Array[Int](nSamples, 1))
    val inputNDArray = Nd4j.hstack(inputNDArray1, inputNDArray2)
    val outPut = Nd4j.create(sum, Array[Int](nSamples, 1))
    val dataSet = new DataSet(inputNDArray, outPut)
    val listDs = dataSet.asList
    Collections.shuffle(listDs, rng)
    new ListDataSetIterator(listDs, batchSize)
  }
} 
Example 128
Source File: ToxCoreTestBase.scala    From jvm-toxcore-c   with GNU General Public License v3.0 5 votes vote down vote up
package im.tox.tox4j

import java.io.IOException
import java.net.{ InetAddress, Socket }
import java.util.Random

import org.jetbrains.annotations.NotNull
import org.scalatest.Assertions

object ToxCoreTestBase extends Assertions {

  private[tox4j] val nodeCandidates = Seq(
    new DhtNode("tox.initramfs.io", "tox.initramfs.io", 33445, "3F0A45A268367C1BEA652F258C85F4A66DA76BCAA667A49E770BCC4917AB6A25"),
    new DhtNode("tox.verdict.gg", null, 33445, "1C5293AEF2114717547B39DA8EA6F1E331E5E358B35F9B6B5F19317911C5F976")
  )

  @NotNull def randomBytes(length: Int): Array[Byte] = {
    val array = new Array[Byte](length)
    new Random().nextBytes(array)
    array
  }

  @NotNull
  def readablePublicKey(@NotNull id: Array[Byte]): String = {
    val str = new StringBuilder
    id foreach { c => str.append(f"$c%02X") }
    str.toString()
  }

  @NotNull
  def parsePublicKey(@NotNull id: String): Array[Byte] = {
    val publicKey = new Array[Byte](id.length / 2)
    publicKey.indices foreach { i =>
      publicKey(i) =
        ((fromHexDigit(id.charAt(i * 2)) << 4) +
          fromHexDigit(id.charAt(i * 2 + 1))).toByte
    }
    publicKey
  }

  private def fromHexDigit(c: Char): Byte = {
    val digit =
      if (false) { 0 }
      else if ('0' to '9' contains c) { c - '0' }
      else if ('A' to 'F' contains c) { c - 'A' + 10 }
      else if ('a' to 'f' contains c) { c - 'a' + 10 }
      else { throw new IllegalArgumentException(s"Non-hex digit character: $c") }
    digit.toByte
  }

  @SuppressWarnings(Array("org.wartremover.warts.Equals"))
  private def hasConnection(ip: String, port: Int): Option[String] = {
    var socket: Socket = null
    try {
      socket = new Socket(InetAddress.getByName(ip), port)
      if (socket.getInputStream == null) {
        Some("Socket input stream is null")
      } else {
        None
      }
    } catch {
      case e: IOException =>
        Some(s"A network connection can't be established to $ip:$port: ${e.getMessage}")
    } finally {
      if (socket != null) {
        socket.close()
      }
    }
  }

  def checkIPv4: Option[String] = {
    hasConnection("8.8.8.8", 53)
  }

  def checkIPv6: Option[String] = {
    hasConnection("2001:4860:4860::8888", 53)
  }

  protected[tox4j] def assumeIPv4(): Unit = {
    assume(checkIPv4.isEmpty)
  }

  protected[tox4j] def assumeIPv6(): Unit = {
    assume(checkIPv6.isEmpty)
  }

} 
Example 129
Source File: PartitionwiseSampledRDD.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.rdd

import java.util.Random

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.util.random.RandomSampler

import scala.reflect.ClassTag

private[sona] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}

/**
  * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD,
  * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain
  * a random sample of the records in the partition. The random seeds assigned to the samplers
  * are guaranteed to have different values.
  *
  * @param prev                  RDD to be sampled
  * @param sampler               a random sampler
  * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD
  * @param seed                  random seed
  * @tparam T input RDD item type
  * @tparam U sampled RDD item type
  */
private[sona] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
                                                                       prev: RDD[T],
                                                                       sampler: RandomSampler[T, U],
                                                                       preservesPartitioning: Boolean,
                                                                       @transient private val seed: Long = (new Random).nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 130
Source File: WeightedRandomSampler.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.utils
import java.util.Random

import org.apache.spark.util.SparkUtil
import org.apache.spark.util.random.RandomSampler

import scala.reflect.ClassTag

abstract class WeightedRandomSampler[T: ClassTag, U: ClassTag] extends RandomSampler[(T, Float), U] {

  protected var fraction = 0.0

  override def sample(items: Iterator[(T, Float)]): Iterator[U] = {
    items.filter(x => sample(x._2) > 0).asInstanceOf[Iterator[U]]
  }

  def sample(weight: Float): Int

  override def sample(): Int = ???

  def setFraction(fraction: Double): Unit = {
    require(
      fraction >= (0.0 - 1e-6)
        && fraction <= (1.0 + 1e-6),
      s"Sampling fraction ($fraction) must be on interval [0, 1]")
    this.fraction = fraction
  }

  override def clone: WeightedRandomSampler[T, U] = ???
}

class NaiveWeightedBernoulliSampler[T: ClassTag] extends WeightedRandomSampler[T, (T, Float)] {

  private val rng: Random = SparkUtil.getXORShiftRandom(System.nanoTime)

  override def setSeed(seed: Long): Unit = rng.setSeed(seed)

  def sample(weight: Float): Int = {
    if (fraction <= 0.0) {
      0
    } else if (fraction >= 1.0) {
      1
    } else {
      if (rng.nextDouble() <= fraction * weight) {
        1
      } else {
        0
      }
    }
  }

  override def clone: NaiveWeightedBernoulliSampler[T] = new NaiveWeightedBernoulliSampler[T]
} 
Example 131
Source File: PartitionwiseWeightedSampledRDD.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.utils
import java.util.Random

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, Partitioner, TaskContext}

import scala.reflect.ClassTag
import scala.util.{Random => ScalaRandom}

class PartitionwiseWeightedSampledRDDPartition(val prev: Partition, val seed: Long, val fraction: Double)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


class PartitionwiseWeightedSampledRDD[T: ClassTag, U: ClassTag](
                                                                 prev: RDD[(T, Float)],
                                                                 sampler: WeightedRandomSampler[T, U],
                                                                 fractions: Map[Int, Double],
                                                                 preservesPartitioning: Boolean,
                                                                 @transient private val seed: Long = ScalaRandom.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner: Option[Partitioner] = {
    if (preservesPartitioning) prev.partitioner else None
  }

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[(T, Float)].partitions.map { x =>
      new PartitionwiseWeightedSampledRDDPartition(x, random.nextLong(), fractions.getOrElse(x.index, 0.0))
    }
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    firstParent[(T, Float)].preferredLocations(
      split.asInstanceOf[PartitionwiseWeightedSampledRDDPartition].prev
    )
  }

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseWeightedSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.setFraction(split.fraction)
    thisSampler.sample(firstParent[(T, Float)].iterator(split.prev, context))
  }
} 
Example 132
Source File: CompositeSampler.scala    From zen   with Apache License 2.0 5 votes vote down vote up
package com.github.cloudml.zen.ml.sampler

import java.util.Random

import spire.math.{Numeric => spNum}


class CompositeSampler(implicit ev: spNum[Double])
  extends Sampler[Double] {
  private var samplers: Seq[Sampler[_]] = _

  protected def numer: spNum[Double] = ev

  def apply(state: Int): Double = samplers.iterator.map(_.applyDouble(state)).sum

  def norm: Double = samplers.iterator.map(_.normDouble).sum

  def sampleFrom(base: Double, gen: Random): Int = {
    val sampIter = samplers.iterator
    var curSampler = sampIter.next()
    var subNorm = curSampler.normDouble
    var remain = base
    while (remain >= subNorm) {
      remain -= subNorm
      curSampler = sampIter.next()
      subNorm = curSampler.normDouble
    }
    curSampler.sampleFromDouble(remain, gen)
  }

  def resetComponents(samplers: Sampler[_]*): CompositeSampler = {
    this.samplers = samplers
    this
  }
} 
Example 133
Source File: MetropolisHastings.scala    From zen   with Apache License 2.0 5 votes vote down vote up
package com.github.cloudml.zen.ml.sampler

import java.util.Random

import spire.math.{Numeric => spNum}


class MetropolisHastings(implicit ev: spNum[Double])
  extends Sampler[Double] {
  type TransProb = Int => Double

  private var origFunc: TransProb = _
  private var proposal: Sampler[Double] = _
  private var state: Int = _

  protected def numer: spNum[Double] = ev

  def apply(state: Int): Double = origFunc(state)

  def norm: Double = proposal.norm

  def sampleFrom(base: Double, gen: Random): Int = {
    val newState = proposal.sampleFrom(base, gen)
    if (newState != state) {
      val ar = acceptRate(newState)
      if (ar >= 1.0 || gen.nextDouble() < ar) {
        state = newState
      }
    }
    state
  }

  private def acceptRate(newState:Int): Double = {
    origFunc(newState) * proposal(state) /
      (origFunc(state) * proposal(newState))
  }

  def resetProb(origFunc: TransProb,
    proposal: Sampler[Double],
    initState: Int): MetropolisHastings = {
    this.origFunc = origFunc
    this.proposal = proposal
    this.state = initState
    this
  }

  def resetProb(origFunc: TransProb,
    proposal: Sampler[Double],
    gen: Random): MetropolisHastings = {
    this.origFunc = origFunc
    this.proposal = proposal
    this.state = proposal.sampleRandom(gen)
    this
  }
} 
Example 134
Source File: DiscreteSampler.scala    From zen   with Apache License 2.0 5 votes vote down vote up
package com.github.cloudml.zen.ml.sampler

import java.util.Random
import scala.annotation.tailrec

import spire.math.{Numeric => spNum}


trait DiscreteSampler[@specialized(Double, Int, Float, Long) T] extends Sampler[T] {
  def length: Int
  def used: Int
  def update(state: Int, value: => T): Unit
  def deltaUpdate(state: Int, delta: => T): Unit
  def resetDist(probs: Array[T], space: Array[Int], psize: Int): DiscreteSampler[T]
  def resetDist(distIter: Iterator[(Int, T)], psize: Int): DiscreteSampler[T]
  def reset(newSize: Int): DiscreteSampler[T]

  @tailrec final def resampleRandom(gen: Random,
    state: Int,
    residualRate: Double,
    numResampling: Int = 2)(implicit ev: spNum[T]): Int = {
    val newState = sampleRandom(gen)
    if (newState == state && numResampling >= 0 && used > 1 &&
      (residualRate >= 1.0 || gen.nextDouble() < residualRate)) {
      resampleRandom(gen, state, residualRate, numResampling - 1)
    } else {
      newState
    }
  }

  @tailrec final def resampleFrom(base: T,
    gen: Random,
    state: Int,
    residualRate: Double,
    numResampling: Int = 2)(implicit ev: spNum[T]): Int = {
    val newState = sampleFrom(base, gen)
    if (newState == state && numResampling >= 0 && used > 1 &&
      (residualRate >= 1.0 || gen.nextDouble() < residualRate)) {
      val newBase = ev.fromDouble(gen.nextDouble() * ev.toDouble(norm))
      resampleFrom(newBase, gen, state, residualRate, numResampling - 1)
    } else {
      newState
    }
  }
} 
Example 135
Source File: FlowerDataSetIterator.scala    From dl4scala   with MIT License 5 votes vote down vote up
package org.dl4scala.examples.transferlearning.vgg16.dataHelpers

import java.io.{File, IOException}
import java.net.URL

import org.datavec.api.io.filters.BalancedPathFilter
import org.datavec.api.io.labels.ParentPathLabelGenerator
import org.datavec.api.split.{FileSplit, InputSplit}
import org.datavec.image.loader.BaseImageLoader
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator
import java.util
import java.util.Random

import org.apache.commons.io.FileUtils
import org.datavec.api.util.ArchiveUtils
import org.datavec.image.recordreader.ImageRecordReader
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator
import org.deeplearning4j.nn.modelimport.keras.trainedmodels.TrainedModels


object FlowerDataSetIterator {
  private val log = org.slf4j.LoggerFactory.getLogger(FlowerDataSetIterator.getClass)

  private val DATA_DIR = new File(System.getProperty("user.home")) + "/dl4jDataDir"
  private val DATA_URL = "http://download.tensorflow.org/example_images/flower_photos.tgz"
  private val FLOWER_DIR = DATA_DIR + "/flower_photos"

  private val allowedExtensions = BaseImageLoader.ALLOWED_FORMATS
  private val rng = new Random(13)

  private val height = 224
  private val width = 224
  private val channels = 3
  private val numClasses = 5

  private val labelMaker = new ParentPathLabelGenerator
  private var trainData: InputSplit = _
  private var testData: InputSplit = _
  private var batchSize = 0

  @throws(classOf[IOException])
  def trainIterator: DataSetIterator = makeIterator(trainData)

  @throws(classOf[IOException])
  def testIterator: DataSetIterator = makeIterator(testData)

  @throws(classOf[IOException])
  def setup(batchSizeArg: Int, trainPerc: Int): Unit = {
    try
      downloadAndUntar()
    catch {
      case e: IOException =>
        e.printStackTrace()
        log.error("IOException : ", e)
    }

    batchSize = batchSizeArg
    val parentDir = new File(FLOWER_DIR)
    val filesInDir = new FileSplit(parentDir, allowedExtensions, rng)
    val pathFilter = new BalancedPathFilter(rng, allowedExtensions, labelMaker)
    if (trainPerc >= 100)
      throw new IllegalArgumentException("Percentage of data set aside for training has to be less than 100%." +
        " Test percentage = 100 - training percentage, has to be greater than 0")
    val filesInDirSplit = filesInDir.sample(pathFilter, trainPerc, 100 - trainPerc)
    trainData = filesInDirSplit(0)
    testData = filesInDirSplit(1)
  }

  @throws(classOf[IOException])
  private def makeIterator(split: InputSplit) = {
    val recordReader = new ImageRecordReader(height, width, channels, labelMaker)
    recordReader.initialize(split)
    val iter = new RecordReaderDataSetIterator(recordReader, batchSize, 1, numClasses)
    iter.setPreProcessor(TrainedModels.VGG16.getPreProcessor)
    iter
  }

  @throws(classOf[IOException])
  def downloadAndUntar(): Unit = {
    val rootFile = new File(DATA_DIR)
    if (!rootFile.exists) rootFile.mkdir
    val tarFile = new File(DATA_DIR, "flower_photos.tgz")
    if (!tarFile.isFile) {
      log.info("Downloading the flower dataset from " + DATA_URL + "...")
      FileUtils.copyURLToFile(new URL(DATA_URL), tarFile)
    }
    ArchiveUtils.unzipFileTo(tarFile.getAbsolutePath, rootFile.getAbsolutePath)
  }
} 
Example 136
Source File: LDADataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.ml

import com.intel.hibench.sparkbench.common.IOCommon

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import scala.collection.mutable.{HashMap => MHashMap}
import org.apache.spark.rdd.RDD


  def generateLDARDD(
    sc: SparkContext,
    numDocs: Long,
    numVocab: Int,
    docLenMin: Int,
    docLenMax: Int,
    numParts: Int = 3,
    seed: Long = System.currentTimeMillis()): RDD[(Long, Vector)] = {
    val data = sc.parallelize(0L until numDocs, numParts).mapPartitionsWithIndex { 
      (idx, part) =>
        val rng = new Random(seed ^ idx)
        part.map { case docIndex =>
          var currentSize = 0
          val entries = MHashMap[Int, Int]()
          val docLength = rng.nextInt(docLenMax - docLenMin + 1) + docLenMin
          while (currentSize < docLength) {
            val index = rng.nextInt(numVocab)
            entries(index) = entries.getOrElse(index, 0) + 1
            currentSize += 1
          }

          val iter = entries.toSeq.map(v => (v._1, v._2.toDouble))
          (docIndex, Vectors.sparse(numVocab, iter))
       }
    }
    data
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("LDADataGenerator")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numDocs: Long = 500L
    var numVocab: Int = 1000
    var docLenMin: Int = 50
    var docLenMax: Int = 10000
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 5) {
      outputPath = args(0)
      numDocs = args(1).toInt
      numVocab = args(2).toInt
      docLenMin = args(3).toInt
      docLenMax = args(4).toInt
      println(s"Output Path: $outputPath")
      println(s"Num of Documents: $numDocs")
      println(s"Vocabulary size: $numVocab")
    } else {
      System.err.println(
        s"Usage: $LDADataGenerator <OUTPUT_PATH> <NUM_DOCUMENTS> <VOCABULARY_SIZE>"
      )
      System.exit(1)
    }

    val data = generateLDARDD(sc, numDocs, numVocab, docLenMin, docLenMax, numPartitions)

    data.saveAsObjectFile(outputPath)

    sc.stop()
  }
} 
Example 137
Source File: Bagging.scala    From streamDM   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streamdm.classifiers.meta

import java.util.Random
import com.github.javacliparser.{ClassOption, IntOption}
import org.apache.spark.streamdm.classifiers.Classifier
import org.apache.spark.streamdm.classifiers.model._
import org.apache.spark.streamdm.core._
import org.apache.spark.streaming.dstream._
import org.apache.spark.streamdm.utils.Utils
import org.apache.spark.streamdm.core.specification.ExampleSpecification


  def ensemblePredict(example: Example): Double = {
    val sizeEnsemble = ensembleSizeOption.getValue
    val predictions: Array[Double] = new Array(sizeEnsemble)
    for (i <- 0 until sizeEnsemble) {
      predictions(i) = classifiers(i).getModel.asInstanceOf[ClassificationModel].predict(example)
    }
    Utils.majorityVote(predictions, numberClasses)
  }

  def numberClasses(): Integer = {
    if (exampleLearnerSpecification == null) 2
    else exampleLearnerSpecification.out(0).range
  }
} 
Example 138
Source File: DiscreteAliasSamplerSpec.scala    From bidirectional-random-walk   with MIT License 5 votes vote down vote up
package soal.util

import org.scalatest.FlatSpec
import org.scalatest.Matchers
import java.util.Random



class DiscreteAliasSamplerSpec  extends FlatSpec with Matchers  {
  val random = new Random(1)
  def testDistribution(unnormalizedProbabilities: Array[Float],
                       values: Seq[Int],
                       nSamples: Int = 10000
      ): Unit = {
    val probabilities = unnormalizedProbabilities map { _ / unnormalizedProbabilities.sum }
    val n = unnormalizedProbabilities.size
    val valueToIndex = (values zip (0 until n)).toMap
    val sampler = new DiscreteAliasSampler(values, unnormalizedProbabilities, random)
    val sampleCounts = Array.fill(n)(0)
    val tol = 4.0f / math.sqrt(nSamples).toFloat
    for (i <- 0 until nSamples) {
      val v = sampler.sample()
      sampleCounts(valueToIndex(v)) += 1
    }
    for (i <- 0 until n) {
      sampleCounts(i).toFloat / nSamples should equal (probabilities(i) +- tol)
    }

    def f(v: Int): Float = v.toFloat * v.toFloat // compute expectation of v => v^2
    val trueExpectation = ((probabilities zip values) map { case (p, v) => p * v * v }).sum
    sampler.expectation(f) shouldEqual (trueExpectation +- trueExpectation * 1.00001f)
  }

  "A Discrete Distribution" should "support sampling" in {
    testDistribution(Array(575.6355f, 89.733475f, 86.90718f, 721.26416f), Array(2, 3, 5, 7))
    testDistribution(Array(2.0f, 5.0f, 3.0f), Array(17, 11, 13))
    testDistribution(Array(1.0f, 1.0f, 1.0f, 1.0f), Array(-2, 3, -5, 7))
    testDistribution(Array(0.9f, 0.1f), Array(19, 17))
    an[IllegalArgumentException] should be thrownBy {
      new DiscreteAliasSampler(Array(1), Array(1.0f, 2.0f))
    }
  }
} 
Example 139
Source File: BidirectionalPPREstimatorSpec.scala    From bidirectional-random-walk   with MIT License 5 votes vote down vote up
package soal.ppr

import java.util.Random

import co.teapot.graph.ConcurrentHashMapDynamicGraph
import org.scalatest.{FlatSpec, Matchers}

import scala.collection.mutable
import scala.io.Source

class BidirectionalPPREstimatorSpec extends FlatSpec with Matchers {
  val graph = ConcurrentHashMapDynamicGraph.readGraph("src/test/resources/test_graph.txt")
  val teleportProb = 0.2f
  val random = new Random(2) // Seed for consistent tests
  val estimator = new BidirectionalPPREstimator(graph, teleportProb, random)
  val truePPRs = BidirectionalPPREstimatorSpec.testGraphTruePPRs

  "BidirectionalPPRSearcher.estimateInversePPR" should "be correct on the test graph" in {
    val pprErrorTolerance = 2.0e-6f
    for (((s, t), truePPR) <- truePPRs) {
      val inversePPRs = estimator.estimatePPRToTarget(t, pprErrorTolerance)
      withClue (s"Testing Pair ($s, $t)") {
        inversePPRs(s) should equal (truePPR +- pprErrorTolerance)
      }
    }
  }

  "BidirectionalPPRSearcher.estimatePPR" should "be correct on the test graph" in {
    val relativeError = 0.01f
    val stPairs = Array(0 -> 1, 2 -> 3, 5 -> 9, 0 -> 0)

    for ((s, t) <- stPairs) {
      withClue (s"Testing Pair ($s, $t)") {
        estimator.estimatePPRSingleSource(s, t, 0.03f, relativeError) should equal (
          truePPRs((s, t)) +- truePPRs((s, t)) * relativeError * 2)
      }
    }
  }
}

object BidirectionalPPREstimatorSpec {
  def testGraphTruePPRs: collection.Map[(Int, Int), Float] = {
    val pprMap = new mutable.HashMap[(Int, Int), Float] {
      override def default(key: (Int, Int)) = 0.0f
    }
    for (line <- Source.fromFile("src/test/resources/test_graph_true_pprs.txt").getLines()) {
      val pieces = line.split("\t")
      val (startId, targetId, truePPR) = (pieces(0).toInt, pieces(1).toInt, pieces(2).toFloat)
      pprMap((startId, targetId)) = truePPR
    }
    pprMap
  }
} 
Example 140
Source File: CsvKafkaPublisher.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.taxi360.common

import java.io.File
import java.util.Random

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}

import scala.io.Source

object CsvKafkaPublisher {

  var counter = 0
  var salts = 0

  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<brokerList> " +
        "<topicName> " +
        "<dataFolderOrFile> " +
        "<sleepPerRecord> " +
        "<acks> " +
        "<linger.ms> " +
        "<producer.type> " +
        "<batch.size> " +
        "<salts>")
      return
    }

    val kafkaBrokerList = args(0)
    val kafkaTopicName = args(1)
    val nyTaxiDataFolder = args(2)
    val sleepPerRecord = args(3).toInt
    val acks = args(4).toInt
    val lingerMs = args(5).toInt
    val producerType = args(6) //"async"
    val batchSize = args(7).toInt
    salts = args(8).toInt

    val kafkaProducer = KafkaProducerUntil.getNewProducer(kafkaBrokerList, acks, lingerMs, producerType, batchSize)

    println("--Input:" + nyTaxiDataFolder)

    val dataFolder = new File(nyTaxiDataFolder)
    if (dataFolder.isDirectory) {
      val files = dataFolder.listFiles().iterator
      files.foreach(f => {
        println("--Input:" + f)
        processFile(f, kafkaTopicName, kafkaProducer, sleepPerRecord)
      })
    } else {
      println("--Input:" + dataFolder)
      processFile(dataFolder, kafkaTopicName, kafkaProducer, sleepPerRecord)
    }
    println("---Done")
  }

  def processFile(file:File, kafkaTopicName:String,
                  kafkaProducer: KafkaProducer[String, String], sleepPerRecord:Int): Unit = {
    var counter = 0
    val r = new Random()

    println("-Starting Reading")
    Source.fromFile(file).getLines().foreach(l => {
      counter += 1
      if (counter % 10000 == 0) {
        println("{Sent:" + counter + "}")
      }
      if (counter % 100 == 0) {
        print(".")
      }
      Thread.sleep(sleepPerRecord)

      val saltedVender = r.nextInt(salts) + l

      if (counter > 2) {
        publishTaxiRecord(saltedVender, kafkaTopicName, kafkaProducer)
      }
    })
  }

  def publishTaxiRecord(line:String, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String]): Unit = {

    if (line.startsWith("vendor_name") || line.length < 10) {
      println("skip")
    } else {
      val message = new ProducerRecord[String, String](kafkaTopicName, line.hashCode.toString, line)
      kafkaProducer.send(message)
    }
  }


} 
Example 141
Source File: RNG.scala    From Scalaprof   with GNU General Public License v2.0 5 votes vote down vote up
package edu.neu.coe.csye._7200.ga

import java.util.Random

trait RNG[+A] {
  def next: RNG[A]
  def value: A
}

abstract class RNG_Java[+A](n: Long) extends RNG[A] { 
  // must be overridden by sub-classes
  def value: A
  def newRNG(n: Long): RNG[A]
  // may be overridden (if you want to define your own pseudo-random sequence)
  def nextSeed: Long = RNG_Java.nextSeed(n)
  // base method -- not normally overridden
  def next: RNG[A] = newRNG(nextSeed)
  def state = n
}

object RNG_Java {
  def nextSeed(n: Long): Long = new Random(n).nextLong
}

case class LongRNG(n: Long) extends RNG_Java[Long](n) {
  def newRNG(n: Long): RNG[Long] = LongRNG(n) 
  def value = n 
}

case class DoubleRNG(n: Long) extends RNG_Java[Double](n) {
  def newRNG(n: Long) = DoubleRNG(n) 
  def value = n.toDouble/Long.MaxValue
  override def toString = s"DoubleRNG: $n->$value"
}


case class UniformDouble(x: Double) extends AnyVal with Ordered[UniformDouble] {
    def + (y: Double) = x + y
    def compare(that: UniformDouble): Int = x.compare(that.x)
}

object DoubleRNG {
  def apply: RNG[Double] = DoubleRNG(System.currentTimeMillis())
}

object UniformDoubleRNG {
  def apply: RNG[UniformDouble] = UniformDoubleRNG(System.currentTimeMillis())
  implicit val u: Unit = Unit
}

object GaussianRNG {
  def apply: RNG[(Double,Double)] = GaussianRNG(System.currentTimeMillis())
}

object UniformDouble {
  def create(x: Double)(implicit y: Unit) = if (x>=0 && x<=1) new UniformDouble(x) else throw new IllegalArgumentException(s"$x is not in range 0..1")
  def + (x: Double, y: UniformDouble) = y+x
} 
Example 142
Source File: RNG.scala    From Scalaprof   with GNU General Public License v2.0 5 votes vote down vote up
package edu.neu.coe.csye._7200

package rng

import java.util.Random

trait RNG[+A] {
  def next: RNG[A]
  def value: A
}

abstract class RNG_Java[+A](n: Long) extends RNG[A] { 
  // must be overridden by sub-classes
  def value: A
  def newRNG(n: Long): RNG_Java[A]
  // may be overridden (if you want to define your own pseudo-random sequence)
  def nextSeed: Long = RNG_Java.nextSeed(n)
  // base method -- not normally overridden
  def next: RNG_Java[A] = newRNG(nextSeed)
  def state = n
}

object RNG_Java {
  def nextSeed(n: Long): Long = new Random(n).nextLong
}

case class LongRNG(n: Long) extends RNG_Java[Long](n) {
  def newRNG(n: Long) = ???
  def value = ???
}

case class DoubleRNG(n: Long) extends RNG_Java[Double](n) {
  def newRNG(n: Long) = ???
  def value = ???
  override def toString = s"DoubleRNG: $n->$value"
}


case class UniformDouble(x: Double) {
    def + (y: Double) = x + y
}

object UniformDoubleRNG {
  def apply: RNG[UniformDouble] = UniformDoubleRNG(System.currentTimeMillis())
}

object GaussianRNG {
  def apply: RNG[(Double,Double)] = GaussianRNG(System.currentTimeMillis())
}

object UniformDouble {
  def apply(x: Double, y: Unit): UniformDouble = if (x>=0 && x<=1) new UniformDouble(x) else throw new IllegalArgumentException(s"$x is not in range 0..1")
  def + (x: Double, y: UniformDouble) = y+x
} 
Example 143
Source File: ProbabilityDistributionTest.scala    From ScalphaGoZero   with Apache License 2.0 5 votes vote down vote up
package org.deeplearning4j.scalphagozero.agents

import java.util.Random
import org.scalatest.funspec.AnyFunSpec

class ProbabilityDistributionTest extends AnyFunSpec {

  describe("Select from a distribution") {

    it("should be low index if distribution skewed low") {
      val dist = createPDist(Array(0.9, 0.8, 0.5, 0.3, 0.2, 0.1, 0.01, 0.001))
      assert(dist.selectRandomIdx() == 2)
      assert(dist.selectRandomIdx() == 1)
      assert(dist.selectRandomIdx() == 0)
    }

    it("should be high index if distribution skewed high") {
      val dist = createPDist(Array(0.001, 0.01, 0.1, 0.3, 0.8, 0.5, 0.8, 0.9))
      assert(dist.selectRandomIdx() == 6)
    }

    it("should be highest index if distribution skewed very high") {
      val dist = createPDist(Array(0.001, 0.01, 0.01, 0.01, 0.01, 0.1, 0.9))
      assert(dist.selectRandomIdx() == 6)
    }

    it("should be near middle if gaussian distribution") {
      val dist = createPDist(Array(0.001, 0.01, 0.1, 0.3, 0.6, 0.8, 0.9, 0.9, 0.8, 0.55, 0.4, 0.2, 0.05, 0.01))
      assert(dist.selectRandomIdx() == 8)
    }

    it("random if uniform distribution") {
      val dist = createPDist(Array(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2))
      assert(dist.selectRandomIdx() == 5)
    }

    it("should be 0 index if distribution has only 1 0 value") {
      val dist = createPDist(Array(0.0))
      assert(dist.selectRandomIdx() == 0)
    }
  }

  private def createPDist(a: Array[Double]) = ProbabilityDistribution(a, new Random(1))
} 
Example 144
Source File: SparkHdfsLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo



object SparkHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    if (args.length < 2) {
      System.err.println("Usage: SparkHdfsLR <file> <iters>")
      System.exit(1)
    }

    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
    val inputPath = args(0)
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).cache()
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
} 
Example 145
Source File: SparkLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData: Array[DataPoint] = {
    def generatePoint(i: Int): DataPoint = {
      val y = if (i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkLR")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)

    sc.stop()
  }
}
// scalastyle:on println 
Example 146
Source File: LocalKMeans.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println 
Example 147
Source File: StopwatchSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.util

import java.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext {

  import StopwatchSuite._

  private def testStopwatchOnDriver(sw: Stopwatch): Unit = {
    assert(sw.name === "sw")
    assert(sw.elapsed() === 0L)
    assert(!sw.isRunning)
    intercept[AssertionError] {
      sw.stop()
    }
    val duration = checkStopwatch(sw)
    val elapsed = sw.elapsed()
    assert(elapsed === duration)
    val duration2 = checkStopwatch(sw)
    val elapsed2 = sw.elapsed()
    assert(elapsed2 === duration + duration2)
    assert(sw.toString === s"sw: ${elapsed2}ms")
    sw.start()
    assert(sw.isRunning)
    intercept[AssertionError] {
      sw.start()
    }
  }

  test("LocalStopwatch") {
    val sw = new LocalStopwatch("sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on driver") {
    val sw = new DistributedStopwatch(sc, "sw")
    testStopwatchOnDriver(sw)
  }

  test("DistributedStopwatch on executors") {
    val sw = new DistributedStopwatch(sc, "sw")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.accumulator(0L)
    rdd.foreach { i =>
      acc += checkStopwatch(sw)
    }
    assert(!sw.isRunning)
    val elapsed = sw.elapsed()
    assert(elapsed === acc.value)
  }

  test("MultiStopwatch") {
    val sw = new MultiStopwatch(sc)
      .addLocal("local")
      .addDistributed("spark")
    assert(sw("local").name === "local")
    assert(sw("spark").name === "spark")
    intercept[NoSuchElementException] {
      sw("some")
    }
    assert(sw.toString === "{\n  local: 0ms,\n  spark: 0ms\n}")
    val localDuration = checkStopwatch(sw("local"))
    val sparkDuration = checkStopwatch(sw("spark"))
    val localElapsed = sw("local").elapsed()
    val sparkElapsed = sw("spark").elapsed()
    assert(localElapsed === localDuration)
    assert(sparkElapsed === sparkDuration)
    assert(sw.toString ===
      s"{\n  local: ${localElapsed}ms,\n  spark: ${sparkElapsed}ms\n}")
    val rdd = sc.parallelize(0 until 4, 4)
    val acc = sc.accumulator(0L)
    rdd.foreach { i =>
      sw("local").start()
      val duration = checkStopwatch(sw("spark"))
      sw("local").stop()
      acc += duration
    }
    val localElapsed2 = sw("local").elapsed()
    assert(localElapsed2 === localElapsed)
    val sparkElapsed2 = sw("spark").elapsed()
    assert(sparkElapsed2 === sparkElapsed + acc.value)
  }
}

private object StopwatchSuite extends SparkFunSuite {

  
  private def now: Long = System.currentTimeMillis()
} 
Example 148
Source File: SampledRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 149
Source File: PartitionwiseSampledRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 150
Source File: SimpleSkewedGroupByTest.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object SimpleSkewedGroupByTest {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers
    var ratio = if (args.length > 4) args(4).toInt else 5.0

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var result = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        val offset = ranGen.nextInt(1000) * numReducers
        if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
          // give ratio times higher chance of generating key 0 (for reducer 0)
          result(i) = (offset, byteArr)
        } else {
          // generate a key for one of the other reducers
          val key = 1 + ranGen.nextInt(numReducers-1) + offset
          result(i) = (key, byteArr)
        }
      }
      result
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    println("RESULT: " + pairs1.groupByKey(numReducers).count)
    // Print how many keys each reducer got (for debugging)
    // println("RESULT: " + pairs1.groupByKey(numReducers)
    //                           .map{case (k,v) => (k, v.size)}
    //                           .collectAsMap)

    sc.stop()
  }
} 
Example 151
Source File: SparkTachyonHdfsLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}
import org.apache.hadoop.conf.Configuration

import org.apache.spark._
import org.apache.spark.scheduler.InputFormatInfo
import org.apache.spark.storage.StorageLevel



object SparkTachyonHdfsLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val tok = new java.util.StringTokenizer(line, " ")
    var y = tok.nextToken.toDouble
    var x = new Array[Double](D)
    var i = 0
    while (i < D) {
      x(i) = tok.nextToken.toDouble; i += 1
    }
    DataPoint(new DenseVector(x), y)
  }

  def main(args: Array[String]) {

    showWarning()

    val inputPath = args(0)
    val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
    val conf = new Configuration()
    val sc = new SparkContext(sparkConf,
      InputFormatInfo.computePreferredLocations(
        Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
      ))
    val lines = sc.textFile(inputPath)
    val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)
    sc.stop()
  }
} 
Example 152
Source File: SkewedGroupByTest.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object SkewedGroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random

      // map output sizes lineraly increase from the 1st to the last
      numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt

      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    sc.stop()
  }
} 
Example 153
Source File: LocalFileLR.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}


object LocalFileLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
    val points = lines.map(parsePoint _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
}
// scalastyle:on println 
Example 154
Source File: LocalLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}


object LocalLR {
  val N = 10000  // Number of data points
  val D = 10   // Number of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData = {
    def generatePoint(i: Int) = {
      val y = if(i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- data) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient +=  p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
} 
Example 155
Source File: GroupByTest.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object GroupByTest {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test")
    var numMappers = if (args.length > 0) args(0).toInt else 2
    var numKVPairs = if (args.length > 1) args(1).toInt else 1000
    var valSize = if (args.length > 2) args(2).toInt else 1000
    var numReducers = if (args.length > 3) args(3).toInt else numMappers

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
      }
      arr1
    }.cache()
    // Enforce that everything has been calculated and in cache
    pairs1.count()

    println(pairs1.groupByKey(numReducers).count())

    sc.stop()
  }
} 
Example 156
Source File: LocalFileLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import breeze.linalg.{Vector, DenseVector}


object LocalFileLR {
  val D = 10   // Numer of dimensions
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def parsePoint(line: String): DataPoint = {
    val nums = line.split(' ').map(_.toDouble)
    DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
    val points = lines.map(parsePoint _)
    val ITERATIONS = args(1).toInt

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      var gradient = DenseVector.zeros[Double](D)
      for (p <- points) {
        val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
        gradient += p.x * scale
      }
      w -= gradient
    }

    println("Final w: " + w)
  }
} 
Example 157
Source File: SparkLR.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.math.exp

import breeze.linalg.{Vector, DenseVector}

import org.apache.spark._


object SparkLR {
  val N = 10000  // Number of data points
  val D = 10   // Numer of dimensions
  val R = 0.7  // Scaling factor
  val ITERATIONS = 5
  val rand = new Random(42)

  case class DataPoint(x: Vector[Double], y: Double)

  def generateData = {
    def generatePoint(i: Int) = {
      val y = if(i % 2 == 0) -1 else 1
      val x = DenseVector.fill(D){rand.nextGaussian + y * R}
      DataPoint(x, y)
    }
    Array.tabulate(N)(generatePoint)
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of Logistic Regression and is given as an example!
        |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
        |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val sparkConf = new SparkConf().setAppName("SparkLR")
    val sc = new SparkContext(sparkConf)
    val numSlices = if (args.length > 0) args(0).toInt else 2
    val points = sc.parallelize(generateData, numSlices).cache()

    // Initialize w to a random value
    var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
    println("Initial w: " + w)

    for (i <- 1 to ITERATIONS) {
      println("On iteration " + i)
      val gradient = points.map { p =>
        p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
      }.reduce(_ + _)
      w -= gradient
    }

    println("Final w: " + w)

    sc.stop()
  }
} 
Example 158
Source File: LocalKMeans.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData = {
    def generatePoint(i: Int) = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
} 
Example 159
Source File: GroupByKey.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package com.javachen.spark.examples.rdd

import java.util.Random

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


object GroupByKey {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]")
    var numMappers = 10
    var numKVPairs = 100
    var valSize = 100
    var numReducers = 3

    val sc = new SparkContext(sparkConf)

    val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
      val ranGen = new Random
      var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
      for (i <- 0 until numKVPairs) {
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr1(i) = (ranGen.nextInt(10), byteArr)
      }
      arr1
    }.cache
    // Enforce that everything has been calculated and in cache
    pairs1.count

    val result = pairs1.groupByKey(numReducers)
    println(result.count)
    println(result.toDebugString)

    sc.stop()
  }
} 
Example 160
Source File: UserRepositoryInMemoryInterpreter.scala    From scala-pet-store   with Apache License 2.0 5 votes vote down vote up
package io.github.pauljamescleary.petstore
package infrastructure.repository.inmemory

import java.util.Random

import cats.implicits._
import cats.Applicative
import cats.data.OptionT
import domain.users.{User, UserRepositoryAlgebra}
import tsec.authentication.IdentityStore

import scala.collection.concurrent.TrieMap

class UserRepositoryInMemoryInterpreter[F[_]: Applicative]
    extends UserRepositoryAlgebra[F]
    with IdentityStore[F, Long, User] {
  private val cache = new TrieMap[Long, User]

  private val random = new Random

  def create(user: User): F[User] = {
    val id = random.nextLong
    val toSave = user.copy(id = id.some)
    cache += (id -> toSave)
    toSave.pure[F]
  }

  def update(user: User): OptionT[F, User] = OptionT {
    user.id.traverse { id =>
      cache.update(id, user)
      user.pure[F]
    }
  }

  def get(id: Long): OptionT[F, User] =
    OptionT.fromOption(cache.get(id))

  def delete(id: Long): OptionT[F, User] =
    OptionT.fromOption(cache.remove(id))

  def findByUserName(userName: String): OptionT[F, User] =
    OptionT.fromOption(cache.values.find(u => u.userName == userName))

  def list(pageSize: Int, offset: Int): F[List[User]] =
    cache.values.toList.sortBy(_.lastName).slice(offset, offset + pageSize).pure[F]

  def deleteByUserName(userName: String): OptionT[F, User] =
    OptionT.fromOption(
      for {
        user <- cache.values.find(u => u.userName == userName)
        removed <- cache.remove(user.id.get)
      } yield removed,
    )
}

object UserRepositoryInMemoryInterpreter {
  def apply[F[_]: Applicative]() =
    new UserRepositoryInMemoryInterpreter[F]
} 
Example 161
Source File: PigFuncs.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.backends.flink

import java.util.Random

import dbis.piglet.CommonPigFuncs
import dbis.piglet.backends._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.functions._
import org.apache.flink.api.scala._

import scala.reflect.ClassTag

class CustomSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) {
  def sample(withReplacement: Boolean, fraction: Double, seed: Long = new Random().nextLong()) = {
    dataSet.mapPartition(new SampleWithFraction[T](withReplacement, fraction, seed))
  }

}

object Sampler {
  implicit def addSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) = {
    new CustomSampler(dataSet)
  }
}

object PigFuncs extends CommonPigFuncs {
} 
Example 162
Source File: StreamingPredictionsSpec.scala    From odsc-east-realish-predictions   with Apache License 2.0 4 votes vote down vote up
package com.twilio.open.odsc.realish

import java.sql.Timestamp
import java.time.Instant
import java.util.{Random, UUID}

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoders, SQLContext, SparkSession}
import org.scalatest.{FunSuite, Matchers}
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}

import scala.concurrent.duration._

class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql {

  override def conf: SparkConf = {
    new SparkConf()
      .setMaster("local[*]")
      .setAppName("odsc-spark-utils")
      .set("spark.ui.enabled", "false")
      .set("spark.app.id", appID)
      .set("spark.driver.host", "localhost")
      .set("spark.sql.session.timeZone", "UTC")
  }

  final val notRandomRandom = {
    val generator = new Random
    generator.setSeed(100L)
    generator
  }

  test("should stream in some mock data for fun") {
    implicit val spark: SparkSession = sparkSql
    import spark.implicits._
    implicit val sqlContext: SQLContext = spark.sqlContext

    implicit val metricEncoder = Encoders.product[Metric]
    val metricData = MemoryStream[Metric]

    val startingInstant = Instant.now()

    val backingData = (1 to 10000).map(offset => {
      val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration"
      val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100)
      Metric(
        Timestamp.from(startingInstant.minusSeconds(offset)),
        UUID.randomUUID().toString,
        metric,
        value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240),
        countryCode = if (offset % 8 == 0) "US" else "BR",
        callDirection = if (metric == "loss_percentage") "inbound" else "outbound"
      )
    })
    val processingTimeTrigger = Trigger.ProcessingTime(2.seconds)


    val streamingQuery = metricData.toDF()
      .withWatermark("timestamp", "2 hours")
      .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes"))
      .agg(
        min("value") as "min",
        avg("value") as "mean",
        max("value") as "max",
        count("*") as "total"
      )
      .writeStream
      .format("memory")
      .queryName("datastream")
      .outputMode(OutputMode.Append())
      .trigger(processingTimeTrigger)
      .start()

    metricData.addData(backingData)

    streamingQuery.processAllAvailable()

    spark.sql("select * from datastream").show(20, false)

    val checkChange = spark.sql("select * from datastream")
      .groupBy("metric","countryCode")
      .agg(
        sum("total") as "total",
        avg("mean") as "mean"
      )

    checkChange.show(20, false)

    // now can do interesting things with minor back tracking...

    streamingQuery.stop()

  }

}