java.util.Random Scala Examples
The following examples show how to use java.util.Random.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: IntegrationTest.scala From kmq with Apache License 2.0 | 6 votes |
package com.softwaremill.kmq.redelivery import java.time.Duration import java.util.Random import akka.actor.ActorSystem import akka.kafka.scaladsl.{Consumer, Producer} import akka.kafka.{ConsumerSettings, ProducerMessage, ProducerSettings, Subscriptions} import akka.stream.ActorMaterializer import akka.testkit.TestKit import com.softwaremill.kmq._ import com.softwaremill.kmq.redelivery.infrastructure.KafkaSpec import org.apache.kafka.clients.consumer.ConsumerConfig import org.apache.kafka.clients.producer.{ProducerConfig, ProducerRecord} import org.apache.kafka.common.serialization.StringDeserializer import org.scalatest.concurrent.Eventually import org.scalatest.time.{Seconds, Span} import org.scalatest.{BeforeAndAfterAll, FlatSpecLike, Matchers} import scala.collection.mutable.ArrayBuffer class IntegrationTest extends TestKit(ActorSystem("test-system")) with FlatSpecLike with KafkaSpec with BeforeAndAfterAll with Eventually with Matchers { implicit val materializer = ActorMaterializer() import system.dispatcher "KMQ" should "resend message if not committed" in { val bootstrapServer = s"localhost:${testKafkaConfig.kafkaPort}" val kmqConfig = new KmqConfig("queue", "markers", "kmq_client", "kmq_redelivery", Duration.ofSeconds(1).toMillis, 1000) val consumerSettings = ConsumerSettings(system, new StringDeserializer, new StringDeserializer) .withBootstrapServers(bootstrapServer) .withGroupId(kmqConfig.getMsgConsumerGroupId) .withProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest") val markerProducerSettings = ProducerSettings(system, new MarkerKey.MarkerKeySerializer(), new MarkerValue.MarkerValueSerializer()) .withBootstrapServers(bootstrapServer) .withProperty(ProducerConfig.PARTITIONER_CLASS_CONFIG, classOf[ParititionFromMarkerKey].getName) val markerProducer = markerProducerSettings.createKafkaProducer() val random = new Random() lazy val processedMessages = ArrayBuffer[String]() lazy val receivedMessages = ArrayBuffer[String]() val control = Consumer.committableSource(consumerSettings, Subscriptions.topics(kmqConfig.getMsgTopic)) // 1. get messages from topic .map { msg => ProducerMessage.Message( new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(msg.record), new StartMarker(kmqConfig.getMsgTimeoutMs)), msg) } .via(Producer.flow(markerProducerSettings, markerProducer)) // 2. write the "start" marker .map(_.message.passThrough) .mapAsync(1) { msg => msg.committableOffset.commitScaladsl().map(_ => msg.record) // this should be batched } .map { msg => receivedMessages += msg.value msg } .filter(_ => random.nextInt(5) != 0) .map { processedMessage => processedMessages += processedMessage.value new ProducerRecord[MarkerKey, MarkerValue](kmqConfig.getMarkerTopic, MarkerKey.fromRecord(processedMessage), EndMarker.INSTANCE) } .to(Producer.plainSink(markerProducerSettings, markerProducer)) // 5. write "end" markers .run() val redeliveryHook = RedeliveryTracker.start(new KafkaClients(bootstrapServer), kmqConfig) val messages = (0 to 20).map(_.toString) messages.foreach(msg => sendToKafka(kmqConfig.getMsgTopic,msg)) eventually { receivedMessages.size should be > processedMessages.size processedMessages.sortBy(_.toInt).distinct shouldBe messages }(PatienceConfig(timeout = Span(15, Seconds)), implicitly) redeliveryHook.close() control.shutdown() } override def afterAll(): Unit = { super.afterAll() TestKit.shutdownActorSystem(system) } }
Example 2
Source File: package.scala From iotchain with MIT License | 5 votes |
package jbok import java.nio.charset.StandardCharsets import java.util.Random import jbok.crypto.hash._ import scodec.bits.ByteVector import jbok.crypto.signature.SignatureInstances trait StringSyntax { implicit final def stringSyntax(a: String): StringOps = new StringOps(a) } final class StringOps(val a : String) extends AnyVal { def utf8bytes: ByteVector = ByteVector(a.getBytes(StandardCharsets.UTF_8)) } trait CryptoSyntax extends CryptoHasherSyntax with StringSyntax trait CryptoInstances extends CryptoHasherInstances with SignatureInstances package object crypto extends CryptoSyntax with CryptoInstances { def randomByteString(random: Random, length: Int): ByteVector = ByteVector(randomByteArray(random, length)) def randomByteArray(random: Random, length: Int): Array[Byte] = { val bytes = Array.ofDim[Byte](length) random.nextBytes(bytes) bytes } }
Example 3
Source File: SignaturePlatform.scala From iotchain with MIT License | 5 votes |
package jbok.crypto.signature import java.math.BigInteger import java.util.Random import cats.effect.Sync import jbok.crypto.facade.{BN, EC, SignatureEC} import scala.scalajs.js.JSConverters._ import scala.scalajs.js.typedarray.Uint8Array trait SignaturePlatform { val ecdsa: Signature[ECDSA] = ECDSAPlatform } private object ECDSAPlatform extends Signature[ECDSA] { import ECDSACommon._ val secp256k1 = new EC("secp256k1") override def generateKeyPair[F[_]](random: Option[Random])(implicit F: Sync[F]): F[KeyPair] = F.delay { val keyPair = secp256k1.genKeyPair() val secret = KeyPair.Secret(keyPair.getPrivate("hex")) // drop uncompressed indicator, make it 64-bytes val pubkey = KeyPair.Public(keyPair.getPublic(false, "hex").drop(2)) KeyPair(pubkey, secret) } override def generatePublicKey[F[_]](secret: KeyPair.Secret)(implicit F: Sync[F]): F[KeyPair.Public] = F.delay { val keyPair = secp256k1.keyFromPrivate(secret.bytes.toHex, "hex") // drop uncompressed indicator, make it 64-bytes KeyPair.Public(keyPair.getPublic(false, "hex").drop(2)) } override def sign[F[_]](hash: Array[Byte], keyPair: KeyPair, chainId: BigInt)(implicit F: Sync[F]): F[CryptoSignature] = F.delay { val kp = secp256k1.keyFromPrivate(keyPair.secret.bytes.toHex, "hex") val sig = secp256k1.sign(new Uint8Array(hash.toJSArray), kp) val r = new BigInteger(sig.r.toString) val s = new BigInteger(sig.s.toString) val pointSign = calculatePointSign(r, toCanonicalS(s), keyPair, hash, chainId) match { case Some(recId) => recId case None => throw new Exception("unexpected error") } val rid: BigInt = getRecoveryId(chainId, pointSign).getOrElse(pointSign) CryptoSignature(r, toCanonicalS(s), rid) } override def verify[F[_]](hash: Array[Byte], sig: CryptoSignature, public: KeyPair.Public, chainId: BigInt)(implicit F: Sync[F]): F[Boolean] = F.delay { getPointSign(chainId, sig.v).exists { bigInt => val signatureEC = convert(sig.copy(v = bigInt)) val key = secp256k1.keyFromPublic(UNCOMPRESSED_INDICATOR_STRING + public.bytes.toHex, "hex") secp256k1.verify(new Uint8Array(hash.toJSArray), signatureEC, key) } } override def recoverPublic(hash: Array[Byte], sig: CryptoSignature, chainId: BigInt): Option[KeyPair.Public] = getPointSign(chainId, sig.v).map { bigInt => val signatureEC = convert(sig.copy(v = bigInt)) val msg = new Uint8Array(hash.toJSArray) val recId = secp256k1.getKeyRecoveryParam(msg, signatureEC) val point = secp256k1.recoverPubKey(new Uint8Array(hash.toJSArray), signatureEC, recId) KeyPair.Public(point.encode("hex", false).drop(2)) } private def convert(sig: CryptoSignature) = { val r = new BN(sig.r.toString(16), 16) val s = new BN(sig.s.toString(16), 16) SignatureEC(r, s, recoveryParam = (sig.v - NEGATIVE_POINT_SIGN).toInt) } private def calculatePointSign(r: BigInt, s: BigInt, keyPair: KeyPair, hash: Array[Byte], chainId: BigInt): Option[BigInt] = allowedPointSigns.find( v => recoverPublic(hash, CryptoSignature(r, s, getRecoveryId(chainId, v).getOrElse(v)), chainId) .contains(keyPair.public)) }
Example 4
Source File: LoggerSimulation.scala From BigData-News with Apache License 2.0 | 5 votes |
package com.vita.spark.utils import java.io.PrintWriter import java.net.ServerSocket class LoggerSimulation { } object LoggerSimulation { var numIndex = 0 /** * 生成一个字母 * * @param 字母的下标 * @return 生成的字母 */ def gennerateContent(index: Int): String = { import scala.collection.mutable.ListBuffer val charList = ListBuffer[Char](); for (i <- 65 to 90) { charList += i.toChar } val charArray = charList.toArray charArray(index).toString(); } def gennerateNumber(): String = { // numIndex += 1 // return numIndex.toString return "a,b,c,d,e,f" } /** * 生成随机下标 * * @return 返回一个下标 */ def index = { import java.util.Random val rdm = new Random() rdm.nextInt(7) } /** * 启动一个main方法来创建一个serversockt发送消息 * * @param args 端口,发送的时间间隔 */ def main(args: Array[String]): Unit = { if (args.length != 2) { System.err.println("Usage:<port><millisecond>") System.exit(1); } val listener = new ServerSocket(args(0).toInt) println("已经做好连接的准备-------") while (true) { val socket = listener.accept() new Thread() { override def run(): Unit = { println("Got client connected from:" + socket.getInetAddress) val out = new PrintWriter(socket.getOutputStream, true) while (true) { Thread.sleep(args(1).toLong) // val content = gennerateContent(index) val content = gennerateNumber() println(content) out.write(content + "\n") out.flush() } socket.close() } }.start() } } }
Example 5
Source File: SimpleSkewedGroupByTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SimpleSkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("SimpleSkewedGroupByTest") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val ratio = if (args.length > 4) args(4).toInt else 5.0 val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val result = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) val offset = ranGen.nextInt(1000) * numReducers if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { // give ratio times higher chance of generating key 0 (for reducer 0) result(i) = (offset, byteArr) } else { // generate a key for one of the other reducers val key = 1 + ranGen.nextInt(numReducers-1) + offset result(i) = (key, byteArr) } } result }.cache // Enforce that everything has been calculated and in cache pairs1.count println("RESULT: " + pairs1.groupByKey(numReducers).count) // Print how many keys each reducer got (for debugging) // println("RESULT: " + pairs1.groupByKey(numReducers) // .map{case (k,v) => (k, v.size)} // .collectAsMap) spark.stop() } } // scalastyle:on println
Example 6
Source File: SkewedGroupByTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes linearly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 7
Source File: SparkHdfsLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 8
Source File: LocalLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 9
Source File: GroupByTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object GroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 10
Source File: LocalFileLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 11
Source File: PageViewGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.clickstream import java.io.PrintWriter import java.net.ServerSocket import java.util.Random // scalastyle:on object PageViewGenerator { val pages = Map("http://foo.com/" -> .7, "http://foo.com/news" -> 0.2, "http://foo.com/contact" -> .1) val httpStatus = Map(200 -> .95, 404 -> .05) val userZipCode = Map(94709 -> .5, 94117 -> .5) val userID = Map((1 to 100).map(_ -> .01): _*) def pickFromDistribution[T](inputMap: Map[T, Double]): T = { val rand = new Random().nextDouble() var total = 0.0 for ((item, prob) <- inputMap) { total = total + prob if (total > rand) { return item } } inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0 } def getNextClickEvent(): String = { val id = pickFromDistribution(userID) val page = pickFromDistribution(pages) val status = pickFromDistribution(httpStatus) val zipCode = pickFromDistribution(userZipCode) new PageView(page, status, zipCode, id).toString() } def main(args: Array[String]) { if (args.length != 2) { System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>") System.exit(1) } val port = args(0).toInt val viewsPerSecond = args(1).toFloat val sleepDelayMs = (1000.0 / viewsPerSecond).toInt val listener = new ServerSocket(port) println("Listening on port: " + port) while (true) { val socket = listener.accept() new Thread() { override def run(): Unit = { println("Got client connected from: " + socket.getInetAddress) val out = new PrintWriter(socket.getOutputStream(), true) while (true) { Thread.sleep(sleepDelayMs) out.write(getNextClickEvent()) out.flush() } socket.close() } }.start() } } } // scalastyle:on println
Example 12
Source File: SparkLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 13
Source File: LocalKMeans.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 14
Source File: StopwatchSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => acc.add(checkStopwatch(sw)) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc.add(duration) } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 15
Source File: PartitionwiseSampledRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 16
Source File: CsvKafkaPublisher.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.common import java.io.File import java.util.Random import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} import scala.io.Source object CsvKafkaPublisher { var counter = 0 var salts = 0 def main(args:Array[String]): Unit = { if (args.length == 0) { println("<brokerList> " + "<topicName> " + "<dataFolderOrFile> " + "<sleepPerRecord> " + "<acks> " + "<linger.ms> " + "<producer.type> " + "<batch.size> " + "<salts>") return } val kafkaBrokerList = args(0) val kafkaTopicName = args(1) val nyTaxiDataFolder = args(2) val sleepPerRecord = args(3).toInt val acks = args(4).toInt val lingerMs = args(5).toInt val producerType = args(6) //"async" val batchSize = args(7).toInt salts = args(8).toInt val kafkaProducer = KafkaProducerUntil.getNewProducer(kafkaBrokerList, acks, lingerMs, producerType, batchSize) println("--Input:" + nyTaxiDataFolder) val dataFolder = new File(nyTaxiDataFolder) if (dataFolder.isDirectory) { val files = dataFolder.listFiles().iterator files.foreach(f => { println("--Input:" + f) processFile(f, kafkaTopicName, kafkaProducer, sleepPerRecord) }) } else { println("--Input:" + dataFolder) processFile(dataFolder, kafkaTopicName, kafkaProducer, sleepPerRecord) } println("---Done") } def processFile(file:File, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String], sleepPerRecord:Int): Unit = { var counter = 0 val r = new Random() println("-Starting Reading") Source.fromFile(file).getLines().foreach(l => { counter += 1 if (counter % 10000 == 0) { println("{Sent:" + counter + "}") } if (counter % 100 == 0) { print(".") } Thread.sleep(sleepPerRecord) val saltedVender = r.nextInt(salts) + l if (counter > 2) { publishTaxiRecord(saltedVender, kafkaTopicName, kafkaProducer) } }) } def publishTaxiRecord(line:String, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String]): Unit = { if (line.startsWith("vendor_name") || line.length < 10) { println("skip") } else { val message = new ProducerRecord[String, String](kafkaTopicName, line.hashCode.toString, line) kafkaProducer.send(message) } } }
Example 17
Source File: ExtremeSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite } import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite } import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType } import java.util.Random import org.apache.spark.sql.Row class ExtremeSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" private def test[T]( dataType: DataType, randValue: Row => Any, summarizer: String => SummarizerFactory, reduceFn: (T, T) => T, inputColumn: String, outputColumn: String ): Unit = { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns( inputColumn -> dataType -> randValue ) val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) } val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) } val result = priceTSRdd.summarize(summarizer(inputColumn)) val extreme = result.first().getAs[T](outputColumn) val outputType = result.schema(outputColumn).dataType assert(outputType == dataType, s"$outputType") assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}") } "MaxSummarizer" should "compute double max correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max") } it should "compute long max correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max") } it should "compute float max correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max") } it should "compute int max correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max") } "MinSummarizer" should "compute double min correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min") } it should "compute long min correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min") } it should "compute float min correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min") } it should "compute int min correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min") } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.max("x1")) summarizerPropertyTest(AllProperties)(Summarizers.min("x2")) } it should "ignore null values" in { val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val inputWithNull = insertNullRows(input, "price") assertEquals( input.summarize(Summarizers.min("price")), inputWithNull.summarize(Summarizers.min("price")) ) } }
Example 18
Source File: WithdrawalEpochCertificateFixture.scala From Sidechains-SDK with MIT License | 5 votes |
package com.horizen.block import java.util.Random trait WithdrawalEpochCertificateFixture { private def getBytes(len: Int = 32, rnd: Random = new Random()): Array[Byte] = { val bytes = new Array[Byte](len) rnd.nextBytes(bytes) bytes } def generateWithdrawalEpochCertificate(previousMcBlockHashOpt: Option[Array[Byte]] = None, rnd: Random = new Random()): WithdrawalEpochCertificate = { WithdrawalEpochCertificate( getBytes(), rnd.nextInt, getBytes(), rnd.nextInt(), rnd.nextLong(), previousMcBlockHashOpt.getOrElse(getBytes()), getBytes(), Seq(), Seq(), Seq()) } }
Example 19
Source File: GenerationRules.scala From Sidechains-SDK with MIT License | 5 votes |
package com.horizen.fixtures.sidechainblock.generation import java.util.Random import scorex.util.ModifierId case class GenerationRules(forgingBoxesToAdd: Set[SidechainForgingData] = Set(), forgingBoxesToSpent: Set[SidechainForgingData] = Set(), mcReferenceIsPresent: Option[Boolean] = None, corruption: CorruptedGenerationRules = CorruptedGenerationRules.emptyCorruptedGenerationRules, forcedParentId: Option[ModifierId] = None, forcedTimestamp: Option[Long] = None ) { def isCorrupted: Boolean = corruption == CorruptedGenerationRules.emptyCorruptedGenerationRules } object GenerationRules { def generateCorrectGenerationRules(rnd: Random, allNotSpentForgerData: Set[SidechainForgingData]): GenerationRules = { val addForgingData: Set[SidechainForgingData] = if (allNotSpentForgerData.size > 100) { Set(SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000)))) } else { Set(SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000))), SidechainForgingData.generate(rnd, Math.abs(rnd.nextInt(1000000)))) } val removedForgingData: Set[SidechainForgingData] = if (rnd.nextBoolean()) { Set(allNotSpentForgerData.toSeq(rnd.nextInt(allNotSpentForgerData.size))) } else { val deleteSize = if (allNotSpentForgerData.size > 100) 10 else 1 allNotSpentForgerData.toSeq.sortBy(_.forgerBox.value())(Ordering[Long]).take(deleteSize).toSet } require((removedForgingData -- allNotSpentForgerData).isEmpty) GenerationRules(forgingBoxesToAdd = addForgingData, forgingBoxesToSpent = removedForgingData) } }
Example 20
Source File: SidechainForgingData.scala From Sidechains-SDK with MIT License | 5 votes |
package com.horizen.fixtures.sidechainblock.generation import java.util.Random import com.horizen.box.ForgerBox import com.horizen.box.data.ForgerBoxData import com.horizen.consensus._ import com.horizen.proof.VrfProof import com.horizen.proposition.VrfPublicKey import com.horizen.secret.{PrivateKey25519, PrivateKey25519Creator, VrfKeyGenerator, VrfSecretKey} import com.horizen.vrf.VrfOutput case class SidechainForgingData(key: PrivateKey25519, forgerBox: ForgerBox, vrfSecret: VrfSecretKey) { def canBeForger(vrfMessage: VrfMessage, totalStake: Long, additionalCheck: Boolean => Boolean): Option[(VrfProof, VrfOutput)] = { val vrfProofAndHash = vrfSecret.prove(vrfMessage) val vrfProof = vrfProofAndHash.getKey val vrfOutput = vrfProofAndHash.getValue val checker = (stakeCheck _).tupled.andThen(additionalCheck) Some((vrfProof, vrfOutput)).filter{case (vrfProof, vrfOutput) => checker(vrfOutput, totalStake)} } private def stakeCheck(vrfOutput: VrfOutput, totalStake: Long): Boolean = { vrfProofCheckAgainstStake(vrfOutput, forgerBox.value(), totalStake) } val forgerId: Array[Byte] = forgerBox.id() override def toString: String = { s"id - ${key.hashCode()}, value - ${forgerBox.value()}" } override def equals(obj: Any): Boolean = { obj match { case that: SidechainForgingData => { val keyEquals = this.key.equals(that.key) val forgerBoxEquals = this.forgerBox.equals(that.forgerBox) val vrfSecretEquals = this.vrfSecret.equals(that.vrfSecret) keyEquals && forgerBoxEquals && vrfSecretEquals } case _ => false } } } object SidechainForgingData { def generate(rnd: Random, value: Long): SidechainForgingData = { val key: PrivateKey25519 = PrivateKey25519Creator.getInstance().generateSecret(rnd.nextLong().toString.getBytes) val vrfSecretKey = VrfKeyGenerator.getInstance().generateSecret(rnd.nextLong().toString.getBytes()) val vrfPublicKey: VrfPublicKey = vrfSecretKey.publicImage(); val forgerBox = new ForgerBoxData(key.publicImage(), value, key.publicImage(), vrfPublicKey).getBox(rnd.nextLong()) SidechainForgingData(key, forgerBox, vrfSecretKey) } }
Example 21
Source File: ForgerBoxFixture.scala From Sidechains-SDK with MIT License | 5 votes |
package com.horizen.fixtures import java.util.Random import com.horizen.box.ForgerBox import com.horizen.box.data.ForgerBoxData import com.horizen.proposition.VrfPublicKey import com.horizen.secret.{PrivateKey25519, VrfKeyGenerator, VrfSecretKey} import com.horizen.utils import com.horizen.utils.Ed25519 case class ForgerBoxGenerationMetadata(propositionSecret: PrivateKey25519, blockSignSecret: PrivateKey25519, vrfSecret: VrfSecretKey) object ForgerBoxFixture { def generateForgerBox(seed: Long): (ForgerBox, ForgerBoxGenerationMetadata) = generateForgerBox(seed, None) def generateForgerBox(seed: Long, vrfKeysOpt: Option[(VrfSecretKey, VrfPublicKey)]): (ForgerBox, ForgerBoxGenerationMetadata) = { val randomGenerator = new Random(seed) val byteSeed = new Array[Byte](32) randomGenerator.nextBytes(byteSeed) val propositionKeyPair: utils.Pair[Array[Byte], Array[Byte]] = Ed25519.createKeyPair(byteSeed) val ownerKeys: PrivateKey25519 = new PrivateKey25519(propositionKeyPair.getKey, propositionKeyPair.getValue) val value: Long = randomGenerator.nextLong val (vrfSecret, vrfPubKey) = vrfKeysOpt.getOrElse{ val secretKey = VrfKeyGenerator.getInstance().generateSecret(ownerKeys.bytes()) val publicKey = secretKey.publicImage() (secretKey, publicKey) } val proposition = ownerKeys.publicImage() val forgerBoxData = new ForgerBoxData(proposition, value, proposition, vrfPubKey) val nonce: Long = randomGenerator.nextLong val forgerBox = forgerBoxData.getBox(nonce) (forgerBox, ForgerBoxGenerationMetadata(ownerKeys, ownerKeys, vrfSecret)) } }
Example 22
Source File: HistoryConsensusCheckerTest.scala From Sidechains-SDK with MIT License | 5 votes |
package com.horizen.consensus import java.util.Random import com.horizen.SidechainHistory import com.horizen.fixtures.sidechainblock.generation._ import com.horizen.params.{NetworkParams, TestNetParams} import org.junit.Test import org.scalatest.junit.JUnitSuite import scala.collection.mutable import scala.util.{Failure, Success, Try} class HistoryConsensusCheckerTest extends JUnitSuite with HistoryConsensusChecker { def testWithSeed(testSeed: Int): Unit = { //val testSeed = 234 val rnd: Random = new Random(testSeed) val initialParams = TestNetParams(consensusSlotsInEpoch = 10, sidechainGenesisBlockTimestamp = 1333344452L) val (params, genesisBlock, genesisGenerator, genesisForgingData, genesisEndEpochInfo) = SidechainBlocksGenerator.startSidechain(10000000000L, testSeed, initialParams) val history: SidechainHistory = createHistory(params, genesisBlock, genesisEndEpochInfo) val nonce = history.calculateNonceForEpoch(blockIdToEpochId(genesisBlock.id)) val stake = genesisEndEpochInfo.stakeConsensusEpochInfo history.applyFullConsensusInfo(genesisBlock.id, FullConsensusEpochInfo(stake, nonce)) println(s"//////////////// Genesis epoch ${genesisBlock.id} had been ended ////////////////") val generators = mutable.IndexedSeq(genesisGenerator) (1 to 50) .foldLeft[(SidechainHistory, mutable.IndexedSeq[SidechainBlocksGenerator])]((history, generators)) { (acc, index) => val currentHistory: SidechainHistory = acc._1 val currentGenerators: mutable.IndexedSeq[SidechainBlocksGenerator] = acc._2 val nextGenerator: SidechainBlocksGenerator = generatorSelection(rnd, currentGenerators) val nextCorrectGenerationRules: GenerationRules = GenerationRules.generateCorrectGenerationRules(rnd, nextGenerator.getNotSpentBoxes) println("try to add incorrect block(s)") tryToAddIncorrectBlocks(params, currentHistory, nextGenerator, nextCorrectGenerationRules, rnd) println("try to add correct block") val correctRes = Try(generateBlock(nextCorrectGenerationRules, nextGenerator, history)) match { case Success((gens, generatedBlock)) => val updatedHistory = historyUpdateShallBeSuccessful(currentHistory, generatedBlock) val updatedGenerators = currentGenerators ++ gens (updatedHistory, updatedGenerators) case Failure(ex: GenerationIsNoLongerPossible) => println("Finishing block generation") return case Failure(ex) => println("Error during block generation") throw ex } correctRes } } private def tryToAddIncorrectBlocks(params: NetworkParams, currentHistory: SidechainHistory, currentGenerator: SidechainBlocksGenerator, correctGenerationRules: GenerationRules, rnd: Random, incorrectBlocksCount: Int = 2): Unit = Try { (1 to incorrectBlocksCount) .foreach{ _ => val incorrectGenerationRules: GenerationRules = CorruptedGenerationRules.corruptGenerationRules(rnd, params, currentGenerator, correctGenerationRules) //println(s"Generated corruption rules are: ${incorrectGenerationRules}") currentGenerator .tryToGenerateBlockForCurrentSlot(incorrectGenerationRules) .map(generationInfo => historyUpdateShallBeFailed(currentHistory,generationInfo.block, incorrectGenerationRules)) } } @Test def testManySeeds(): Unit = { val seed = 9084 (50 to 50).foreach{index => println(s"SEED IS ${index}") testWithSeed(index + seed) } } }
Example 23
Source File: DescriptiveStatsSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.commons import java.util.Random import org.scalatest.FunSuite // scalastyle:off magic.number class DescriptiveStatsSuite extends FunSuite { val SEED = 123 val SignificantPosCorrelation = 0.9 test("mean") { val samples0 = Seq(1, 1) val samples1 = Seq(1, 2, 3, 4) val samples2 = Seq(1.1, 0.9, 1.0) val samples3 = Seq.empty[Int] assertResult(1.0)(DescriptiveStats.mean(samples0)) assertResult(2.5)(DescriptiveStats.mean(samples1)) assertResult(1.0)(DescriptiveStats.mean(samples2)) val samples3mean = DescriptiveStats.mean(samples3) assert(samples3mean.isNaN) } test("stdev") { val samples0 = Seq(0, 2) val samples1 = Seq.empty[Double] val samples2 = Seq.fill(1000)(0) ++ Seq.fill(1000)(2) assertResult(math.sqrt(2))(DescriptiveStats.stdev(samples0)) assert(DescriptiveStats.stdev(samples1).isNaN) val samples2stdev = DescriptiveStats.stdev(samples2) assert(samples2stdev > 1.0 && samples2stdev < 1.001) } test("pearson") { val rand = new Random(SEED) val samples1 = Seq((1, 1), (2, 2), (3, 3)) val samples2 = Seq((3.0, 1), (2.0, 2), (1.0, 3)) val samples3 = (1 to 100000).map { i => (rand.nextDouble(), rand.nextDouble()) } val samples4 = Seq.empty[(Double, Double)] assertResult(1.0)(DescriptiveStats.pearson(samples1)) assertResult(-1.0)(DescriptiveStats.pearson(samples2)) assert(math.abs(DescriptiveStats.pearson(samples3)) < 0.01) assert(DescriptiveStats.pearson(samples4).isNaN) } test("spearman") { val rand = new Random(SEED) val samples1 = Seq((1, 1), (2, 2), (3, 3)) val samples2 = Seq((3.0, 1), (2.0, 2), (1.0, 3)) val samples3 = (1 to 100000).map { i => (rand.nextDouble(), rand.nextDouble()) } val samples4 = Seq.empty[(Double, Double)] assertResult(1.0)(DescriptiveStats.spearman(samples1)) assertResult(-1.0)(DescriptiveStats.spearman(samples2)) assert(math.abs(DescriptiveStats.spearman(samples3)) < 0.01) assert(DescriptiveStats.spearman(samples4).isNaN) } test("spearman & pearson w/ noise & outliers") { val samples1 = Seq((1, 300.0), (2, 250.0), (3, 400.0), (4, 350.0), (5, 500.0), (6, 450.0), (7, 600.0), (8, 550.0), (9, 700.0), (10, 650.0)) val samples2 = Seq((1, 300.0), (2, 350.0), (3, 400.0), (4, 450.0), (5, 500.0), (6, 550.0), (7, 2000.0), (8, 700.0), (9, 750.0), (10, 800.0)) assert(DescriptiveStats.pearson(samples1) > SignificantPosCorrelation) assert(DescriptiveStats.spearman(samples1) > SignificantPosCorrelation) // pearson is less robust, does not detect dependency assert(DescriptiveStats.pearson(samples2) < SignificantPosCorrelation) // spearman detects dependency assert(DescriptiveStats.spearman(samples2) > SignificantPosCorrelation) } test("spearman & pearson w/ real data") { val measure1 = Seq(379, 379, 382, 360, 378, 374, 364, 371, 360, 365, 364, 363, 369, 375, 365, 369, 358, 372, 370, 363, 363, 369, 361, 362, 367, 357, 365, 364, 363, 368, 360, 361, 360, 363, 359, 357, 365, 367, 364, 363) val measure2 = Seq(411, 379, 380, 382, 387, 404, 410, 431, 430, 444, 468, 489, 519, 573, 571, 620, 643, 657, 694, 711, 752, 783, 807, 841, 856, 891, 912, 962,1042, 982, 1076,1056,1092,1145,1128,1186,1221,1245,1284,1307) val samples1 = measure1.zipWithIndex val samples2 = measure2.zipWithIndex assert(DescriptiveStats.pearson(samples1) < SignificantPosCorrelation) assert(DescriptiveStats.spearman(samples1) < SignificantPosCorrelation) assert(DescriptiveStats.pearson(samples2) > SignificantPosCorrelation) assert(DescriptiveStats.spearman(samples2) > SignificantPosCorrelation) } }
Example 24
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 25
Source File: ProjectionsTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.linalg import java.util.Random import org.scalacheck.Gen import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.scalatest.prop.Checkers.check class ProjectionsTest extends LinalgPropSpec with GeneratorDrivenPropertyChecks with Matchers { import org.scalactic.Tolerance._ val dimGen = for { srcDim <- Gen.choose(100, 200) dstDim <- Gen.choose(100, 200) } yield (srcDim, dstDim) property("gaussian random projection have good statistical properties") { forAllNoShrink(dimGen) { case (srcDim, dstDim) => val projection = GaussianRandomProjection(srcDim, dstDim, new Random()) projection.mean === 0.0 +- 0.01 projection.stddev === 1.0 +- 0.01 } } property("cauchy random projection have good statistical properties") { forAllNoShrink(dimGen) { case (srcDim, dstDim) => val projection = CauchyRandomProjection(srcDim, dstDim) projection.median === 0.0 +- 0.01 } } }
Example 26
Source File: FuzzySearchEvaluationUtilsWrapper.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev.similaritysearch.implementation import java.util import java.util.Random import com.stefansavev.randomprojections.utils.RandomUtils import com.stefansavev.similaritysearch.{SimilaritySearchIndex, SimilaritySearchResult, SimilaritySearchResultBuilder, SimilaritySearchResults} object FuzzySearchEvaluationUtilsWrapper { def generateRandomTestSet(rnd: Random, numQueries: Int, index: SimilaritySearchIndex): SimilaritySearchResults = { import scala.collection.JavaConversions._ val itemNames = index.getItems.toIterator.map(_.getName).toArray val sampleIds = RandomUtils.sample(rnd, numQueries, Array.range(0, itemNames.length)) val builder = new SimilaritySearchResultBuilder() for (id <- sampleIds) { val queryId = itemNames(id) val queryVector = index.getItemByName(queryId).getVector val queryResults = new util.ArrayList[SimilaritySearchResult]() builder.addResult(queryId, queryResults) } return builder.build() } }
Example 27
Source File: DataFrameView.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev.randomprojections.datarepr.dense import java.util.Random import com.stefansavev.randomprojections.datarepr.sparse.SparseVector import com.stefansavev.randomprojections.implementation.{Signatures, PointSignatures} class PointIndexes(val indexes: Array[Int]){ def toTuple: PointIndexes.TupleType = (0, indexes) def size = indexes.length def apply(i: Int): Int = indexes(i) } object PointIndexes{ type TupleType = (Int, Array[Int]) //the first is dummy because I need to add a tuple1 def apply(indexes: Array[Int]): PointIndexes = new PointIndexes(indexes) def unapply(pntIndexes: PointIndexes): Option[Array[Int]] = Some(pntIndexes.indexes) def fromTuple(t: TupleType): PointIndexes = new PointIndexes(t._2) } class DataFrameView(val indexes: PointIndexes, val rowStoredView: RowStoredMatrixView) { def toTuple:DataFrameView.TupleType = (indexes, rowStoredView) var pointSignatures: PointSignatures = null //work around until the concept is validated def numRows: Int = indexes.size def numCols: Int = rowStoredView.numCols def setPointSignatures(pointSignatures: PointSignatures): Unit = { this.pointSignatures = pointSignatures } def getRowIdByName(name: String): Int = { rowStoredView.getRowIdByName(name) } def buildSetSignatures(numSignatures: Int, rnd: Random): Unit = { if (pointSignatures != null){ throw new IllegalStateException("Signatures cannot be overwritten") } val (signatureVecs, signatures) = Signatures.computePointSignatures(numSignatures, rnd, this) this.setPointSignatures(signatures) } def getPointSignatures(): PointSignatures = { this.pointSignatures } def getAllRowNames(): Array[String] = { rowStoredView.getAllRowNames() } def getPointAsDenseVector(pntId: Int): Array[Double] = { rowStoredView.getPointAsDenseVector(pntId) } def getPointAsDenseVector(pntId: Int, columnIds: Array[Int], vec: Array[Double]): Unit = { rowStoredView.getPointAsDenseVector(pntId, columnIds, vec) } def multiplyRowComponentWiseBySparseVector(pntId: Int, sv: SparseVector, output: Array[Double]): Unit = { rowStoredView.multiplyRowComponentWiseBySparseVector(pntId, sv, output) } def getUnderlyingIndexes(): PointIndexes = indexes def childView(newIndexes: PointIndexes): DataFrameView = { new DataFrameView(newIndexes, rowStoredView) } def getLabel(rowId: Int): Int = rowStoredView.getLabel(rowId) def getAllLabels(): Array[Int] = rowStoredView.getAllLabels() def getName(rowId: Int): String = { rowStoredView.getName(rowId) } //def dist(id1: Int, id2: Int): Double = rowStoredView.dist(id1, id2) def cosineForNormalizedData(query: Array[Double], id: Int): Double = rowStoredView.cosineForNormalizedData(query, id) override def toString = s"DataFrameView($numRows, $numCols)" } object DataFrameView{ type TupleType = (PointIndexes, RowStoredMatrixView) def fromTuple(t: TupleType) = new DataFrameView(t._1, t._2) }
Example 28
Source File: RandomUtils.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev.randomprojections.utils import java.util.Random import com.stefansavev.randomprojections.buffers.IntArrayBuffer import com.stefansavev.randomprojections.datarepr.sparse.SparseVector object RandomUtils { def shuffleInts(rnd: Random, arr: Array[Int]): Array[Int] = { val values = arr.map(v => (v, rnd.nextDouble())).sortBy(_._2).map(_._1) values } def shuffleDoubles(rnd: Random, arr: Array[Double]): Array[Double] = { val values = arr.map(v => (v, rnd.nextDouble())).sortBy(_._2).map(_._1) values } def sign(rnd: Random): Double = { if (rnd.nextDouble() > 0.5) 1.0 else -1.0 } def generateRandomVector(rnd: Random, numCols: Int, columnIds: Array[Int]): SparseVector = { val signs = columnIds.map(_ => (if (rnd.nextDouble() >= 0.5) 1.0 else -1.0)) var sum = 0.0 var i = 0 while (i < signs.length) { val v = signs(i) sum += v * v i += 1 } sum = Math.sqrt(sum) i = 0 while (i < signs.length) { signs(i) /= sum i += 1 } val sparseVec = new SparseVector(numCols, columnIds, signs) sparseVec } def generateRandomVector(rnd: Random, numCols: Int): SparseVector = { generateRandomVector(rnd, numCols, Array.range(0, numCols)) } //TODO: use a version of reservoir sampling together with random shuffle def sample(rnd: Random, k: Int, arr: Array[Int]): Array[Int] = { def getValue(arr: Array[Int], overWrites: scala.collection.mutable.HashMap[Int, Int], index: Int): Int = { if (overWrites.contains(index)) { overWrites(index) } else { arr(index) } } var currentLength = arr.length val buffer = new IntArrayBuffer() val overWrites = new scala.collection.mutable.HashMap[Int, Int]() var i = 0 while (i < k && currentLength > 0) { val nextPos = rnd.nextInt(currentLength) val sampledValue = getValue(arr, overWrites, nextPos) buffer += sampledValue if (nextPos < currentLength - 1) { val lastValue = getValue(arr, overWrites, currentLength - 1) overWrites += ((nextPos, lastValue)) } currentLength -= 1 i += 1 } buffer.toArray() } }
Example 29
Source File: SplitIntoKProjection.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev.randomprojections.implementation class SplitIntoKProjection { } import java.util.Random import com.stefansavev.randomprojections.datarepr.dense.DataFrameView import com.stefansavev.randomprojections.datarepr.sparse.SparseVector import com.stefansavev.randomprojections.utils.RandomUtils import scala.collection.mutable.ArrayBuffer case class SplitIntoKProjectionStrategy(rnd: Random, numCols: Int, k: Int) extends ProjectionStrategy { def chooseKPoints(k: Int, pointIds: Array[Int], view: DataFrameView): Array[Int] = { RandomUtils.shuffleInts(rnd, pointIds).take(k) } def chooseKDimensions(k: Int): Array[Int] = { val columns = Array.range(0, numCols) RandomUtils.shuffleInts(rnd, columns).take(k).sorted } def generateRandomVector(columnIds: Array[Int]): SparseVector = { val signs = columnIds.map(_ => (if (rnd.nextDouble() >= 0.5) 1.0 else -1.0)) var sum = 0.0 var i = 0 while (i < signs.length) { val v = signs(i) sum += v * v i += 1 } sum = Math.sqrt(sum) i = 0 while (i < signs.length) { signs(i) /= sum i += 1 } val sparseVec = new SparseVector(numCols, columnIds, signs) sparseVec } def generateKRandomVectors(num: Int, columnIds: Array[Int]): Array[SparseVector] = { val buff = new ArrayBuffer[SparseVector]() for (i <- 0 until num) { buff += generateRandomVector(columnIds) } buff.toArray } def nextRandomProjection(depth: Int, view: DataFrameView, projectionVector: AbstractProjectionVector): AbstractProjectionVector = { val useK = HadamardUtils.largestPowerOf2(k) val chosenDim = chooseKDimensions(useK) val randomVector = generateRandomVector(chosenDim) val proj = new HadamardProjectionVector(randomVector) proj } } case class SplitIntoKProjectionSettings(k: Int) class SplitIntoKProjectionBuilder(builderSettings: SplitIntoKProjectionSettings) extends ProjectionStrategyBuilder { type T = SplitIntoKProjectionStrategy val splitStrategy: DatasetSplitStrategy = new HadamardProjectionSplitStrategy() def build(settings: IndexSettings, rnd: Random, dataFrameView: DataFrameView): T = SplitIntoKProjectionStrategy(rnd, dataFrameView.numCols, builderSettings.k) def datasetSplitStrategy: DatasetSplitStrategy = splitStrategy }
Example 30
Source File: ValuesStoreTest.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev import java.util.Random import com.stefansavev.randomprojections.datarepr.dense.store._ import com.typesafe.scalalogging.StrictLogging import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class TestSingleByteEncodingSpec extends FlatSpec with Matchers { "Error after encoding double to float" should "be small" in { val minV = -1.0f val maxV = 2.0f val rnd = new Random(481861) for (i <- 0 until 100) { //we encode a float (which is 4 bytes) with a single byte //therefore the loss of precision val value = rnd.nextFloat() * 3.0f - 1.0f val enc = FloatToSingleByteEncoder.encodeValue(minV, maxV, value) val dec = FloatToSingleByteEncoder.decodeValue(minV, maxV, enc) val error = Math.abs(value - dec) error should be < (0.01) } } } @RunWith(classOf[JUnitRunner]) class TestValueStores extends FlatSpec with Matchers { case class BuilderTypeWithErrorPredicate(builderType: StoreBuilderType, pred: Double => Boolean) "ValueStore" should "return store the data with small error" in { val tests = List( BuilderTypeWithErrorPredicate(StoreBuilderAsDoubleType, error => (error <= 0.0)), BuilderTypeWithErrorPredicate(StoreBuilderAsBytesType, error => (error <= 0.01)), BuilderTypeWithErrorPredicate(StoreBuilderAsSingleByteType, error => (error <= 0.01)) ) for (test <- tests) { testBuilder(test) } def testBuilder(builderWithPred: BuilderTypeWithErrorPredicate): Unit = { val dataGenSettings = RandomBitStrings.RandomBitSettings( numGroups = 1000, numRowsPerGroup = 2, numCols = 256, per1sInPrototype = 0.5, perNoise = 0.2) val debug = false val randomBitStringsDataset = RandomBitStrings.genRandomData(58585, dataGenSettings, debug, true) val builder = builderWithPred.builderType.getBuilder(randomBitStringsDataset.numCols) def addValues(): Unit = { var i = 0 while (i < randomBitStringsDataset.numRows) { val values = randomBitStringsDataset.getPointAsDenseVector(i) builder.addValues(values) i += 1 } } addValues() val valueStore = builder.build() def verifyStoredValues(expected: Array[Double], stored: Array[Double]): Unit = { for (i <- 0 until expected.length) { val error = Math.abs(expected(i) - stored(i)) val passed = builderWithPred.pred(error) passed should be (true) } } def testValues(): Unit = { var i = 0 while (i < randomBitStringsDataset.numRows) { val values = randomBitStringsDataset.getPointAsDenseVector(i) val output = Array.ofDim[Double](randomBitStringsDataset.numCols) valueStore.fillRow(i, output, true) verifyStoredValues(values, output) i += 1 } } testValues() } } } object Test extends StrictLogging { def main(args: Array[String]) { logger.info("hello") } }
Example 31
Source File: SpeedSimulator.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev.tuning import java.util.Random case class SpeedSimulatorParams(numberOfQueries: Int, numberOfTrees: Int, requiredPointsPerTree: Int, deviationOfRequiredPointsPerTree: Int = 0){ } object SpeedSimulator { val rnd = new Random(11144) def testPoint(p: Int, params: SpeedSimulatorParams, buffer: Array[Int]): Int = { val rnd = this.rnd val len = buffer.length var i = 0 val numTrees = params.numberOfTrees val numPointsPerTree = params.requiredPointsPerTree val offset = rnd.nextInt(len) val stride = offset/params.requiredPointsPerTree + rnd.nextInt(50) var totalOperations = 0 while(i < numTrees){ var j = 0 while(j < numPointsPerTree){ val k = (offset + stride*j + i) % len //some random formula buffer(k) += 1 j += 1 } i += 1 totalOperations += 1 } totalOperations } def simulate(params: SpeedSimulatorParams): Unit = { val numQueries = params.numberOfQueries val bufflen = params.numberOfQueries val buffer = Array.ofDim[Int](bufflen) val start = System.currentTimeMillis() var i = 0 while(i < numQueries){ var j = 0 while(j < bufflen){ buffer(j) = 0 j += 1 } if (i % 5000 == 0){ println(".") } testPoint(i, params, buffer) i += 1 } val result = System.currentTimeMillis() - start println("Simulation in secs: " + result/1000.0 + " ; per point in ms " + result.toDouble/numQueries.toDouble) } def main (args: Array[String]): Unit = { val params = SpeedSimulatorParams(numberOfQueries = 42000, numberOfTrees = 10, requiredPointsPerTree = 1200) simulate(params) } }
Example 32
Source File: RandomBitStrings.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev import java.util.Random import com.stefansavev.randomprojections.datarepr.dense.{ColumnHeaderBuilder, DataFrameView, PointIndexes, RowStoredMatrixViewBuilderFactory} object RandomBitStrings { case class RandomBitSettings(numGroups: Int, numRowsPerGroup: Int, numCols: Int, per1sInPrototype: Double, perNoise: Double) def generatePrototype(rnd: Random, dim: Int, perValue: Double): Array[Double] = { val arr = Array.ofDim[Double](dim) for (i <- 0 until dim) { val gaussian = Math.abs(rnd.nextGaussian()) arr(i) = -1.0 * gaussian if (rnd.nextDouble() < perValue) { arr(i) = 1.0 * gaussian } } arr } def corrupt(rnd: Random, input: Array[Double], perNoise: Double): Array[Double] = { val arr = Array.ofDim[Double](input.length) for (i <- 0 until input.length) { if (rnd.nextDouble() < perNoise) { arr(i) = -input(i) } else { arr(i) = input(i) } } arr } //todo: put in utils def normalize(input: Array[Double]): Array[Double] = { val arr = Array.ofDim[Double](input.length) var norm = 0.0 for (i <- 0 until input.length) { norm += input(i) * input(i) } norm = Math.sqrt(norm) for (i <- 0 until input.length) { arr(i) = input(i) / norm } arr } def genRandomData(seed: Int, settings: RandomBitSettings, debug: Boolean, dense: Boolean): DataFrameView = { val (numGroups, numRowsPerGroup, numCols: Int, per1sInPrototype: Double, perNoise: Double) = (settings.numGroups, settings.numRowsPerGroup, settings.numCols, settings.per1sInPrototype, settings.perNoise) val numRows = numGroups * numRowsPerGroup val labels = Array.ofDim[Int](numRows) val rnd = new Random(seed) var i = 0 val columnNames = Array.range(0, numCols).map((i: Int) => ("feature" + i, i)) val rowNames = Array.range(0, numRows).map(_.toString) val header = ColumnHeaderBuilder.build("label", columnNames, true) val builder = RowStoredMatrixViewBuilderFactory.createDense(header) for (g <- 0 until numGroups) { val prototype = generatePrototype(rnd, numCols, per1sInPrototype) for (r <- 0 until numRowsPerGroup) { val noisyProt = corrupt(rnd, prototype, perNoise) labels(i) = g if (i != builder.currentRowId) { throw new IllegalStateException("Cannot skip rows") } builder.addRow(i.toString, g, Array.range(0, numCols), normalize(noisyProt)) i += 1 } } val indexes = PointIndexes(Array.range(0, numRows)) new DataFrameView(indexes, builder.build()) } }
Example 33
Source File: RandomBitStrings.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev.fuzzysearchtest import java.util.Random import com.stefansavev.randomprojections.datarepr.dense.{ColumnHeaderBuilder, DataFrameView, PointIndexes, RowStoredMatrixViewBuilderFactory} object RandomBitStrings { case class RandomBitSettings(numGroups: Int, numRowsPerGroup: Int, numCols: Int, per1sInPrototype: Double, perNoise: Double) def generatePrototype(rnd: Random, dim: Int, perValue: Double): Array[Double] = { val arr = Array.ofDim[Double](dim) for (i <- 0 until dim) { arr(i) = -1.0 if (rnd.nextDouble() < perValue) { arr(i) = 1.0 } } arr } def corrupt(rnd: Random, input: Array[Double], perNoise: Double): Array[Double] = { val arr = Array.ofDim[Double](input.length) for (i <- 0 until input.length) { if (rnd.nextDouble() < perNoise) { arr(i) = -input(i) } else { arr(i) = input(i) } } arr } def genRandomData(seed: Int, settings: RandomBitSettings, debug: Boolean, dense: Boolean): DataFrameView = { val (numGroups, numRowsPerGroup, numCols: Int, per1sInPrototype: Double, perNoise: Double) = (settings.numGroups, settings.numRowsPerGroup, settings.numCols, settings.per1sInPrototype, settings.perNoise) val numRows = numGroups * numRowsPerGroup val labels = Array.ofDim[Int](numRows) val rnd = new Random(seed) var i = 0 val columnNames = Array.range(0, numCols).map((i: Int) => ("feature" + i, i)) val header = ColumnHeaderBuilder.build("label", columnNames, false) val builder = RowStoredMatrixViewBuilderFactory.createDense(header) for (g <- 0 until numGroups) { val prototype = generatePrototype(rnd, numCols, per1sInPrototype) for (r <- 0 until numRowsPerGroup) { val noisyProt = corrupt(rnd, prototype, perNoise) labels(i) = g if (i != builder.currentRowId) { throw new IllegalStateException("Cannot skip rows") } builder.addRow(g, Array.range(0, numCols), noisyProt) i += 1 } } val indexes = PointIndexes(Array.range(0, numRows)) new DataFrameView(indexes, builder.build()) } }
Example 34
Source File: TestOnRandomData.scala From random-projections-at-berlinbuzzwords with Apache License 2.0 | 5 votes |
package com.stefansavev.fuzzysearchtest import java.util.Random import com.stefansavev.randomprojections.actors.Application import com.stefansavev.randomprojections.implementation._ import com.stefansavev.randomprojections.utils.Utils import com.stefansavev.similaritysearch.SimilaritySearchEvaluationUtils import com.stefansavev.similaritysearch.VectorType.StorageSize import com.stefansavev.similaritysearch.implementation.FuzzySearchIndexBuilderWrapper import com.typesafe.scalalogging.StrictLogging object TestOnRandomData extends StrictLogging { implicit val _ = logger def main(args: Array[String]): Unit = { val dataGenSettings = RandomBitStrings.RandomBitSettings( numGroups = 100000, numRowsPerGroup = 2, numCols = 256, per1sInPrototype = 0.5, perNoise = 0.1) val debug = false val randomBitStringsDataset = RandomBitStrings.genRandomData(58585, dataGenSettings, debug, true) val randomTreeSettings = IndexSettings( maxPntsPerBucket = 50, numTrees = 50, maxDepth = None, projectionStrategyBuilder = ProjectionStrategies.splitIntoKRandomProjection(), reportingDistanceEvaluator = ReportingDistanceEvaluators.cosineOnOriginalData(), randomSeed = 39393 ) println("Number of Rows: " + randomBitStringsDataset.numRows) val diskLocation = "D:/tmp/randomfile" val trees = Utils.timed("Build Index", { val wrapper = new FuzzySearchIndexBuilderWrapper(diskLocation, randomBitStringsDataset.numCols, 50, StorageSize.Double) var i = 0 while (i < randomBitStringsDataset.numRows) { wrapper.addItem(i.toString, 0, randomBitStringsDataset.getPointAsDenseVector(i)) i += 1 } wrapper.build() //SimilaritySearchIndex.open(diskLocation) () }).result SimilaritySearchEvaluationUtils.compareWithBruteForce(diskLocation, new Random(481868), 1000, 50) Application.shutdown() } }
Example 35
Source File: RenderParticle.scala From Electrodynamics with GNU Lesser General Public License v3.0 | 5 votes |
package com.calclavia.edx.quantum.machine.accelerator import java.util.Random import cpw.mods.fml.relauncher.{Side, SideOnly} import net.minecraft.client.renderer.entity.Render import net.minecraft.client.renderer.{RenderHelper, Tessellator} import net.minecraft.entity.Entity import net.minecraft.util.ResourceLocation import org.lwjgl.opengl.GL11 @SideOnly(Side.CLIENT) class RenderParticle extends Render { def doRender(entity: Entity, x: Double, y: Double, z: Double, var8: Float, var9: Float) { val tessellator: Tessellator = Tessellator.instance var par2: Float = (entity.ticksExisted) while (par2 > 200) { par2 -= 100 } RenderHelper.disableStandardItemLighting val var41: Float = (5 + par2) / 200.0F var var51: Float = 0.0F if (var41 > 0.8F) { var51 = (var41 - 0.8F) / 0.2F } val rand: Random = new Random(432L) GL11.glPushMatrix GL11.glTranslatef(x.asInstanceOf[Float], y.asInstanceOf[Float], z.asInstanceOf[Float]) GL11.glScalef(0.15f, 0.15f, 0.15f) GL11.glDisable(GL11.GL_TEXTURE_2D) GL11.glShadeModel(GL11.GL_SMOOTH) GL11.glEnable(GL11.GL_BLEND) GL11.glBlendFunc(GL11.GL_SRC_ALPHA, GL11.GL_ONE) GL11.glDisable(GL11.GL_ALPHA_TEST) GL11.glEnable(GL11.GL_CULL_FACE) GL11.glDepthMask(false) GL11.glPushMatrix GL11.glTranslatef(0.0F, -1.0F, -2.0F) for (i1 <- 0 to ((var41 + var41 * var41) / 2.0F * 60.0F).asInstanceOf[Int]) { GL11.glRotatef(rand.nextFloat * 360.0F, 1.0F, 0.0F, 0.0F) GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 1.0F, 0.0F) GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 0.0F, 1.0F) GL11.glRotatef(rand.nextFloat * 360.0F, 1.0F, 0.0F, 0.0F) GL11.glRotatef(rand.nextFloat * 360.0F, 0.0F, 1.0F, 0.0F) GL11.glRotatef(rand.nextFloat * 360.0F + var41 * 90.0F, 0.0F, 0.0F, 1.0F) tessellator.startDrawing(6) val var81: Float = rand.nextFloat * 20.0F + 5.0F + var51 * 10.0F val var91: Float = rand.nextFloat * 2.0F + 1.0F + var51 * 2.0F tessellator.setColorRGBA_I(16777215, (255.0F * (1.0F - var51)).asInstanceOf[Int]) tessellator.addVertex(0.0D, 0.0D, 0.0D) tessellator.setColorRGBA_I(0, 0) tessellator.addVertex(-0.866D * var91, var81, -0.5F * var91) tessellator.addVertex(0.866D * var91, var81, -0.5F * var91) tessellator.addVertex(0.0D, var81, 1.0F * var91) tessellator.addVertex(-0.866D * var91, var81, -0.5F * var91) tessellator.draw } GL11.glPopMatrix GL11.glDepthMask(true) GL11.glDisable(GL11.GL_CULL_FACE) GL11.glDisable(GL11.GL_BLEND) GL11.glShadeModel(GL11.GL_FLAT) GL11.glColor4f(1.0F, 1.0F, 1.0F, 1.0F) GL11.glEnable(GL11.GL_TEXTURE_2D) GL11.glEnable(GL11.GL_ALPHA_TEST) RenderHelper.enableStandardItemLighting GL11.glPopMatrix } protected def getEntityTexture(entity: Entity): ResourceLocation = { return null } }
Example 36
Source File: BlockRadioactive.scala From Electrodynamics with GNU Lesser General Public License v3.0 | 5 votes |
package com.calclavia.edx.quantum.blocks import java.util.{List, Random} import cpw.mods.fml.relauncher.{Side, SideOnly} import net.minecraft.block.Block import net.minecraft.block.material.Material import net.minecraft.client.Minecraft import net.minecraft.client.particle.EntitySmokeFX import net.minecraft.client.renderer.texture.IIconRegister import net.minecraft.entity.{Entity, EntityLiving, EntityLivingBase} import net.minecraft.init.Blocks import net.minecraft.util.{AxisAlignedBB, IIcon} import net.minecraft.world.World import resonantengine.lib.potion.PoisonRadiation import resonantengine.lib.transform.vector.Vector3 import scala.collection.JavaConversions._ class BlockRadioactive(material: Material) extends Block(material) { var canSpread: Boolean = true var radius: Float = 5 var amplifier: Int = 2 var canWalkPoison: Boolean = true var isRandomlyRadioactive: Boolean = true var spawnParticle: Boolean = true private var iconTop: IIcon = null private var iconBottom: IIcon = null //Constructor this.setTickRandomly(true) this.setHardness(0.2F) override def getIcon(side: Int, metadata: Int): IIcon = { return if (side == 1) this.iconTop else (if (side == 0) this.iconBottom else this.blockIcon) } @SideOnly(Side.CLIENT) override def registerBlockIcons(iconRegister: IIconRegister) { super.registerBlockIcons(iconRegister) this.iconTop = iconRegister.registerIcon(this.getUnlocalizedName.replace("tile.", "") + "_top") this.iconBottom = iconRegister.registerIcon(this.getUnlocalizedName.replace("tile.", "") + "_bottom") } override def onEntityWalking(par1World: World, x: Int, y: Int, z: Int, par5Entity: Entity) { if (par5Entity.isInstanceOf[EntityLiving] && this.canWalkPoison) { PoisonRadiation.INSTANCE.poisonEntity(new Vector3(x, y, z), par5Entity.asInstanceOf[EntityLiving]) } } override def quantityDropped(par1Random: Random): Int = { return 0 } @SideOnly(Side.CLIENT) override def randomDisplayTick(world: World, x: Int, y: Int, z: Int, par5Random: Random) { if (this.spawnParticle) { if (Minecraft.getMinecraft.gameSettings.particleSetting == 0) { val radius: Int = 3 for (i <- 0 to 2) { val pos: Vector3 = new Vector3(x, y, z) pos.add(Math.random * radius - radius / 2, Math.random * radius - radius / 2, Math.random * radius - radius / 2) val fx: EntitySmokeFX = new EntitySmokeFX(world, pos.x, pos.y, pos.z, (Math.random - 0.5) / 2, (Math.random - 0.5) / 2, (Math.random - 0.5) / 2) fx.setRBGColorF(0.2f, 0.8f, 0) Minecraft.getMinecraft.effectRenderer.addEffect(fx) } } } } }
Example 37
Source File: BlockToxicWaste.scala From Electrodynamics with GNU Lesser General Public License v3.0 | 5 votes |
package com.calclavia.edx.quantum.blocks import java.util.Random import com.calclavia.edx.quantum.QuantumContent import QuantumContent import net.minecraft.block.material.Material import net.minecraft.entity.{Entity, EntityLivingBase} import net.minecraft.util.DamageSource import net.minecraft.world.World import net.minecraftforge.fluids.BlockFluidClassic import resonantengine.lib.potion.PoisonRadiation import resonantengine.lib.transform.vector.Vector3 class BlockToxicWaste extends BlockFluidClassic(QuantumContent.getFluidToxicWaste, Material.water) { //Constructor setTickRate(20) override def randomDisplayTick(par1World: World, x: Int, y: Int, z: Int, par5Random: Random) { super.randomDisplayTick(par1World, x, y, z, par5Random) if (par5Random.nextInt(100) == 0) { val d5: Double = x + par5Random.nextFloat val d7: Double = y + this.maxY val d6: Double = z + par5Random.nextFloat par1World.spawnParticle("suspended", d5, d7, d6, 0.0D, 0.0D, 0.0D) } if (par5Random.nextInt(200) == 0) { par1World.playSound(x, y, z, "liquid.lava", 0.2F + par5Random.nextFloat * 0.2F, 0.9F + par5Random.nextFloat * 0.15F, false) } } override def onEntityCollidedWithBlock(par1World: World, x: Int, y: Int, z: Int, entity: Entity) { if (entity.isInstanceOf[EntityLivingBase]) { entity.attackEntityFrom(DamageSource.wither, 3) PoisonRadiation.INSTANCE.poisonEntity(new Vector3(x, y, z), entity.asInstanceOf[EntityLivingBase], 4) } } }
Example 38
Source File: DirectDataInjector.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.cdc import java.text.SimpleDateFormat import java.util.Random import org.kududb.client.{PartialRow, Operation, KuduClient} import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator class DirectDataInjector { val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy") val random = new Random def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tableName> <numberOfRecords>") return } val kuduMaster = args(0) val tableName = args(1) val numberOfRecords = args(2).toInt val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() val table = kuduClient.openTable(tableName) val session = kuduClient.newSession() for (i <- 0 to numberOfRecords) { val record = GamerDataGenerator.makeNewGamerRecord(100000) val pr = new PartialRow(table.getSchema) pr.addString(0, "record.gamerId") pr.addString(1, "") val scannerRows = kuduClient.newScannerBuilder(table).lowerBound(null).limit(1).build().nextRows() val op:Operation = if (scannerRows.hasNext) { val oldRow = scannerRows.next() val oldRecordUpdateOp = table.newInsert() val row = oldRecordUpdateOp.getRow row.addString("gamer_id", oldRow.getString("gamer_id")) row.addString("eff_to", simpleDateFormat.format(System.currentTimeMillis())) row.addString("eff_from", oldRow.getString("eff_from")) row.addLong("last_time_played", oldRow.getLong("last_time_played")) row.addInt("games_played", oldRow.getInt("games_played")) row.addInt("games_won", oldRow.getInt("games_won")) row.addInt("oks", oldRow.getInt("oks")) row.addInt("deaths", oldRow.getInt("deaths")) row.addInt("damage_given", oldRow.getInt("damage_given")) row.addInt("damage_taken", oldRow.getInt("damage_taken")) row.addInt("max_oks_in_one_game", oldRow.getInt("max_oks_in_one_game")) row.addInt("max_deaths_in_one_game", oldRow.getInt("max_deaths_in_one_game")) session.apply(oldRecordUpdateOp) table.newUpdate() } else { table.newInsert() } val row = op.getRow row.addString("gamer_id", record.gamerId) row.addString("eff_to", "") row.addString("eff_from", simpleDateFormat.format(System.currentTimeMillis())) row.addLong("last_time_played", record.lastTimePlayed) row.addInt("games_played", record.gamesPlayed) row.addInt("games_won", record.gamesWon) row.addInt("oks", record.oks) row.addInt("deaths", record.deaths) row.addInt("damage_given", record.damageGiven) row.addInt("damage_taken", record.damageTaken) row.addInt("max_oks_in_one_game", record.maxOksInOneGame) row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame) session.apply(op) } session.flush() kuduClient.close() } }
Example 39
Source File: DirectDataMultiThreadedInjector.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.cdc import java.text.SimpleDateFormat import java.util.Random import java.util.concurrent.atomic.AtomicInteger import java.util.concurrent.{TimeUnit, Executors} import org.kududb.client.{Operation, PartialRow, KuduClient} import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator object DirectDataMultiThreadedInjector { val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy") val random = new Random def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tableName> <numberOfRecords> <numberOfThreads>") return } val kuduMaster = args(0) val tableName = args(1) val numberOfRecords = args(2).toInt val executor = Executors.newFixedThreadPool(args(3).toInt) val numberOfGamers = args(4).toInt val sleepTime = args(5).toInt val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() val leftToRun = new AtomicInteger() for (i <- 0 to numberOfRecords) { leftToRun.incrementAndGet() executor.execute(new ApplyNewRecordRunnable(GamerDataGenerator.makeNewGamerRecord(numberOfGamers), kuduClient, tableName, leftToRun)) println("Summited:" + i) Thread.sleep(sleepTime) } val startTime = System.currentTimeMillis() while (!executor.awaitTermination(10000, TimeUnit.SECONDS)) { val newTime = System.currentTimeMillis() println("> Still Waiting: {Time:" + (newTime - startTime) + ", LeftToRun:" + leftToRun + "}" ) } kuduClient.close() } }
Example 40
Source File: DirectDataInjector.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.aggregates import java.util.Random import org.kududb.client.KuduClient object DirectDataInjector { val random = new Random def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tableName> <numberOfRecords>") return } val kuduMaster = args(0) val tableName = args(1) val numberOfRecords = args(2).toInt val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() val table = kuduClient.openTable(tableName) val session = kuduClient.newSession() table.newInsert() for (i <- 0 to numberOfRecords) { val record = GamerDataGenerator.makeNewGamerRecord(100000) val op = table.newInsert() val row = op.getRow row.addString("gamer_id", record.gamerId) row.addLong("last_time_played", record.lastTimePlayed) row.addInt("games_played", record.gamesPlayed) row.addInt("games_won", record.gamesWon) row.addInt("oks", record.oks) row.addInt("deaths", record.deaths) row.addInt("damage_given", record.damageGiven) row.addInt("damage_taken", record.damageTaken) row.addInt("max_oks_in_one_game", record.maxOksInOneGame) row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame) session.apply(op) } session.flush() kuduClient.close() } }
Example 41
Source File: GamerDataGenerator.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.gamer.aggregates import java.util.{Date, Random} import org.kududb.spark.demo.gamer.GamerEvent object GamerDataGenerator { val random = new Random() val averagePlayerPercentage = 40 val advancedPlayerPercentage = 80 val superStarPlayerPercentage = 100 var date = System.currentTimeMillis() def makeNewGamerRecord(numOfGamers:Int): GamerEvent = { println("date" + new Date(date)) date += 60000 * 60 * 6 val playerSelection = random.nextInt(100) if (playerSelection < averagePlayerPercentage) { val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection new GamerEvent(gamerId.toString, date, 1, if (random.nextInt(10) > 7) 1 else 0, random.nextInt(10), random.nextInt(20), random.nextInt(1000), random.nextInt(2000)) } else if (playerSelection < advancedPlayerPercentage) { val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection new GamerEvent(gamerId.toString, date, 1, if (random.nextInt(10) > 5) 1 else 0, random.nextInt(20), random.nextInt(18), random.nextInt(2000), random.nextInt(2000)) } else { val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection new GamerEvent(gamerId.toString, date, 1, if (random.nextInt(10) > 3) 1 else 0, random.nextInt(20), random.nextInt(10), random.nextInt(4000), random.nextInt(1500)) } } }
Example 42
Source File: AddSingleRecord.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import java.util.Random import org.kududb.client.{PartialRow, KuduClient} object AddSingleRecord { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <tableName> <rowKey>") return } val kuduMaster = args(0) val tableName = args(1) val rowKey = args(2) val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() val table = kuduClient.openTable(tableName) val session = kuduClient.newSession() val lowerBound = new PartialRow(table.getSchema) lowerBound.addString(0, rowKey) val upperBound = new PartialRow(table.getSchema) upperBound.addString(0, rowKey + "_") var startTime = System.currentTimeMillis() val random = new Random() startTime = System.currentTimeMillis() val update = table.newInsert() val row = update.getRow row.addString(0, rowKey) val columns = table.getSchema.getColumns for (c <- 1 until columns.size()) { println(columns.get(c).getName + " " + columns.get(c).getType) row.addInt(columns.get(c).getName, random.nextInt(100000)) } session.apply(update) println("new key: " + rowKey) println(" new key time spent: " + (System.currentTimeMillis() - startTime)) startTime = System.currentTimeMillis() val scanner2 = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build() while (scanner2.hasMoreRows) { val rows = scanner2.nextRows() while (rows.hasNext) { val row = rows.next() println("NewValue: " + rowKey + " " + row.rowToString()) } } scanner2.close() println(" scan time spent: " + (System.currentTimeMillis() - startTime)) val scannerX = kuduClient.newScannerBuilder(table).build() while (scannerX.hasMoreRows) { val rows = scannerX.nextRows() while (rows.hasNext) { val row = rows.next() println("Full Scan: " + row.rowToString()) } } println("done") kuduClient.shutdown() } }
Example 43
Source File: NameGenerator.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import java.util.Random import scala.collection.mutable object NameGenerator { val random = new Random() val listOfNames = new mutable.MutableList[NameAndCounter] listOfNames += new NameAndCounter("Katlyn") listOfNames += new NameAndCounter("Laurena") listOfNames += new NameAndCounter("Jenise") listOfNames += new NameAndCounter("Vida") listOfNames += new NameAndCounter("Delphine") listOfNames += new NameAndCounter("Tiffanie") listOfNames += new NameAndCounter("Carroll") listOfNames += new NameAndCounter("Steve") listOfNames += new NameAndCounter("Nu") listOfNames += new NameAndCounter("Robbin") listOfNames += new NameAndCounter("Mahalia") listOfNames += new NameAndCounter("Norah") listOfNames += new NameAndCounter("Selina") listOfNames += new NameAndCounter("Cornelius") listOfNames += new NameAndCounter("Bennie") listOfNames += new NameAndCounter("Kemberly") listOfNames += new NameAndCounter("Johnie") listOfNames += new NameAndCounter("Jenee") listOfNames += new NameAndCounter("Napoleon") listOfNames += new NameAndCounter("Brenton") listOfNames += new NameAndCounter("Roxana") listOfNames += new NameAndCounter("Kalyn") listOfNames += new NameAndCounter("Jeana") listOfNames += new NameAndCounter("Tennie") listOfNames += new NameAndCounter("Tasia") listOfNames += new NameAndCounter("Ashely") listOfNames += new NameAndCounter("Hester") listOfNames += new NameAndCounter("Zita") listOfNames += new NameAndCounter("Evalyn") listOfNames += new NameAndCounter("Anderson") listOfNames += new NameAndCounter("Elaina") listOfNames += new NameAndCounter("Benny") listOfNames += new NameAndCounter("Heidi") listOfNames += new NameAndCounter("Mammie") listOfNames += new NameAndCounter("Alisa") listOfNames += new NameAndCounter("Billie") listOfNames += new NameAndCounter("Wan") listOfNames += new NameAndCounter("Dionna") listOfNames += new NameAndCounter("Julene") listOfNames += new NameAndCounter("Chasidy") listOfNames += new NameAndCounter("Vennie") listOfNames += new NameAndCounter("Cara") listOfNames += new NameAndCounter("Charissa") listOfNames += new NameAndCounter("Russell") listOfNames += new NameAndCounter("Daniela") listOfNames += new NameAndCounter("Kindra") listOfNames += new NameAndCounter("Eduardo") listOfNames += new NameAndCounter("Marci") listOfNames += new NameAndCounter("Gustavo") listOfNames += new NameAndCounter("Dianna ") def getName(): String = { val nameAndCounter = listOfNames.get(random.nextInt(listOfNames.length - 1)).get nameAndCounter.counter += 1 nameAndCounter.name + "_" + nameAndCounter.counter } } class NameAndCounter(val name:String = "N/A", var counter:Int = 0) { }
Example 44
Source File: BasicExample.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import java.util import java.util.Random import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.kududb.ColumnSchema.ColumnSchemaBuilder import org.kududb.client.KuduClient import org.kududb.{ColumnSchema, Schema, Type} object BasicExample { def main(args: Array[String]): Unit = { val kuduMaster = "quickstart.cloudera" println(" -- Starting ") val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() try { println(" -- ") val columnList = new util.ArrayList[ColumnSchema]() columnList.add(new ColumnSchemaBuilder("KEY_ID", Type.STRING).key(true).build()) columnList.add(new ColumnSchemaBuilder("COL_A", Type.STRING).key(false).build()) columnList.add(new ColumnSchemaBuilder("COL_B", Type.STRING).key(false).build()) columnList.add(new ColumnSchemaBuilder("COL_C", Type.STRING).key(false).build()) val schema = new Schema(columnList) if (kuduClient.tableExists("foobar")) { kuduClient.deleteTable("foobar") } kuduClient.createTable("foobar", schema) val session = kuduClient.newSession() val table = kuduClient.openTable("foobar") try { val random = new Random() for (i <- 0 until 10) { val insert = table.newInsert() val row = insert.getRow() row.addString(0, i.toString) row.addString(1, "value " + i) row.addString(2, "42:" + i) row.addString(3, "Cat" + random.nextGaussian()) session.apply(insert) } session.flush() } finally { session.close() } val tableList = kuduClient.getTablesList.getTablesList for (i <- 0 until tableList.size()) { println("Table " + i + ":" + tableList.get(i)) } val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") val sc = new SparkContext("local[2]", "SparkSQL on Kudu", sparkConfig) val sqlContext = new SQLContext(sc) val df = sqlContext.load("org.kududb.spark", Map("kudu.table" -> "foobar", "kudu.master" -> kuduMaster)) df.registerTempTable("foobar") sqlContext.sql("SELECT * FROM foobar").foreach(r => { println("Row: " + r) }) } finally { kuduClient.shutdown() } println("-- finished") } }
Example 45
Source File: InitialDataPopulation.scala From SparkOnKudu with Apache License 2.0 | 5 votes |
package org.kududb.spark.demo.basic import java.util import java.util.Random import org.kududb.{Schema, Type, ColumnSchema} import org.kududb.ColumnSchema.ColumnSchemaBuilder import org.kududb.client.{AsyncKuduClient, KuduClient} object InitialDataPopulation { def main(args:Array[String]): Unit = { if (args.length == 0) { println("<kuduMaster> <TableName> <numberOfColumns> <numberOfRows>") //"quickstart.cloudera" return } val kuduMaster = args(0) val tableName = args(1) val numOfColumns = args(2).toInt val numOfRows = args(3).toInt val kuduClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build() try { //Delete table if exist if (kuduClient.tableExists(tableName).join()) { kuduClient.deleteTable(tableName).join() } //Create Schema val columnList = new util.ArrayList[ColumnSchema]() columnList.add(new ColumnSchemaBuilder("key_id", Type.STRING).key(true).build()) for (c <- 0 until numOfColumns) { columnList.add(new ColumnSchemaBuilder("col_" + c, Type.INT32).key(false).build()) } val schema = new Schema(columnList) //Create table kuduClient.createTable(tableName, schema).join() //Populate table val random = new Random val table = kuduClient.openTable(tableName).join() val asyncSession = kuduClient.newSession() for (r <- 0 until numOfRows) { val insert = table.newInsert() val row = insert.getRow() row.addString(0, NameGenerator.getName()) val columns = table.getSchema.getColumns for (c <- 1 until columns.size()) { row.addInt(columns.get(c).getName, random.nextInt(100000)) } asyncSession.apply(insert) if (r % 1000 == 0) { println("Inserted: " + r) } } asyncSession.flush() val scannerX = kuduClient.newScannerBuilder(table).build() while (scannerX.hasMoreRows) { val rows = scannerX.nextRows().join() while (rows.hasNext) { val row = rows.next() println(" - " + row.rowToString()) } } asyncSession.close() } finally { kuduClient.shutdown() } } }
Example 46
Source File: SOLStreamProducer.scala From incubator-retired-gearpump with Apache License 2.0 | 5 votes |
package org.apache.gearpump.streaming.examples.sol import java.time.Instant import java.util.Random import org.apache.gearpump.Message import org.apache.gearpump.cluster.UserConfig import org.apache.gearpump.streaming.examples.sol.SOLStreamProducer._ import org.apache.gearpump.streaming.source.Watermark import org.apache.gearpump.streaming.task.{Task, TaskContext} class SOLStreamProducer(taskContext: TaskContext, conf: UserConfig) extends Task(taskContext, conf) { import taskContext.output private val sizeInBytes = conf.getInt(SOLStreamProducer.BYTES_PER_MESSAGE) .getOrElse(DEFAULT_MESSAGE_SIZE) private var messages: Array[String] = null private var rand: Random = null private var messageCount: Long = 0 override def onStart(startTime: Instant): Unit = { prepareRandomMessage self ! Watermark(Instant.now) } private def prepareRandomMessage = { rand = new Random() val differentMessages = 100 messages = new Array(differentMessages) 0.until(differentMessages).map { index => val sb = new StringBuilder(sizeInBytes) // Even though java encodes strings in UCS2, the serialized version sent by the tuples // is UTF8, so it should be a single byte 0.until(sizeInBytes).foldLeft(sb) { (sb, j) => sb.append(rand.nextInt(9)) } messages(index) = sb.toString() } } override def onNext(msg: Message): Unit = { val message = messages(rand.nextInt(messages.length)) output(Message(message, System.currentTimeMillis())) messageCount = messageCount + 1L self ! Watermark(Instant.now) } } object SOLStreamProducer { val DEFAULT_MESSAGE_SIZE = 100 // Bytes val BYTES_PER_MESSAGE = "bytesPerMessage" }
Example 47
Source File: ScalaClientTestUtils.scala From incubator-livy with Apache License 2.0 | 5 votes |
package org.apache.livy.scalaapi import java.util.Random import java.util.concurrent.{CountDownLatch, TimeUnit} import scala.collection.mutable.ArrayBuffer import scala.concurrent.{Await, Future} import scala.concurrent.duration._ import org.scalatest.FunSuite import org.apache.livy.LivyBaseUnitTestSuite object ScalaClientTestUtils extends FunSuite with LivyBaseUnitTestSuite { val Timeout = 40 def helloJob(context: ScalaJobContext): String = "hello" def throwExceptionJob(context: ScalaJobContext): Unit = throw new CustomTestFailureException def simpleSparkJob(context: ScalaJobContext): Long = { val r = new Random val count = 5 val partitions = Math.min(r.nextInt(10) + 1, count) val buffer = new ArrayBuffer[Int]() for (a <- 1 to count) { buffer += r.nextInt() } context.sc.parallelize(buffer, partitions).count() } def assertAwait(lock: CountDownLatch): Unit = { assert(lock.await(Timeout, TimeUnit.SECONDS) == true) } def assertTestPassed[T](future: Future[T], expectedValue: T): Unit = { val result = Await.result(future, Timeout second) assert(result === expectedValue) } }
Example 48
Source File: SimpleSkewedGroupByTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SimpleSkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("SimpleSkewedGroupByTest") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val ratio = if (args.length > 4) args(4).toInt else 5.0 val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val result = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) val offset = ranGen.nextInt(1000) * numReducers if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { // give ratio times higher chance of generating key 0 (for reducer 0) result(i) = (offset, byteArr) } else { // generate a key for one of the other reducers val key = 1 + ranGen.nextInt(numReducers-1) + offset result(i) = (key, byteArr) } } result }.cache // Enforce that everything has been calculated and in cache pairs1.count println("RESULT: " + pairs1.groupByKey(numReducers).count) // Print how many keys each reducer got (for debugging) // println("RESULT: " + pairs1.groupByKey(numReducers) // .map{case (k,v) => (k, v.size)} // .collectAsMap) spark.stop() } } // scalastyle:on println
Example 49
Source File: SkewedGroupByTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes linearly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 50
Source File: SparkHdfsLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 51
Source File: LocalLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 52
Source File: GroupByTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object GroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 53
Source File: LocalFileLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 54
Source File: PageViewGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.clickstream import java.io.PrintWriter import java.net.ServerSocket import java.util.Random // scalastyle:on object PageViewGenerator { val pages = Map("http://foo.com/" -> .7, "http://foo.com/news" -> 0.2, "http://foo.com/contact" -> .1) val httpStatus = Map(200 -> .95, 404 -> .05) val userZipCode = Map(94709 -> .5, 94117 -> .5) val userID = Map((1 to 100).map(_ -> .01): _*) def pickFromDistribution[T](inputMap: Map[T, Double]): T = { val rand = new Random().nextDouble() var total = 0.0 for ((item, prob) <- inputMap) { total = total + prob if (total > rand) { return item } } inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0 } def getNextClickEvent(): String = { val id = pickFromDistribution(userID) val page = pickFromDistribution(pages) val status = pickFromDistribution(httpStatus) val zipCode = pickFromDistribution(userZipCode) new PageView(page, status, zipCode, id).toString() } def main(args: Array[String]) { if (args.length != 2) { System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>") System.exit(1) } val port = args(0).toInt val viewsPerSecond = args(1).toFloat val sleepDelayMs = (1000.0 / viewsPerSecond).toInt val listener = new ServerSocket(port) println("Listening on port: " + port) while (true) { val socket = listener.accept() new Thread() { override def run(): Unit = { println("Got client connected from: " + socket.getInetAddress) val out = new PrintWriter(socket.getOutputStream(), true) while (true) { Thread.sleep(sleepDelayMs) out.write(getNextClickEvent()) out.flush() } socket.close() } }.start() } } } // scalastyle:on println
Example 55
Source File: SparkLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 56
Source File: LocalKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 57
Source File: StopwatchSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => acc.add(checkStopwatch(sw)) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc.add(duration) } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 58
Source File: PartitionwiseSampledRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 59
Source File: SampledRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 60
Source File: PartitionwiseSampledRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 61
Source File: RandomProjection.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.linalg import java.util.Random import breeze.stats.distributions.CauchyDistribution import org.apache.spark.mllib.linalg.{ DenseMatrix, Matrices } import org.apache.spark.mllib.linalg.{ DenseVector, Vector } def generateGaussian(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = { val localMatrix = DenseMatrix.randn(projectedDim, originalDim, random) new RandomProjection(localMatrix) } def generateCauchy(originalDim: Int, projectedDim: Int, random: Random): RandomProjection = { def randc(numRows: Int, numCols: Int): DenseMatrix = { require( numRows.toLong * numCols <= Int.MaxValue, s"$numRows x $numCols dense matrix is too large to allocate" ) val cauchyDistribution = new CauchyDistribution(0, 1) new DenseMatrix(numRows, numCols, cauchyDistribution.drawMany(numRows * numCols)) } val localMatrix = randc(projectedDim, originalDim) new RandomProjection(localMatrix) } }
Example 62
Source File: BitSamplingFunction.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.lsh import java.util.Random import scala.collection.immutable.BitSet import org.apache.spark.mllib.linalg.SparseVector def generate( originalDim: Int, signatureLength: Int, random: Random = new Random ): BitSamplingFunction = { val indices = Array.fill(signatureLength) { random.nextInt(originalDim) } new BitSamplingFunction(indices) } }
Example 63
Source File: MinhashFunction.scala From spark-neighbors with MIT License | 5 votes |
package com.github.karlhigley.spark.neighbors.lsh import java.util.Random import org.apache.spark.mllib.linalg.SparseVector def generate( dimensions: Int, signatureLength: Int, prime: Int, random: Random = new Random ): MinhashFunction = { val perms = new Array[PermutationFunction](signatureLength) var i = 0 while (i < signatureLength) { perms(i) = PermutationFunction.random(dimensions, prime, random) i += 1 } new MinhashFunction(perms) } }
Example 64
Source File: TestLoadDataWithJunkChars.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.integration.spark.testsuite.dataload import java.io.{BufferedWriter, File, FileWriter} import java.util.Random import org.apache.spark.sql.Row import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll class TestLoadDataWithJunkChars extends QueryTest with BeforeAndAfterAll { var filePath = "" val junkchars = "ǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰ" def buildTestData() = { filePath = s"$integrationPath/spark/target/junkcharsdata.csv" val file = new File(filePath) val writer = new BufferedWriter(new FileWriter(file)) writer.write("c1,c2\n") val random = new Random for (i <- 1 until 1000) { writer.write("a" + i + "," + junkchars + "\n") if ( i % 100 == 0) { writer.flush() } } writer.write("a1000000," + junkchars) writer.close } test("[bug]fix bug of duplicate rows in UnivocityCsvParser #877") { buildTestData() sql("drop table if exists junkcharsdata") sql("""create table if not exists junkcharsdata (c1 string, c2 string) STORED AS carbondata""") sql(s"LOAD DATA LOCAL INPATH '$filePath' into table junkcharsdata") checkAnswer(sql("select count(*) from junkcharsdata"), Seq(Row(1000))) sql("drop table if exists junkcharsdata") new File(filePath).delete() } }
Example 65
Source File: DoubleDataTypeTestCase.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.integration.spark.testsuite.primitiveTypes import java.util.Random import org.apache.spark.sql.test.util.QueryTest import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.scalatest.BeforeAndAfterAll class DoubleDataTypeTestCase extends QueryTest with BeforeAndAfterAll { lazy val df: DataFrame = generateDataFrame private def generateDataFrame(): DataFrame = { val r = new Random() val rdd = sqlContext.sparkContext .parallelize(1 to 10, 2) .map { x => Row(x, "London" + (x % 2), x.toDouble / 13, x.toDouble / 11) } val schema = StructType( Seq( StructField("id", IntegerType, nullable = false), StructField("city", StringType, nullable = false), StructField("m1", DoubleType, nullable = false), StructField("m2", DoubleType, nullable = false) ) ) sqlContext.createDataFrame(rdd, schema) } override def beforeAll { sql("drop table if exists uniq_carbon") sql("drop table if exists uniq_hive") sql("drop table if exists doubleTypeCarbonTable") sql("drop table if exists doubleTypeHiveTable") df.write .format("carbondata") .option("tableName", "doubleTypeCarbonTable") .option("tempCSV", "false") .option("table_blocksize", "32") .mode(SaveMode.Overwrite) .save() df.write .mode(SaveMode.Overwrite) .saveAsTable("doubleTypeHiveTable") } test("detail query") { checkAnswer(sql("select * from doubleTypeCarbonTable order by id"), sql("select * from doubleTypeHiveTable order by id")) } test("duplicate values") { sql("create table uniq_carbon(name string, double_column double) STORED AS carbondata ") sql(s"load data inpath '$resourcesPath/uniq.csv' into table uniq_carbon") sql("create table uniq_hive(name string, double_column double) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','") sql(s"load data local inpath '$resourcesPath/uniqwithoutheader.csv' into table uniq_hive") checkAnswer(sql("select * from uniq_carbon where double_column>=11"), sql("select * from uniq_hive where double_column>=11")) } // test("agg query") { // checkAnswer(sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeCarbonTable group by city"), // sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeHiveTable group by city")) // // checkAnswer(sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeCarbonTable group by city"), // sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeHiveTable group by city")) // } override def afterAll { sql("drop table if exists uniq_carbon") sql("drop table if exists uniq_hive") sql("drop table if exists doubleTypeCarbonTable") sql("drop table if exists doubleTypeHiveTable") } }
Example 66
Source File: TestSource.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbon.flink import java.util.Random import org.apache.flink.api.common.state.{ListState, ListStateDescriptor} import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext} import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction import org.apache.flink.streaming.api.functions.source.SourceFunction abstract class TestSource(val dataCount: Int) extends SourceFunction[Array[AnyRef]] with CheckpointedFunction { private var dataIndex = 0 private var dataIndexState: ListState[Integer] = _ private var running = false @throws[Exception] def get(index: Int): Array[AnyRef] @throws[Exception] def onFinish(): Unit = { // to do nothing. } @throws[Exception] override def run(sourceContext: SourceFunction.SourceContext[Array[AnyRef]]): Unit = { this.running = true while ( { this.running && this.dataIndex < this.dataCount }) { sourceContext.collectWithTimestamp(this.get(this.dataIndex), System.currentTimeMillis) this.dataIndex += 1 } this.onFinish() } override def cancel(): Unit = { this.running = false } @throws[Exception] override def snapshotState(context: FunctionSnapshotContext): Unit = { this.dataIndexState.clear() this.dataIndexState.add(this.dataIndex) } @throws[Exception] override def initializeState(context: FunctionInitializationContext): Unit = { this.dataIndexState = context.getOperatorStateStore.getListState(new ListStateDescriptor[Integer]("dataIndex", classOf[Integer])) if (!context.isRestored) return import scala.collection.JavaConversions._ for (dataIndex <- this.dataIndexState.get) { this.dataIndex = dataIndex } } } object TestSource { val randomCache = new ThreadLocal[Random] { override def initialValue(): Random = new Random() } }
Example 67
Source File: AppleCustomPartitioner.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.partitioning import java.util.Random import org.apache.spark.Partitioner class AppleCustomPartitioner(numOfParts:Int) extends Partitioner { override def numPartitions: Int = numOfParts def random = new Random() override def getPartition(key: Any): Int = { val k = key.asInstanceOf[(String, Long)] val ticker = k._1 if (ticker.equals("apple")) { val saltedTicker = ticker + random.nextInt(9) Math.abs(saltedTicker.hashCode) % numPartitions } else { Math.abs(ticker.hashCode) % numPartitions } } }
Example 68
Source File: SaltedExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.salted import java.util.Random import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SparkSession object SaltedExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args:Array[String]): Unit = { val jsonPath = args(0) val sparkSession = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .getOrCreate() val jsonDfLeft = sparkSession.read.json(jsonPath) val saltedLeft = jsonDfLeft.rdd.flatMap(r => { val group = r.getAs[String]("group") val value = r.getAs[Long]("value") Seq((group + "_" + 0, value),(group + "_" + 1, value)) }) val jsonDfRight = sparkSession.read.json(jsonPath) val saltedRight = jsonDfRight.rdd.mapPartitions(it => { val random = new Random() it.map(r => { val group = r.getAs[String]("group") val value = r.getAs[Long]("value") (group + "_" + random.nextInt(2), value) }) }) jsonDfLeft.join(jsonDfRight).collect().foreach(r => { println("Normal.result:" + r) }) println("----") saltedLeft.join(saltedRight).collect().foreach(r => { println("Salted.result:" + r) }) } }
Example 69
Source File: SessionDataFileHDFSWriter.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.streaming.dstream.sessionization import java.io.BufferedWriter import java.io.FileWriter import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.conf.Configuration import java.io.OutputStreamWriter import org.apache.hadoop.fs.Path import java.util.Random object SessionDataFileHDFSWriter { val eol = System.getProperty("line.separator"); def main(args: Array[String]) { if (args.length == 0) { println("SessionDataFileWriter {tempDir} {distDir} {numberOfFiles} {numberOfEventsPerFile} {waitBetweenFiles}"); return; } val conf = new Configuration conf.addResource(new Path("/etc/hadoop/conf/core-site.xml")) conf.addResource(new Path("/etc/hadoop/conf/mapred-site.xml")) conf.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml")) val fs = FileSystem.get(new Configuration) val rootTempDir = args(0) val rootDistDir = args(1) val files = args(2).toInt val loops = args(3).toInt val waitBetweenFiles = args(4).toInt val r = new Random for (f <- 1 to files) { val rootName = "/weblog." + System.currentTimeMillis() val tmpPath = new Path(rootTempDir + rootName + ".tmp") val writer = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath))) print(f + ": [") val randomLoops = loops + r.nextInt(loops) for (i <- 1 to randomLoops) { writer.write(SessionDataGenerator.getNextEvent + eol) if (i%100 == 0) { print(".") } } println("]") writer.close val distPath = new Path(rootDistDir + rootName + ".dat") fs.rename(tmpPath, distPath) Thread.sleep(waitBetweenFiles) } println("Done") } }
Example 70
Source File: RandSampleData.scala From SparkMLlibDeepLearn with Apache License 2.0 | 5 votes |
package util import java.util.Random import breeze.linalg.{ Matrix => BM, CSCMatrix => BSM, DenseMatrix => BDM, Vector => BV, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, svd => brzSvd } import breeze.numerics.{ exp => Bexp, cos => Bcos, tanh => Btanh } import scala.math.Pi object RandSampleData extends Serializable { // Rosenbrock: //��(100*(x(i+1)-x(i) 2) 2 + (x(i)-1) 2) // Rastrigin: //��(x(i) 2 -10*cos(2*3.14*x(i))+10) // Sphere : //��(x(i) 2) def RandM( n1: Int, n2: Int, b1: Double, b2: Double, function: String): BDM[Double] = { // val n1 = 2 // val n2 = 3 // val b1 = -30 // val b2 = 30 val bdm1 = BDM.rand(n1, n2) * (b2 - b1).toDouble + b1.toDouble val bdm_y = function match { case "rosenbrock" => val xi0 = bdm1(::, 0 to (bdm1.cols - 2)) val xi1 = bdm1(::, 1 to (bdm1.cols - 1)) val xi2 = (xi0 :* xi0) val m1 = ((xi1 - xi2) :* (xi1 - xi2)) * 100.0 + ((xi0 - 1.0) :* (xi0 - 1.0)) val m2 = m1 * BDM.ones[Double](m1.cols, 1) m2 case "rastrigin" => val xi0 = bdm1 val xi2 = (xi0 :* xi0) val sicos = Bcos(xi0 * 2.0 * Pi) * 10.0 val m1 = xi2 - sicos + 10.0 val m2 = m1 * BDM.ones[Double](m1.cols, 1) m2 case "sphere" => val xi0 = bdm1 val xi2 = (xi0 :* xi0) val m1 = xi2 val m2 = m1 * BDM.ones[Double](m1.cols, 1) m2 } val randm = BDM.horzcat(bdm_y, bdm1) randm } }
Example 71
Source File: RconConnector.scala From chatoverflow with Eclipse Public License 2.0 | 5 votes |
package org.codeoverflow.chatoverflow.requirement.service.rcon import java.io.{DataInputStream, IOException, InputStream, OutputStream} import java.net.{Socket, SocketException} import java.nio.{ByteBuffer, ByteOrder} import java.util.Random import org.codeoverflow.chatoverflow.WithLogger import org.codeoverflow.chatoverflow.connector.Connector class RconConnector(override val sourceIdentifier: String) extends Connector(sourceIdentifier) with WithLogger { override protected var requiredCredentialKeys: List[String] = List("password", "address") override protected var optionalCredentialKeys: List[String] = List("port") private var socket: Socket = _ private var outputStream: OutputStream = _ private var inputStream: InputStream = _ private var requestId: Int = 0 def sendCommand(command: String): String = { logger debug s"Sending $command to RCON" requestId += 1 if (write(2, command.getBytes("ASCII"))) { return read() } null } override def stop(): Boolean = { logger info s"Stopped RCON connector to ${credentials.get.getValue("address").get}!" socket.close() true } }
Example 72
Source File: AutoregressionSuite.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.models import java.util.Random import com.cloudera.sparkts.MatrixUtil.toBreeze import org.apache.spark.mllib.linalg._ import org.apache.commons.math3.random.MersenneTwister import org.scalatest.FunSuite class AutoregressionSuite extends FunSuite { test("fit AR(1) model") { val model = new ARModel(1.5, Array(.2)) val ts = model.sample(5000, new MersenneTwister(10L)) val fittedModel = Autoregression.fitModel(ts, 1) assert(fittedModel.coefficients.length == 1) assert(math.abs(fittedModel.c - 1.5) < .07) assert(math.abs(fittedModel.coefficients(0) - .2) < .03) } test("fit AR(2) model") { val model = new ARModel(1.5, Array(.2, .3)) val ts = model.sample(5000, new MersenneTwister(10L)) val fittedModel = Autoregression.fitModel(ts, 2) assert(fittedModel.coefficients.length == 2) assert(math.abs(fittedModel.c - 1.5) < .15) assert(math.abs(fittedModel.coefficients(0) - .2) < .03) assert(math.abs(fittedModel.coefficients(1) - .3) < .03) } test("add and remove time dependent effects") { val rand = new Random() val ts = new DenseVector(Array.fill(1000)(rand.nextDouble())) val model = new ARModel(1.5, Array(.2, .3)) val added = model.addTimeDependentEffects(ts, Vectors.zeros(ts.size)) val removed = model.removeTimeDependentEffects(added, Vectors.zeros(ts.size)) assert((toBreeze(ts) - toBreeze(removed)).toArray.forall(math.abs(_) < .001)) } }
Example 73
Source File: utils.scala From scalabpe with Apache License 2.0 | 5 votes |
package scalabpe.flow import scalabpe.core._ import java.util.Random object Global { def init() { println("init called") } def close() { println("close called") } } object FlowHelper { val random = new Random() val jobStatusCache = new java.util.concurrent.ConcurrentHashMap[String,String]() def getConfig(s:String,defaultValue:String="") = Flow.router.getConfig(s,defaultValue) def isEmpty(req:Request,name:String):Boolean={ if( name.indexOf(",") >= 0 ) return isEmptyForAny(req,name) return isEmpty(req.s(name)) } private def isEmptyForAny(req:Request,names:String):Boolean={ val ss = names.split(",") var i = 0 while( i < ss.length ) { if( isEmpty(req.s(ss(i)) )) return true i += 1 } false } def isInt(req:Request,name:String):Boolean={ if( name.indexOf(",") >= 0 ) return isIntForAny(req,name) return isInt(req.s(name)) } private def isIntForAny(req:Request,names:String):Boolean={ val ss = names.split(",") var i = 0 while( i < ss.length ) { if( isInt(req.s(ss(i)) )) return true i += 1 } false } def isEmpty(str:String):Boolean={ return str == null || str.length() == 0 } def isInt(n:String):Boolean={ try { Integer.parseInt(n) return true } catch { case e: Throwable => return false } } def checkInclude(ss:String,s:String,t:String=","):Boolean={ if( ss == null || ss == "" ) return false if( s == null || s == "" ) return true return (t+ss+t).indexOf(t+s+t) >= 0 } def uuid(): String = { return java.util.UUID.randomUUID().toString().replaceAll("-", "") } def generateSeed():String = { "%08d".format(Math.abs(random.nextInt())%100000000) } def contact(a:String,b:String):String = a + b }
Example 74
Source File: ZipTests.scala From coursier with Apache License 2.0 | 5 votes |
package coursier.cli.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.util.Random import java.util.zip.{Deflater, ZipEntry, ZipInputStream, ZipOutputStream} import coursier.launcher.internal.Zip import org.junit.runner.RunWith import org.scalatest.flatspec.AnyFlatSpec import org.scalatestplus.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ZipTests extends AnyFlatSpec { "zipEntries" should "be fine with custom deflaters" in { // Inspired by https://github.com/spring-projects/spring-boot/commit/a50646b7cc3ad941e748dfb450077e3a73706205#diff-2297c301250b25e3b80301c58daf3ea0R621 val baos = new ByteArrayOutputStream val output = new ZipOutputStream(baos) { `def` = new Deflater(Deflater.NO_COMPRESSION, true) } val data = Array.ofDim[Byte](1024 * 1024) new Random().nextBytes(data) val entry = new ZipEntry("entry.dat") output.putNextEntry(entry) output.write(data) output.closeEntry() output.close() val result = baos.toByteArray val zos = new ZipOutputStream(new ByteArrayOutputStream) val entryNames = Zip.zipEntries(new ZipInputStream(new ByteArrayInputStream(result))) .map { case (ent, content) => println(ent.getCompressedSize) val name = ent.getName zos.putNextEntry(ent) zos.write(content) zos.closeEntry() name } .toVector zos.close() assert(entryNames == Vector("entry.dat")) } }
Example 75
Source File: SimpleSkewedGroupByTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SimpleSkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("SimpleSkewedGroupByTest") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val ratio = if (args.length > 4) args(4).toInt else 5.0 val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val result = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) val offset = ranGen.nextInt(1000) * numReducers if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { // give ratio times higher chance of generating key 0 (for reducer 0) result(i) = (offset, byteArr) } else { // generate a key for one of the other reducers val key = 1 + ranGen.nextInt(numReducers-1) + offset result(i) = (key, byteArr) } } result }.cache // Enforce that everything has been calculated and in cache pairs1.count println("RESULT: " + pairs1.groupByKey(numReducers).count) // Print how many keys each reducer got (for debugging) // println("RESULT: " + pairs1.groupByKey(numReducers) // .map{case (k,v) => (k, v.size)} // .collectAsMap) spark.stop() } } // scalastyle:on println
Example 76
Source File: SkewedGroupByTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes linearly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 77
Source File: SparkHdfsLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 78
Source File: LocalLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 79
Source File: GroupByTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object GroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 80
Source File: LocalFileLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 81
Source File: PageViewGenerator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.clickstream import java.io.PrintWriter import java.net.ServerSocket import java.util.Random // scalastyle:on object PageViewGenerator { val pages = Map("http://foo.com/" -> .7, "http://foo.com/news" -> 0.2, "http://foo.com/contact" -> .1) val httpStatus = Map(200 -> .95, 404 -> .05) val userZipCode = Map(94709 -> .5, 94117 -> .5) val userID = Map((1 to 100).map(_ -> .01): _*) def pickFromDistribution[T](inputMap: Map[T, Double]): T = { val rand = new Random().nextDouble() var total = 0.0 for ((item, prob) <- inputMap) { total = total + prob if (total > rand) { return item } } inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0 } def getNextClickEvent(): String = { val id = pickFromDistribution(userID) val page = pickFromDistribution(pages) val status = pickFromDistribution(httpStatus) val zipCode = pickFromDistribution(userZipCode) new PageView(page, status, zipCode, id).toString() } def main(args: Array[String]) { if (args.length != 2) { System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>") System.exit(1) } val port = args(0).toInt val viewsPerSecond = args(1).toFloat val sleepDelayMs = (1000.0 / viewsPerSecond).toInt val listener = new ServerSocket(port) println("Listening on port: " + port) while (true) { val socket = listener.accept() new Thread() { override def run(): Unit = { println("Got client connected from: " + socket.getInetAddress) val out = new PrintWriter(socket.getOutputStream(), true) while (true) { Thread.sleep(sleepDelayMs) out.write(getNextClickEvent()) out.flush() } socket.close() } }.start() } } } // scalastyle:on println
Example 82
Source File: SparkLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 83
Source File: LocalKMeans.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 84
Source File: StopwatchSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => acc.add(checkStopwatch(sw)) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc.add(duration) } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 85
Source File: PartitionwiseSampledRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 86
Source File: SimpleSkewedGroupByTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SimpleSkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers var ratio = if (args.length > 4) args(4).toInt else 5.0 val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random var result = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) val offset = ranGen.nextInt(1000) * numReducers if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { // give ratio times higher chance of generating key 0 (for reducer 0) result(i) = (offset, byteArr) } else { // generate a key for one of the other reducers val key = 1 + ranGen.nextInt(numReducers-1) + offset result(i) = (key, byteArr) } } result }.cache // Enforce that everything has been calculated and in cache pairs1.count println("RESULT: " + pairs1.groupByKey(numReducers).count) // Print how many keys each reducer got (for debugging) // println("RESULT: " + pairs1.groupByKey(numReducers) // .map{case (k,v) => (k, v.size)} // .collectAsMap) sc.stop() } }
Example 87
Source File: SparkTachyonHdfsLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 88
Source File: SkewedGroupByTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes lineraly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) sc.stop() } }
Example 89
Source File: SparkHdfsLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 90
Source File: LocalLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 91
Source File: GroupByTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object GroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) sc.stop() } }
Example 92
Source File: LocalFileLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 93
Source File: SparkLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 94
Source File: LocalKMeans.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }
Example 95
Source File: SampledRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 96
Source File: PartitionwiseSampledRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 97
Source File: SparkTachyonHdfsLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value 将w初始化为一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 98
Source File: SkewedGroupByTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes lineraly increase from the 1st to the last //map输出的大小线性增加从第一到最后 numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache //执行所有的计算和缓存 pairs1.count() println(pairs1.groupByKey(numReducers).count()) sc.stop() } } // scalastyle:on println
Example 99
Source File: SparkHdfsLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR").setMaster("local[2]") val inputPath = "D:\\spark\\spark-1.5.0-hadoop2.6\\data\\mllib\\lr_data.txt"//args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache()//缓存 val ITERATIONS = 6 //args(1).toInt 迭代次数 // Initialize w to a random value //初始化W到一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => //p代表DataPoint Vector p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 100
Source File: LocalFileLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions 维度 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) //解析每一行数据,生成DataPoint对像 def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() //fromFile读取文件,转换成Array[String] val lines = scala.io.Source.fromFile(args(0)).getLines().toArray //调用parsePoint解析每一行数据 val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value //初始化W到一个随机值数组 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 101
Source File: PeopleInfoFileGenerator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.demoIBM import java.io.File import java.util.Random import java.io.FileWriter object PeopleInfoFileGenerator { def main(args: Array[String]) { val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_people_info.txt"), false) val rand = new Random() for (i <- 1 to 10000) { var height = rand.nextInt(220) if (height < 50) { height = height + 50 } var gender = getRandomGender if (height < 100 && gender == "M") height = height + 100 if (height < 100 && gender == "F") height = height + 50 writer.write(i + " " + getRandomGender + " " + height) writer.write(System.getProperty("line.separator")) } writer.flush() writer.close() println("People Information File generated successfully.") } def getRandomGender(): String = { val rand = new Random() val randNum = rand.nextInt(2) + 1 if (randNum % 2 == 0) { "M" } else { "F" } } }
Example 102
Source File: SparkLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) //String.stripMargin 移除每行字符串开头的空格和第一个遇到的垂直分割符| } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR").setMaster("local") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value //将w初始化为一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 103
Source File: FileWrite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package scalaDemo import java.io.{File, FileWriter} import java.util.Random import com.google.common.base.Charsets.UTF_8 import com.google.common.io.Files import org.apache.spark.util.Utils object FileWrite { def main(args: Array[String]) { val outFile = File.createTempFile("test-load-spark-properties", "test") Files.write("spark.test.fileNameLoadA true\n" + "spark.test.fileNameLoadB 1\n", outFile, UTF_8) val writer = new FileWriter(new File("D:\\eclipse44_64\\workspace\\spark1.5\\examples\\sample_age_data.txt"), false) val rand = new Random() for (i <- 1 to 10000) { writer.write(i + " " + rand.nextInt(100)) writer.write(System.getProperty("line.separator")) } writer.flush() writer.close() } }
Example 104
Source File: SampledRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. //对于大型数据集,替换样本中每个元素的预期出现次数为泊松(压缩),我们使用它来获取每个元素的计数 val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { //当我们返回0个项目时,避免对象分配,这是很经常的 Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 105
Source File: PartitionwiseSampledRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 106
Source File: SessionKafkaProducer.scala From flink_training with Apache License 2.0 | 5 votes |
package com.tmalaska.flinktraining.example.session import java.util.{Properties, Random} import net.liftweb.json.DefaultFormats import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} import net.liftweb.json.Serialization.write object SessionKafkaProducer { def main(args:Array[String]): Unit = { implicit val formats = DefaultFormats val kafkaServerURL = args(0) val kafkaServerPort = args(1) val topic = args(2) val numberOfEntities = args(3).toInt val numberOfMessagesPerEntity = args(4).toInt val waitTimeBetweenMessageBatch = args(5).toInt val chancesOfMissing = args(6).toInt val props = new Properties() props.put("bootstrap.servers", kafkaServerURL + ":" + kafkaServerPort) props.put("acks", "all") props.put("retries", "0") props.put("batch.size", "16384") props.put("linger.ms", "1") props.put("buffer.memory", "33554432") props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") val producer = new KafkaProducer[String, String](props) val r = new Random() var sentCount = 0 println("About to send to " + topic) for (j <- 0 to numberOfMessagesPerEntity) { for (i <- 0 to numberOfEntities) { if (r.nextInt(chancesOfMissing) != 0) { val message = write(HeartBeat(i.toString, System.currentTimeMillis())) val producerRecord = new ProducerRecord[String,String](topic, message) producer.send(producerRecord) sentCount += 1 } } println("Sent Count:" + sentCount) Thread.sleep(waitTimeBetweenMessageBatch) } producer.close() } }
Example 107
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }
Example 108
Source File: SimpleSkewedGroupByTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SimpleSkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("SimpleSkewedGroupByTest") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val ratio = if (args.length > 4) args(4).toInt else 5.0 val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val result = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) val offset = ranGen.nextInt(1000) * numReducers if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { // give ratio times higher chance of generating key 0 (for reducer 0) result(i) = (offset, byteArr) } else { // generate a key for one of the other reducers val key = 1 + ranGen.nextInt(numReducers-1) + offset result(i) = (key, byteArr) } } result }.cache // Enforce that everything has been calculated and in cache pairs1.count println(s"RESULT: ${pairs1.groupByKey(numReducers).count}") spark.stop() } } // scalastyle:on println
Example 109
Source File: SkewedGroupByTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object SkewedGroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes linearly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 110
Source File: SparkHdfsLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") val y = tok.nextToken.toDouble val x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd lines.cache() val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println(s"Final w: $w") spark.stop() } } // scalastyle:on println
Example 111
Source File: LocalLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println(s"Final w: $w") } } // scalastyle:on println
Example 112
Source File: GroupByTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.sql.SparkSession object GroupByTest { def main(args: Array[String]) { val spark = SparkSession .builder .appName("GroupBy Test") .getOrCreate() val numMappers = if (args.length > 0) args(0).toInt else 2 val numKVPairs = if (args.length > 1) args(1).toInt else 1000 val valSize = if (args.length > 2) args(2).toInt else 1000 val numReducers = if (args.length > 3) args(3).toInt else numMappers val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random val arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() pairs1.repartition(1) // Enforce that everything has been calculated and in cache pairs1.count() implicit val caseInsensitiveOrdering = new Ordering[(Int, String)] { override def compare(a: (Int, String), b: (Int, String)): Int = a._1.compareTo( b._1) } println(pairs1.groupByKey(numReducers).count()) spark.stop() } } // scalastyle:on println
Example 113
Source File: LocalFileLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val fileSrc = scala.io.Source.fromFile(args(0)) val lines = fileSrc.getLines().toArray val points = lines.map(parsePoint) val ITERATIONS = args(1).toInt // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } fileSrc.close() println(s"Final w: $w") } } // scalastyle:on println
Example 114
Source File: PageViewGenerator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming.clickstream import java.io.PrintWriter import java.net.ServerSocket import java.util.Random // scalastyle:on object PageViewGenerator { val pages = Map("http://foo.com/" -> .7, "http://foo.com/news" -> 0.2, "http://foo.com/contact" -> .1) val httpStatus = Map(200 -> .95, 404 -> .05) val userZipCode = Map(94709 -> .5, 94117 -> .5) val userID = Map((1 to 100).map(_ -> .01): _*) def pickFromDistribution[T](inputMap: Map[T, Double]): T = { val rand = new Random().nextDouble() var total = 0.0 for ((item, prob) <- inputMap) { total = total + prob if (total > rand) { return item } } inputMap.take(1).head._1 // Shouldn't get here if probabilities add up to 1.0 } def getNextClickEvent(): String = { val id = pickFromDistribution(userID) val page = pickFromDistribution(pages) val status = pickFromDistribution(httpStatus) val zipCode = pickFromDistribution(userZipCode) new PageView(page, status, zipCode, id).toString() } def main(args: Array[String]) { if (args.length != 2) { System.err.println("Usage: PageViewGenerator <port> <viewsPerSecond>") System.exit(1) } val port = args(0).toInt val viewsPerSecond = args(1).toFloat val sleepDelayMs = (1000.0 / viewsPerSecond).toInt val listener = new ServerSocket(port) println(s"Listening on port: $port") while (true) { val socket = listener.accept() new Thread() { override def run(): Unit = { println(s"Got client connected from: ${socket.getInetAddress}") val out = new PrintWriter(socket.getOutputStream(), true) while (true) { Thread.sleep(sleepDelayMs) out.write(getNextClickEvent()) out.flush() } socket.close() } }.start() } } } // scalastyle:on println
Example 115
Source File: SparkLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println(s"Final w: $w") spark.stop() } } // scalastyle:on println
Example 116
Source File: LocalKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers(i) val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData val points = new HashSet[Vector[Double]] val kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println(s"Initial centers: $kPoints") while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val mappings = closest.groupBy[Int] (x => x._1) val pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints(mapping._1), mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println(s"Final centers: $kPoints") } } // scalastyle:on println
Example 117
Source File: ChiSquareTestSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import java.util.Random import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.stat.test.ChiSqTest import org.apache.spark.mllib.util.MLlibTestSparkContext class ChiSquareTestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("test DataFrame of labeled points") { // labels: 1.0 (2 / 6), 0.0 (4 / 6) // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) val data = Seq( LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) for (numParts <- List(2, 4, 6, 8)) { val df = spark.createDataFrame(sc.parallelize(data, numParts)) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4) assert(degreesOfFreedom === Array(2, 3)) assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4) } } test("large number of features (SPARK-3087)") { // Test that the right number of results is returned val numCols = 1001 val sparseData = Array( LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))), LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0))))) val df = spark.createDataFrame(sparseData) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues.size === numCols) assert(degreesOfFreedom.length === numCols) assert(statistics.size === numCols) assert(pValues(1000) !== null) // SPARK-3087 } test("fail on continuous features or labels") { val tooManyCategories: Int = 100000 assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " + "tooManyCategories be large enough to cause ChiSqTest to throw an exception.") val random = new Random(11L) val continuousLabel = Seq.fill(tooManyCategories)( LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2)))) withClue("ChiSquare should throw an exception when given a continuous-valued label") { intercept[SparkException] { val df = spark.createDataFrame(continuousLabel) ChiSquareTest.test(df, "features", "label") } } val continuousFeature = Seq.fill(tooManyCategories)( LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble()))) withClue("ChiSquare should throw an exception when given continuous-valued features") { intercept[SparkException] { val df = spark.createDataFrame(continuousFeature) ChiSquareTest.test(df, "features", "label") } } } }
Example 118
Source File: StopwatchSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => acc.add(checkStopwatch(sw)) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.longAccumulator rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc.add(duration) } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 119
Source File: PartitionwiseSampledRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils import org.apache.spark.util.random.RandomSampler private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 120
Source File: SimpleSkewedGroupByTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SimpleSkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers var ratio = if (args.length > 4) args(4).toInt else 5.0 val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random var result = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) val offset = ranGen.nextInt(1000) * numReducers if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { // give ratio times higher chance of generating key 0 (for reducer 0) result(i) = (offset, byteArr) } else { // generate a key for one of the other reducers val key = 1 + ranGen.nextInt(numReducers-1) + offset result(i) = (key, byteArr) } } result }.cache // Enforce that everything has been calculated and in cache pairs1.count println("RESULT: " + pairs1.groupByKey(numReducers).count) // Print how many keys each reducer got (for debugging) // println("RESULT: " + pairs1.groupByKey(numReducers) // .map{case (k,v) => (k, v.size)} // .collectAsMap) sc.stop() } } // scalastyle:on println
Example 121
Source File: SparkTachyonHdfsLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 122
Source File: SkewedGroupByTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes lineraly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) sc.stop() } } // scalastyle:on println
Example 123
Source File: SparkHdfsLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 124
Source File: LocalLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 125
Source File: GroupByTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object GroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) sc.stop() } } // scalastyle:on println
Example 126
Source File: Utils.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.util import java.util.Random object Utils { val random = new Random() def log1pExp(x: Double): Double = { if (x > 0) { x + math.log1p(math.exp(-x)) } else { math.log1p(math.exp(x)) } } }
Example 127
Source File: CustomActivationExample.scala From dl4scala with MIT License | 5 votes |
package org.dl4scala.examples.misc.activationfunctions import java.util.{Collections, Random} import org.deeplearning4j.datasets.iterator.impl.ListDataSetIterator import org.deeplearning4j.nn.api.OptimizationAlgorithm import org.deeplearning4j.nn.conf.layers.{DenseLayer, OutputLayer} import org.deeplearning4j.nn.conf.{NeuralNetConfiguration, Updater} import org.deeplearning4j.nn.multilayer.MultiLayerNetwork import org.deeplearning4j.nn.weights.WeightInit import org.deeplearning4j.optimize.listeners.ScoreIterationListener import org.nd4j.linalg.activations.Activation import org.nd4j.linalg.api.ndarray.INDArray import org.nd4j.linalg.dataset.DataSet import org.nd4j.linalg.dataset.api.iterator.DataSetIterator import org.nd4j.linalg.factory.Nd4j import org.nd4j.linalg.lossfunctions.LossFunctions object CustomActivationExample { val seed = 12345 val iterations = 1 val nEpochs = 500 val nSamples = 1000 val batchSize = 100 val learningRate = 0.001 var MIN_RANGE = 0 var MAX_RANGE = 3 val rng = new Random(seed) def main(args: Array[String]): Unit = { val iterator = getTrainingData(batchSize, rng) // Create the network val numInput = 2 val numOutputs = 1 val nHidden = 10 val net = new MultiLayerNetwork(new NeuralNetConfiguration.Builder() .seed(seed) .iterations(iterations) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT) .learningRate(learningRate) .weightInit(WeightInit.XAVIER) .updater(Updater.NESTEROVS) .list() //INSTANTIATING CUSTOM ACTIVATION FUNCTION here as follows //Refer to CustomActivation class for more details on implementation .layer(0, new DenseLayer.Builder().nIn(numInput).nOut(nHidden) .activation(new CustomActivation()) .build()) .layer(1, new OutputLayer.Builder(LossFunctions.LossFunction.MSE) .activation(Activation.IDENTITY) .nIn(nHidden).nOut(numOutputs).build()) .pretrain(false).backprop(true).build() ) net.init() net.setListeners(new ScoreIterationListener(100)) (0 until nEpochs).foreach{_ => iterator.reset() net.fit(iterator) } // Test the addition of 2 numbers (Try different numbers here) val input: INDArray = Nd4j.create(Array[Double](0.111111, 0.3333333333333), Array[Int](1, 2)) val out: INDArray = net.output(input, false) System.out.println(out) } private def getTrainingData(batchSize: Int, rand: Random): DataSetIterator = { val sum = new Array[Double](nSamples) val input1 = new Array[Double](nSamples) val input2 = new Array[Double](nSamples) (0 until nSamples).foreach{i => input1(i) = MIN_RANGE + (MAX_RANGE - MIN_RANGE) * rand.nextDouble input2(i) = MIN_RANGE + (MAX_RANGE - MIN_RANGE) * rand.nextDouble sum(i) = input1(i) + input2(i) } val inputNDArray1 = Nd4j.create(input1, Array[Int](nSamples, 1)) val inputNDArray2 = Nd4j.create(input2, Array[Int](nSamples, 1)) val inputNDArray = Nd4j.hstack(inputNDArray1, inputNDArray2) val outPut = Nd4j.create(sum, Array[Int](nSamples, 1)) val dataSet = new DataSet(inputNDArray, outPut) val listDs = dataSet.asList Collections.shuffle(listDs, rng) new ListDataSetIterator(listDs, batchSize) } }
Example 128
Source File: ToxCoreTestBase.scala From jvm-toxcore-c with GNU General Public License v3.0 | 5 votes |
package im.tox.tox4j import java.io.IOException import java.net.{ InetAddress, Socket } import java.util.Random import org.jetbrains.annotations.NotNull import org.scalatest.Assertions object ToxCoreTestBase extends Assertions { private[tox4j] val nodeCandidates = Seq( new DhtNode("tox.initramfs.io", "tox.initramfs.io", 33445, "3F0A45A268367C1BEA652F258C85F4A66DA76BCAA667A49E770BCC4917AB6A25"), new DhtNode("tox.verdict.gg", null, 33445, "1C5293AEF2114717547B39DA8EA6F1E331E5E358B35F9B6B5F19317911C5F976") ) @NotNull def randomBytes(length: Int): Array[Byte] = { val array = new Array[Byte](length) new Random().nextBytes(array) array } @NotNull def readablePublicKey(@NotNull id: Array[Byte]): String = { val str = new StringBuilder id foreach { c => str.append(f"$c%02X") } str.toString() } @NotNull def parsePublicKey(@NotNull id: String): Array[Byte] = { val publicKey = new Array[Byte](id.length / 2) publicKey.indices foreach { i => publicKey(i) = ((fromHexDigit(id.charAt(i * 2)) << 4) + fromHexDigit(id.charAt(i * 2 + 1))).toByte } publicKey } private def fromHexDigit(c: Char): Byte = { val digit = if (false) { 0 } else if ('0' to '9' contains c) { c - '0' } else if ('A' to 'F' contains c) { c - 'A' + 10 } else if ('a' to 'f' contains c) { c - 'a' + 10 } else { throw new IllegalArgumentException(s"Non-hex digit character: $c") } digit.toByte } @SuppressWarnings(Array("org.wartremover.warts.Equals")) private def hasConnection(ip: String, port: Int): Option[String] = { var socket: Socket = null try { socket = new Socket(InetAddress.getByName(ip), port) if (socket.getInputStream == null) { Some("Socket input stream is null") } else { None } } catch { case e: IOException => Some(s"A network connection can't be established to $ip:$port: ${e.getMessage}") } finally { if (socket != null) { socket.close() } } } def checkIPv4: Option[String] = { hasConnection("8.8.8.8", 53) } def checkIPv6: Option[String] = { hasConnection("2001:4860:4860::8888", 53) } protected[tox4j] def assumeIPv4(): Unit = { assume(checkIPv4.isEmpty) } protected[tox4j] def assumeIPv6(): Unit = { assume(checkIPv6.isEmpty) } }
Example 129
Source File: PartitionwiseSampledRDD.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.rdd import java.util.Random import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.util.random.RandomSampler import scala.reflect.ClassTag private[sona] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } /** * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD, * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain * a random sample of the records in the partition. The random seeds assigned to the samplers * are guaranteed to have different values. * * @param prev RDD to be sampled * @param sampler a random sampler * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD * @param seed random seed * @tparam T input RDD item type * @tparam U sampled RDD item type */ private[sona] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = (new Random).nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 130
Source File: WeightedRandomSampler.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import java.util.Random import org.apache.spark.util.SparkUtil import org.apache.spark.util.random.RandomSampler import scala.reflect.ClassTag abstract class WeightedRandomSampler[T: ClassTag, U: ClassTag] extends RandomSampler[(T, Float), U] { protected var fraction = 0.0 override def sample(items: Iterator[(T, Float)]): Iterator[U] = { items.filter(x => sample(x._2) > 0).asInstanceOf[Iterator[U]] } def sample(weight: Float): Int override def sample(): Int = ??? def setFraction(fraction: Double): Unit = { require( fraction >= (0.0 - 1e-6) && fraction <= (1.0 + 1e-6), s"Sampling fraction ($fraction) must be on interval [0, 1]") this.fraction = fraction } override def clone: WeightedRandomSampler[T, U] = ??? } class NaiveWeightedBernoulliSampler[T: ClassTag] extends WeightedRandomSampler[T, (T, Float)] { private val rng: Random = SparkUtil.getXORShiftRandom(System.nanoTime) override def setSeed(seed: Long): Unit = rng.setSeed(seed) def sample(weight: Float): Int = { if (fraction <= 0.0) { 0 } else if (fraction >= 1.0) { 1 } else { if (rng.nextDouble() <= fraction * weight) { 1 } else { 0 } } } override def clone: NaiveWeightedBernoulliSampler[T] = new NaiveWeightedBernoulliSampler[T] }
Example 131
Source File: PartitionwiseWeightedSampledRDD.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import java.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, Partitioner, TaskContext} import scala.reflect.ClassTag import scala.util.{Random => ScalaRandom} class PartitionwiseWeightedSampledRDDPartition(val prev: Partition, val seed: Long, val fraction: Double) extends Partition with Serializable { override val index: Int = prev.index } class PartitionwiseWeightedSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[(T, Float)], sampler: WeightedRandomSampler[T, U], fractions: Map[Int, Double], preservesPartitioning: Boolean, @transient private val seed: Long = ScalaRandom.nextLong) extends RDD[U](prev) { @transient override val partitioner: Option[Partitioner] = { if (preservesPartitioning) prev.partitioner else None } override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[(T, Float)].partitions.map { x => new PartitionwiseWeightedSampledRDDPartition(x, random.nextLong(), fractions.getOrElse(x.index, 0.0)) } } override def getPreferredLocations(split: Partition): Seq[String] = { firstParent[(T, Float)].preferredLocations( split.asInstanceOf[PartitionwiseWeightedSampledRDDPartition].prev ) } override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseWeightedSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.setFraction(split.fraction) thisSampler.sample(firstParent[(T, Float)].iterator(split.prev, context)) } }
Example 132
Source File: CompositeSampler.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.sampler import java.util.Random import spire.math.{Numeric => spNum} class CompositeSampler(implicit ev: spNum[Double]) extends Sampler[Double] { private var samplers: Seq[Sampler[_]] = _ protected def numer: spNum[Double] = ev def apply(state: Int): Double = samplers.iterator.map(_.applyDouble(state)).sum def norm: Double = samplers.iterator.map(_.normDouble).sum def sampleFrom(base: Double, gen: Random): Int = { val sampIter = samplers.iterator var curSampler = sampIter.next() var subNorm = curSampler.normDouble var remain = base while (remain >= subNorm) { remain -= subNorm curSampler = sampIter.next() subNorm = curSampler.normDouble } curSampler.sampleFromDouble(remain, gen) } def resetComponents(samplers: Sampler[_]*): CompositeSampler = { this.samplers = samplers this } }
Example 133
Source File: MetropolisHastings.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.sampler import java.util.Random import spire.math.{Numeric => spNum} class MetropolisHastings(implicit ev: spNum[Double]) extends Sampler[Double] { type TransProb = Int => Double private var origFunc: TransProb = _ private var proposal: Sampler[Double] = _ private var state: Int = _ protected def numer: spNum[Double] = ev def apply(state: Int): Double = origFunc(state) def norm: Double = proposal.norm def sampleFrom(base: Double, gen: Random): Int = { val newState = proposal.sampleFrom(base, gen) if (newState != state) { val ar = acceptRate(newState) if (ar >= 1.0 || gen.nextDouble() < ar) { state = newState } } state } private def acceptRate(newState:Int): Double = { origFunc(newState) * proposal(state) / (origFunc(state) * proposal(newState)) } def resetProb(origFunc: TransProb, proposal: Sampler[Double], initState: Int): MetropolisHastings = { this.origFunc = origFunc this.proposal = proposal this.state = initState this } def resetProb(origFunc: TransProb, proposal: Sampler[Double], gen: Random): MetropolisHastings = { this.origFunc = origFunc this.proposal = proposal this.state = proposal.sampleRandom(gen) this } }
Example 134
Source File: DiscreteSampler.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.sampler import java.util.Random import scala.annotation.tailrec import spire.math.{Numeric => spNum} trait DiscreteSampler[@specialized(Double, Int, Float, Long) T] extends Sampler[T] { def length: Int def used: Int def update(state: Int, value: => T): Unit def deltaUpdate(state: Int, delta: => T): Unit def resetDist(probs: Array[T], space: Array[Int], psize: Int): DiscreteSampler[T] def resetDist(distIter: Iterator[(Int, T)], psize: Int): DiscreteSampler[T] def reset(newSize: Int): DiscreteSampler[T] @tailrec final def resampleRandom(gen: Random, state: Int, residualRate: Double, numResampling: Int = 2)(implicit ev: spNum[T]): Int = { val newState = sampleRandom(gen) if (newState == state && numResampling >= 0 && used > 1 && (residualRate >= 1.0 || gen.nextDouble() < residualRate)) { resampleRandom(gen, state, residualRate, numResampling - 1) } else { newState } } @tailrec final def resampleFrom(base: T, gen: Random, state: Int, residualRate: Double, numResampling: Int = 2)(implicit ev: spNum[T]): Int = { val newState = sampleFrom(base, gen) if (newState == state && numResampling >= 0 && used > 1 && (residualRate >= 1.0 || gen.nextDouble() < residualRate)) { val newBase = ev.fromDouble(gen.nextDouble() * ev.toDouble(norm)) resampleFrom(newBase, gen, state, residualRate, numResampling - 1) } else { newState } } }
Example 135
Source File: FlowerDataSetIterator.scala From dl4scala with MIT License | 5 votes |
package org.dl4scala.examples.transferlearning.vgg16.dataHelpers import java.io.{File, IOException} import java.net.URL import org.datavec.api.io.filters.BalancedPathFilter import org.datavec.api.io.labels.ParentPathLabelGenerator import org.datavec.api.split.{FileSplit, InputSplit} import org.datavec.image.loader.BaseImageLoader import org.nd4j.linalg.dataset.api.iterator.DataSetIterator import java.util import java.util.Random import org.apache.commons.io.FileUtils import org.datavec.api.util.ArchiveUtils import org.datavec.image.recordreader.ImageRecordReader import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator import org.deeplearning4j.nn.modelimport.keras.trainedmodels.TrainedModels object FlowerDataSetIterator { private val log = org.slf4j.LoggerFactory.getLogger(FlowerDataSetIterator.getClass) private val DATA_DIR = new File(System.getProperty("user.home")) + "/dl4jDataDir" private val DATA_URL = "http://download.tensorflow.org/example_images/flower_photos.tgz" private val FLOWER_DIR = DATA_DIR + "/flower_photos" private val allowedExtensions = BaseImageLoader.ALLOWED_FORMATS private val rng = new Random(13) private val height = 224 private val width = 224 private val channels = 3 private val numClasses = 5 private val labelMaker = new ParentPathLabelGenerator private var trainData: InputSplit = _ private var testData: InputSplit = _ private var batchSize = 0 @throws(classOf[IOException]) def trainIterator: DataSetIterator = makeIterator(trainData) @throws(classOf[IOException]) def testIterator: DataSetIterator = makeIterator(testData) @throws(classOf[IOException]) def setup(batchSizeArg: Int, trainPerc: Int): Unit = { try downloadAndUntar() catch { case e: IOException => e.printStackTrace() log.error("IOException : ", e) } batchSize = batchSizeArg val parentDir = new File(FLOWER_DIR) val filesInDir = new FileSplit(parentDir, allowedExtensions, rng) val pathFilter = new BalancedPathFilter(rng, allowedExtensions, labelMaker) if (trainPerc >= 100) throw new IllegalArgumentException("Percentage of data set aside for training has to be less than 100%." + " Test percentage = 100 - training percentage, has to be greater than 0") val filesInDirSplit = filesInDir.sample(pathFilter, trainPerc, 100 - trainPerc) trainData = filesInDirSplit(0) testData = filesInDirSplit(1) } @throws(classOf[IOException]) private def makeIterator(split: InputSplit) = { val recordReader = new ImageRecordReader(height, width, channels, labelMaker) recordReader.initialize(split) val iter = new RecordReaderDataSetIterator(recordReader, batchSize, 1, numClasses) iter.setPreProcessor(TrainedModels.VGG16.getPreProcessor) iter } @throws(classOf[IOException]) def downloadAndUntar(): Unit = { val rootFile = new File(DATA_DIR) if (!rootFile.exists) rootFile.mkdir val tarFile = new File(DATA_DIR, "flower_photos.tgz") if (!tarFile.isFile) { log.info("Downloading the flower dataset from " + DATA_URL + "...") FileUtils.copyURLToFile(new URL(DATA_URL), tarFile) } ArchiveUtils.unzipFileTo(tarFile.getAbsolutePath, rootFile.getAbsolutePath) } }
Example 136
Source File: LDADataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Vector, Vectors} import scala.collection.mutable.{HashMap => MHashMap} import org.apache.spark.rdd.RDD def generateLDARDD( sc: SparkContext, numDocs: Long, numVocab: Int, docLenMin: Int, docLenMax: Int, numParts: Int = 3, seed: Long = System.currentTimeMillis()): RDD[(Long, Vector)] = { val data = sc.parallelize(0L until numDocs, numParts).mapPartitionsWithIndex { (idx, part) => val rng = new Random(seed ^ idx) part.map { case docIndex => var currentSize = 0 val entries = MHashMap[Int, Int]() val docLength = rng.nextInt(docLenMax - docLenMin + 1) + docLenMin while (currentSize < docLength) { val index = rng.nextInt(numVocab) entries(index) = entries.getOrElse(index, 0) + 1 currentSize += 1 } val iter = entries.toSeq.map(v => (v._1, v._2.toDouble)) (docIndex, Vectors.sparse(numVocab, iter)) } } data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("LDADataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numDocs: Long = 500L var numVocab: Int = 1000 var docLenMin: Int = 50 var docLenMax: Int = 10000 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 5) { outputPath = args(0) numDocs = args(1).toInt numVocab = args(2).toInt docLenMin = args(3).toInt docLenMax = args(4).toInt println(s"Output Path: $outputPath") println(s"Num of Documents: $numDocs") println(s"Vocabulary size: $numVocab") } else { System.err.println( s"Usage: $LDADataGenerator <OUTPUT_PATH> <NUM_DOCUMENTS> <VOCABULARY_SIZE>" ) System.exit(1) } val data = generateLDARDD(sc, numDocs, numVocab, docLenMin, docLenMax, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }
Example 137
Source File: Bagging.scala From streamDM with Apache License 2.0 | 5 votes |
package org.apache.spark.streamdm.classifiers.meta import java.util.Random import com.github.javacliparser.{ClassOption, IntOption} import org.apache.spark.streamdm.classifiers.Classifier import org.apache.spark.streamdm.classifiers.model._ import org.apache.spark.streamdm.core._ import org.apache.spark.streaming.dstream._ import org.apache.spark.streamdm.utils.Utils import org.apache.spark.streamdm.core.specification.ExampleSpecification def ensemblePredict(example: Example): Double = { val sizeEnsemble = ensembleSizeOption.getValue val predictions: Array[Double] = new Array(sizeEnsemble) for (i <- 0 until sizeEnsemble) { predictions(i) = classifiers(i).getModel.asInstanceOf[ClassificationModel].predict(example) } Utils.majorityVote(predictions, numberClasses) } def numberClasses(): Integer = { if (exampleLearnerSpecification == null) 2 else exampleLearnerSpecification.out(0).range } }
Example 138
Source File: DiscreteAliasSamplerSpec.scala From bidirectional-random-walk with MIT License | 5 votes |
package soal.util import org.scalatest.FlatSpec import org.scalatest.Matchers import java.util.Random class DiscreteAliasSamplerSpec extends FlatSpec with Matchers { val random = new Random(1) def testDistribution(unnormalizedProbabilities: Array[Float], values: Seq[Int], nSamples: Int = 10000 ): Unit = { val probabilities = unnormalizedProbabilities map { _ / unnormalizedProbabilities.sum } val n = unnormalizedProbabilities.size val valueToIndex = (values zip (0 until n)).toMap val sampler = new DiscreteAliasSampler(values, unnormalizedProbabilities, random) val sampleCounts = Array.fill(n)(0) val tol = 4.0f / math.sqrt(nSamples).toFloat for (i <- 0 until nSamples) { val v = sampler.sample() sampleCounts(valueToIndex(v)) += 1 } for (i <- 0 until n) { sampleCounts(i).toFloat / nSamples should equal (probabilities(i) +- tol) } def f(v: Int): Float = v.toFloat * v.toFloat // compute expectation of v => v^2 val trueExpectation = ((probabilities zip values) map { case (p, v) => p * v * v }).sum sampler.expectation(f) shouldEqual (trueExpectation +- trueExpectation * 1.00001f) } "A Discrete Distribution" should "support sampling" in { testDistribution(Array(575.6355f, 89.733475f, 86.90718f, 721.26416f), Array(2, 3, 5, 7)) testDistribution(Array(2.0f, 5.0f, 3.0f), Array(17, 11, 13)) testDistribution(Array(1.0f, 1.0f, 1.0f, 1.0f), Array(-2, 3, -5, 7)) testDistribution(Array(0.9f, 0.1f), Array(19, 17)) an[IllegalArgumentException] should be thrownBy { new DiscreteAliasSampler(Array(1), Array(1.0f, 2.0f)) } } }
Example 139
Source File: BidirectionalPPREstimatorSpec.scala From bidirectional-random-walk with MIT License | 5 votes |
package soal.ppr import java.util.Random import co.teapot.graph.ConcurrentHashMapDynamicGraph import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable import scala.io.Source class BidirectionalPPREstimatorSpec extends FlatSpec with Matchers { val graph = ConcurrentHashMapDynamicGraph.readGraph("src/test/resources/test_graph.txt") val teleportProb = 0.2f val random = new Random(2) // Seed for consistent tests val estimator = new BidirectionalPPREstimator(graph, teleportProb, random) val truePPRs = BidirectionalPPREstimatorSpec.testGraphTruePPRs "BidirectionalPPRSearcher.estimateInversePPR" should "be correct on the test graph" in { val pprErrorTolerance = 2.0e-6f for (((s, t), truePPR) <- truePPRs) { val inversePPRs = estimator.estimatePPRToTarget(t, pprErrorTolerance) withClue (s"Testing Pair ($s, $t)") { inversePPRs(s) should equal (truePPR +- pprErrorTolerance) } } } "BidirectionalPPRSearcher.estimatePPR" should "be correct on the test graph" in { val relativeError = 0.01f val stPairs = Array(0 -> 1, 2 -> 3, 5 -> 9, 0 -> 0) for ((s, t) <- stPairs) { withClue (s"Testing Pair ($s, $t)") { estimator.estimatePPRSingleSource(s, t, 0.03f, relativeError) should equal ( truePPRs((s, t)) +- truePPRs((s, t)) * relativeError * 2) } } } } object BidirectionalPPREstimatorSpec { def testGraphTruePPRs: collection.Map[(Int, Int), Float] = { val pprMap = new mutable.HashMap[(Int, Int), Float] { override def default(key: (Int, Int)) = 0.0f } for (line <- Source.fromFile("src/test/resources/test_graph_true_pprs.txt").getLines()) { val pieces = line.split("\t") val (startId, targetId, truePPR) = (pieces(0).toInt, pieces(1).toInt, pieces(2).toFloat) pprMap((startId, targetId)) = truePPR } pprMap } }
Example 140
Source File: CsvKafkaPublisher.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.common import java.io.File import java.util.Random import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} import scala.io.Source object CsvKafkaPublisher { var counter = 0 var salts = 0 def main(args:Array[String]): Unit = { if (args.length == 0) { println("<brokerList> " + "<topicName> " + "<dataFolderOrFile> " + "<sleepPerRecord> " + "<acks> " + "<linger.ms> " + "<producer.type> " + "<batch.size> " + "<salts>") return } val kafkaBrokerList = args(0) val kafkaTopicName = args(1) val nyTaxiDataFolder = args(2) val sleepPerRecord = args(3).toInt val acks = args(4).toInt val lingerMs = args(5).toInt val producerType = args(6) //"async" val batchSize = args(7).toInt salts = args(8).toInt val kafkaProducer = KafkaProducerUntil.getNewProducer(kafkaBrokerList, acks, lingerMs, producerType, batchSize) println("--Input:" + nyTaxiDataFolder) val dataFolder = new File(nyTaxiDataFolder) if (dataFolder.isDirectory) { val files = dataFolder.listFiles().iterator files.foreach(f => { println("--Input:" + f) processFile(f, kafkaTopicName, kafkaProducer, sleepPerRecord) }) } else { println("--Input:" + dataFolder) processFile(dataFolder, kafkaTopicName, kafkaProducer, sleepPerRecord) } println("---Done") } def processFile(file:File, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String], sleepPerRecord:Int): Unit = { var counter = 0 val r = new Random() println("-Starting Reading") Source.fromFile(file).getLines().foreach(l => { counter += 1 if (counter % 10000 == 0) { println("{Sent:" + counter + "}") } if (counter % 100 == 0) { print(".") } Thread.sleep(sleepPerRecord) val saltedVender = r.nextInt(salts) + l if (counter > 2) { publishTaxiRecord(saltedVender, kafkaTopicName, kafkaProducer) } }) } def publishTaxiRecord(line:String, kafkaTopicName:String, kafkaProducer: KafkaProducer[String, String]): Unit = { if (line.startsWith("vendor_name") || line.length < 10) { println("skip") } else { val message = new ProducerRecord[String, String](kafkaTopicName, line.hashCode.toString, line) kafkaProducer.send(message) } } }
Example 141
package edu.neu.coe.csye._7200.ga import java.util.Random trait RNG[+A] { def next: RNG[A] def value: A } abstract class RNG_Java[+A](n: Long) extends RNG[A] { // must be overridden by sub-classes def value: A def newRNG(n: Long): RNG[A] // may be overridden (if you want to define your own pseudo-random sequence) def nextSeed: Long = RNG_Java.nextSeed(n) // base method -- not normally overridden def next: RNG[A] = newRNG(nextSeed) def state = n } object RNG_Java { def nextSeed(n: Long): Long = new Random(n).nextLong } case class LongRNG(n: Long) extends RNG_Java[Long](n) { def newRNG(n: Long): RNG[Long] = LongRNG(n) def value = n } case class DoubleRNG(n: Long) extends RNG_Java[Double](n) { def newRNG(n: Long) = DoubleRNG(n) def value = n.toDouble/Long.MaxValue override def toString = s"DoubleRNG: $n->$value" } case class UniformDouble(x: Double) extends AnyVal with Ordered[UniformDouble] { def + (y: Double) = x + y def compare(that: UniformDouble): Int = x.compare(that.x) } object DoubleRNG { def apply: RNG[Double] = DoubleRNG(System.currentTimeMillis()) } object UniformDoubleRNG { def apply: RNG[UniformDouble] = UniformDoubleRNG(System.currentTimeMillis()) implicit val u: Unit = Unit } object GaussianRNG { def apply: RNG[(Double,Double)] = GaussianRNG(System.currentTimeMillis()) } object UniformDouble { def create(x: Double)(implicit y: Unit) = if (x>=0 && x<=1) new UniformDouble(x) else throw new IllegalArgumentException(s"$x is not in range 0..1") def + (x: Double, y: UniformDouble) = y+x }
Example 142
package edu.neu.coe.csye._7200 package rng import java.util.Random trait RNG[+A] { def next: RNG[A] def value: A } abstract class RNG_Java[+A](n: Long) extends RNG[A] { // must be overridden by sub-classes def value: A def newRNG(n: Long): RNG_Java[A] // may be overridden (if you want to define your own pseudo-random sequence) def nextSeed: Long = RNG_Java.nextSeed(n) // base method -- not normally overridden def next: RNG_Java[A] = newRNG(nextSeed) def state = n } object RNG_Java { def nextSeed(n: Long): Long = new Random(n).nextLong } case class LongRNG(n: Long) extends RNG_Java[Long](n) { def newRNG(n: Long) = ??? def value = ??? } case class DoubleRNG(n: Long) extends RNG_Java[Double](n) { def newRNG(n: Long) = ??? def value = ??? override def toString = s"DoubleRNG: $n->$value" } case class UniformDouble(x: Double) { def + (y: Double) = x + y } object UniformDoubleRNG { def apply: RNG[UniformDouble] = UniformDoubleRNG(System.currentTimeMillis()) } object GaussianRNG { def apply: RNG[(Double,Double)] = GaussianRNG(System.currentTimeMillis()) } object UniformDouble { def apply(x: Double, y: Unit): UniformDouble = if (x>=0 && x<=1) new UniformDouble(x) else throw new IllegalArgumentException(s"$x is not in range 0..1") def + (x: Double, y: UniformDouble) = y+x }
Example 143
Source File: ProbabilityDistributionTest.scala From ScalphaGoZero with Apache License 2.0 | 5 votes |
package org.deeplearning4j.scalphagozero.agents import java.util.Random import org.scalatest.funspec.AnyFunSpec class ProbabilityDistributionTest extends AnyFunSpec { describe("Select from a distribution") { it("should be low index if distribution skewed low") { val dist = createPDist(Array(0.9, 0.8, 0.5, 0.3, 0.2, 0.1, 0.01, 0.001)) assert(dist.selectRandomIdx() == 2) assert(dist.selectRandomIdx() == 1) assert(dist.selectRandomIdx() == 0) } it("should be high index if distribution skewed high") { val dist = createPDist(Array(0.001, 0.01, 0.1, 0.3, 0.8, 0.5, 0.8, 0.9)) assert(dist.selectRandomIdx() == 6) } it("should be highest index if distribution skewed very high") { val dist = createPDist(Array(0.001, 0.01, 0.01, 0.01, 0.01, 0.1, 0.9)) assert(dist.selectRandomIdx() == 6) } it("should be near middle if gaussian distribution") { val dist = createPDist(Array(0.001, 0.01, 0.1, 0.3, 0.6, 0.8, 0.9, 0.9, 0.8, 0.55, 0.4, 0.2, 0.05, 0.01)) assert(dist.selectRandomIdx() == 8) } it("random if uniform distribution") { val dist = createPDist(Array(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2)) assert(dist.selectRandomIdx() == 5) } it("should be 0 index if distribution has only 1 0 value") { val dist = createPDist(Array(0.0)) assert(dist.selectRandomIdx() == 0) } } private def createPDist(a: Array[Double]) = ProbabilityDistribution(a, new Random(1)) }
Example 144
Source File: SparkHdfsLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 145
Source File: SparkLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 146
Source File: LocalKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 147
Source File: StopwatchSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util import java.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class StopwatchSuite extends SparkFunSuite with MLlibTestSparkContext { import StopwatchSuite._ private def testStopwatchOnDriver(sw: Stopwatch): Unit = { assert(sw.name === "sw") assert(sw.elapsed() === 0L) assert(!sw.isRunning) intercept[AssertionError] { sw.stop() } val duration = checkStopwatch(sw) val elapsed = sw.elapsed() assert(elapsed === duration) val duration2 = checkStopwatch(sw) val elapsed2 = sw.elapsed() assert(elapsed2 === duration + duration2) assert(sw.toString === s"sw: ${elapsed2}ms") sw.start() assert(sw.isRunning) intercept[AssertionError] { sw.start() } } test("LocalStopwatch") { val sw = new LocalStopwatch("sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on driver") { val sw = new DistributedStopwatch(sc, "sw") testStopwatchOnDriver(sw) } test("DistributedStopwatch on executors") { val sw = new DistributedStopwatch(sc, "sw") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.accumulator(0L) rdd.foreach { i => acc += checkStopwatch(sw) } assert(!sw.isRunning) val elapsed = sw.elapsed() assert(elapsed === acc.value) } test("MultiStopwatch") { val sw = new MultiStopwatch(sc) .addLocal("local") .addDistributed("spark") assert(sw("local").name === "local") assert(sw("spark").name === "spark") intercept[NoSuchElementException] { sw("some") } assert(sw.toString === "{\n local: 0ms,\n spark: 0ms\n}") val localDuration = checkStopwatch(sw("local")) val sparkDuration = checkStopwatch(sw("spark")) val localElapsed = sw("local").elapsed() val sparkElapsed = sw("spark").elapsed() assert(localElapsed === localDuration) assert(sparkElapsed === sparkDuration) assert(sw.toString === s"{\n local: ${localElapsed}ms,\n spark: ${sparkElapsed}ms\n}") val rdd = sc.parallelize(0 until 4, 4) val acc = sc.accumulator(0L) rdd.foreach { i => sw("local").start() val duration = checkStopwatch(sw("spark")) sw("local").stop() acc += duration } val localElapsed2 = sw("local").elapsed() assert(localElapsed2 === localElapsed) val sparkElapsed2 = sw("spark").elapsed() assert(sparkElapsed2 === sparkElapsed + acc.value) } } private object StopwatchSuite extends SparkFunSuite { private def now: Long = System.currentTimeMillis() }
Example 148
Source File: SampledRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 149
Source File: PartitionwiseSampledRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 150
Source File: SimpleSkewedGroupByTest.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SimpleSkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers var ratio = if (args.length > 4) args(4).toInt else 5.0 val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random var result = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) val offset = ranGen.nextInt(1000) * numReducers if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { // give ratio times higher chance of generating key 0 (for reducer 0) result(i) = (offset, byteArr) } else { // generate a key for one of the other reducers val key = 1 + ranGen.nextInt(numReducers-1) + offset result(i) = (key, byteArr) } } result }.cache // Enforce that everything has been calculated and in cache pairs1.count println("RESULT: " + pairs1.groupByKey(numReducers).count) // Print how many keys each reducer got (for debugging) // println("RESULT: " + pairs1.groupByKey(numReducers) // .map{case (k,v) => (k, v.size)} // .collectAsMap) sc.stop() } }
Example 151
Source File: SparkTachyonHdfsLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 152
Source File: SkewedGroupByTest.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SkewedGroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random // map output sizes lineraly increase from the 1st to the last numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) sc.stop() } }
Example 153
Source File: LocalFileLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 154
Source File: LocalLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData = { def generatePoint(i: Int) = { val y = if(i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 155
Source File: GroupByTest.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object GroupByTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test") var numMappers = if (args.length > 0) args(0).toInt else 2 var numKVPairs = if (args.length > 1) args(1).toInt else 1000 var valSize = if (args.length > 2) args(2).toInt else 1000 var numReducers = if (args.length > 3) args(3).toInt else numMappers val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) } arr1 }.cache() // Enforce that everything has been calculated and in cache pairs1.count() println(pairs1.groupByKey(numReducers).count()) sc.stop() } }
Example 156
Source File: LocalFileLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 157
Source File: SparkLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData = { def generatePoint(i: Int) = { val y = if(i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 158
Source File: LocalKMeans.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData = { def generatePoint(i: Int) = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }
Example 159
Source File: GroupByKey.scala From learning-spark with Apache License 2.0 | 5 votes |
package com.javachen.spark.examples.rdd import java.util.Random import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object GroupByKey { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]") var numMappers = 10 var numKVPairs = 100 var valSize = 100 var numReducers = 3 val sc = new SparkContext(sparkConf) val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => val ranGen = new Random var arr1 = new Array[(Int, Array[Byte])](numKVPairs) for (i <- 0 until numKVPairs) { val byteArr = new Array[Byte](valSize) ranGen.nextBytes(byteArr) arr1(i) = (ranGen.nextInt(10), byteArr) } arr1 }.cache // Enforce that everything has been calculated and in cache pairs1.count val result = pairs1.groupByKey(numReducers) println(result.count) println(result.toDebugString) sc.stop() } }
Example 160
Source File: UserRepositoryInMemoryInterpreter.scala From scala-pet-store with Apache License 2.0 | 5 votes |
package io.github.pauljamescleary.petstore package infrastructure.repository.inmemory import java.util.Random import cats.implicits._ import cats.Applicative import cats.data.OptionT import domain.users.{User, UserRepositoryAlgebra} import tsec.authentication.IdentityStore import scala.collection.concurrent.TrieMap class UserRepositoryInMemoryInterpreter[F[_]: Applicative] extends UserRepositoryAlgebra[F] with IdentityStore[F, Long, User] { private val cache = new TrieMap[Long, User] private val random = new Random def create(user: User): F[User] = { val id = random.nextLong val toSave = user.copy(id = id.some) cache += (id -> toSave) toSave.pure[F] } def update(user: User): OptionT[F, User] = OptionT { user.id.traverse { id => cache.update(id, user) user.pure[F] } } def get(id: Long): OptionT[F, User] = OptionT.fromOption(cache.get(id)) def delete(id: Long): OptionT[F, User] = OptionT.fromOption(cache.remove(id)) def findByUserName(userName: String): OptionT[F, User] = OptionT.fromOption(cache.values.find(u => u.userName == userName)) def list(pageSize: Int, offset: Int): F[List[User]] = cache.values.toList.sortBy(_.lastName).slice(offset, offset + pageSize).pure[F] def deleteByUserName(userName: String): OptionT[F, User] = OptionT.fromOption( for { user <- cache.values.find(u => u.userName == userName) removed <- cache.remove(user.id.get) } yield removed, ) } object UserRepositoryInMemoryInterpreter { def apply[F[_]: Applicative]() = new UserRepositoryInMemoryInterpreter[F] }
Example 161
Source File: PigFuncs.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.backends.flink import java.util.Random import dbis.piglet.CommonPigFuncs import dbis.piglet.backends._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.functions._ import org.apache.flink.api.scala._ import scala.reflect.ClassTag class CustomSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) { def sample(withReplacement: Boolean, fraction: Double, seed: Long = new Random().nextLong()) = { dataSet.mapPartition(new SampleWithFraction[T](withReplacement, fraction, seed)) } } object Sampler { implicit def addSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) = { new CustomSampler(dataSet) } } object PigFuncs extends CommonPigFuncs { }
Example 162
Source File: StreamingPredictionsSpec.scala From odsc-east-realish-predictions with Apache License 2.0 | 4 votes |
package com.twilio.open.odsc.realish import java.sql.Timestamp import java.time.Instant import java.util.{Random, UUID} import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoders, SQLContext, SparkSession} import org.scalatest.{FunSuite, Matchers} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} import scala.concurrent.duration._ class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql { override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("odsc-spark-utils") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.session.timeZone", "UTC") } final val notRandomRandom = { val generator = new Random generator.setSeed(100L) generator } test("should stream in some mock data for fun") { implicit val spark: SparkSession = sparkSql import spark.implicits._ implicit val sqlContext: SQLContext = spark.sqlContext implicit val metricEncoder = Encoders.product[Metric] val metricData = MemoryStream[Metric] val startingInstant = Instant.now() val backingData = (1 to 10000).map(offset => { val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration" val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100) Metric( Timestamp.from(startingInstant.minusSeconds(offset)), UUID.randomUUID().toString, metric, value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240), countryCode = if (offset % 8 == 0) "US" else "BR", callDirection = if (metric == "loss_percentage") "inbound" else "outbound" ) }) val processingTimeTrigger = Trigger.ProcessingTime(2.seconds) val streamingQuery = metricData.toDF() .withWatermark("timestamp", "2 hours") .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes")) .agg( min("value") as "min", avg("value") as "mean", max("value") as "max", count("*") as "total" ) .writeStream .format("memory") .queryName("datastream") .outputMode(OutputMode.Append()) .trigger(processingTimeTrigger) .start() metricData.addData(backingData) streamingQuery.processAllAvailable() spark.sql("select * from datastream").show(20, false) val checkChange = spark.sql("select * from datastream") .groupBy("metric","countryCode") .agg( sum("total") as "total", avg("mean") as "mean" ) checkChange.show(20, false) // now can do interesting things with minor back tracking... streamingQuery.stop() } }