org.apache.spark.mllib.linalg.distributed.IndexedRow Scala Examples

The following examples show how to use org.apache.spark.mllib.linalg.distributed.IndexedRow. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: T9-4DataTypes.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object DataTypesApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    val denseV = substream.map(f => Vectors.dense(f.slice(1, 5)))
    denseV.print()
    val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) })
      .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l))
    sparseV.print()
    val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    labeledP.print()
    val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53)))
    denseM.print()
    denseV.foreachRDD(rdd => {
      val rowM = new RowMatrix(rdd)
      println(rowM)
    })
    denseV.foreachRDD(rdd => {
      val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1))
      val iRowM = new IndexedRowMatrix(iRdd)
      println(iRowM)
    })
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val cRowM = new CoordinateMatrix(entries)
      println(cRowM)
    })
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val blockM = new CoordinateMatrix(entries).toBlockMatrix
      println(blockM)
    })

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 2
Source File: SparkMatrix.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package linalg.matrix

import org.apache.spark.ml.linalg.Matrix
import org.apache.spark.ml.linalg.Matrices
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry

object SparkMatrix {

  def main(args: Array[String]) {

    val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0))
    println("dMatrix: \n" + dMatrix)

    val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7))
    println("sMatrixOne: \n" + sMatrixOne)

    val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7))
    println("sMatrixTwo: \n" + sMatrixTwo)

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.1),
      Vectors.dense(3.0, 2.0, 4.0),
      Vectors.dense(5.0, 7.0, 8.0),
      Vectors.dense(9.0, 0.0, 1.1)
    )
    val sparseData = Seq(
      Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))),
      Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))),
      Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))),
      Vectors.sparse(3, Seq((0, 9.0), (2, 1.0)))
    )

    val denseMat = new RowMatrix(sc.parallelize(denseData, 2))
    val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2))

    println("Dense Matrix - Num of Rows :" + denseMat.numRows())
    println("Dense Matrix - Num of Cols:" + denseMat.numCols())
    println("Sparse Matrix - Num of Rows :" + sparseMat.numRows())
    println("Sparse Matrix - Num of Cols:" + sparseMat.numCols())

    val data = Seq(
      (0L, Vectors.dense(0.0, 1.0, 2.0)),
      (1L, Vectors.dense(3.0, 4.0, 5.0)),
      (3L, Vectors.dense(9.0, 0.0, 1.0))
    ).map(x => IndexedRow(x._1, x._2))
    val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2)
    val indexedRowsMat = new IndexedRowMatrix(indexedRows)
    println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows())
    println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols())

    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    val coordinateMat = new CoordinateMatrix(entries)
    println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows())
    println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols())

    sc.stop()

  }

} 
Example 3
Source File: RichIndexedRowMatrixSuite.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.utils

import breeze.linalg.{DenseMatrix => BDM, _}
import is.hail.{HailSuite, TestUtils}
import is.hail.linalg.BlockMatrix
import is.hail.linalg.BlockMatrix.ops._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{DistributedMatrix, IndexedRow, IndexedRowMatrix}
import org.apache.spark.rdd.RDD
import org.testng.annotations.Test


class RichIndexedRowMatrixSuite extends HailSuite {

  private def convertDistributedMatrixToBreeze(sparkMatrix: DistributedMatrix): Matrix[Double] = {
    val breezeConverter = sparkMatrix.getClass.getMethod("toBreeze")
    breezeConverter.invoke(sparkMatrix).asInstanceOf[Matrix[Double]]
  }

  @Test def testToBlockMatrixDense() {
    val nRows = 9L
    val nCols = 6L
    val data = Seq(
      (0L, Vectors.dense(0.0, 1.0, 2.0, 1.0, 3.0, 4.0)),
      (1L, Vectors.dense(3.0, 4.0, 5.0, 1.0, 1.0, 1.0)),
      (3L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)),
      (4L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)),
      (5L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)),
      (6L, Vectors.dense(1.0, 2.0, 3.0, 1.0, 1.0, 1.0)),
      (7L, Vectors.dense(4.0, 5.0, 6.0, 1.0, 1.0, 1.0)),
      (8L, Vectors.dense(7.0, 8.0, 9.0, 1.0, 1.0, 1.0))
    ).map(IndexedRow.tupled)
    val indexedRows: RDD[IndexedRow] = sc.parallelize(data)

    val irm = new IndexedRowMatrix(indexedRows)

    for {
      blockSize <- Seq(1, 2, 3, 4, 6, 7, 9, 10)
    } {
      val blockMat = irm.toHailBlockMatrix(blockSize)
      assert(blockMat.nRows === nRows)
      assert(blockMat.nCols === nCols)
      assert(blockMat.toBreezeMatrix() === convertDistributedMatrixToBreeze(irm))
    }

    intercept[IllegalArgumentException] {
      irm.toHailBlockMatrix(-1)
    }
    intercept[IllegalArgumentException] {
      irm.toHailBlockMatrix(0)
    }
  }

  @Test def emptyBlocks() {
    val nRows = 9
    val nCols = 2
    val data = Seq(
      (3L, Vectors.dense(1.0, 2.0)),
      (4L, Vectors.dense(1.0, 2.0)),
      (5L, Vectors.dense(1.0, 2.0)),
      (8L, Vectors.dense(1.0, 2.0))
    ).map(IndexedRow.tupled)

    val irm = new IndexedRowMatrix(sc.parallelize(data))

    val m = irm.toHailBlockMatrix(2)
    assert(m.nRows == nRows)
    assert(m.nCols == nCols)
    assert(m.toBreezeMatrix() == convertDistributedMatrixToBreeze(irm))
    assert(m.blocks.count() == 5)

    (m.dot(m.T)).toBreezeMatrix() // assert no exception

    assert(m.mapWithIndex { case (i, j, v) => i + 10 * j + v }.toBreezeMatrix() ===
      new BDM[Double](nRows, nCols, Array[Double](
        0.0, 1.0, 2.0, 4.0, 5.0, 6.0, 6.0, 7.0, 9.0,
        10.0, 11.0, 12.0, 15.0, 16.0, 17.0, 16.0, 17.0, 20.0
      )))
  }
} 
Example 4
Source File: IndexRowMatrixDemo.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.SparkConf
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.mllib.linalg.Vectors

object IndexRowMatrixDemo {
  def main(args: Array[String]) {
    //val sparkConf = new SparkConf().setMast("local[2]").setAppName("SparkHdfsLR")

    val conf = new SparkConf().setAppName("test").setMaster("local")
    val sc = new SparkContext(conf)
    //定义一个隐式转换函数
    implicit def double2long(x: Double) = x.toLong
    //数据中的第一个元素为IndexedRow中的index,剩余的映射到vector
    //f.take(1)(0)获取到第一个元素并自动进行隐式转换,转换成Long类型
    val rdd1 = sc.parallelize(
      Array(
        Array(1.0, 2.0, 3.0, 4.0),
        Array(2.0, 3.0, 4.0, 5.0),
        Array(3.0, 4.0, 5.0, 6.0))).map(f => IndexedRow(f.take(1)(0), Vectors.dense(f.drop(1))))
     //索引行矩阵(IndexedRowMatrix)按行分布式存储,有行索引,其底层支撑结构是索引的行组成的RDD,所以每行可以通过索引(long)和局部向量表示
    val indexRowMatrix = new IndexedRowMatrix(rdd1)
    //计算拉姆矩阵
    var gramianMatrix: Matrix = indexRowMatrix.computeGramianMatrix()
    //转换成行矩阵RowMatrix
    var rowMatrix: RowMatrix = indexRowMatrix.toRowMatrix()
    //其它方法例如computeSVD计算奇异值、multiply矩阵相乘等操作,方法使用与RowMaxtrix相同
  }
} 
Example 5
Source File: Main.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._

object Main {

  def main(args: Array[String]) {

    // init spark context
    val numPartitions = 8
    val input = "data/example.tsv"
    val conf = new SparkConf()
      .setAppName("LSH-Cosine")
      .setMaster("local[4]")
    val storageLevel = StorageLevel.MEMORY_AND_DISK
    val sc = new SparkContext(conf)

    // read in an example data set of word embeddings
    val data = sc.textFile(input, numPartitions).map {
      line =>
        val split = line.split(" ")
        val word = split.head
        val features = split.tail.map(_.toDouble)
        (word, features)
    }


    // create an unique id for each word by zipping with the RDD index
    val indexed = data.zipWithIndex.persist(storageLevel)

    // create indexed row matrix where every row represents one word
    val rows = indexed.map {
      case ((word, features), index) =>
        IndexedRow(index, Vectors.dense(features))
    }

    // store index for later re-mapping (index to word)
    val index = indexed.map {
      case ((word, features), index) =>
        (index, word)
    }.persist(storageLevel)

    // create an input matrix from all rows and run lsh on it
    val matrix = new IndexedRowMatrix(rows)
    val lsh = new Lsh(
      minCosineSimilarity = 0.5,
      dimensions = 20,
      numNeighbours = 200,
      numPermutations = 10,
      partitions = numPartitions,
      storageLevel = storageLevel
    )
    val similarityMatrix = lsh.join(matrix)

    // remap both ids back to words
    val remapFirst = similarityMatrix.entries.keyBy(_.i).join(index).values
    val remapSecond = remapFirst.keyBy { case (entry, word1) => entry.j }.join(index).values.map {
      case ((entry, word1), word2) =>
        (word1, word2, entry.value)
    }

    // group by neighbours to get a list of similar words and then take top k
    val result = remapSecond.groupBy(_._1).map {
      case (word1, similarWords) =>
        // sort by score desc. and take top 10 entries
        val similar = similarWords.toSeq.sortBy(-1 * _._3).take(10).map(_._2).mkString(",")
        s"$word1 --> $similar"
    }

    // print out the results for the first 10 words
    result.take(20).foreach(println)

    sc.stop()

  }
} 
Example 6
Source File: QueryNearestNeighbours.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry}


class QueryNearestNeighbours(
                              distance: VectorDistance,
                              threshold: Double,
                              queryFraction: Double,
                              catalogFraction: Double
                            ) extends QueryJoiner with Serializable {

  def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val sampledQueries = queryMatrix.rows.sample(false, queryFraction)
    val sampledCatalog = catalogMatrix.rows.sample(false, catalogFraction)

    val joined = sampledQueries.cartesian(sampledCatalog)

    val neighbours = joined.map { case ((query: IndexedRow), (catalogEntry: IndexedRow)) =>
      new MatrixEntry(query.index, catalogEntry.index, distance(query.vector, catalogEntry.vector))
    }.filter(_.value >= threshold)

    new CoordinateMatrix(neighbours)
  }
} 
Example 7
Source File: NearestNeighbours.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry}


class NearestNeighbours(
                         distance: VectorDistance,
                         threshold: Double,
                         fraction: Double) extends Joiner with Serializable {

  def join(inputMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val rows = inputMatrix.rows
    val sampledRows = rows.sample(false, fraction)
    sampledRows.cache()

    val joined = sampledRows.cartesian(rows)

    val similarity = joined.map {
      case ((rowA: IndexedRow), (rowB: IndexedRow)) =>
        ((rowA.index, rowB.index), distance(rowA.vector, rowB.vector))
    }

    val neighbours = similarity.filter {
      case ((indexA: Long, indexB: Long), similarity) =>
        similarity >= threshold &&
          indexA < indexB // make upper triangular and remove self similarities
    }

    val resultRows = neighbours.map {
      case ((indexA: Long, indexB: Long), similarity) =>
        MatrixEntry(indexA, indexB, similarity)
    }

    new CoordinateMatrix(resultRows)
  }
} 
Example 8
Source File: NearestNeighboursTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class NearestNeighboursTest
  extends FunSuite
  with SparkLocalContext
  with Matchers {

  def denseVector(input: Double*): Vector = {
    Vectors.dense(input.toArray)
  }

  test("nearest neighbours cosine") {
    val vecA = denseVector(1.0, 0.0)
    val vecB = denseVector(0.0, 1.0)
    val vecC = denseVector(-1.0, 0.0)
    val vecD = denseVector(1.0, 0.0)

    val rows = Seq(
      IndexedRow(0, vecA),
      IndexedRow(1, vecB),
      IndexedRow(2, vecC),
      IndexedRow(3, vecD)
    )
    val indexedMatrix = new IndexedRowMatrix(sc.parallelize(rows))

    val nearestNeighbour = new NearestNeighbours(Cosine, 0.0, 1.0)
    val got = nearestNeighbour.join(indexedMatrix)

    val expected = Seq(
      MatrixEntry(0, 1, 0.0),
      MatrixEntry(0, 3, 1.0),
      MatrixEntry(1, 2, 0.0),
      MatrixEntry(1, 3, 0.0)
    )
    val gotEntries = got.entries.collect().toSeq
    gotEntries should be(expected)
  }


} 
Example 9
Source File: QueryHammingTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import com.soundcloud.TestHelper
import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class QueryHammingTest
  extends FunSuite
    with SparkLocalContext
    with Matchers
    with TestHelper {

  def denseVector(input: Double*): Vector = {
    Vectors.dense(input.toArray)
  }

  val queryVectorA = denseVector(1.0, 1.0)
  val queryVectorB = denseVector(-1.0, 1.0)

  val catalogVectorA = denseVector(1.0, 1.0)
  val catalogVectorB = denseVector(-1.0, 1.0)
  val catalogVectorC = denseVector(-1.0, 0.5)
  val catalogVectorD = denseVector(1.0, 0.5)

  val queryRows = Seq(
    IndexedRow(0, queryVectorA),
    IndexedRow(1, queryVectorB)
  )

  val catalogRows = Seq(
    IndexedRow(0, catalogVectorA),
    IndexedRow(1, catalogVectorB),
    IndexedRow(2, catalogVectorC),
    IndexedRow(3, catalogVectorD)
  )

  val expected = Array(
    MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)),
    MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)),
    MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)),
    MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC))
  )

  test("broadcast catalog") {
    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, true)
    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect

    implicit val equality = new MatrixEquality(0.02)
    got.sortBy(t => (t.i, t.j)) should equal(expected)
  }

  test("broadcast query") {
    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryHamming(0.1, 10000, 2, false)
    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect

    implicit val equality = new MatrixEquality(0.02)
    got.sortBy(t => (t.i, t.j)) should equal(expected)
  }

} 
Example 10
Source File: QueryNearestNeighboursTest.scala    From cosine-lsh-join-spark   with MIT License 5 votes vote down vote up
package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.scalatest.{FunSuite, Matchers}

class QueryNearestNeighboursTest
  extends FunSuite
    with SparkLocalContext
    with Matchers {

  def denseVector(input: Double*): Vector = {
    Vectors.dense(input.toArray)
  }

  test("nearest neighbours cosine") {
    val queryVectorA = denseVector(1.0, 1.0)
    val queryVectorB = denseVector(-1.0, 1.0)

    val catalogVectorA = denseVector(1.0, 1.0)
    val catalogVectorB = denseVector(-1.0, 1.0)
    val catalogVectorC = denseVector(-1.0, 0.5)
    val catalogVectorD = denseVector(1.0, 0.5)

    val queryRows = Seq(
      IndexedRow(0, queryVectorA),
      IndexedRow(1, queryVectorB)
    )

    val catalogRows = Seq(
      IndexedRow(0, catalogVectorA),
      IndexedRow(1, catalogVectorB),
      IndexedRow(2, catalogVectorC),
      IndexedRow(3, catalogVectorD)
    )

    val queryMatrix = new IndexedRowMatrix(sc.parallelize(queryRows))
    val catalogMatrix = new IndexedRowMatrix(sc.parallelize(catalogRows))

    val queryNearestNeighbour = new QueryNearestNeighbours(Cosine, 0.4, 1.0, 1.0)
    val expected = Seq(
      MatrixEntry(0, 0, Cosine(queryVectorA, catalogVectorA)),
      MatrixEntry(0, 3, Cosine(queryVectorA, catalogVectorD)),
      MatrixEntry(1, 1, Cosine(queryVectorB, catalogVectorB)),
      MatrixEntry(1, 2, Cosine(queryVectorB, catalogVectorC))
    )

    val got = queryNearestNeighbour.join(queryMatrix, catalogMatrix).entries.collect
    got should be(expected)
  }
}