org.apache.hadoop.hbase.util.Bytes Scala Examples

The following examples show how to use org.apache.hadoop.hbase.util.Bytes. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: HBaseLocalClient.scala    From gimel   with Apache License 2.0 5 votes vote down vote up
package com.paypal.gimel.hbase.utilities

import java.io.File

import scala.collection.mutable.ArrayBuffer

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.util._
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}

import com.paypal.gimel.common.catalog.Field
import com.paypal.gimel.hbase.DataSet

class HBaseLocalClient extends FunSuite with Matchers with BeforeAndAfterAll {

  var sparkSession : SparkSession = _
  var dataSet: DataSet = _
  val hbaseTestingUtility = new HBaseTestingUtility()
  val tableName = "test_table"
  val cfs = Array("personal", "professional")
  val columns = Array("id", "name", "age", "address", "company", "designation", "salary")
  val fields = columns.map(col => new Field(col))

  val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]

  protected override def beforeAll(): Unit = {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    hbaseTestingUtility.startMiniCluster()
    SparkHBaseConf.conf = hbaseTestingUtility.getConfiguration
    createTable(tableName, cfs)
    val conf = new SparkConf
    conf.set(SparkHBaseConf.testConf, "true")
    sparkSession = SparkSession.builder()
      .master("local")
      .appName("HBase Test")
      .config(conf)
      .getOrCreate()

    val listener = new QueryExecutionListener {
      // Only test successful case here, so no need to implement `onFailure`
      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
        metrics += ((funcName, qe, duration))
      }
    }
    sparkSession.listenerManager.register(listener)
    sparkSession.sparkContext.setLogLevel("ERROR")
    dataSet = new DataSet(sparkSession)
  }

  protected override def afterAll(): Unit = {
    hbaseTestingUtility.shutdownMiniCluster()
    sparkSession.close()
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      hbaseTestingUtility.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ : Throwable =>
        println("No table = " + name + " found")
    }
    hbaseTestingUtility.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }

  // Mocks data for testing
  def mockDataInDataFrame(numberOfRows: Int): DataFrame = {
    def stringed(n: Int) = s"""{"id": "$n","name": "MAC-$n", "address": "MAC-${n + 1}", "age": "${n + 1}", "company": "MAC-$n", "designation": "MAC-$n", "salary": "${n * 10000}" }"""
    val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) }
    val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts)
    val dataFrame: DataFrame = sparkSession.read.json(rdd)
    dataFrame
  }
} 
Example 2
Source File: HBaseBulkGetExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName} is missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
} 
Example 3
Source File: HBaseForeachPartitionExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseForeachPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseForeachPartitionExample {tableName} {columnFamily} are missing an arguments")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseForeachPartitionExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)


      rdd.hbaseForeachPartition(hbaseContext,
        (it, connection) => {
          val m = connection.getBufferedMutator(TableName.valueOf(tableName))

          it.foreach(r => {
            val put = new Put(r._1)
            r._2.foreach((putValue) =>
              put.addColumn(putValue._1, putValue._2, putValue._3))
            m.mutate(put)
          })
          m.flush()
          m.close()
        })

    } finally {
      sc.stop()
    }
  }
} 
Example 4
Source File: HBaseBulkDeleteExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeleteExample {tableName} are missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)

    } finally {
      sc.stop()
    }
  }
} 
Example 5
Source File: HBaseBulkPutExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExample {
   def main(args: Array[String]) {
     if (args.length < 2) {
       println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments")
       return
     }

     val tableName = args(0)
     val columnFamily = args(1)

     val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
       tableName + " " + columnFamily)
     val sc = new SparkContext(sparkConf)

     try {
       //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
       val rdd = sc.parallelize(Array(
         (Bytes.toBytes("1"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
         (Bytes.toBytes("2"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
         (Bytes.toBytes("3"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
         (Bytes.toBytes("4"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
         (Bytes.toBytes("5"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
       ))

       val conf = HBaseConfiguration.create()

       val hbaseContext = new HBaseContext(sc, conf)

       rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName),
         (putRecord) => {
           val put = new Put(putRecord._1)
           putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
             putValue._3))
           put
         })

     } finally {
       sc.stop()
     }
   }
 } 
Example 6
Source File: HBaseBulkGetExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = hbaseContext.bulkGet[Array[Byte], String](
        TableName.valueOf(tableName),
        2,
        rdd,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
} 
Example 7
Source File: HBaseStreamingBulkPutExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseStreamingBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 4) {
      println("HBaseStreamingBulkPutExample " +
        "{host} {port} {tableName} {columnFamily} are missing an argument")
      return
    }

    val host = args(0)
    val port = args(1)
    val tableName = args(2)
    val columnFamily = args(3)

    val sparkConf = new SparkConf().setAppName("HBaseStreamingBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)
    try {
      val ssc = new StreamingContext(sc, Seconds(1))

      val lines = ssc.socketTextStream(host, port.toInt)

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      hbaseContext.streamBulkPut[String](lines,
        TableName.valueOf(tableName),
        (putRecord) => {
          if (putRecord.length() > 0) {
            val put = new Put(Bytes.toBytes(putRecord))
            put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar"))
            put
          } else {
            null
          }
        })
      ssc.start()
      ssc.awaitTerminationOrTimeout(60000)
    } finally {
      sc.stop()
    }
  }
} 
Example 8
Source File: HBaseBulkPutExampleFromFile.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExampleFromFile {
  def main(args: Array[String]) {
    if (args.length < 3) {
      println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile} are missing an argument")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)
    val inputFile = args(2)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " +
      tableName + " " + columnFamily + " " + inputFile)
    val sc = new SparkContext(sparkConf)

    try {
      var rdd = sc.hadoopFile(
        inputFile,
        classOf[TextInputFormat],
        classOf[LongWritable],
        classOf[Text]).map(v => {
        System.out.println("reading-" + v._2.toString)
        v._2.toString
      })

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[String](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          System.out.println("hbase-" + putRecord)
          val put = new Put(Bytes.toBytes("Value- " + putRecord))
          put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"),
            Bytes.toBytes(putRecord.length()))
          put
        });
    } finally {
      sc.stop()
    }
  }
} 
Example 9
Source File: HBaseBulkDeleteExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeleteExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkDelete[Array[Byte]](rdd,
        TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)
    } finally {
      sc.stop()
    }
  }
} 
Example 10
Source File: HBaseBulkPutExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily} are missing an arguments")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) =>
            put.addColumn(putValue._1, putValue._2, putValue._3))
          put
        });
    } finally {
      sc.stop()
    }
  }
} 
Example 11
Source File: HBaseDistributedScanExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience

@InterfaceAudience.Private
object HBaseDistributedScanExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseDistributedScanExample {tableName} missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName )
    val sc = new SparkContext(sparkConf)

    try {
      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val scan = new Scan()
      scan.setCaching(100)

      val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan)

      getRdd.foreach(v => println(Bytes.toString(v._1.get())))

      println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length);
    } finally {
      sc.stop()
    }
  }

} 
Example 12
Source File: HBaseBulkPutTimestampExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseBulkPutTimestampExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily} are missing an argument")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {

      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("6"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("7"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("8"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("9"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("10"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))))

      val conf = HBaseConfiguration.create()

      val timeStamp = System.currentTimeMillis()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
            timeStamp, putValue._3))
          put
        })
    } finally {
      sc.stop()
    }
  }
} 
Example 13
Source File: ReadFilter.scala    From hbase-rdd-examples   with Apache License 2.0 5 votes vote down vote up
package unicredit.example

import org.apache.hadoop.hbase.filter.PrefixFilter
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import unicredit.spark.hbase._


object ReadFilter extends App {
  val name = "Example of read from HBase table"

  lazy val sparkConf = new SparkConf().setAppName(name)
  lazy val sc = new SparkContext(sparkConf)
  implicit val config = HBaseConfig() // Assumes hbase-site.xml is on classpath

  val columns = Map(
    "cf1" -> Set("col1", "col2"),
    "cf2" -> Set("col3")
  )

  val filter = new PrefixFilter(Bytes.toBytes("abc"))

  sc.hbase[String]("test-table", columns, filter)
    .map({ case (k, v) =>
      val cf1 = v("cf1")
      val col1 = cf1("col1")
      val col2 = cf1("col2")
      val col3 = v("cf2")("col3")

      List(k, col1, col2, col3) mkString "\t"
    })
    .saveAsTextFile("test-output")
} 
Example 14
Source File: HBaseMapPartitionExample.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.yetus.audience.InterfaceAudience


@InterfaceAudience.Private
object HBaseMapPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseMapPartitionExample {tableName} is missing an argument")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseMapPartitionExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => {
        val table = connection.getTable(TableName.valueOf(tableName))
        it.map{r =>
          //batching would be faster.  This is just an example
          val result = table.get(new Get(r))

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(cell.getQualifierArray)
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")")
            }
          }
          b.toString()
        }
      })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
} 
Example 15
Source File: HBasePut.scala    From gimel   with Apache License 2.0 5 votes vote down vote up
package com.paypal.gimel.hbase.utilities

import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.{DataFrame, SparkSession}

import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs}
import com.paypal.gimel.logger.Logger

object HBasePut {

  def apply(sparkSession: SparkSession): HBasePut = new HBasePut(sparkSession)

}

class HBasePut(sparkSession: SparkSession) {
  val logger = Logger()
  lazy val hbaseUtilities = HBaseUtilities(sparkSession)

  
  def putRows(hbaseTable: String, dataFrame: DataFrame, rowKeyColumn: String, columns: Array[String], cfColsMap: Map[String, String]) {
    try {
      // Configure And Connect
      val conf = HBaseConfiguration.create()
      val cnxn = ConnectionFactory.createConnection(conf)
      // Create Connection to HBase table
      val tbl = cnxn.getTable(TableName.valueOf(hbaseTable))
      val rows = dataFrame.rdd.map { row =>
        (row.getAs(rowKeyColumn).toString,
          columns.map(eachCol => (cfColsMap.getOrElse(eachCol, ""), eachCol, row.getAs(eachCol).asInstanceOf[String]))
        )
      }.collect()
      // Performing put operation on each row of dataframe
      rows.foreach { row =>
        val putRow: Put = new Put(Bytes.toBytes(row._1.asInstanceOf[String]))
        row._2.foreach(x => if (x._2 != rowKeyColumn) putRow.addColumn(Bytes.toBytes(x._1), Bytes.toBytes(x._2), Bytes.toBytes(x._3)))
        tbl.put(putRow)
      }
      tbl.close()
    } catch {
      case ex: Throwable =>
        ex.printStackTrace()
        throw ex
    }
  }
} 
Example 16
Source File: HogConfig.scala    From hogzilla   with GNU General Public License v2.0 5 votes vote down vote up
package org.hogzilla.util

import java.security.MessageDigest
import org.apache.hadoop.hbase.util.Bytes
import javax.xml.bind.DatatypeConverter
import math._
import com.typesafe.config.Config
import scala.collection.mutable.HashSet



object HogConfig {
  
   
  
  def get(config:Config,key:String,valueType:String,default:Any):Any =
  {
    if(config==null)
      return default
    
    
      try {
        
        val value = config.getString(key)
        
        if(value.isEmpty())
          return default // Return default value
        
        println(f"Configuration: $key => $value")
       
        if(valueType.equals("Int"))
          value.toInt 
        else if(valueType.equals("Double"))
          value.toDouble 
        else if(valueType.equals("Long"))
          value.toLong 
        else if(valueType.equals("Set(Int)"))
        {
          val patternSet="Set\\(".r
          val patternSetEnd="\\)".r
          
          if(value.equals("Set()"))
            return Set()
          
          return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),""))
                  .split(",").map({x => x.toInt}).toSet
        }
        else if(valueType.equals("Set(String)"))
        {
          val patternSet="Set\\(".r
          val patternSetEnd="\\)".r
          
          if(value.equals("Set()"))
            return Set()
          
          return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),""))
                  .split(",").map({x => println(x.toString.trim()) ; x.toString.trim()}).toSet
        }
        else
          default // Create type first
          
      } catch {
        case t: Throwable => t.printStackTrace() 
        println(f"Problem parsing $key . Check if it is ok. Using default value")
        
        return default
      } 
  
  }
  
  def getInt(config:Config,key:String,default:Any):Int =
  {
    get(config,key,"Int",default).asInstanceOf[Int]
  }
  
  def getLong(config:Config,key:String,default:Any):Long =
  {
    get(config,key,"Long",default).asInstanceOf[Long]
  }
  
  def getDouble(config:Config,key:String,default:Any):Double =
  {
    get(config,key,"Double",default).asInstanceOf[Long]
  }
  
  def getSetInt(config:Config,key:String,default:Any):Set[Int] =
  {
    get(config,key,"Set(Int)",default).asInstanceOf[Set[Int]]
  }
  
  def getSetString(config:Config,key:String,default:Any):Set[String] =
  {
    get(config,key,"Set(String)",default).asInstanceOf[Set[String]]
  }
   

} 
Example 17
Source File: HogGeograph.scala    From hogzilla   with GNU General Public License v2.0 5 votes vote down vote up
package org.hogzilla.util

import java.security.MessageDigest
import org.apache.hadoop.hbase.util.Bytes
import javax.xml.bind.DatatypeConverter
import math._



object HogGeograph {
  
   val R = 6372.8 //radius in km
  
  def haversineDistance(lat1:Double, lon1:Double, lat2:Double, lon2:Double):Double =
  {
      val dLat=(lat2 - lat1).toRadians
      val dLon=(lon2 - lon1).toRadians

      val a = pow(sin(dLat/2),2) + pow(sin(dLon/2),2) * cos(lat1.toRadians) * cos(lat2.toRadians)
      val c = 2 * asin(sqrt(a))
      R * c    
  }
   
   
   def haversineDistanceFromStrings(coords1:String, coords2:String):Double =
	   {
		   try {
			   val coordsDouble1 = coords1.split(",").map({ x => x.toDouble })
				 val coordsDouble2 = coords2.split(",").map({ x => x.toDouble })

				 haversineDistance(coordsDouble1(0),coordsDouble1(1),coordsDouble2(0),coordsDouble2(1))
		   } catch {
		        case t: Throwable => // t.printStackTrace() 
             // Return a large distance 
             999999999D
		   }
	   }

} 
Example 18
Source File: HogEvent.scala    From hogzilla   with GNU General Public License v2.0 5 votes vote down vote up
package org.hogzilla.event

import java.util.HashMap
import java.util.Map
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.hogzilla.hbase.HogHBaseRDD
import org.hogzilla.util.HogFlow
import java.net.InetAddress


class HogEvent(flow:HogFlow) 
{
	var sensorid:Int=0
	var signature_id:Double=0
	var priorityid:Int=0
	var text:String=""
	var data:Map[String,String]=new HashMap()
  var ports:String=""
  var title:String=""
  var username:String=""
  var coords:String=""
 
  
  def formatIPtoBytes(ip:String):Array[Byte] =
  {
    try {
       // Eca! Snorby doesn't support IPv6 yet. See https://github.com/Snorby/snorby/issues/65
    if(ip.contains(":"))
      InetAddress.getByName("255.255.6.6").getAddress
    else  
      InetAddress.getByName(ip).getAddress
    } catch {
      case t: Throwable => 
        // Bogus address!
        InetAddress.getByName("255.255.1.1").getAddress
    }   
   
  }

  
   def alert()
   {
	   val put = new Put(Bytes.toBytes(flow.get("flow:id")))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("note"), Bytes.toBytes(text))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("lower_ip"), formatIPtoBytes(flow.lower_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("upper_ip"), formatIPtoBytes(flow.upper_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("lower_ip_str"), Bytes.toBytes(flow.lower_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("upper_ip_str"), Bytes.toBytes(flow.upper_ip))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("signature_id"), Bytes.toBytes("%.0f".format(signature_id)))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("time"), Bytes.toBytes(System.currentTimeMillis))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("ports"), Bytes.toBytes(ports))
     put.add(Bytes.toBytes("event"), Bytes.toBytes("title"), Bytes.toBytes(title))
     
     if(!username.equals(""))
       put.add(Bytes.toBytes("event"), Bytes.toBytes("username"), Bytes.toBytes(username))
     if(!coords.equals(""))
       put.add(Bytes.toBytes("event"), Bytes.toBytes("coords"), Bytes.toBytes(coords))
     
     HogHBaseRDD.hogzilla_events.put(put)

     //println(f"ALERT: $text%100s\n\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
   }
} 
Example 19
Source File: HogSignature.scala    From hogzilla   with GNU General Public License v2.0 5 votes vote down vote up
package org.hogzilla.event

import org.hogzilla.hbase.HogHBaseRDD
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Put




case class HogSignature(signature_class:Int, signature_name:String, signature_priority:Int, signature_revision:Int, signature_id:Double,signature_group_id:Int) {
  //Example: 3,"HZ: Suspicious DNS flow identified by K-Means clustering",2,1,826000001,826
  
  def saveHBase():HogSignature =
  {
    val get = new Get(Bytes.toBytes("%.0f".format(signature_id)))
    
    if(!HogHBaseRDD.hogzilla_sensor.exists(get))
    {
      val put = new Put(Bytes.toBytes("%.0f".format(signature_id)))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("id"), Bytes.toBytes("%.0f".format(signature_id)))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("class"), Bytes.toBytes(signature_class.toString()))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("name"), Bytes.toBytes(signature_name))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("priority"), Bytes.toBytes(signature_priority.toString()))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("revision"), Bytes.toBytes(signature_revision.toString()))
      put.add(Bytes.toBytes("signature"), Bytes.toBytes("group_id"), Bytes.toBytes(signature_group_id.toString()))
      HogHBaseRDD.hogzilla_signatures.put(put)
    }
    
    this
  }
} 
Example 20
Source File: HogHBaseReputation.scala    From hogzilla   with GNU General Public License v2.0 5 votes vote down vote up
package org.hogzilla.hbase




import scala.math.random
import java.lang.Math
import org.apache.spark._
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
import org.apache.hadoop.hbase.filter.BinaryComparator
import org.apache.hadoop.hbase.filter.FilterList
import org.apache.hadoop.hbase.filter.CompareFilter
import java.util.ArrayList
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.Filter
import scala.collection.mutable.HashSet
import org.apache.hadoop.hbase.client.Put


object HogHBaseReputation {

  // Ex: MX, whitelist
	def getReputationList(listName:String, listType:String):Set[String] =
	{
		val list =  new HashSet[String]


	  val filters: ArrayList[Filter] = new ArrayList();

		val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType)))
		colValFilter1.setFilterIfMissing(false);

		val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName)))
		colValFilter2.setFilterIfMissing(false);

		filters.add(colValFilter1);
		filters.add(colValFilter2);

		val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters);
		val scan = new Scan()
		scan.setFilter(filterList)
    
		val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator()
		
    while(it.hasNext())
		{
      list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) )
		}
    
    list.toSet

	}
  
 def saveReputationList(listName:String, listType:String, ip:String) =
 {
     val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip))
     
     HogHBaseRDD.hogzilla_reputation.put(put)
 }

} 
Example 21
Source File: HogHBaseCluster.scala    From hogzilla   with GNU General Public License v2.0 5 votes vote down vote up
package org.hogzilla.hbase

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.Vector
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Delete
import org.hogzilla.cluster.HogClusterMember


object HogHBaseCluster {

 def formatClusterTitle(clusterCentroid: List[(Long,Double)], clusterIdx:Int):String =
 {
   val mainTitle = 
   "Group "+clusterIdx.toString+" - "+
   clusterCentroid
   .filter({case (port,rate) =>
            rate > 4.999
          })
   .map({case (port,rate) =>
            port.toString()+":"+"%.0f".format(rate)+"%"
        }).mkString(", ")
        
   val onePercentList=
   clusterCentroid
   .filter({case (port,rate) =>
            .9999 < rate & rate < 5
          })
          
   if(onePercentList.size>0)
   {
     mainTitle+", "+
     onePercentList.map({case (port,rate) =>
            port.toString()
        }).mkString("(",", ",")"+"> 1%")
     
   }else
   {
     mainTitle
   }
 }
 
 def deleteCluster(clusterIdx:Int)=
 {
     val del = new Delete(Bytes.toBytes(clusterIdx.toString))
     HogHBaseRDD.hogzilla_clusters.delete(del)
 }
 
 
 def deleteClusterMember(memberIP:String)=
 {
     val del = new Delete(Bytes.toBytes(memberIP))
     HogHBaseRDD.hogzilla_cluster_members.delete(del)
 }
 
 def saveCluster(clusterIdx:Int, clusterCentroid:List[(Long,Double)], clusterSize: Long, members:Array[String]) = {
   
     val memberString = members.mkString(",")
   
     val put = new Put(Bytes.toBytes(clusterIdx.toString))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("title"), Bytes.toBytes(formatClusterTitle(clusterCentroid,clusterIdx)))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("size"), Bytes.toBytes(clusterSize.toString))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("centroid"), Bytes.toBytes(clusterCentroid.mkString("[",",","]")))
     put.add(Bytes.toBytes("info"), Bytes.toBytes("members"), Bytes.toBytes(memberString))
     
     HogHBaseRDD.hogzilla_clusters.put(put)
  }
 
 def saveClusterMember(clusterMember:HogClusterMember) = {
   
     val put = new Put(Bytes.toBytes(clusterMember.memberIP.toString))
     put.add(Bytes.toBytes("info"),   Bytes.toBytes("title"),      Bytes.toBytes(clusterMember.formatTitle))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("size"),       Bytes.toBytes(clusterMember.clusterSize.toString))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("centroid"),   Bytes.toBytes(clusterMember.centroid.mkString("[",",","]")))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("idx"),        Bytes.toBytes(clusterMember.clusterIdx.toString))
     put.add(Bytes.toBytes("cluster"),Bytes.toBytes("description"),Bytes.toBytes(formatClusterTitle(clusterMember.centroid,clusterMember.clusterIdx)))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("ports"),      Bytes.toBytes("TCP: "+clusterMember.ports.mkString(""," ","")))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("frequencies"),Bytes.toBytes("TCP: "+
                                                                           clusterMember.frequency_vector
                                                                           .filter({case (port,freq) => clusterMember.ports.contains(port)})
                                                                           .map({case (port,freq) => port.toString+"="+
                                                                                                     "%.0f".format(freq)+"%"
                                                                                })
                                                                           .mkString(""," ","")
                                                                          ))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("ip"),         Bytes.toBytes(clusterMember.memberIP))
     put.add(Bytes.toBytes("member"), Bytes.toBytes("distance"),   Bytes.toBytes("%.2f".format(clusterMember.distance)))
     
     
     HogHBaseRDD.hogzilla_cluster_members.put(put)
  }
  

} 
Example 22
Source File: BytesUtilsSuite.scala    From Spark-SQL-on-HBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Logging
import org.apache.spark.sql.hbase.types.HBaseBytesType
import org.apache.spark.sql.hbase.util.BinaryBytesUtils
import org.apache.spark.sql.types._
import org.scalatest.{BeforeAndAfterAll, FunSuite}

class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging {
  test("Bytes Ordering Test") {
    val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1,
      0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257)
    val result = s.map(i => (i, BinaryBytesUtils.create(IntegerType).toBytes(i)))
      .sortWith((f, s) =>
      HBaseBytesType.ordering.gt(
        f._2.asInstanceOf[HBaseBytesType.InternalType],
        s._2.asInstanceOf[HBaseBytesType.InternalType]))
    assert(result.map(a => a._1) == s.sorted.reverse)
  }

  def compare(a: Array[Byte], b: Array[Byte]): Int = {
    val length = Math.min(a.length, b.length)
    var result: Int = 0
    for (i <- 0 to length - 1) {
      val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte]
      if (diff != 0) {
        result = diff
      }
    }
    result
  }

  test("Bytes Utility Test") {
    assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType)
      .toBytes(input = true), 0) === true)
    assert(BinaryBytesUtils.toBoolean(BinaryBytesUtils.create(BooleanType)
      .toBytes(input = false), 0) === false)

    assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(12.34d), 0)
      === 12.34d)
    assert(BinaryBytesUtils.toDouble(BinaryBytesUtils.create(DoubleType).toBytes(-12.34d), 0)
      === -12.34d)

    assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(12.34f), 0)
      === 12.34f)
    assert(BinaryBytesUtils.toFloat(BinaryBytesUtils.create(FloatType).toBytes(-12.34f), 0)
      === -12.34f)

    assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(12), 0)
      === 12)
    assert(BinaryBytesUtils.toInt(BinaryBytesUtils.create(IntegerType).toBytes(-12), 0)
      === -12)

    assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(1234l), 0)
      === 1234l)
    assert(BinaryBytesUtils.toLong(BinaryBytesUtils.create(LongType).toBytes(-1234l), 0)
      === -1234l)

    assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType)
      .toBytes(12.asInstanceOf[Short]), 0) === 12)
    assert(BinaryBytesUtils.toShort(BinaryBytesUtils.create(ShortType)
      .toBytes(-12.asInstanceOf[Short]), 0) === -12)

    assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes("abc"), 0, 3)
      === UTF8String("abc"))
    assert(BinaryBytesUtils.toUTF8String(BinaryBytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String(""))

    assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType)
      .toBytes(5.asInstanceOf[Byte]), 0) === 5)
    assert(BinaryBytesUtils.toByte(BinaryBytesUtils.create(ByteType)
      .toBytes(-5.asInstanceOf[Byte]), 0) === -5)

    assert(compare(BinaryBytesUtils.create(IntegerType).toBytes(128),
      BinaryBytesUtils.create(IntegerType).toBytes(-128)) > 0)
  }

  test("byte array plus one") {
    var byteArray =  Array[Byte](0x01.toByte, 127.toByte)
    assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray),  Array[Byte](0x01.toByte, 0x80.toByte)) == 0)

    byteArray =  Array[Byte](0xff.toByte, 0xff.toByte)
    assert(BinaryBytesUtils.addOne(byteArray) == null)

    byteArray =  Array[Byte](0x02.toByte, 0xff.toByte)
    assert(Bytes.compareTo(BinaryBytesUtils.addOne(byteArray),  Array[Byte](0x03.toByte, 0x00.toByte)) == 0)
  }

  test("float comparison") {
    val f1 = BinaryBytesUtils.create(FloatType).toBytes(-1.23f)
    val f2 = BinaryBytesUtils.create(FloatType).toBytes(100f)
    assert(Bytes.compareTo(f1, f2) < 0)
  }
} 
Example 23
Source File: HBasePartitioner.scala    From Spark-SQL-on-HBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner
import org.apache.spark.util.CollectionsUtils

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
} 
Example 24
Source File: HBasePartitioner.scala    From Heracles   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner
import org.apache.spark.util.CollectionsUtils

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (val splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  override def numPartitions = if (len == 0) 1 else len

  @transient private lazy val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  override def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
} 
Example 25
Source File: ByteArrayComparable.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.util.Bytes

class ByteArrayComparable(val bytes:Array[Byte], val offset:Int = 0, var length:Int = -1)
  extends Comparable[ByteArrayComparable] {

  if (length == -1) {
    length = bytes.length
  }

  override def compareTo(o: ByteArrayComparable): Int = {
    Bytes.compareTo(bytes, offset, length, o.bytes, o.offset, o.length)
  }

  override def hashCode(): Int = {
    Bytes.hashCode(bytes, offset, length)
  }

  override def equals (obj: Any): Boolean = {
    obj match {
      case b: ByteArrayComparable =>
        Bytes.equals(bytes, offset, length, b.bytes, b.offset, b.length)
      case _ =>
        false
    }
  }
} 
Example 26
Source File: HbaseReaderHelper.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.hbase.writers

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.HbaseHelper
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}

import scala.collection.JavaConverters._

object HbaseReaderHelper {
  def createConnection: Connection = {
    ConnectionFactory.createConnection(HBaseConfiguration.create())
  }

  def getAllRecords(tableName: String, columnFamily: String)(implicit connection: Connection): List[HbaseRowData] = {
    HbaseHelper.withTable(TableName.valueOf(tableName)) { tbl =>
      val scan = new Scan()
      scan.addFamily(columnFamily.fromString())
      val scanner = tbl.getScanner(scan)
      scanner.asScala.map { rs =>
        val cells = rs.rawCells().map { cell =>
          Bytes.toString(CellUtil.cloneQualifier(cell)) -> CellUtil.cloneValue(cell)
        }.toMap
        HbaseRowData(rs.getRow, cells)
      }.toList
    }
  }

}

case class HbaseRowData(key: Array[Byte], cells: Map[String, Array[Byte]]) 
Example 27
Source File: HBaseServiceLayer.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.taxi360.server.hbase

import javax.ws.rs._
import javax.ws.rs.core.MediaType

import com.cloudera.sa.taxi360.model.NyTaxiYellowTrip
import com.cloudera.sa.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable

@Path("rest")
class HBaseServiceLayer {

  @GET
  @Path("hello")
  @Produces(Array(MediaType.TEXT_PLAIN))
  def hello(): String = {
    "Hello World"
  }

  @GET
  @Path("vender/{venderId}/timeline")
  @Produces(Array(MediaType.APPLICATION_JSON))
  def getTripTimeLine (@PathParam("venderId") venderId:String,
                          @QueryParam("startTime") startTime:String = Long.MinValue.toString,
                          @QueryParam("endTime")  endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = {

    val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName))

    val st = if (startTime == null) {
      Long.MinValue.toString
    } else {
      startTime
    }
    val et = if (endTime == null) {
      Long.MaxValue.toString
    } else {
      endTime
    }

    val scan = new Scan()
    val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts)
    println("startRowKey:" + Bytes.toString(startRowKey))
    scan.setStartRow(startRowKey)
    val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts)
    println("endRowKey:" + Bytes.toString(endRowKey))
    scan.setStopRow(endRowKey)

    val scannerIt = table.getScanner(scan).iterator()

    val tripList = new mutable.MutableList[NyTaxiYellowTrip]

    while(scannerIt.hasNext) {
      val result = scannerIt.next()
      tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result)
      println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result))
    }

    println("tripList.size:" + tripList.size)

    tripList.toArray
  }

} 
Example 28
Source File: CreateSaltedTable.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.taxi360.setup.hbase

import java.io.File

import org.apache.commons.lang.StringUtils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.compress.Compression
import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable


object CreateSaltedTable {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>")
    }
    val tableName = args(0)
    val columnFamilyName = args(1)
    val regionCount = args(2).toInt
    val numOfSalts = args(3).toInt
    val hbaseConfigFolder = args(4)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val connection = ConnectionFactory.createConnection(conf)

    val admin = connection.getAdmin

    val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName))

    val columnDescriptor = new HColumnDescriptor(columnFamilyName)

    columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY)
    columnDescriptor.setBlocksize(64 * 1024)
    columnDescriptor.setBloomFilterType(BloomType.ROW)

    tableDescriptor.addFamily(columnDescriptor)

    tableDescriptor.setMaxFileSize(Long.MaxValue)
    tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName)

    val splitKeys = new mutable.MutableList[Array[Byte]]
    for (i <- 0 to regionCount) {
      val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0")
      splitKeys += Bytes.toBytes(regionSplitStr)
    }
    admin.createTable(tableDescriptor, splitKeys.toArray)
  }
} 
Example 29
Source File: Test.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.types.BinaryType

object Test {
  def main(args: Array[String]) {
   val a: Array[Byte] = Array.fill(10)(Byte.MinValue)
    val b = Bytes.toBytes ("row003")
    System.arraycopy(b, 0, a, 0, b.length)
    val c = Bytes.toBytes(Int.MinValue)
    System.arraycopy(c, 0, a, b.length, c.length)
    val len = a.indexOf(HBaseTableCatalog.delimiter, 0)
    val s1 = Bytes.toString(a, 0, 6)
    val s2 = Bytes.toString(a, 0, len)



    
    val l = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(l, 0, Double.MinValue)
    val m = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(m, 0, -20.0)
    val n = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(n, 0, 0.0)
    val o = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(o,  0, 20.0)
    val p = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(p, 0, Double.MaxValue)

    val c1 = BinaryType.ordering.compare(l, m)
    val c2 = BinaryType.ordering.compare(m, n)
    val c3 = BinaryType.ordering.compare(n, o)
    val c4 = BinaryType.ordering.compare(o, p)

    val p1 = Array.fill(10)(0: Byte)
    Bytes.putBytes(p1, 0, Bytes.toBytes("row010"), 0, 6)

    val p2 = Array.fill(10)(-1: Byte)
    Bytes.putBytes(p2, 0, Bytes.toBytes("row010"), 0, 6)

    val p3 = Array.fill(10)(Byte.MaxValue)
    Bytes.putBytes(p3, 0, Bytes.toBytes("row010"), 0, 6)
    Bytes.putInt(p3, 6, 10)

    val p4 = Bytes.compareTo(p1, p3)
    val p5 = Bytes.compareTo(p2, p3)



    val z = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z, 0, -1)
    val z1 = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z1, 0, -2147483648)

    val z2 = Bytes.compareTo(z, z1)


    val t = Array.fill(4)(-1: Byte)
    println(Bytes.toInt(t))

    val s = Bytes.toBytes(1.4.asInstanceOf[Float])
    println(Bytes.toInt(s))
    println(Bytes.toFloat(s))
    val w =  Bytes.toBytes(-1.4.asInstanceOf[Float])
    println(Bytes.toInt(w))
    println(Bytes.toFloat(w))

    val buffer1 = Bytes.toBytes(-1.0f)
    val b1 = Bytes.toInt(buffer1)
    var buffer = Array.fill(4)(-1: Byte)
    var buffer2 = Bytes.toBytes(-1.0f)

    var buffer3 = java.lang.Float.floatToIntBits(-1.0f)
    val b3 = Bytes.toBytes(buffer3)
    val out = Bytes.toInt(buffer1) ^ Integer.MIN_VALUE
    buffer2 = Bytes.toBytes(out)
    var i: Int = java.lang.Float.floatToIntBits(-1.0f)
    i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1
    Bytes.putInt(buffer, 0, i)


    val mn = Bytes.toBytes(-0.0f)
    println(Bytes.toFloat(mn))
    println(Float.MinPositiveValue)



    println(s"a")
  }
} 
Example 30
Source File: SHC.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.hbase.{HColumnDescriptor, HTableDescriptor, TableName, HBaseTestingUtility}
import org.apache.hadoop.hbase.client.{Scan, Put, ConnectionFactory, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.sql.types.UTF8String
import org.apache.spark.{SparkContext, SparkConf, Logging}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
import scala.collection.JavaConverters._

class SHC  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  implicit class StringToColumn(val sc: StringContext) {
    def $(args: Any*): ColumnName = {
      new ColumnName(sc.s(args: _*))
    }
  }


  private[spark] var htu = HBaseTestingUtility.createLocalHTU()
  private[spark] def tableName = "table1"

  private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"}
  var table: Table = null
  val conf = new SparkConf
  conf.set(SparkHBaseConf.testConf, "true")
  SparkHBaseConf.conf = htu.getConfiguration
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  def catalog = s"""{
            |"table":{"namespace":"default", "name":"table1"},
            |"rowkey":"key",
            |"columns":{
              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
            |}
          |}""".stripMargin

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.cleanupTestDir
    htu.startMiniZKCluster
    htu.startMiniHBaseCluster(1, 4)
    logInfo(" - minicluster started")
    println(" - minicluster started")

  }

  override def afterAll() {
    try {
      table.close()
      println("shutdown")
      htu.deleteTable(TableName.valueOf(tableName))
      logInfo("shuting down minicluster")
      htu.shutdownMiniHBaseCluster
      htu.shutdownMiniZKCluster
      logInfo(" - minicluster shut down")
      htu.cleanupTestDir
    } catch {
      case _ => logError("teardown error")
    }
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      htu.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ =>
        logInfo(" - no table " + name + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }


  def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) {
    try {
      htu.deleteTable(TableName.valueOf(name))
    } catch {
      case _ =>
        logInfo(" - no table " + Bytes.toString(name) + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(name), cfs)
  }
} 
Example 31
Source File: Utils.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase

import java.util
import java.util.Comparator

import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.catalyst.expressions.MutableRow
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution.SparkSqlSerializer
import org.apache.spark.sql.types._

import scala.collection.mutable.ArrayBuffer
import scala.math.Ordering

object Utils {

  def setRowCol(
      row: MutableRow,
      field: (Field, Int),
      src: HBaseType,
      offset: Int,
      length: Int): Unit = {
    val index = field._2
    val f = field._1
    if (f.sedes.isDefined) {
      // If we already have sedes defined , use it.
      val m = f.sedes.get.deserialize(src, offset, length)
      row.update(index, m)
    } else if (f.exeSchema.isDefined) {
      // println("avro schema is defined to do deserialization")
      // If we have avro schema defined, use it to get record, and then covert them to catalyst data type
      val m = AvroSedes.deserialize(src, f.exeSchema.get)
      // println(m)
      val n = f.avroToCatalyst.map(_(m))
      row.update(index, n.get)
    } else  {
      // Fall back to atomic type
      f.dt match {
        case BooleanType => row.setBoolean(index, toBoolean(src, offset))
        case ByteType => row.setByte(index, src(offset))
        case DoubleType => row.setDouble(index, Bytes.toDouble(src, offset))
        case FloatType => row.setFloat(index, Bytes.toFloat(src, offset))
        case IntegerType => row.setInt(index, Bytes.toInt(src, offset))
        case LongType => row.setLong(index, Bytes.toLong(src, offset))
        case ShortType => row.setShort(index, Bytes.toShort(src, offset))
        case StringType => row.update(index, toUTF8String(src, offset, length))
        case BinaryType =>
          val newArray = new Array[Byte](length)
          System.arraycopy(src, offset, newArray, 0, length)
          row.update(index, newArray)
        case _ => row.update(index, SparkSqlSerializer.deserialize[Any](src)) //TODO
      }
    }
  }

  // convert input to data type
  def toBytes(input: Any, field: Field): Array[Byte] = {
    if (field.sedes.isDefined) {
      field.sedes.get.serialize(input)
    } else if (field.schema.isDefined) {
      // Here we assume the top level type is structType
      val record = field.catalystToAvro(input)
      AvroSedes.serialize(record, field.schema.get)
    } else {
      input match {
        case data: Boolean => Bytes.toBytes(data)
        case data: Byte => Array(data)
        case data: Array[Byte] => data
        case data: Double => Bytes.toBytes(data)
        case data: Float => Bytes.toBytes(data)
        case data: Int => Bytes.toBytes(data)
        case data: Long => Bytes.toBytes(data)
        case data: Short => Bytes.toBytes(data)
        case data: UTF8String => data.getBytes
        case data: String => Bytes.toBytes(data)
          //Bytes.toBytes(input.asInstanceOf[String])//input.asInstanceOf[UTF8String].getBytes
        case _ => throw new Exception(s"unsupported data type ${field.dt}") //TODO
      }
    }
  }

  def toBoolean(input: HBaseType, offset: Int): Boolean = {
    input(offset) != 0
  }

  def toUTF8String(input: HBaseType, offset: Int, length: Int): UTF8String = {
    UTF8String(input.slice(offset, offset + length))
  }
} 
Example 32
Source File: Sedes.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase

import java.io.ByteArrayInputStream

import org.apache.avro.Schema
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io._
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._

trait Sedes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

class DoubleSedes extends Sedes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toLong(bytes, start)
  }
} 
Example 33
Source File: HBaseTestSuite.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.io.File

import scala.collection.JavaConverters._

import com.google.common.io.Files
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{TableName, HBaseTestingUtility}
import org.apache.spark.sql.execution.datasources.hbase.Logging
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class HBaseTestSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  private[spark] var htu = HBaseTestingUtility.createLocalHTU()
  private[spark] var tableName: Array[Byte] = Bytes.toBytes("t1")
  private[spark] var columnFamily: Array[Byte] = Bytes.toBytes("cf0")
  private[spark] var columnFamilies: Array[Array[Byte]] =
    Array(Bytes.toBytes("cf0"), Bytes.toBytes("cf1"), Bytes.toBytes("cf2"), Bytes.toBytes("cf3"), Bytes.toBytes("cf4"))
  var table: Table = null
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.cleanupTestDir
    htu.startMiniZKCluster
    htu.startMiniHBaseCluster(1, 4)
    logInfo(" - minicluster started")
    println(" - minicluster started")
    try {
      htu.deleteTable(TableName.valueOf(tableName))

      //htu.createTable(TableName.valueOf(tableName), columnFamily, 2, Bytes.toBytes("abc"), Bytes.toBytes("xyz"), 2)
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + Bytes.toString(tableName) + " found")
    }
    setupTable()
  }



  override def afterAll() {
    try {
      table.close()
      println("shutdown")
      htu.deleteTable(TableName.valueOf(tableName))
      logInfo("shuting down minicluster")
      htu.shutdownMiniHBaseCluster
      htu.shutdownMiniZKCluster
      logInfo(" - minicluster shut down")
      htu.cleanupTestDir
    } catch {
      case _ : Throwable => logError("teardown error")
    }
  }

  def setupTable() {
    val config = htu.getConfiguration
    htu.createMultiRegionTable(TableName.valueOf(tableName), columnFamilies)
    println("create htable t1")
    val connection = ConnectionFactory.createConnection(config)
    val r = connection.getRegionLocator(TableName.valueOf("t1"))
    table = connection.getTable(TableName.valueOf("t1"))

    val regionLocations = r.getAllRegionLocations.asScala.toSeq
    println(s"$regionLocations size: ${regionLocations.size}")
    (0 until 100).foreach { x =>
      var put = new Put(Bytes.toBytes(s"row$x"))
      (0 until 5).foreach { y =>
        put.addColumn(columnFamilies(y), Bytes.toBytes(s"c$y"), Bytes.toBytes(s"value $x $y"))
      }
      table.put(put)
    }
  }
} 
Example 34
Source File: Test.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.types.BinaryType

object Test {
  def main(args: Array[String]) {
    val a: Array[Byte] = Array.fill(10)(Byte.MinValue)
    val b = Bytes.toBytes ("row003")
    System.arraycopy(b, 0, a, 0, b.length)
    val c = Bytes.toBytes(Int.MinValue)
    System.arraycopy(c, 0, a, b.length, c.length)
    val len = a.indexOf(HBaseTableCatalog.delimiter, 0)
    val s1 = Bytes.toString(a, 0, 6)
    val s2 = Bytes.toString(a, 0, len)

    
    val l = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(l, 0, Double.MinValue)
    val m = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(m, 0, -20.0)
    val n = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(n, 0, 0.0)
    val o = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(o,  0, 20.0)
    val p = Array.fill(8)(Byte.MaxValue)
    Bytes.putDouble(p, 0, Double.MaxValue)

    val c1 = BinaryType.ordering.compare(l, m)
    val c2 = BinaryType.ordering.compare(m, n)
    val c3 = BinaryType.ordering.compare(n, o)
    val c4 = BinaryType.ordering.compare(o, p)

    val p1 = Array.fill(10)(0: Byte)
    Bytes.putBytes(p1, 0, Bytes.toBytes("row010"), 0, 6)

    val p2 = Array.fill(10)(-1: Byte)
    Bytes.putBytes(p2, 0, Bytes.toBytes("row010"), 0, 6)

    val p3 = Array.fill(10)(Byte.MaxValue)
    Bytes.putBytes(p3, 0, Bytes.toBytes("row010"), 0, 6)
    Bytes.putInt(p3, 6, 10)

    val p4 = Bytes.compareTo(p1, p3)
    val p5 = Bytes.compareTo(p2, p3)

    val z = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z, 0, -1)
    val z1 = Array.fill(4)(Byte.MinValue)
    Bytes.putInt(z1, 0, -2147483648)

    val z2 = Bytes.compareTo(z, z1)

    val t = Array.fill(4)(-1: Byte)
    println(Bytes.toInt(t))

    val s = Bytes.toBytes(1.4.asInstanceOf[Float])
    println(Bytes.toInt(s))
    println(Bytes.toFloat(s))
    val w =  Bytes.toBytes(-1.4.asInstanceOf[Float])
    println(Bytes.toInt(w))
    println(Bytes.toFloat(w))

    val buffer1 = Bytes.toBytes(-1.0f)
    val b1 = Bytes.toInt(buffer1)
    var buffer = Array.fill(4)(-1: Byte)
    var buffer2 = Bytes.toBytes(-1.0f)

    var buffer3 = java.lang.Float.floatToIntBits(-1.0f)
    val b3 = Bytes.toBytes(buffer3)
    val out = Bytes.toInt(buffer1) ^ Integer.MIN_VALUE
    buffer2 = Bytes.toBytes(out)
    var i: Int = java.lang.Float.floatToIntBits(-1.0f)
    i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1
    Bytes.putInt(buffer, 0, i)

    val mn = Bytes.toBytes(-0.0f)
    println(Bytes.toFloat(mn))
    println(Float.MinPositiveValue)

    println(s"a")
  }
} 
Example 35
Source File: SHC.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import org.apache.spark.sql.execution.datasources.hbase.Logging

import java.io.File

import com.google.common.io.Files
import org.apache.hadoop.hbase.client.Table
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
import org.apache.spark.{SparkContext, SparkConf}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class SHC  extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {
  implicit class StringToColumn(val sc: StringContext) {
    def $(args: Any*): ColumnName = {
      new ColumnName(sc.s(args: _*))
    }
  }

  var spark: SparkSession = null
  var sc: SparkContext = null
  var sqlContext: SQLContext = null
  var df: DataFrame = null

  private[spark] var htu = new HBaseTestingUtility
  private[spark] def tableName = "table1"

  private[spark] def columnFamilies: Array[String] = Array.tabulate(9){ x=> s"cf$x"}
  var table: Table = null
  val conf = new SparkConf
  conf.set(SparkHBaseConf.testConf, "true")
  // private[spark] var columnFamilyStr = Bytes.toString(columnFamily)

  def defineCatalog(tName: String) = s"""{
                                         |"table":{"namespace":"default", "name":"$tName"},
                                         |"rowkey":"key",
                                         |"columns":{
                                              |"col0":{"cf":"rowkey", "col":"key", "type":"string"},
                                              |"col1":{"cf":"cf1", "col":"col1", "type":"boolean"},
                                              |"col2":{"cf":"cf2", "col":"col2", "type":"double"},
                                              |"col3":{"cf":"cf3", "col":"col3", "type":"float"},
                                              |"col4":{"cf":"cf4", "col":"col4", "type":"int"},
                                              |"col5":{"cf":"cf5", "col":"col5", "type":"bigint"},
                                              |"col6":{"cf":"cf6", "col":"col6", "type":"smallint"},
                                              |"col7":{"cf":"cf7", "col":"col7", "type":"string"},
                                              |"col8":{"cf":"cf8", "col":"col8", "type":"tinyint"}
                                            |}
                                         |}""".stripMargin

  @deprecated(since = "04.12.2017(dd/mm/year)", message = "use `defineCatalog` instead")
  def catalog = defineCatalog(tableName)

  override def beforeAll() {
    val tempDir: File = Files.createTempDir
    tempDir.deleteOnExit
    htu.startMiniCluster
    SparkHBaseConf.conf = htu.getConfiguration
    logInfo(" - minicluster started")
    println(" - minicluster started")

    spark = SparkSession.builder()
      .master("local")
      .appName("HBaseTest")
      .config(conf)
      .getOrCreate()

    sqlContext = spark.sqlContext
    sc = spark.sparkContext
  }

  override def afterAll() {
    htu.shutdownMiniCluster()
    spark.stop()
  }

  def createTable(name: String, cfs: Array[String]) {
    val tName = Bytes.toBytes(name)
    val bcfs = cfs.map(Bytes.toBytes(_))
    try {
      htu.deleteTable(TableName.valueOf(tName))
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + name + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(tName), bcfs)
  }


  def createTable(name: Array[Byte], cfs: Array[Array[Byte]]) {
    try {
      htu.deleteTable(TableName.valueOf(name))
    } catch {
      case _ : Throwable =>
        logInfo(" - no table " + Bytes.toString(name) + " found")
    }
    htu.createMultiRegionTable(TableName.valueOf(name), cfs)
  }
} 
Example 36
Source File: AvroRecordRowKeyBuilderTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import com.datamountaineer.streamreactor.connect.hbase.avro.AvroRecordFieldExtractorMapFn
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.sink.SinkRecord
import org.mockito.MockitoSugar
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordRowKeyBuilderTest extends AnyWordSpec with Matchers with MockitoSugar {
  val schema: Schema = new Schema.Parser().parse(PersonAvroSchema.schema)

  "AvroRecordRowKeyBuilder" should {
    "extract the values from the avro record and create the key" in {
      val keys = Seq("firstName", "lastName", "age")
      val rowKeyBuilder = new AvroRecordRowKeyBuilderBytes(AvroRecordFieldExtractorMapFn(schema, keys), keys)

      val sinkRecord = mock[SinkRecord]
      val firstName = "Jack"
      val lastName = "Smith"
      val age = 29

      val record = new GenericRecord {

        val values: Map[String, AnyRef] = Map("firstName" -> firstName, "lastName" -> lastName, "age" -> Int.box(age))

        override def get(key: String): AnyRef = values(key)

        override def put(key: String, v: scala.Any): Unit = sys.error("not supported")

        override def get(i: Int): AnyRef = sys.error("not supported")


        override def put(i: Int, v: scala.Any): Unit = sys.error("not supported")


        override def getSchema: Schema = sys.error("not supported")
      }

      val expectedValue = Bytes.add(
        Array(
          firstName.fromString(),
          rowKeyBuilder.delimBytes,
          lastName.fromString(),
          rowKeyBuilder.delimBytes,
          age.fromInt()))
      rowKeyBuilder.build(sinkRecord, record) shouldBe expectedValue
    }
  }
} 
Example 37
Source File: StructFieldsRowKeyBuilderTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class StructFieldsRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "StructFieldsRowKeyBuilder" should {
    "raise an exception if the field is not present in the struct" in {
      intercept[IllegalArgumentException] {
        val schema = SchemaBuilder.struct().name("com.example.Person")
          .field("firstName", Schema.STRING_SCHEMA)
          .field("age", Schema.INT32_SCHEMA)
          .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

        val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

        val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
        //val field = Field("threshold", "threshold", false)

        StructFieldsRowKeyBuilderBytes(List("threshold")).build(sinkRecord, null)
      }
    }

    "create the row key based on one single field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName")).build(sinkRecord, null) shouldBe "Alex".fromString
    }

    "create the row key based on more thant one field in the struct" in {
      val schema = SchemaBuilder.struct().name("com.example.Person")
        .field("firstName", Schema.STRING_SCHEMA)
        .field("age", Schema.INT32_SCHEMA)
        .field("threshold", Schema.OPTIONAL_FLOAT64_SCHEMA).build()

      val struct = new Struct(schema).put("firstName", "Alex").put("age", 30)

      //val field = Field("firstName", "firstName", true)
      //val field2 = Field("age", "age", true)
      val sinkRecord = new SinkRecord("sometopic", 1, null, null, schema, struct, 1)
      StructFieldsRowKeyBuilderBytes(List("firstName", "age")).build(sinkRecord, null) shouldBe
        Bytes.add("Alex".fromString(), "\n".fromString(), 30.fromInt())
    }
  }
} 
Example 38
Source File: ColumnFamilyQualifierMapKeyWrapper.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.util.Bytes


class ColumnFamilyQualifierMapKeyWrapper(val columnFamily:Array[Byte],
                                         val columnFamilyOffSet:Int,
                                         val columnFamilyLength:Int,
                                         val qualifier:Array[Byte],
                                         val qualifierOffSet:Int,
                                         val qualifierLength:Int)
  extends Serializable{

  override def equals(other:Any): Boolean = {
    val otherWrapper = other.asInstanceOf[ColumnFamilyQualifierMapKeyWrapper]

    Bytes.compareTo(columnFamily,
      columnFamilyOffSet,
      columnFamilyLength,
      otherWrapper.columnFamily,
      otherWrapper.columnFamilyOffSet,
      otherWrapper.columnFamilyLength) == 0 && Bytes.compareTo(qualifier,
        qualifierOffSet,
        qualifierLength,
        otherWrapper.qualifier,
        otherWrapper.qualifierOffSet,
        otherWrapper.qualifierLength) == 0
  }

  override def hashCode():Int = {
    Bytes.hashCode(columnFamily, columnFamilyOffSet, columnFamilyLength) +
      Bytes.hashCode(qualifier, qualifierOffSet, qualifierLength)
  }

  def cloneColumnFamily():Array[Byte] = {
    val resultArray = new Array[Byte](columnFamilyLength)
    System.arraycopy(columnFamily, columnFamilyOffSet, resultArray, 0, columnFamilyLength)
    resultArray
  }

  def cloneQualifier():Array[Byte] = {
    val resultArray = new Array[Byte](qualifierLength)
    System.arraycopy(qualifier, qualifierOffSet, resultArray, 0, qualifierLength)
    resultArray
  }
} 
Example 39
Source File: GenericRowKeyBuilderTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.hbase

import com.datamountaineer.streamreactor.connect.hbase.BytesHelper._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.connect.data.Schema
import org.apache.kafka.connect.sink.SinkRecord
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class GenericRowKeyBuilderTest extends AnyWordSpec with Matchers {
  "GenericRowKeyBuilder" should {
    "use the topic, partition and offset to make the key" in {

      val topic = "sometopic"
      val partition = 2
      val offset = 1243L
      val sinkRecord = new SinkRecord(topic, partition, Schema.INT32_SCHEMA, 345, Schema.STRING_SCHEMA, "", offset)

      val keyBuilder = new GenericRowKeyBuilderBytes()
      val expected = Bytes.add(Array(topic.fromString(), keyBuilder.delimiterBytes, partition.fromString(),
        keyBuilder.delimiterBytes, offset.fromString()))
      keyBuilder.build(sinkRecord, Nil) shouldBe expected
    }
  }
} 
Example 40
Source File: AvroRecordFieldExtractorMapFnTest.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.hbase.avro

import java.nio.file.Paths

import org.apache.avro.Schema
import org.apache.hadoop.hbase.util.Bytes
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class AvroRecordFieldExtractorMapFnTest extends AnyWordSpec with Matchers {

  val schema: Schema = new Schema.Parser().parse(Paths.get(getClass.getResource("/person.avsc").toURI).toFile)

  "AvroRecordFieldExtractorMapFn" should {
    "raise an exception if the given field does not exist in the schema" in {
      intercept[IllegalArgumentException] {
        AvroRecordFieldExtractorMapFn(schema, Seq("wrongField"))
      }
    }

    "raise an exception if the given field is not a primitive" in {
      intercept[IllegalArgumentException] {
        AvroRecordFieldExtractorMapFn(schema, Seq("address"))
      }
    }

    "create the mappings for all the given fields" in {
      val mappings = AvroRecordFieldExtractorMapFn(schema, Seq("firstName", "age"))

      val fnFirstName = mappings("firstName")
      val firstName = "Beaky"
      fnFirstName(firstName) shouldBe Bytes.toBytes(firstName)

      val fnAge = mappings("age")
      val age = 31
      fnAge(age) shouldBe Bytes.toBytes(age)
      intercept[ClassCastException] {
        fnAge(12.4)
      }
    }
  }
} 
Example 41
Source File: HBaseCatalogSuite.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.spark.datasources.{DataTypeParserWrapper, DoubleSerDes, HBaseTableCatalog}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

class HBaseCatalogSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll  with Logging {

  val map = s"""MAP<int, struct<varchar:string>>"""
  val array = s"""array<struct<tinYint:tinyint>>"""
  val arrayMap = s"""MAp<int, ARRAY<double>>"""
  val catalog = s"""{
                    |"table":{"namespace":"default", "name":"htable"},
                    |"rowkey":"key1:key2",
                    |"columns":{
                    |"col1":{"cf":"rowkey", "col":"key1", "type":"string"},
                    |"col2":{"cf":"rowkey", "col":"key2", "type":"double"},
                    |"col3":{"cf":"cf1", "col":"col2", "type":"binary"},
                    |"col4":{"cf":"cf1", "col":"col3", "type":"timestamp"},
                    |"col5":{"cf":"cf1", "col":"col4", "type":"double", "serdes":"${classOf[DoubleSerDes].getName}"},
                    |"col6":{"cf":"cf1", "col":"col5", "type":"$map"},
                    |"col7":{"cf":"cf1", "col":"col6", "type":"$array"},
                    |"col8":{"cf":"cf1", "col":"col7", "type":"$arrayMap"},
                    |"col9":{"cf":"cf1", "col":"col8", "type":"date"},
                    |"col10":{"cf":"cf1", "col":"col9", "type":"timestamp"}
                    |}
                    |}""".stripMargin
  val parameters = Map(HBaseTableCatalog.tableCatalog->catalog)
  val t = HBaseTableCatalog(parameters)

  def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = {
    test(s"parse ${dataTypeString.replace("\n", "")}") {
      assert(DataTypeParserWrapper.parse(dataTypeString) === expectedDataType)
    }
  }
  test("basic") {
    assert(t.getField("col1").isRowKey == true)
    assert(t.getPrimaryKey == "key1")
    assert(t.getField("col3").dt == BinaryType)
    assert(t.getField("col4").dt == TimestampType)
    assert(t.getField("col5").dt == DoubleType)
    assert(t.getField("col5").serdes != None)
    assert(t.getField("col4").serdes == None)
    assert(t.getField("col1").isRowKey)
    assert(t.getField("col2").isRowKey)
    assert(!t.getField("col3").isRowKey)
    assert(t.getField("col2").length == Bytes.SIZEOF_DOUBLE)
    assert(t.getField("col1").length == -1)
    assert(t.getField("col8").length == -1)
    assert(t.getField("col9").dt == DateType)
    assert(t.getField("col10").dt == TimestampType)
  }

  checkDataType(
    map,
    t.getField("col6").dt
  )

  checkDataType(
    array,
    t.getField("col7").dt
  )

  checkDataType(
    arrayMap,
    t.getField("col8").dt
  )

  test("convert") {
    val m = Map("hbase.columns.mapping" ->
      "KEY_FIELD STRING :key, A_FIELD STRING c:a, B_FIELD DOUBLE c:b, C_FIELD BINARY c:c,",
      "hbase.table" -> "t1")
    val map = HBaseTableCatalog.convert(m)
    val json = map.get(HBaseTableCatalog.tableCatalog).get
    val parameters = Map(HBaseTableCatalog.tableCatalog->json)
    val t = HBaseTableCatalog(parameters)
    assert(t.getField("KEY_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("STRING") === t.getField("A_FIELD").dt)
    assert(!t.getField("A_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("DOUBLE") === t.getField("B_FIELD").dt)
    assert(DataTypeParserWrapper.parse("BINARY") === t.getField("C_FIELD").dt)
  }

  test("compatibility") {
    val m = Map("hbase.columns.mapping" ->
      "KEY_FIELD STRING :key, A_FIELD STRING c:a, B_FIELD DOUBLE c:b, C_FIELD BINARY c:c,",
      "hbase.table" -> "t1")
    val t = HBaseTableCatalog(m)
    assert(t.getField("KEY_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("STRING") === t.getField("A_FIELD").dt)
    assert(!t.getField("A_FIELD").isRowKey)
    assert(DataTypeParserWrapper.parse("DOUBLE") === t.getField("B_FIELD").dt)
    assert(DataTypeParserWrapper.parse("BINARY") === t.getField("C_FIELD").dt)
  }
} 
Example 42
Source File: ByteArrayComparable.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes

@InterfaceAudience.Public
class ByteArrayComparable(val bytes:Array[Byte], val offset:Int = 0, var length:Int = -1)
  extends Comparable[ByteArrayComparable] {

  if (length == -1) {
    length = bytes.length
  }

  override def compareTo(o: ByteArrayComparable): Int = {
    Bytes.compareTo(bytes, offset, length, o.bytes, o.offset, o.length)
  }

  override def hashCode(): Int = {
    Bytes.hashCode(bytes, offset, length)
  }

  override def equals (obj: Any): Boolean = {
    obj match {
      case b: ByteArrayComparable =>
        Bytes.equals(bytes, offset, length, b.bytes, b.offset, b.length)
      case _ =>
        false
    }
  }
} 
Example 43
Source File: ColumnFamilyQualifierMapKeyWrapper.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes


@InterfaceAudience.Public
class ColumnFamilyQualifierMapKeyWrapper(val columnFamily:Array[Byte],
                                         val columnFamilyOffSet:Int,
                                         val columnFamilyLength:Int,
                                         val qualifier:Array[Byte],
                                         val qualifierOffSet:Int,
                                         val qualifierLength:Int)
  extends Serializable{

  override def equals(other:Any): Boolean = {
    val otherWrapper = other.asInstanceOf[ColumnFamilyQualifierMapKeyWrapper]

    Bytes.compareTo(columnFamily,
      columnFamilyOffSet,
      columnFamilyLength,
      otherWrapper.columnFamily,
      otherWrapper.columnFamilyOffSet,
      otherWrapper.columnFamilyLength) == 0 && Bytes.compareTo(qualifier,
        qualifierOffSet,
        qualifierLength,
        otherWrapper.qualifier,
        otherWrapper.qualifierOffSet,
        otherWrapper.qualifierLength) == 0
  }

  override def hashCode():Int = {
    Bytes.hashCode(columnFamily, columnFamilyOffSet, columnFamilyLength) +
      Bytes.hashCode(qualifier, qualifierOffSet, qualifierLength)
  }

  def cloneColumnFamily():Array[Byte] = {
    val resultArray = new Array[Byte](columnFamilyLength)
    System.arraycopy(columnFamily, columnFamilyOffSet, resultArray, 0, columnFamilyLength)
    resultArray
  }

  def cloneQualifier():Array[Byte] = {
    val resultArray = new Array[Byte](qualifierLength)
    System.arraycopy(qualifier, qualifierOffSet, resultArray, 0, qualifierLength)
    resultArray
  }
} 
Example 44
Source File: ByteArrayWrapper.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes


@InterfaceAudience.Public
class ByteArrayWrapper (var value:Array[Byte])
  extends Comparable[ByteArrayWrapper] with Serializable {
  override def compareTo(valueOther: ByteArrayWrapper): Int = {
    Bytes.compareTo(value,valueOther.value)
  }
  override def equals(o2: Any): Boolean = {
    o2 match {
      case wrapper: ByteArrayWrapper =>
        Bytes.equals(value, wrapper.value)
      case _ =>
        false
    }
  }
  override def hashCode():Int = {
    Bytes.hashCode(value)
  }
} 
Example 45
Source File: BulkLoadPartitioner.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import java.util
import java.util.Comparator

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


@InterfaceAudience.Public
class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {
  // when table not exist, startKeys = Byte[0][]
  override def numPartitions: Int = if (startKeys.length == 0) 1 else startKeys.length

  override def getPartition(key: Any): Int = {

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case wrapper: ByteArrayWrapper =>
          wrapper.value
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }
    var partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0)
      partition = partition * -1 + -2
    if (partition < 0)
      partition = 0
    partition
  }
} 
Example 46
Source File: package.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.util.Bytes

import scala.math.Ordering

// TODO: add @InterfaceAudience.Private if https://issues.scala-lang.org/browse/SI-3600 is resolved
package object hbase {
  type HBaseType = Array[Byte]
  def bytesMin = new Array[Byte](0)
  def bytesMax = null
  val ByteMax = -1.asInstanceOf[Byte]
  val ByteMin = 0.asInstanceOf[Byte]
  val ord: Ordering[HBaseType] = new Ordering[HBaseType] {
    def compare(x: Array[Byte], y: Array[Byte]): Int = {
      return Bytes.compareTo(x, y)
    }
  }
  //Do not use BinaryType.ordering
  implicit val order: Ordering[HBaseType] = ord

} 
Example 47
Source File: Utils.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.datasources

import java.sql.{Date, Timestamp}

import org.apache.hadoop.hbase.spark.AvroSerdes
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.yetus.audience.InterfaceAudience;

@InterfaceAudience.Private
object Utils {

  
  def hbaseFieldToScalaType(
      f: Field,
      src: Array[Byte],
      offset: Int,
      length: Int): Any = {
    if (f.exeSchema.isDefined) {
      // If we have avro schema defined, use it to get record, and then convert them to catalyst data type
      val m = AvroSerdes.deserialize(src, f.exeSchema.get)
      val n = f.avroToCatalyst.map(_(m))
      n.get
    } else  {
      // Fall back to atomic type
      f.dt match {
        case BooleanType => src(offset) != 0
        case ByteType => src(offset)
        case ShortType => Bytes.toShort(src, offset)
        case IntegerType => Bytes.toInt(src, offset)
        case LongType => Bytes.toLong(src, offset)
        case FloatType => Bytes.toFloat(src, offset)
        case DoubleType => Bytes.toDouble(src, offset)
        case DateType => new Date(Bytes.toLong(src, offset))
        case TimestampType => new Timestamp(Bytes.toLong(src, offset))
        case StringType => UTF8String.fromBytes(src, offset, length)
        case BinaryType =>
          val newArray = new Array[Byte](length)
          System.arraycopy(src, offset, newArray, 0, length)
          newArray
        // TODO: SparkSqlSerializer.deserialize[Any](src)
        case _ => throw new Exception(s"unsupported data type ${f.dt}")
      }
    }
  }

  // convert input to data type
  def toBytes(input: Any, field: Field): Array[Byte] = {
    if (field.schema.isDefined) {
      // Here we assume the top level type is structType
      val record = field.catalystToAvro(input)
      AvroSerdes.serialize(record, field.schema.get)
    } else {
      field.dt match {
        case BooleanType => Bytes.toBytes(input.asInstanceOf[Boolean])
        case ByteType => Array(input.asInstanceOf[Number].byteValue)
        case ShortType => Bytes.toBytes(input.asInstanceOf[Number].shortValue)
        case IntegerType => Bytes.toBytes(input.asInstanceOf[Number].intValue)
        case LongType => Bytes.toBytes(input.asInstanceOf[Number].longValue)
        case FloatType => Bytes.toBytes(input.asInstanceOf[Number].floatValue)
        case DoubleType => Bytes.toBytes(input.asInstanceOf[Number].doubleValue)
        case DateType | TimestampType => Bytes.toBytes(input.asInstanceOf[java.util.Date].getTime)
        case StringType => Bytes.toBytes(input.toString)
        case BinaryType => input.asInstanceOf[Array[Byte]]
        case _ => throw new Exception(s"unsupported data type ${field.dt}")
      }
    }
  }
} 
Example 48
Source File: SerDes.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.datasources

import org.apache.hadoop.hbase.util.Bytes
import org.apache.yetus.audience.InterfaceAudience

// TODO: This is not really used in code.
@InterfaceAudience.Public
trait SerDes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

// TODO: This is not really used in code.
@InterfaceAudience.Private
class DoubleSerDes extends SerDes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toDouble(bytes, start)
  }
} 
Example 49
Source File: KeyFamilyQualifier.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes


@InterfaceAudience.Public
class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
} 
Example 50
Source File: IndexEdgeDeserializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.indexedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.{Label, LabelMeta, ServiceColumn}
import org.apache.s2graph.core.storage.serde.StorageDeserializable._
import org.apache.s2graph.core.storage._
import org.apache.s2graph.core.storage.serde.Deserializable
import org.apache.s2graph.core.types._

class IndexEdgeDeserializable(graph: S2GraphLike,
                              bytesToLongFunc: (Array[Byte], Int) => Long = bytesToLong) extends Deserializable[S2EdgeLike] {

   type QualifierRaw = (Array[(LabelMeta, InnerValLike)], VertexId, Byte, Boolean, Int)
   type ValueRaw = (Array[(LabelMeta, InnerValLike)], Int)
   val builder = graph.elementBuilder

   override def fromKeyValues[T: CanSKeyValue](_kvs: Seq[T],
                                               cacheElementOpt: Option[S2EdgeLike]): Option[S2EdgeLike] = {
     try {
       assert(_kvs.size == 1)

       //     val kvs = _kvs.map { kv => implicitly[CanSKeyValue[T]].toSKeyValue(kv) }

       val kv = implicitly[CanSKeyValue[T]].toSKeyValue(_kvs.head)
       val version = kv.timestamp

       var pos = 0
       val (srcVertexId, srcIdLen) = SourceVertexId.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
       pos += srcIdLen
       val labelWithDir = LabelWithDirection(Bytes.toInt(kv.row, pos, 4))
       pos += 4
       val (labelIdxSeq, isInverted) = bytesToLabelIndexSeqWithIsInverted(kv.row, pos)
       pos += 1

       if (isInverted) None
       else {
         val label = Label.findById(labelWithDir.labelId)
         val schemaVer = label.schemaVersion
         val srcVertex = builder.newVertex(srcVertexId, version)
         //TODO:
         var tsVal = version

         if (kv.qualifier.isEmpty) {
           val degreeVal = bytesToLongFunc(kv.value, 0)
           val tgtVertexId = VertexId(ServiceColumn.Default, InnerVal.withStr("0", schemaVer))
           val tgtVertex = builder.newVertex(tgtVertexId, version)
           val edge = builder.newEdge(srcVertex, tgtVertex,
             label, labelWithDir.dir, GraphUtil.defaultOpByte, version, S2Edge.EmptyState)

           edge.propertyInner(LabelMeta.timestamp.name, version, version)
           edge.propertyInner(LabelMeta.degree.name, degreeVal, version)
           edge.tgtVertex = builder.newVertex(tgtVertexId, version)
           edge.setOp(GraphUtil.defaultOpByte)
           edge.setTsInnerValOpt(Option(InnerVal.withLong(tsVal, schemaVer)))

           Option(edge)
         } else {
           pos = 0
           val (idxPropsRaw, endAt) = bytesToProps(kv.qualifier, pos, schemaVer)
           pos = endAt

           val (tgtVertexIdRaw, tgtVertexIdLen) = if (endAt == kv.qualifier.length) {
             (HBaseType.defaultTgtVertexId, 0)
           } else {
             TargetVertexId.fromBytes(kv.qualifier, endAt, kv.qualifier.length, schemaVer)
           }
           pos += tgtVertexIdLen
           val op =
             if (kv.qualifier.length == pos) GraphUtil.defaultOpByte
             else kv.qualifier(kv.qualifier.length-1)

           val tgtVertex = builder.newVertex(tgtVertexIdRaw, version)
           val edge = builder.newEdge(srcVertex, tgtVertex,
             label, labelWithDir.dir, GraphUtil.defaultOpByte, version, S2Edge.EmptyState)

           val index = label.indicesMap.getOrElse(labelIdxSeq, throw new RuntimeException(s"invalid index seq: ${label.id.get}, ${labelIdxSeq}"))

           
           if (edge.checkProperty(LabelMeta.to.name)) {
             val vId = edge.property(LabelMeta.to.name).asInstanceOf[S2Property[_]].innerValWithTs
             val tgtVertex = builder.newVertex(TargetVertexId(ServiceColumn.Default, vId.innerVal), version)
             edge.setTgtVertex(tgtVertex)
           }

           edge.propertyInner(LabelMeta.timestamp.name, tsVal, version)
           edge.setOp(op)
           edge.setTsInnerValOpt(Option(InnerVal.withLong(tsVal, schemaVer)))

           Option(edge)
         }
       }
     } catch {
       case e: Exception => None
     }
   }
 } 
Example 51
Source File: HBasePartitioner.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.{CollectionsUtils, Utils}
import org.apache.spark.{Partitioner, SparkEnv}

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
} 
Example 52
Source File: HBaseServiceLayer.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.server.hbase

import javax.ws.rs._
import javax.ws.rs.core.MediaType

import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTrip
import com.hadooparchitecturebook.taxi360.streaming.ingestion.hbase.TaxiTripHBaseHelper
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Scan}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable

@Path("rest")
class HBaseServiceLayer {

  @GET
  @Path("hello")
  @Produces(Array(MediaType.TEXT_PLAIN))
  def hello(): String = {
    "Hello World"
  }

  @GET
  @Path("vender/{venderId}/timeline")
  @Produces(Array(MediaType.APPLICATION_JSON))
  def getTripTimeLine (@PathParam("venderId") venderId:String,
                          @QueryParam("startTime") startTime:String = Long.MinValue.toString,
                          @QueryParam("endTime")  endTime:String = Long.MaxValue.toString): Array[NyTaxiYellowTrip] = {

    val table = HBaseGlobalValues.connection.getTable(TableName.valueOf(HBaseGlobalValues.appEventTableName))

    val st = if (startTime == null) {
      Long.MinValue.toString
    } else {
      startTime
    }
    val et = if (endTime == null) {
      Long.MaxValue.toString
    } else {
      endTime
    }

    val scan = new Scan()
    val startRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, st.toLong, HBaseGlobalValues.numberOfSalts)
    println("startRowKey:" + Bytes.toString(startRowKey))
    scan.setStartRow(startRowKey)
    val endRowKey = TaxiTripHBaseHelper.generateRowKey(venderId, et.toLong, HBaseGlobalValues.numberOfSalts)
    println("endRowKey:" + Bytes.toString(endRowKey))
    scan.setStopRow(endRowKey)

    val scannerIt = table.getScanner(scan).iterator()

    val tripList = new mutable.MutableList[NyTaxiYellowTrip]

    while(scannerIt.hasNext) {
      val result = scannerIt.next()
      tripList += TaxiTripHBaseHelper.convertToTaxiTrip(result)
      println("Found a trip:" + TaxiTripHBaseHelper.convertToTaxiTrip(result))
    }

    println("tripList.size:" + tripList.size)

    tripList.toArray
  }

} 
Example 53
Source File: CreateSaltedTable.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.setup.hbase

import java.io.File

import org.apache.commons.lang.StringUtils
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.compress.Compression
import org.apache.hadoop.hbase.regionserver.{BloomType, ConstantSizeRegionSplitPolicy}
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable


object CreateSaltedTable {
  def main(args:Array[String]): Unit = {

    if (args.length == 0) {
      println("<tableName> <columnFamily> <regionCount> <numOfSalts> <hbaseConfigFolder>")
    }
    val tableName = args(0)
    val columnFamilyName = args(1)
    val regionCount = args(2).toInt
    val numOfSalts = args(3).toInt
    val hbaseConfigFolder = args(4)

    val conf = HBaseConfiguration.create()

    conf.addResource(new File(hbaseConfigFolder + "hbase-site.xml").toURI.toURL)

    val connection = ConnectionFactory.createConnection(conf)

    val admin = connection.getAdmin

    val tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName))

    val columnDescriptor = new HColumnDescriptor(columnFamilyName)

    columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY)
    columnDescriptor.setBlocksize(64 * 1024)
    columnDescriptor.setBloomFilterType(BloomType.ROW)

    tableDescriptor.addFamily(columnDescriptor)

    tableDescriptor.setMaxFileSize(Long.MaxValue)
    tableDescriptor.setRegionSplitPolicyClassName(classOf[ConstantSizeRegionSplitPolicy].getName)

    val splitKeys = new mutable.MutableList[Array[Byte]]
    for (i <- 0 to regionCount) {
      val regionSplitStr = StringUtils.leftPad((i*(numOfSalts/regionCount)).toString, 4, "0")
      splitKeys += Bytes.toBytes(regionSplitStr)
    }
    admin.createTable(tableDescriptor, splitKeys.toArray)
  }
} 
Example 54
Source File: BytesUtilV1.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.counter.core.v1

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.counter.core.TimedQualifier.IntervalUnit
import org.apache.s2graph.counter.core.{TimedQualifier, ExactQualifier, ExactKeyTrait, BytesUtil}
import org.apache.s2graph.counter.models.Counter.ItemType
import org.apache.s2graph.counter.util.Hashes
import scala.collection.mutable.ArrayBuffer

object BytesUtilV1 extends BytesUtil {
  // ExactKey: [hash(2b)][policy(4b)][item(variable)]
  val BUCKET_BYTE_SIZE = Bytes.SIZEOF_SHORT
  val POLICY_ID_SIZE = Bytes.SIZEOF_INT
  val INTERVAL_SIZE = Bytes.SIZEOF_BYTE
  val TIMESTAMP_SIZE = Bytes.SIZEOF_LONG
  val TIMED_QUALIFIER_SIZE = INTERVAL_SIZE + TIMESTAMP_SIZE

  override def getRowKeyPrefix(id: Int): Array[Byte] = {
    Bytes.toBytes(id)
  }

  override def toBytes(key: ExactKeyTrait): Array[Byte] = {
    val buff = new ArrayBuffer[Byte]
    // hash key (2 byte)
    buff ++= Bytes.toBytes(Hashes.murmur3(key.itemKey)).take(BUCKET_BYTE_SIZE)

    buff ++= getRowKeyPrefix(key.policyId)
    buff ++= {
      key.itemType match {
        case ItemType.INT => Bytes.toBytes(key.itemKey.toInt)
        case ItemType.LONG => Bytes.toBytes(key.itemKey.toLong)
        case ItemType.STRING | ItemType.BLOB => Bytes.toBytes(key.itemKey)
      }
    }
    buff.toArray
  }

  override def toBytes(eq: ExactQualifier): Array[Byte] = {
    toBytes(eq.tq) ++ eq.dimension.getBytes
  }

  override def toBytes(tq: TimedQualifier): Array[Byte] = {
    Bytes.toBytes(tq.q.toString) ++ Bytes.toBytes(tq.ts)
  }

  override def toExactQualifier(bytes: Array[Byte]): ExactQualifier = {
    // qualifier: interval, ts, dimension 순서
    val tq = toTimedQualifier(bytes)

    val dimension = Bytes.toString(bytes, TIMED_QUALIFIER_SIZE, bytes.length - TIMED_QUALIFIER_SIZE)
    ExactQualifier(tq, dimension)
  }

  override def toTimedQualifier(bytes: Array[Byte]): TimedQualifier = {
    val interval = Bytes.toString(bytes, 0, INTERVAL_SIZE)
    val ts = Bytes.toLong(bytes, INTERVAL_SIZE)

    TimedQualifier(IntervalUnit.withName(interval), ts)
  }
} 
Example 55
Source File: Hashes.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.counter.util

import org.apache.hadoop.hbase.util.Bytes

import scala.util.hashing.MurmurHash3

object Hashes {
  def sha1(s: String): String = {
    val md = java.security.MessageDigest.getInstance("SHA-1")
    Bytes.toHex(md.digest(s.getBytes("UTF-8")))
  }
  
  private def positiveHash(h: Int): Int = {
    if (h < 0) -1 * (h + 1) else h
  }

  def murmur3(s: String): Int = {
    val hash = MurmurHash3.stringHash(s)
    positiveHash(hash)
  }
} 
Example 56
Source File: DistributedScanner.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.counter.helper

import java.util
import java.util.Comparator

import com.google.common.primitives.SignedBytes
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes

object DistributedScanner {
   val BUCKET_BYTE_SIZE = Bytes.SIZEOF_BYTE

   def getRealRowKey(result: Result): Array[Byte] = {
     result.getRow.drop(BUCKET_BYTE_SIZE)
   }
 }

class DistributedScanner(table: Table, scan: Scan) extends AbstractClientScanner {
   import DistributedScanner._

   private val BYTE_MAX = BigInt(256)

   private[helper] val scanners = {
     for {
       i <- 0 until BYTE_MAX.pow(BUCKET_BYTE_SIZE).toInt
     } yield {
       val bucketBytes: Array[Byte] = Bytes.toBytes(i).takeRight(BUCKET_BYTE_SIZE)
       val newScan = new Scan(scan).setStartRow(bucketBytes ++ scan.getStartRow).setStopRow(bucketBytes ++ scan.getStopRow)
       table.getScanner(newScan)
     }
   }

   val resultCache = new util.TreeMap[Result, java.util.Iterator[Result]](new Comparator[Result] {
     val comparator = SignedBytes.lexicographicalComparator()
     override def compare(o1: Result, o2: Result): Int = {
       comparator.compare(getRealRowKey(o1), getRealRowKey(o2))
     }
   })

   lazy val initialized = {
     val iterators = scanners.map(_.iterator()).filter(_.hasNext)
     iterators.foreach { it =>
       resultCache.put(it.next(), it)
     }
     iterators.nonEmpty
   }

   override def next(): Result = {
     if (initialized) {
       Option(resultCache.pollFirstEntry()).map { entry =>
         val it = entry.getValue
         if (it.hasNext) {
           // fill cache
           resultCache.put(it.next(), it)
         }
         entry.getKey
       }.orNull
     } else {
       null
     }
   }

   override def close(): Unit = {
     for {
       scanner <- scanners
     } {
       scanner.close()
     }
   }

  override def renewLease(): Boolean = true
} 
Example 57
Source File: BulkLoadPartitioner.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.loader.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
} 
Example 58
Source File: KeyFamilyQualifier.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.loader.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
} 
Example 59
Source File: StorageSerializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.{ColumnMeta, LabelMeta}
import org.apache.s2graph.core.storage.SKeyValue
import org.apache.s2graph.core.types.{InnerValLike, InnerValLikeWithTs}

object StorageSerializable {
  
  def propsToBytes(props: Seq[(LabelMeta, InnerValLike)]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((_, v) <- props) bytes = Bytes.add(bytes, v.bytes)
    bytes
  }

  def vertexPropsToBytes(props: Seq[(ColumnMeta, Array[Byte])]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((k, v) <- props) bytes = Bytes.add(bytes, Bytes.toBytes(k.seq.toInt), v)
    bytes
  }

  def propsToKeyValues(props: Seq[(LabelMeta, InnerValLike)]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((k, v) <- props) bytes = Bytes.add(bytes, Array.fill(1)(k.seq), v.bytes)
    bytes
  }

  def propsToKeyValuesWithTs(props: Seq[(LabelMeta, InnerValLikeWithTs)]): Array[Byte] = {
    val len = props.length
    assert(len < Byte.MaxValue)
    var bytes = Array.fill(1)(len.toByte)
    for ((k, v) <- props) bytes = Bytes.add(bytes, Array.fill(1)(k.seq), v.bytes)
    bytes
  }

  def labelOrderSeqWithIsInverted(labelOrderSeq: Byte, isInverted: Boolean): Array[Byte] = {
    assert(labelOrderSeq < (1 << 6))
    val byte = labelOrderSeq << 1 | (if (isInverted) 1 else 0)
    Array.fill(1)(byte.toByte)
  }

  def intToBytes(value: Int): Array[Byte] = Bytes.toBytes(value)

  def longToBytes(value: Long): Array[Byte] = Bytes.toBytes(value)
}

trait StorageSerializable[E] {
  val cf = Serializable.edgeCf

  def table: Array[Byte]
  def ts: Long

  def toRowKey: Array[Byte]
  def toQualifier: Array[Byte]
  def toValue: Array[Byte]

  def toKeyValues: Seq[SKeyValue] = {
    val row = toRowKey
    val qualifier = toQualifier
    val value = toValue
    val kv = SKeyValue(table, row, cf, qualifier, value, ts)
//    logger.debug(s"[SER]: ${kv.toLogString}}")
    Seq(kv)
  }
} 
Example 60
Source File: VertexSerializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.vertex.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.{S2Vertex, S2VertexLike}
import org.apache.s2graph.core.storage.SKeyValue
import org.apache.s2graph.core.storage.serde.Serializable
import org.apache.s2graph.core.storage.serde.StorageSerializable._

import scala.collection.JavaConverters._

case class VertexSerializable(vertex: S2VertexLike, intToBytes: Int => Array[Byte] = intToBytes) extends Serializable[S2VertexLike] {

  override val table = vertex.hbaseTableName.getBytes
  override val ts = vertex.ts
  override val cf = Serializable.vertexCf

  override def toRowKey: Array[Byte] = vertex.id.bytes

  override def toQualifier: Array[Byte] = Array.empty[Byte]
  override def toValue: Array[Byte] = {
    val props = (vertex.props.asScala ++ vertex.defaultProps.asScala).toSeq.map { case (_, v) =>
      v.columnMeta -> v.innerVal.bytes
    }
    vertexPropsToBytes(props)
  }

  
  override def toKeyValues: Seq[SKeyValue] = {
//    val row = toRowKey
//    val qualifier = toQualifier
//    val value = toValue
//    Seq(
//      SKeyValue(vertex.hbaseTableName.getBytes, row, cf, qualifier, value, vertex.ts)
//    )
    (vertex.props.asScala ++ vertex.defaultProps.asScala).toSeq.map { case (_, v) =>
        val row = Bytes.add(vertex.id.bytes, Array.fill(1)(v.columnMeta.seq))
        val qualifier = Array.empty[Byte]
        val value = v.innerVal.bytes

        SKeyValue(vertex.hbaseTableName.getBytes, row, cf, qualifier, value, vertex.ts)
    }
  }
} 
Example 61
Source File: IndexEdgeSerializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.indexedge.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.LabelMeta
import org.apache.s2graph.core.types.VertexId
import org.apache.s2graph.core.{GraphUtil, IndexEdge}
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.storage.serde.Serializable

class IndexEdgeSerializable(indexEdge: IndexEdge, longToBytes: Long => Array[Byte] = longToBytes) extends Serializable[IndexEdge] {

   override def ts = indexEdge.version
   override def table = indexEdge.label.hbaseTableName.getBytes("UTF-8")

   def idxPropsMap = indexEdge.orders.toMap
   def idxPropsBytes = propsToBytes(indexEdge.orders)

   override def toRowKey: Array[Byte] = {
     val srcIdBytes = VertexId.toSourceVertexId(indexEdge.srcVertex.id).bytes
     val labelWithDirBytes = indexEdge.labelWithDir.bytes
     val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(indexEdge.labelIndexSeq, isInverted = false)

     val row = Bytes.add(srcIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
     //    logger.error(s"${row.toList}\n${srcIdBytes.toList}\n${labelWithDirBytes.toList}\n${labelIndexSeqWithIsInvertedBytes.toList}")

     if (indexEdge.degreeEdge) row
     else {
       val qualifier = idxPropsMap.get(LabelMeta.to) match {
           case None => Bytes.add(idxPropsBytes, VertexId.toTargetVertexId(indexEdge.tgtVertex.id).bytes)
           case Some(vId) => idxPropsBytes
         }

       val opByte = if (indexEdge.op == GraphUtil.operations("incrementCount")) indexEdge.op else GraphUtil.defaultOpByte
       Bytes.add(row, qualifier, Array.fill(1)(opByte))
     }
   }

   override def toQualifier: Array[Byte] = Array.empty[Byte]

   override def toValue: Array[Byte] =
     if (indexEdge.degreeEdge)
       longToBytes(indexEdge.property(LabelMeta.degree).innerVal.toString().toLong)
     else if (indexEdge.op == GraphUtil.operations("incrementCount"))
       longToBytes(indexEdge.property(LabelMeta.count).innerVal.toString().toLong)
     else propsToKeyValues(indexEdge.metas.toSeq)

 } 
Example 62
Source File: BytesUtilsSuite.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.spark.Logging
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._
import org.apache.spark.sql.hbase.types.HBaseBytesType
import org.apache.spark.sql.hbase.util.BytesUtils
import org.scalatest.{BeforeAndAfterAll, FunSuite}

class BytesUtilsSuite extends FunSuite with BeforeAndAfterAll with Logging {
  test("Bytes Ordering Test") {
    val s = Seq(-257, -256, -255, -129, -128, -127, -64, -16, -4, -1,
      0, 1, 4, 16, 64, 127, 128, 129, 255, 256, 257)
    val result = s.map(i => (i, BytesUtils.create(IntegerType).toBytes(i)))
      .sortWith((f, s) =>
      HBaseBytesType.ordering.gt(
        f._2.asInstanceOf[HBaseBytesType.InternalType],
        s._2.asInstanceOf[HBaseBytesType.InternalType]))
    assert(result.map(a => a._1) == s.sorted.reverse)
  }

  def compare(a: Array[Byte], b: Array[Byte]): Int = {
    val length = Math.min(a.length, b.length)
    var result: Int = 0
    for (i <- 0 to length - 1) {
      val diff: Int = (a(i) & 0xff).asInstanceOf[Byte] - (b(i) & 0xff).asInstanceOf[Byte]
      if (diff != 0) {
        result = diff
      }
    }
    result
  }

  test("Bytes Utility Test") {
    assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
      .toBytes(input = true), 0) === true)
    assert(BytesUtils.toBoolean(BytesUtils.create(BooleanType)
      .toBytes(input = false), 0) === false)

    assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(12.34d), 0)
      === 12.34d)
    assert(BytesUtils.toDouble(BytesUtils.create(DoubleType).toBytes(-12.34d), 0)
      === -12.34d)

    assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(12.34f), 0)
      === 12.34f)
    assert(BytesUtils.toFloat(BytesUtils.create(FloatType).toBytes(-12.34f), 0)
      === -12.34f)

    assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(12), 0)
      === 12)
    assert(BytesUtils.toInt(BytesUtils.create(IntegerType).toBytes(-12), 0)
      === -12)

    assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(1234l), 0)
      === 1234l)
    assert(BytesUtils.toLong(BytesUtils.create(LongType).toBytes(-1234l), 0)
      === -1234l)

    assert(BytesUtils.toShort(BytesUtils.create(ShortType)
      .toBytes(12.asInstanceOf[Short]), 0) === 12)
    assert(BytesUtils.toShort(BytesUtils.create(ShortType)
      .toBytes(-12.asInstanceOf[Short]), 0) === -12)

    assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes("abc"), 0, 3)
      === UTF8String("abc"))
    assert(BytesUtils.toUTF8String(BytesUtils.create(StringType).toBytes(""), 0, 0) === UTF8String(""))

    assert(BytesUtils.toByte(BytesUtils.create(ByteType)
      .toBytes(5.asInstanceOf[Byte]), 0) === 5)
    assert(BytesUtils.toByte(BytesUtils.create(ByteType)
      .toBytes(-5.asInstanceOf[Byte]), 0) === -5)

    assert(compare(BytesUtils.create(IntegerType).toBytes(128),
      BytesUtils.create(IntegerType).toBytes(-128)) > 0)
  }

  test("byte array plus one") {
    var byteArray =  Array[Byte](0x01.toByte, 127.toByte)
    assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x01.toByte, 0x80.toByte)) == 0)

    byteArray =  Array[Byte](0xff.toByte, 0xff.toByte)
    assert(BytesUtils.addOne(byteArray) == null)

    byteArray =  Array[Byte](0x02.toByte, 0xff.toByte)
    assert(Bytes.compareTo(BytesUtils.addOne(byteArray),  Array[Byte](0x03.toByte, 0x00.toByte)) == 0)
  }

  test("float comparison") {
    val f1 = BytesUtils.create(FloatType).toBytes(-1.23f)
    val f2 = BytesUtils.create(FloatType).toBytes(100f)
    assert(Bytes.compareTo(f1, f2) < 0)
  }
} 
Example 63
Source File: IndexEdgeSerializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.indexedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.LabelMeta
import org.apache.s2graph.core.types.VertexId
import org.apache.s2graph.core.{GraphUtil, IndexEdge}
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.storage.serde.Serializable

class IndexEdgeSerializable(indexEdge: IndexEdge, longToBytes: Long => Array[Byte] = longToBytes) extends Serializable[IndexEdge] {

   override def ts = indexEdge.version
   override def table = indexEdge.label.hbaseTableName.getBytes("UTF-8")

   def idxPropsMap = indexEdge.orders.toMap
   def idxPropsBytes = propsToBytes(indexEdge.orders)

   override def toRowKey: Array[Byte] = {
     val srcIdBytes = VertexId.toSourceVertexId(indexEdge.srcVertex.id).bytes
     val labelWithDirBytes = indexEdge.labelWithDir.bytes
     val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(indexEdge.labelIndexSeq, isInverted = false)

     Bytes.add(srcIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
   }

   override def toQualifier: Array[Byte] = {
     val tgtIdBytes = VertexId.toTargetVertexId(indexEdge.tgtVertex.id).bytes
     if (indexEdge.degreeEdge) Array.empty[Byte]
     else {
       if (indexEdge.op == GraphUtil.operations("incrementCount")) {
         Bytes.add(idxPropsBytes, tgtIdBytes, Array.fill(1)(indexEdge.op))
       } else {
         idxPropsMap.get(LabelMeta.to) match {
           case None => Bytes.add(idxPropsBytes, tgtIdBytes)
           case Some(vId) => idxPropsBytes
         }
       }
     }
   }

  override def toValue: Array[Byte] =
    if (indexEdge.degreeEdge)
      longToBytes(indexEdge.property(LabelMeta.degree).innerVal.toString().toLong)
    else if (indexEdge.op == GraphUtil.operations("incrementCount"))
      longToBytes(indexEdge.property(LabelMeta.count).innerVal.toString().toLong)
    else propsToKeyValues(indexEdge.metas.toSeq)

 } 
Example 64
Source File: SnapshotEdgeDeserializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.snapshotedge.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.{Label, LabelMeta, ServiceColumn}
import org.apache.s2graph.core.storage.serde.StorageDeserializable._
import org.apache.s2graph.core.storage.CanSKeyValue
import org.apache.s2graph.core.types._
import org.apache.s2graph.core._
import org.apache.s2graph.core.storage.serde.Deserializable
import org.apache.s2graph.core.utils.logger

class SnapshotEdgeDeserializable(graph: S2GraphLike) extends Deserializable[SnapshotEdge] {
  val builder = graph.elementBuilder
  def statusCodeWithOp(byte: Byte): (Byte, Byte) = {
    val statusCode = byte >> 4
    val op = byte & ((1 << 4) - 1)
    (statusCode.toByte, op.toByte)
  }

  override def fromKeyValues[T: CanSKeyValue](_kvs: Seq[T],
                                              cacheElementOpt: Option[SnapshotEdge]): Option[SnapshotEdge] = {
    try {
      val kvs = _kvs.map { kv => implicitly[CanSKeyValue[T]].toSKeyValue(kv) }
      assert(kvs.size == 1)

      val kv = kvs.head
      val version = kv.timestamp
      var pos = 0
      val (srcVertexId, srcIdLen) = SourceVertexId.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
      pos += srcIdLen

      val isTallSchema = pos + 5 != kv.row.length
      var tgtVertexId = TargetVertexId(ServiceColumn.Default, srcVertexId.innerId)

      if (isTallSchema) {
        val (tgtId, tgtBytesLen) = InnerVal.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
        tgtVertexId = TargetVertexId(ServiceColumn.Default, tgtId)
        pos += tgtBytesLen
      }

      val labelWithDir = LabelWithDirection(Bytes.toInt(kv.row, pos, 4))
      pos += 4
      val (labelIdxSeq, isInverted) = bytesToLabelIndexSeqWithIsInverted(kv.row, pos)
      pos += 1

      if (!isInverted) None
      else {
        val label = Label.findById(labelWithDir.labelId)
        val schemaVer = label.schemaVersion
//        val srcVertexId = SourceVertexId(ServiceColumn.Default, srcIdAndTgtId.srcInnerId)
//        val tgtVertexId = SourceVertexId(ServiceColumn.Default, tgtId.tgtInnerId)

        var pos = 0
        val (statusCode, op) = statusCodeWithOp(kv.value(pos))
        pos += 1
        val (props, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
        val kvsMap = props.toMap
        val tsInnerVal = kvsMap(LabelMeta.timestamp).innerVal
        val ts = tsInnerVal.toString.toLong
        pos = endAt

        val _pendingEdgeOpt =
          if (pos == kv.value.length) None
          else {
            val (pendingEdgeStatusCode, pendingEdgeOp) = statusCodeWithOp(kv.value(pos))
            pos += 1
            //          val versionNum = Bytes.toLong(kv.value, pos, 8)
            //          pos += 8
            val (pendingEdgeProps, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
            pos = endAt
            val lockTs = Option(Bytes.toLong(kv.value, pos, 8))

            val pendingEdge =
              builder.newEdge(
                builder.newVertex(srcVertexId, version),
                builder.newVertex(tgtVertexId, version),
                label, labelWithDir.dir, pendingEdgeOp,
                version, pendingEdgeProps.toMap,
                statusCode = pendingEdgeStatusCode, lockTs = lockTs, tsInnerValOpt = Option(tsInnerVal))

            Option(pendingEdge)
          }

        val snapshotEdge = builder.newSnapshotEdge(
          builder.newVertex(srcVertexId, ts),
          builder.newVertex(tgtVertexId, ts),
          label, labelWithDir.dir, op, version, props.toMap, statusCode = statusCode,
          pendingEdgeOpt = _pendingEdgeOpt, lockTs = None, tsInnerValOpt = Option(tsInnerVal))

        Option(snapshotEdge)
      }
    } catch {
      case e: Exception =>
        logger.error("#" * 100, e)
        None
    }
  }
} 
Example 65
Source File: SnapshotEdgeSerializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.snapshotedge.tall

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.{S2Edge, SnapshotEdge}
import org.apache.s2graph.core.schema.LabelIndex
import org.apache.s2graph.core.storage.serde._
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.types.SourceAndTargetVertexIdPair


class SnapshotEdgeSerializable(snapshotEdge: SnapshotEdge) extends Serializable[SnapshotEdge] {

  override def ts = snapshotEdge.version
  override def table = snapshotEdge.label.hbaseTableName.getBytes("UTF-8")

  def statusCodeWithOp(statusCode: Byte, op: Byte): Array[Byte] = {
    val byte = (((statusCode << 4) | op).toByte)
    Array.fill(1)(byte.toByte)
  }
  def valueBytes() = Bytes.add(statusCodeWithOp(snapshotEdge.statusCode, snapshotEdge.op), snapshotEdge.propsToKeyValuesWithTs)

  override def toRowKey: Array[Byte] = {
    val srcIdAndTgtIdBytes = SourceAndTargetVertexIdPair(snapshotEdge.srcVertex.innerId, snapshotEdge.tgtVertex.innerId).bytes
    val labelWithDirBytes = snapshotEdge.labelWithDir.bytes
    val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(LabelIndex.DefaultSeq, isInverted = true)

    Bytes.add(srcIdAndTgtIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
  }

  override def toQualifier: Array[Byte] = Array.empty[Byte]

  override def toValue: Array[Byte] =
    snapshotEdge.pendingEdgeOpt match {
      case None => valueBytes()
      case Some(pendingEdge) =>
        val opBytes = statusCodeWithOp(pendingEdge.getStatusCode(), pendingEdge.getOp())
        val versionBytes = Array.empty[Byte]
        val propsBytes = S2Edge.serializePropsWithTs(pendingEdge)
        val lockBytes = Bytes.toBytes(pendingEdge.getLockTs().get)

        Bytes.add(Bytes.add(valueBytes(), opBytes, versionBytes), Bytes.add(propsBytes, lockBytes))
    }

} 
Example 66
Source File: SnapshotEdgeDeserializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.snapshotedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.schema.{Label, LabelMeta}
import org.apache.s2graph.core.storage.serde.StorageDeserializable._
import org.apache.s2graph.core.storage.CanSKeyValue
import org.apache.s2graph.core.types.{HBaseType, LabelWithDirection, SourceVertexId, TargetVertexId}
import org.apache.s2graph.core._
import org.apache.s2graph.core.storage.serde.Deserializable

class SnapshotEdgeDeserializable(graph: S2GraphLike) extends Deserializable[SnapshotEdge] {
  val builder = graph.elementBuilder
  def statusCodeWithOp(byte: Byte): (Byte, Byte) = {
    val statusCode = byte >> 4
    val op = byte & ((1 << 4) - 1)
    (statusCode.toByte, op.toByte)
  }

  override def fromKeyValues[T: CanSKeyValue](_kvs: Seq[T],
                                              cacheElementOpt: Option[SnapshotEdge]): Option[SnapshotEdge] = {
    try {
      val kvs = _kvs.map { kv => implicitly[CanSKeyValue[T]].toSKeyValue(kv) }
      assert(kvs.size == 1)

      val kv = kvs.head
      val version = kv.timestamp
      var pos = 0
      val (srcVertexId, srcIdLen) = SourceVertexId.fromBytes(kv.row, pos, kv.row.length, HBaseType.DEFAULT_VERSION)
      pos += srcIdLen
      val labelWithDir = LabelWithDirection(Bytes.toInt(kv.row, pos, 4))
      pos += 4
      val (labelIdxSeq, isInverted) = bytesToLabelIndexSeqWithIsInverted(kv.row, pos)
      pos += 1

      if (!isInverted) None
      else {
        val label = Label.findById(labelWithDir.labelId)
        val schemaVer = label.schemaVersion
        val srcVertex = builder.newVertex(srcVertexId, version)

        val (tgtVertexId, _) = TargetVertexId.fromBytes(kv.qualifier, 0, kv.qualifier.length, schemaVer)

        var pos = 0
        val (statusCode, op) = statusCodeWithOp(kv.value(pos))
        pos += 1
        val (props, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
        val kvsMap = props.toMap
        val tsInnerVal = kvsMap(LabelMeta.timestamp).innerVal
        val ts = tsInnerVal.toString.toLong
        pos = endAt

        val _pendingEdgeOpt =
          if (pos == kv.value.length) None
          else {
            val (pendingEdgeStatusCode, pendingEdgeOp) = statusCodeWithOp(kv.value(pos))
            pos += 1
            //          val versionNum = Bytes.toLong(kv.value, pos, 8)
            //          pos += 8
            val (pendingEdgeProps, endAt) = bytesToKeyValuesWithTs(kv.value, pos, schemaVer, label)
            pos = endAt
            val lockTs = Option(Bytes.toLong(kv.value, pos, 8))

            val pendingEdge =
              builder.newEdge(
                builder.newVertex(srcVertexId, version),
                builder.newVertex(tgtVertexId, version),
                label, labelWithDir.dir, pendingEdgeOp,
                version, pendingEdgeProps.toMap,
                statusCode = pendingEdgeStatusCode, lockTs = lockTs, tsInnerValOpt = Option(tsInnerVal))
            Option(pendingEdge)
          }

        val snapshotEdge = builder.newSnapshotEdge(
          builder.newVertex(srcVertexId, ts),
          builder.newVertex(tgtVertexId, ts),
          label, labelWithDir.dir, op, version, props.toMap, statusCode = statusCode,
          pendingEdgeOpt = _pendingEdgeOpt, lockTs = None, tsInnerValOpt = Option(tsInnerVal))

        Option(snapshotEdge)
      }
    } catch {
      case e: Exception => None
    }
  }
} 
Example 67
Source File: SnapshotEdgeSerializable.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.serde.snapshotedge.wide

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.{S2Edge, SnapshotEdge}
import org.apache.s2graph.core.schema.LabelIndex
import org.apache.s2graph.core.storage.serde.Serializable
import org.apache.s2graph.core.storage.serde.StorageSerializable._
import org.apache.s2graph.core.types.VertexId




class SnapshotEdgeSerializable(snapshotEdge: SnapshotEdge) extends Serializable[SnapshotEdge] {

  override def ts = snapshotEdge.version
  override def table = snapshotEdge.label.hbaseTableName.getBytes("UTF-8")

  def statusCodeWithOp(statusCode: Byte, op: Byte): Array[Byte] = {
    val byte = (((statusCode << 4) | op).toByte)
    Array.fill(1)(byte.toByte)
  }
  def valueBytes() = Bytes.add(statusCodeWithOp(snapshotEdge.statusCode, snapshotEdge.op), snapshotEdge.propsToKeyValuesWithTs)


  override def toRowKey: Array[Byte] = {
    val srcIdBytes = VertexId.toSourceVertexId(snapshotEdge.srcVertex.id).bytes
    val labelWithDirBytes = snapshotEdge.labelWithDir.bytes
    val labelIndexSeqWithIsInvertedBytes = labelOrderSeqWithIsInverted(LabelIndex.DefaultSeq, isInverted = true)

    Bytes.add(srcIdBytes, labelWithDirBytes, labelIndexSeqWithIsInvertedBytes)
  }

  override def toQualifier: Array[Byte] =
    VertexId.toTargetVertexId(snapshotEdge.tgtVertex.id).bytes

  override def toValue: Array[Byte] =
    snapshotEdge.pendingEdgeOpt match {
      case None => valueBytes()
      case Some(pendingEdge) =>
        val opBytes = statusCodeWithOp(pendingEdge.getStatusCode(), pendingEdge.getOp())
        val versionBytes = Array.empty[Byte]
        val propsBytes = S2Edge.serializePropsWithTs(pendingEdge)
        val lockBytes = Bytes.toBytes(pendingEdge.getLockTs().get)
        Bytes.add(Bytes.add(valueBytes(), opBytes, versionBytes), Bytes.add(propsBytes, lockBytes))
    }

} 
Example 68
Source File: SKeyValue.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage

import java.nio.charset.StandardCharsets
import org.apache.hadoop.hbase.util.Bytes
import org.hbase.async.KeyValue


object SKeyValue {
//  val SnapshotEdgeCf = "s".getBytes(StandardCharsets.UTF_8)
  val EdgeCf = "e".getBytes(StandardCharsets.UTF_8)
  val VertexCf = "v".getBytes(StandardCharsets.UTF_8)
  val Put = 1
  val Delete = 2
  val Increment = 3
  val Default = Put
}

case class SKeyValue(table: Array[Byte],
                     row: Array[Byte],
                     cf: Array[Byte],
                     qualifier: Array[Byte],
                     value: Array[Byte],
                     timestamp: Long,
                     operation: Int = SKeyValue.Default,
                     durability: Boolean = true) {
  def toLogString = {
    Map("table" -> Bytes.toString(table), "row" -> row.toList, "cf" -> Bytes.toString(cf),
      "qualifier" -> qualifier.toList, "value" -> value.toList, "timestamp" -> timestamp,
      "operation" -> operation, "durability" -> durability).toString()
  }
  override def toString(): String = toLogString

  def toKeyValue: KeyValue = new KeyValue(row, cf, qualifier, timestamp, value)
}

trait CanSKeyValue[T] {
  def toSKeyValue(from: T): SKeyValue
}

object CanSKeyValue {
  def instance[T](f: T => SKeyValue): CanSKeyValue[T] = new CanSKeyValue[T] {
    override def toSKeyValue(from: T): SKeyValue = f.apply(from)
  }

  // For asyncbase KeyValues
  implicit val asyncKeyValue = instance[KeyValue] { kv =>
    SKeyValue(Array.empty[Byte], kv.key(), kv.family(), kv.qualifier(), kv.value(), kv.timestamp())
  }

  implicit val hbaseKeyValue = instance[org.apache.hadoop.hbase.KeyValue] { kv =>
    SKeyValue(Array.empty[Byte], kv.getRow, kv.getFamily, kv.getQualifier, kv.getValue, kv.getTimestamp)
  }

  // For asyncbase KeyValues
  implicit val sKeyValue = instance[SKeyValue](identity)

  // For hbase KeyValues
} 
Example 69
Source File: AsynchbaseEdgeFetcher.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.hbase

import java.util

import com.stumbleupon.async.Deferred
import com.typesafe.config.Config
import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.Label
import org.apache.s2graph.core.storage.serde.Serializable
import org.apache.s2graph.core.storage.{CanSKeyValue, StorageIO, StorageSerDe}
import org.apache.s2graph.core.types.{HBaseType, VertexId}
import org.apache.s2graph.core.utils.{CanDefer, DeferCache, Extensions, logger}
import org.hbase.async._

import scala.concurrent.{ExecutionContext, Future}

class AsynchbaseEdgeFetcher(val graph: S2GraphLike,
                            val config: Config,
                            val client: HBaseClient,
                            val serDe: StorageSerDe,
                            val io: StorageIO) extends EdgeFetcher {

  import AsynchbaseStorage._
  import CanDefer._
  import Extensions.DeferOps

  import scala.collection.JavaConverters._

  

    val edge = graph.elementBuilder.toRequestEdge(queryRequest, parentEdges)
    val request = buildRequest(client, serDe, queryRequest, edge)

    val (intervalMaxBytes, intervalMinBytes) = queryParam.buildInterval(Option(edge))
    val requestCacheKey = Bytes.add(toCacheKeyBytes(request), intervalMaxBytes, intervalMinBytes)

    if (cacheTTL <= 0) fetchInner(request)
    else {
      val cacheKeyBytes = Bytes.add(queryRequest.query.queryOption.cacheKeyBytes, requestCacheKey)

      //      val cacheKeyBytes = toCacheKeyBytes(request)
      val cacheKey = queryParam.toCacheKey(cacheKeyBytes)
      futureCache.getOrElseUpdate(cacheKey, cacheTTL)(fetchInner(request))
    }
  }
} 
Example 70
Source File: RocksVertexFetcher.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.storage.rocks

import com.typesafe.config.Config
import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core._
import org.apache.s2graph.core.schema.ServiceColumn
import org.apache.s2graph.core.storage.rocks.RocksStorage.{qualifier, table}
import org.apache.s2graph.core.storage.{SKeyValue, StorageIO, StorageSerDe}
import org.apache.s2graph.core.types.HBaseType
import org.rocksdb.RocksDB

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.{ExecutionContext, Future}

class RocksVertexFetcher(val graph: S2GraphLike,
                         val config: Config,
                         val db: RocksDB,
                         val vdb: RocksDB,
                         val serDe: StorageSerDe,
                         val io: StorageIO) extends VertexFetcher {
  private def fetchKeyValues(queryRequest: QueryRequest, vertex: S2VertexLike)(implicit ec: ExecutionContext): Future[Seq[SKeyValue]] = {
    val rpc = RocksStorage.buildRequest(queryRequest, vertex)

    RocksStorage.fetchKeyValues(vdb, db, rpc)
  }

  override def fetchVertices(vertexQueryParam: VertexQueryParam)(implicit ec: ExecutionContext): Future[Seq[S2VertexLike]] = {
    def fromResult(kvs: Seq[SKeyValue], version: String): Seq[S2VertexLike] = {
      if (kvs.isEmpty) Nil
      else serDe.vertexDeserializer(version).fromKeyValues(kvs, None).toSeq.filter(vertexQueryParam.where.get.filter)
    }
    val vertices = vertexQueryParam.vertexIds.map(vId => graph.elementBuilder.newVertex(vId))

    val futures = vertices.map { vertex =>
      val queryParam = QueryParam.Empty
      val q = Query.toQuery(Seq(vertex), Seq(queryParam))
      val queryRequest = QueryRequest(q, stepIdx = -1, vertex, queryParam)

      fetchKeyValues(queryRequest, vertex).map { kvs =>
        fromResult(kvs, vertex.serviceColumn.schemaVersion)
      } recoverWith {
        case ex: Throwable => Future.successful(Nil)
      }
    }

    Future.sequence(futures).map(_.flatten)
  }

  override def fetchVerticesAll()(implicit ec: ExecutionContext) = {
    import scala.collection.mutable

    val vertices = new ArrayBuffer[S2VertexLike]()
    ServiceColumn.findAll().groupBy(_.service.hTableName).toSeq.foreach { case (hTableName, columns) =>
      val distinctColumns = columns.toSet

      val iter = vdb.newIterator()
      val buffer = mutable.ListBuffer.empty[SKeyValue]
      var oldVertexIdBytes = Array.empty[Byte]
      var minusPos = 0

      try {
        iter.seekToFirst()
        while (iter.isValid) {
          val row = iter.key()
          if (!Bytes.equals(oldVertexIdBytes, 0, oldVertexIdBytes.length - minusPos, row, 0, row.length - 1)) {
            if (buffer.nonEmpty)
              serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None)
                .filter(v => distinctColumns(v.serviceColumn))
                .foreach { vertex =>
                  vertices += vertex
                }

            oldVertexIdBytes = row
            minusPos = 1
            buffer.clear()
          }
          val kv = SKeyValue(table, iter.key(), SKeyValue.VertexCf, qualifier, iter.value(), System.currentTimeMillis())
          buffer += kv

          iter.next()
        }
        if (buffer.nonEmpty)
          serDe.vertexDeserializer(schemaVer = HBaseType.DEFAULT_VERSION).fromKeyValues(buffer, None)
            .filter(v => distinctColumns(v.serviceColumn))
            .foreach { vertex =>
              vertices += vertex
            }

      } finally {
        iter.close()
      }
    }

    Future.successful(vertices)
  }
} 
Example 71
Source File: LabelWithDirection.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.core.types

import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.GraphUtil

object LabelWithDirection {

  import HBaseType._

  def apply(compositeInt: Int): LabelWithDirection = {
    //      logger.debug(s"CompositeInt: $compositeInt")

    val dir = compositeInt & ((1 << bitsForDir) - 1)
    val labelId = compositeInt >> bitsForDir
    LabelWithDirection(labelId, dir)
  }

  def labelOrderSeqWithIsInverted(labelOrderSeq: Byte, isInverted: Boolean): Array[Byte] = {
    assert(labelOrderSeq < (1 << 6))
    val byte = labelOrderSeq << 1 | (if (isInverted) 1 else 0)
    Array.fill(1)(byte.toByte)
  }

  def bytesToLabelIndexSeqWithIsInverted(bytes: Array[Byte], offset: Int): (Byte, Boolean) = {
    val byte = bytes(offset)
    val isInverted = if ((byte & 1) != 0) true else false
    val labelOrderSeq = byte >> 1
    (labelOrderSeq.toByte, isInverted)
  }
}

case class LabelWithDirection(labelId: Int, dir: Int) extends HBaseSerializable {

  import HBaseType._

  assert(dir < (1 << bitsForDir))
  assert(labelId < (Int.MaxValue >> bitsForDir))

  lazy val labelBits = labelId << bitsForDir

  lazy val compositeInt = labelBits | dir

  def bytes = {
     Bytes.toBytes(compositeInt)
  }

  lazy val dirToggled = LabelWithDirection(labelId, GraphUtil.toggleDir(dir))

  def updateDir(newDir: Int) = LabelWithDirection(labelId, newDir)

  def isDirected = dir == 0 || dir == 1

  override def hashCode(): Int = compositeInt

  override def equals(other: Any): Boolean = {
    other match {
      case o: LabelWithDirection => hashCode == o.hashCode()
      case _ => false
    }
  }
} 
Example 72
Source File: S2GraphCellReader.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.s2jobs.serde.reader

import org.apache.hadoop.hbase.Cell
import org.apache.hadoop.hbase.util.Bytes
import org.apache.s2graph.core.storage.SKeyValue
import org.apache.s2graph.core.types.HBaseType
import org.apache.s2graph.core.{GraphElement, S2Graph}
import org.apache.s2graph.s2jobs.serde.GraphElementReadable

class S2GraphCellReader(elementType: String) extends GraphElementReadable[Seq[Cell]]{
  override def read(s2: S2Graph)(cells: Seq[Cell]): Seq[GraphElement] = {
    val schemaVer = HBaseType.DEFAULT_VERSION
    val kvs = cells.map { cell =>
      new SKeyValue(Array.empty[Byte], cell.getRow, cell.getFamily, cell.getQualifier,
        cell.getValue, cell.getTimestamp, SKeyValue.Default)
    }

    elementType.toLowerCase match {
      case "vertex" | "v" =>
        s2.defaultStorage.serDe.vertexDeserializer(schemaVer)
          .fromKeyValues(kvs, None).map(_.asInstanceOf[GraphElement]).toSeq
      case "indexedge" | "ie" =>
        kvs.flatMap { kv =>
          s2.defaultStorage.serDe.indexEdgeDeserializer(schemaVer)
            .fromKeyValues(Seq(kv), None).map(_.asInstanceOf[GraphElement])
        }
      case "snapshotedge" | "se" =>
        kvs.flatMap { kv =>
          s2.defaultStorage.serDe.snapshotEdgeDeserializer(schemaVer)
            .fromKeyValues(Seq(kv), None).map(_.asInstanceOf[GraphElement])
        }
      case _ => throw new IllegalArgumentException(s"$elementType is not supported.")
    }
  }
} 
Example 73
Source File: BulkLoadPartitioner.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.s2jobs.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
} 
Example 74
Source File: HBaseBulkDeleteExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Delete
import org.apache.spark.SparkConf


object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeletesExample {tableName} ")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkDelete[Array[Byte]](rdd,
        TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)
    } finally {
      sc.stop()
    }
  }
} 
Example 75
Source File: ByteArrayWrapper.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class ByteArrayWrapper (var value:Array[Byte])
  extends Comparable[ByteArrayWrapper] with Serializable {
  override def compareTo(valueOther: ByteArrayWrapper): Int = {
    Bytes.compareTo(value,valueOther.value)
  }
  override def equals(o2: Any): Boolean = {
    o2 match {
      case wrapper: ByteArrayWrapper =>
        Bytes.equals(value, wrapper.value)
      case _ =>
        false
    }
  }
  override def hashCode():Int = {
    Bytes.hashCode(value)
  }
} 
Example 76
Source File: BulkLoadPartitioner.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import java.util
import java.util.Comparator

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.Partitioner


class BulkLoadPartitioner(startKeys:Array[Array[Byte]])
  extends Partitioner {

  override def numPartitions: Int = startKeys.length

  override def getPartition(key: Any): Int = {

    val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] {
      override def compare(o1: Array[Byte], o2: Array[Byte]): Int = {
        Bytes.compareTo(o1, o2)
      }
    }

    val rowKey:Array[Byte] =
      key match {
        case qualifier: KeyFamilyQualifier =>
          qualifier.rowKey
        case wrapper: ByteArrayWrapper =>
          wrapper.value
        case _ =>
          key.asInstanceOf[Array[Byte]]
      }
    val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator)
    if (partition < 0) partition * -1 + -2
    else partition
  }
} 
Example 77
Source File: KeyFamilyQualifier.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
} 
Example 78
Source File: HBaseMapPartitionExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkContext, SparkConf}


object HBaseMapPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => {
        val table = connection.getTable(TableName.valueOf(tableName))
        it.map{r =>
          //batching would be faster.  This is just an example
          val result = table.get(new Get(r))
          val it = result.listCells().iterator()
          val b = new StringBuilder
          b.append(Bytes.toString(result.getRow) + ":")
          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(cell.getQualifierArray)
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")")
            }
          }
          b.toString()
        }
      })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
} 
Example 79
Source File: HBaseBulkGetExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.{Result, Get}
import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.spark.{SparkContext, SparkConf}


object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
} 
Example 80
Source File: HBaseForeachPartitionExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkContext, SparkConf}


object HBaseForeachPartitionExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)


      rdd.hbaseForeachPartition(hbaseContext,
        (it, connection) => {
          val m = connection.getBufferedMutator(TableName.valueOf(tableName))

          it.foreach(r => {
            val put = new Put(r._1)
            r._2.foreach((putValue) =>
              put.addColumn(putValue._1, putValue._2, putValue._3))
            m.mutate(put)
          })
          m.flush()
          m.close()
        })

    } finally {
      sc.stop()
    }
  }
} 
Example 81
Source File: HBaseBulkDeleteExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Delete
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes

import org.apache.spark.{SparkContext, SparkConf}


object HBaseBulkDeleteExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkDeletesExample {tableName} ")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName)
    val sc = new SparkContext(sparkConf)
    try {
      //[Array[Byte]]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5")
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName),
        putRecord => new Delete(putRecord),
        4)

    } finally {
      sc.stop()
    }
  }
} 
Example 82
Source File: HBaseBulkPutExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.rdd

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.{SparkConf, SparkContext}


object HBaseBulkPutExample {
   def main(args: Array[String]) {
     if (args.length < 2) {
       println("HBaseBulkPutExample {tableName} {columnFamily}")
       return
     }

     val tableName = args(0)
     val columnFamily = args(1)

     val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
       tableName + " " + columnFamily)
     val sc = new SparkContext(sparkConf)

     try {
       //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
       val rdd = sc.parallelize(Array(
         (Bytes.toBytes("1"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
         (Bytes.toBytes("2"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
         (Bytes.toBytes("3"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
         (Bytes.toBytes("4"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
         (Bytes.toBytes("5"),
           Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
       ))

       val conf = HBaseConfiguration.create()

       val hbaseContext = new HBaseContext(sc, conf)

       rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName),
         (putRecord) => {
           val put = new Put(putRecord._1)
           putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
             putValue._3))
           put
         })

     } finally {
       sc.stop()
     }
   }
 } 
Example 83
Source File: HBaseBulkGetExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.client.Result
import org.apache.spark.SparkConf


object HBaseBulkGetExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("HBaseBulkGetExample {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName)
    val sc = new SparkContext(sparkConf)

    try {

      //[(Array[Byte])]
      val rdd = sc.parallelize(Array(
        Bytes.toBytes("1"),
        Bytes.toBytes("2"),
        Bytes.toBytes("3"),
        Bytes.toBytes("4"),
        Bytes.toBytes("5"),
        Bytes.toBytes("6"),
        Bytes.toBytes("7")))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val getRdd = hbaseContext.bulkGet[Array[Byte], String](
        TableName.valueOf(tableName),
        2,
        rdd,
        record => {
          System.out.println("making Get")
          new Get(record)
        },
        (result: Result) => {

          val it = result.listCells().iterator()
          val b = new StringBuilder

          b.append(Bytes.toString(result.getRow) + ":")

          while (it.hasNext) {
            val cell = it.next()
            val q = Bytes.toString(CellUtil.cloneQualifier(cell))
            if (q.equals("counter")) {
              b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")")
            } else {
              b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")")
            }
          }
          b.toString()
        })

      getRdd.collect().foreach(v => println(v))

    } finally {
      sc.stop()
    }
  }
} 
Example 84
Source File: HBaseBulkPutExampleFromFile.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf


object HBaseBulkPutExampleFromFile {
  def main(args: Array[String]) {
    if (args.length < 3) {
      println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)
    val inputFile = args(2)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " +
      tableName + " " + columnFamily + " " + inputFile)
    val sc = new SparkContext(sparkConf)

    try {
      var rdd = sc.hadoopFile(
        inputFile,
        classOf[TextInputFormat],
        classOf[LongWritable],
        classOf[Text]).map(v => {
        System.out.println("reading-" + v._2.toString)
        v._2.toString
      })

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[String](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          System.out.println("hbase-" + putRecord)
          val put = new Put(Bytes.toBytes("Value- " + putRecord))
          put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"),
            Bytes.toBytes(putRecord.length()))
          put
        });
    } finally {
      sc.stop()
    }
  }
} 
Example 85
Source File: KeyFamilyQualifier.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.s2jobs.spark

import java.io.Serializable

import org.apache.hadoop.hbase.util.Bytes


class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte])
  extends Comparable[KeyFamilyQualifier] with Serializable {
  override def compareTo(o: KeyFamilyQualifier): Int = {
    var result = Bytes.compareTo(rowKey, o.rowKey)
    if (result == 0) {
      result = Bytes.compareTo(family, o.family)
      if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier)
    }
    result
  }
  override def toString: String = {
    Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier)
  }
} 
Example 86
Source File: HBaseBulkPutExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf


object HBaseBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      println("HBaseBulkPutExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {
      //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])]
      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("1"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("2"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("3"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("4"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("5"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
      ))

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) =>
            put.addColumn(putValue._1, putValue._2, putValue._3))
          put
        });
    } finally {
      sc.stop()
    }
  }
} 
Example 87
Source File: HBaseDistributedScanExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Scan
import org.apache.spark.SparkConf

object HBaseDistributedScanExample {
  def main(args: Array[String]) {
    if (args.length < 1) {
      println("GenerateGraphs {tableName}")
      return
    }

    val tableName = args(0)

    val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName )
    val sc = new SparkContext(sparkConf)

    try {
      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      val scan = new Scan()
      scan.setCaching(100)

      val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan)

      getRdd.foreach(v => println(Bytes.toString(v._1.get())))

      println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length);

        //.collect().foreach(v => println(Bytes.toString(v._1.get())))
    } finally {
      sc.stop()
    }
  }

} 
Example 88
Source File: HBaseBulkPutTimestampExample.scala    From SparkOnHBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf


object HBaseBulkPutTimestampExample {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily}")
      return
    }

    val tableName = args(0)
    val columnFamily = args(1)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)

    try {

      val rdd = sc.parallelize(Array(
        (Bytes.toBytes("6"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
        (Bytes.toBytes("7"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
        (Bytes.toBytes("8"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
        (Bytes.toBytes("9"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
        (Bytes.toBytes("10"),
          Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))))

      val conf = HBaseConfiguration.create()

      val timeStamp = System.currentTimeMillis()

      val hbaseContext = new HBaseContext(sc, conf)
      hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
        TableName.valueOf(tableName),
        (putRecord) => {
          val put = new Put(putRecord._1)
          putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2,
            timeStamp, putValue._3))
          put
        })
    } finally {
      sc.stop()
    }
  }
} 
Example 89
Source File: HBaseSimpleRDD.scala    From spark-hbase-connector   with Apache License 2.0 5 votes vote down vote up
package it.nerdammer.spark.hbase

import it.nerdammer.spark.hbase.conversion.FieldReader
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, TaskContext}

import scala.reflect.ClassTag

class HBaseSimpleRDD[R: ClassTag](hadoopHBase: NewHadoopRDD[ImmutableBytesWritable, Result], builder: HBaseReaderBuilder[R], saltingLength: Int = 0)
                       (implicit mapper: FieldReader[R], saltingProvider: SaltingProviderFactory[String]) extends RDD[R](hadoopHBase) {

  override def getPartitions: Array[Partition] = firstParent[(ImmutableBytesWritable, Result)].partitions

  override def compute(split: Partition, context: TaskContext) = {
    // val cleanConversion = sc.clean ---> next version
    firstParent[(ImmutableBytesWritable, Result)].iterator(split, context)
      .map(e => conversion(e._1, e._2))
  }

  def conversion(key: ImmutableBytesWritable, row: Result) = {

    val columnNames = HBaseUtils.chosenColumns(builder.columns, mapper.columns)

    val columnNamesFC = HBaseUtils.columnsWithFamily(builder.columnFamily, columnNames)

    val columns = columnNamesFC
      .map(t => (Bytes.toBytes(t._1), Bytes.toBytes(t._2)))
      .map(t => if(row.containsColumn(t._1, t._2)) Some(CellUtil.cloneValue(row.getColumnLatestCell(t._1, t._2)).array) else None)
      .toList

    mapper.map(Some(key.get.drop(saltingLength)) :: columns)
  }
} 
Example 90
Source File: HBaseSinkSpec.scala    From incubator-retired-gearpump   with Apache License 2.0 5 votes vote down vote up
package org.apache.gearpump.external.hbase

import akka.actor.ActorSystem
import org.apache.gearpump.Message
import org.apache.gearpump.cluster.UserConfig
import org.apache.gearpump.external.hbase.HBaseSink.{HBaseWriter, HBaseWriterFactory}
import org.apache.gearpump.streaming.MockUtil
import org.apache.gearpump.streaming.task.TaskContext
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.mockito.Mockito._
import org.scalacheck.Gen
import org.scalatest.mock.MockitoSugar
import org.scalatest.prop.PropertyChecks
import org.scalatest.{Matchers, PropSpec}

class HBaseSinkSpec extends PropSpec with PropertyChecks with Matchers with MockitoSugar {


  property("HBaseSink should invoke HBaseWriter for writing message to HBase") {

    val hbaseWriter = mock[HBaseWriter]
    val hbaseWriterFactory = mock[HBaseWriterFactory]

    implicit val system: ActorSystem = MockUtil.system

    val userConfig = UserConfig.empty
    val tableName = "hbase"

    when(hbaseWriterFactory.getHBaseWriter(userConfig, tableName))
      .thenReturn(hbaseWriter)

    val hbaseSink = new HBaseSink(userConfig, tableName, hbaseWriterFactory)

    hbaseSink.open(MockUtil.mockTaskContext)

    forAll(Gen.alphaStr) { (value: String) =>
      val message = Message(value)
      hbaseSink.write(message)
      verify(hbaseWriter, atLeastOnce()).put(value)
    }

    hbaseSink.close()
    verify(hbaseWriter).close()
  }

  property("HBaseWriter should insert a row successfully") {

    val table = mock[Table]
    val config = mock[Configuration]
    val connection = mock[Connection]
    val taskContext = mock[TaskContext]

    val map = Map[String, String]("HBASESINK" -> "hbasesink", "TABLE_NAME" -> "hbase.table.name",
      "COLUMN_FAMILY" -> "hbase.table.column.family", "COLUMN_NAME" -> "hbase.table.column.name",
      "HBASE_USER" -> "hbase.user", "GEARPUMP_KERBEROS_PRINCIPAL" -> "gearpump.kerberos.principal",
      "GEARPUMP_KEYTAB_FILE" -> "gearpump.keytab.file"
    )
    val userConfig = new UserConfig(map)
    val tableName = "hbase"
    val row = "row"
    val group = "group"
    val name = "name"
    val value = "3.0"

    when(connection.getTable(TableName.valueOf(tableName))).thenReturn(table)

    val put = new Put(Bytes.toBytes(row))
    put.addColumn(Bytes.toBytes(group), Bytes.toBytes(name), Bytes.toBytes(value))

    val hbaseWriter = new HBaseWriter(connection, tableName)
    hbaseWriter.insert(Bytes.toBytes(row), Bytes.toBytes(group), Bytes.toBytes(name),
      Bytes.toBytes(value))

    verify(table).put(MockUtil.argMatch[Put](_.getRow sameElements Bytes.toBytes(row)))
  }
} 
Example 91
Source File: HBaseUtil.scala    From sprue   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sprue

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes

object HBaseUtil {
  val columnFamily: String = "cf1"

  def insertIncomingDataIntoHBase(patient: Patient): Put = {
    if (patient.getPatientId == null) {
      return null
    } else {
      val put = new Put(Bytes.toBytes(patient.getPatientId))
      if (patient.getPatientId != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("patientId"), Bytes.toBytes(patient.getPatientId))
      if (patient.getLocation != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("location"), Bytes.toBytes(patient.getLocation))
      if ((patient.getEvaluationDate : java.lang.Long) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("evaluationDate"), Bytes.toBytes(patient.getEvaluationDate))
      if ((patient.getTemperature  : java.lang.Float) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("temperature"), Bytes.toBytes(patient.getTemperature))
      if ((patient.getWbc  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("wbc"), Bytes.toBytes(patient.getWbc))

      if ((patient.getHeartRate  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("heartRate"), Bytes.toBytes(patient.getHeartRate))
      if ((patient.getRespiratoryRate  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("respiratoryRate"), Bytes.toBytes(patient.getRespiratoryRate))
      if ((patient.getSbp  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sbp"), Bytes.toBytes(patient.getSbp))
      if ((patient.getHypotension  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("hypotension"), Bytes.toBytes(patient.getHypotension))
      if (patient.getInfectionFlag != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("infectionFlag"), Bytes.toBytes(patient.getInfectionFlag))
      if ((patient.getOrganFailCount  : java.lang.Integer) != null)
        put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("organFailCount"), Bytes.toBytes(patient.getOrganFailCount))

      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("evalFinished"), Bytes.toBytes("N"))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("recordUpdatedTime"), Bytes.toBytes(System.currentTimeMillis))

    }
  }

  def insertEvaluatedDataIntoHBase(patient: Patient): Put = {
    if (patient.getPatientId == null) {
      return null
    } else {
      val put = new Put(Bytes.toBytes(patient.getPatientId))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("evalFinished"), Bytes.toBytes("Y"))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sirsCounter"), Bytes.toBytes(patient.getSirsCounter))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sirsFlag"), Bytes.toBytes(patient.getSirsFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("sepsisFlag"), Bytes.toBytes(patient.getSepsisFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("severeSepsisFlag"), Bytes.toBytes(patient.getSevereSepsisFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("septicShockFlag"), Bytes.toBytes(patient.getSepticShockFlag))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("organDysfunctionFlag"), Bytes.toBytes(patient.getOrganDysfunctionSyndrome))
      put.add(Bytes.toBytes(columnFamily), Bytes.toBytes("systemEvalTime"), Bytes.toBytes(System.currentTimeMillis))
      put
    }
  }
} 
Example 92
Source File: L6-14HBase.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object HBaseSinkApp {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
      .foreachRDD(rdd => {
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
        hbaseConf.set("hbase.master", hbaseMaster)
        val jobConf = new Configuration(hbaseConf)
        jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName)
        rdd.map(rec => {
          val put = new Put(rec._1.getBytes)
          put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
          (rec._1, put)
        }).saveAsNewAPIHadoopDataset(jobConf)
      })

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 93
Source File: L6-16SparkHBase.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object SparkHBaseBulkPutApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: SparkHBaseBulkPutApp <appname> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    val hbaseConf = HBaseConfiguration.create()
    val hContext = new HBaseContext(ssc.sparkContext, hbaseConf)

    val windowed = HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))

    hContext.streamBulkPut[(String, Float)](windowed, TableName.valueOf(tableName), rec => {
      val put = new Put(rec._1.getBytes)
      put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
      put
    })

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 94
Source File: HbRddReaders.scala    From hbrdd   with Apache License 2.0 5 votes vote down vote up
package top.spoofer.hbrdd.unit

import org.apache.hadoop.hbase.util.Bytes
import org.json4s._

trait HbRddReaders {
  implicit val hbBooleanReader = new HbRddFormatsReader[Boolean] {
    def formatsRead(readData: Array[Byte]): Boolean = Bytes.toBoolean(readData)
  }

  implicit val hbByteArrayReader = new HbRddFormatsReader[Array[Byte]] {
    def formatsRead(readData: Array[Byte]): Array[Byte] = readData
  }

  implicit val hbShortReader = new HbRddFormatsReader[Short] {
    def formatsRead(readData: Array[Byte]): Short = Bytes.toShort(readData)
  }

  implicit val hbIntReader = new HbRddFormatsReader[Int] {
    def formatsRead(readData: Array[Byte]): Int = Bytes.toInt(readData)
  }

  implicit val hbFloatReader = new HbRddFormatsReader[Float] {
    def formatsRead(readData: Array[Byte]): Float = Bytes.toFloat(readData)
  }

  implicit val hbDoubleReader = new HbRddFormatsReader[Double] {
    def formatsRead(readData: Array[Byte]): Double = Bytes.toDouble(readData)
  }

  implicit val hbLongReader = new HbRddFormatsReader[Long] {
    def formatsRead(readData: Array[Byte]): Long = Bytes.toLong(readData)
  }

  implicit val hbStringReader = new HbRddFormatsReader[String] {
    def formatsRead(readData: Array[Byte]): String = Bytes.toString(readData)
  }

  
  implicit val hbJsonReader = new HbRddFormatsReader[JValue] {
    import org.json4s.jackson.JsonMethods._
    def formatsRead(readData: Array[Byte]): JValue = parse(Bytes.toString(readData))
  }
} 
Example 95
Source File: HbRddWriters.scala    From hbrdd   with Apache License 2.0 5 votes vote down vote up
package top.spoofer.hbrdd.unit

import org.apache.hadoop.hbase.util.Bytes
import org.json4s._

trait HbRddWriters {
  implicit val hbBooleanWriter = new HbRddFormatsWriter[Boolean] {
    def formatsWrite(writeData: Boolean): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbArrayWriter = new HbRddFormatsWriter[Array[Byte]] {
    def formatsWrite(writeData: Array[Byte]): Array[Byte] = writeData
  }

  implicit val hbShortWriter = new HbRddFormatsWriter[Short] {
    def formatsWrite(writeData: Short): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbIntWriter = new HbRddFormatsWriter[Int] {
    def formatsWrite(writeData: Int): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbFloatWriter = new HbRddFormatsWriter[Float] {
    def formatsWrite(writeData: Float): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbDoubleWrite = new HbRddFormatsWriter[Double] {
    def formatsWrite(writeData: Double): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbLongWrite = new HbRddFormatsWriter[Long] {
    def formatsWrite(writeData: Long): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbStringWrite = new HbRddFormatsWriter[String] {
    def formatsWrite(writeData: String): Array[Byte] = Bytes.toBytes(writeData)
  }

  implicit val hbJsonWrite = new HbRddFormatsWriter[JValue] {
    import org.json4s.jackson.JsonMethods._
    def formatsWrite(writeData: JValue): Array[Byte] = Bytes.toBytes(compact(writeData))
  }
} 
Example 96
Source File: HbasePredicate.scala    From eel-sdk   with Apache License 2.0 5 votes vote down vote up
package io.eels.component.hbase

import io.eels._
import io.eels.schema.{DataType, StructType}
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
import org.apache.hadoop.hbase.filter._
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.JavaConversions._

// These are simply marker predicates used for pattern matching
case class ContainsPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = true
}

case class RegexPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = true
}

case class StartsWithPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = true
}

case class NotEqualsPredicate(name: String, value: Any) extends NamedPredicate(name) {
  override def eval(row: Row): Boolean = row.get(name) != value
}

object HbasePredicate {

  private val ByteComparableClazz = classOf[BinaryComparator]
  private val StringComparableClazz = classOf[SubstringComparator]
  private val RegexStringComparableClazz = classOf[RegexStringComparator]
  private val BinaryPrefixComparableClazz = classOf[BinaryPrefixComparator]

  def apply(pred: Predicate)(implicit schema: StructType, serializer: HbaseSerializer): FilterList = pred match {
    case EqualsPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, ByteComparableClazz))
    case NotEqualsPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.NOT_EQUAL, value, ByteComparableClazz))
    case ContainsPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, StringComparableClazz))
    case StartsWithPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, BinaryPrefixComparableClazz))
    case RegexPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.EQUAL, value, RegexStringComparableClazz))
    case GtPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.GREATER, value, ByteComparableClazz))
    case GtePredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.GREATER_OR_EQUAL, value, ByteComparableClazz))
    case LtPredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.LESS, value, ByteComparableClazz))
    case LtePredicate(name, value) => new FilterList(hbaseFiler(name, CompareOp.LESS_OR_EQUAL, value, ByteComparableClazz))
    case AndPredicate(predicates: Seq[Predicate]) => new FilterList(FilterList.Operator.MUST_PASS_ALL, predicates.map(apply).flatMap(_.getFilters))
    case OrPredicate(predicates: Seq[Predicate]) => new FilterList(FilterList.Operator.MUST_PASS_ONE, predicates.map(apply).flatMap(_.getFilters))
    case _@predicateType => sys.error(s"Predicate type '${predicateType.getClass.getSimpleName}' is not supported!")
  }

  def hbaseFiler[T](name: String, compareOp: CompareOp, value: Any, comparableClass: Class[T])
                   (implicit schema: StructType, serializer: HbaseSerializer): Filter = {
    val field = schema.fields.find(_.name == name).getOrElse(sys.error(s"Field '$name' in the predicate is not defined in the EEL schema"))
    if (field.key) {
      new RowFilter(compareOp, hbaseComparator(comparableClass, name, field.dataType, value))
    } else {
      new SingleColumnValueFilter(
        Bytes.toBytes(field.columnFamily.getOrElse(sys.error(s"No Column Family defined for field '${field.name}'"))),
        Bytes.toBytes(name),
        compareOp,
        hbaseComparator(comparableClass, name, field.dataType, value))
    }
  }

  def hbaseComparator[T](comparableClass: Class[T], name: String, dataType: DataType, value: Any)
                        (implicit schema: StructType, serializer: HbaseSerializer): ByteArrayComparable = (comparableClass, value) match {
    case (ByteComparableClazz, _) => new BinaryComparator(serializer.toBytes(value, name, dataType))
    case (RegexStringComparableClazz, stringValue: String) => new RegexStringComparator(stringValue)
    case (StringComparableClazz, stringValue: String) => new SubstringComparator(stringValue)
    case (BinaryPrefixComparableClazz, _) => new BinaryPrefixComparator(serializer.toBytes(value, name, dataType))
  }

  // Shorthand predicate names
  def or(left: Predicate, right: Predicate) = OrPredicate(Seq(left, right))

  def and(left: Predicate, right: Predicate) = AndPredicate(Seq(left, right))

  def equals(name: String, value: Any) = EqualsPredicate(name, value)

  def notEquals(name: String, value: Any) = NotEqualsPredicate(name, value)

  def gt(name: String, value: Any) = GtPredicate(name, value)

  def gte(name: String, value: Any) = GtePredicate(name, value)

  def lt(name: String, value: Any) = LtPredicate(name, value)

  def lte(name: String, value: Any) = LtePredicate(name, value)

  def regex(name: String, value: Any) = RegexPredicate(name, value)

  def contains(name: String, value: Any) = ContainsPredicate(name, value)

  def startsWith(name: String, value: Any) = StartsWithPredicate(name, value)

} 
Example 97
Source File: HBaseStreamingBulkPutExample.scala    From SparkOnHBase   with Apache License 2.0 4 votes vote down vote up
package org.apache.hadoop.hbase.spark.example.hbasecontext

import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.{TableName, HBaseConfiguration}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.SparkConf


object HBaseStreamingBulkPutExample {
  def main(args: Array[String]) {
    if (args.length < 4) {
      println("HBaseStreamingBulkPutExample " +
        "{host} {port} {tableName} {columnFamily}")
      return
    }

    val host = args(0)
    val port = args(1)
    val tableName = args(2)
    val columnFamily = args(3)

    val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " +
      tableName + " " + columnFamily)
    val sc = new SparkContext(sparkConf)
    try {
      val ssc = new StreamingContext(sc, Seconds(1))

      val lines = ssc.socketTextStream(host, port.toInt)

      val conf = HBaseConfiguration.create()

      val hbaseContext = new HBaseContext(sc, conf)

      hbaseContext.streamBulkPut[String](lines,
        TableName.valueOf(tableName),
        (putRecord) => {
          if (putRecord.length() > 0) {
            val put = new Put(Bytes.toBytes(putRecord))
            put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar"))
            put
          } else {
            null
          }
        })
      ssc.start()
      ssc.awaitTerminationOrTimeout(60000)
    } finally {
      sc.stop()
    }
  }
}