Scala Examples

Example 1
Source File: HBase.scala    From AI   with Apache License 2.0
package com.bigchange.hbase

import com.bigchange.util.HBaseUtil._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{Result, _}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.SparkContext

  def existRowKey(row:String, table: Table): Boolean ={

    val get = new Get(row.getBytes())
    val result = table.get(get)

    if (result.isEmpty) {
      warn("hbase table don't have this data,execute insert")
      return false



  def getConfiguration = if(hBaseConfiguration == null) {
      warn("hbase setDefaultConfiguration....")
    } else  hBaseConfiguration

  def setDefaultConfiguration = {

    hBaseConfiguration = HBaseConfiguration.create

    // 本地测试 需配置的选项, 在集群上通过对应配置文件路径自动获得
    hBaseConfiguration.set("fs.defaultFS", "hdfs://ns1"); // nameservices的路径
    hBaseConfiguration.set("dfs.nameservices", "ns1");  //
    hBaseConfiguration.set("dfs.ha.namenodes.ns1", "nn1,nn2"); //namenode的路径
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn1", "server3:9000"); // namenode 通信地址
    hBaseConfiguration.set("dfs.namenode.rpc-address.ns1.nn2", "server4:9000"); // namenode 通信地址
    // 设置namenode自动切换的实现类
    hBaseConfiguration.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
    hBaseConfiguration.set("hbase.rootdir", "hdfs://ns1/hbase")
    hBaseConfiguration.set("hbase.zookeeper.quorum", "server0,server1,server2")
    hBaseConfiguration.set("", "2181")



Example 2
Source File: HbRddWriter.scala    From hbrdd   with Apache License 2.0
package top.spoofer.hbrdd.hbsupport

import org.apache.hadoop.hbase.client.Put
import org.apache.spark.rdd.RDD
import top.spoofer.hbrdd.config.HbRddConfig
import top.spoofer.hbrdd.unit.HbRddFormatsWriter
import top.spoofer.hbrdd._
import HbRddWritPuter._

trait HbRddWriter {
  type TsValue[A] = (Long, A) // (ts, A)
  val LATEST_TIMESTAMP = Long.MaxValue
final class SingleFamilyRDDWriter[A](
    val rdd: RDD[(String, Map[String, A])],
    val put: HbRddPuter[A]
) extends HbRddWritCommon[A] with Serializable {
  def put2Hbase(tableName: String, family: String)(implicit config: HbRddConfig) = {
    val job = createJob(tableName, config.getHbaseConfig)
    rdd.flatMap({ case (rowId, data) => convert2Writable(rowId, Map(family -> data), put) })
Example 3
Source File: HBaseReaders.scala    From cuesheet   with Apache License 2.0
package com.kakao.cuesheet.convert

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.JavaConversions._

trait HBaseReaders {
  val sc: SparkContext

  def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = {
    hbaseTableBinary(quorum, table).map {
      case (rowkey, ((family, qualifier), (timestamp, value))) =>
        (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string)))

  def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = {
    hbaseTableBinary(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell)

  def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = {
    hbaseTable(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell)
Example 4
Source File: HBaseSimpleRDD.scala    From spark-hbase-connector   with Apache License 2.0
package it.nerdammer.spark.hbase

import it.nerdammer.spark.hbase.conversion.FieldReader
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, TaskContext}

import scala.reflect.ClassTag

class HBaseSimpleRDD[R: ClassTag](hadoopHBase: NewHadoopRDD[ImmutableBytesWritable, Result], builder: HBaseReaderBuilder[R], saltingLength: Int = 0)
                       (implicit mapper: FieldReader[R], saltingProvider: SaltingProviderFactory[String]) extends RDD[R](hadoopHBase) {

  override def getPartitions: Array[Partition] = firstParent[(ImmutableBytesWritable, Result)].partitions

  override def compute(split: Partition, context: TaskContext) = {
    // val cleanConversion = sc.clean ---> next version
    firstParent[(ImmutableBytesWritable, Result)].iterator(split, context)
      .map(e => conversion(e._1, e._2))

  def conversion(key: ImmutableBytesWritable, row: Result) = {

    val columnNames = HBaseUtils.chosenColumns(builder.columns, mapper.columns)

    val columnNamesFC = HBaseUtils.columnsWithFamily(builder.columnFamily, columnNames)

    val columns = columnNamesFC
      .map(t => (Bytes.toBytes(t._1), Bytes.toBytes(t._2)))
      .map(t => if(row.containsColumn(t._1, t._2)) Some(CellUtil.cloneValue(row.getColumnLatestCell(t._1, t._2)).array) else None)
      .toList :: columns)
Example 5
Source File: GraphX.scala    From unicorn   with Apache License 2.0
package unicorn.narwhal.graph

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext

import unicorn.bigtable.hbase.HBaseTable
import unicorn.json._
import unicorn.unibase.graph.{ReadOnlyGraph, GraphSerializer, GraphVertexColumnFamily, GraphOutEdgeColumnFamily}

  def graphx(sc: SparkContext): org.apache.spark.graphx.Graph[JsObject, (String, JsValue)] = {

    val conf = HBaseConfiguration.create()
    conf.set(TableInputFormat.INPUT_TABLE, name)
    conf.setInt(TableInputFormat.SCAN_CACHEDROWS, 500)
    conf.setBoolean(TableInputFormat.SCAN_CACHEBLOCKS, false)
    conf.set(TableInputFormat.SCAN_COLUMNS, s"$GraphVertexColumnFamily $GraphOutEdgeColumnFamily")

    val rdd = sc.newAPIHadoopRDD(

    val rows = rdd.mapPartitions { it =>
      val serializer = new GraphSerializer() { tuple =>
        val row = HBaseTable.getRow(tuple._2)

    val vertices = { vertex =>

    val edges = rows.flatMap { vertex => { edge =>
        org.apache.spark.graphx.Edge(edge.from,, (edge.label,

    org.apache.spark.graphx.Graph(vertices, edges)
Example 6
Source File: Phoenix.scala    From shc   with Apache License 2.0
package org.apache.spark.sql.execution.datasources.hbase.types

import org.apache.phoenix.query.QueryConstants
import org.apache.phoenix.schema._
import org.apache.phoenix.schema.RowKeySchema.RowKeySchemaBuilder
import org.apache.phoenix.schema.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.datasources.hbase._
import org.apache.spark.sql.types._

class Phoenix(f:Option[Field] = None) extends SHCDataType {
  private var schema: RowKeySchema = null

  def fromBytes(src: HBaseType): Any = {
    if (f.isDefined) {
    } else {
      throw new UnsupportedOperationException(
        "Phoenix coder: without field metadata, 'fromBytes' conversion can not be supported")

  def toBytes(input: Any): Array[Byte] = {
    input match {
      case data: Boolean => PBoolean.INSTANCE.toBytes(data)
      case data: Byte => PTinyint.INSTANCE.toBytes(data)
      case data: Array[Byte] => PVarbinary.INSTANCE.toBytes(data)
      case data: Double => PDouble.INSTANCE.toBytes(data)
      case data: Float => PFloat.INSTANCE.toBytes(data)
      case data: Int => PInteger.INSTANCE.toBytes(data)
      case data: Long => PLong.INSTANCE.toBytes(data)
      case data: Short => PSmallint.INSTANCE.toBytes(data)
      case data: String => PVarchar.INSTANCE.toBytes(data)
      case _ => throw new UnsupportedOperationException(s"unsupported data type $input")

  override def isRowKeySupported(): Boolean = true

  override def isCompositeKeySupported(): Boolean = true

  override def decodeCompositeRowKey(row: Array[Byte], keyFields: Seq[Field]): Map[Field, Any] = {
    if (schema == null) schema = buildSchema(keyFields)
    val ptr: ImmutableBytesWritable = new ImmutableBytesWritable
    val maxOffest = schema.iterator(row, 0, row.length, ptr)
    var ret = Map.empty[Field, Any]
    for (i <- 0 until schema.getFieldCount) {
      if (, i, maxOffest) != null) {
        val value = mapToPhoenixTypeInstance(keyFields(i).dt)
          .toObject(ptr, schema.getField(i).getDataType, SortOrder.getDefault)
        ret += ((keyFields(i), value))

  override def encodeCompositeRowKey(rkIdxedFields: Seq[(Int, Field)], row: Row): Seq[Array[Byte]] = { { case (x, y) =>
      var ret = toBytes(row(x))
      // the last dimension of composite key does not need SEPARATOR
      if (y.length == -1 && x < rkIdxedFields.size - 1)
        ret ++= QueryConstants.SEPARATOR_BYTE_ARRAY

  private def buildSchema(keyFields: Seq[Field]): RowKeySchema = {
    val builder: RowKeySchemaBuilder = new RowKeySchemaBuilder(keyFields.length)
    keyFields.foreach{ x =>
      builder.addField(buildPDatum(x.dt), false, SortOrder.getDefault)

  private def mapToPhoenixTypeInstance(input: DataType): PDataType[_] = {
    input match {
      case BooleanType => PBoolean.INSTANCE
      case ByteType => PTinyint.INSTANCE
      case DoubleType => PDouble.INSTANCE
      case IntegerType => PInteger.INSTANCE
      case FloatType => PFloat.INSTANCE
      case LongType => PLong.INSTANCE
      case ShortType => PSmallint.INSTANCE
      case StringType => PVarchar.INSTANCE
      case BinaryType => PVarbinary.INSTANCE
      case _ => throw new UnsupportedOperationException(s"unsupported data type $input")

  private def buildPDatum(input: DataType): PDatum = new PDatum {
    override def getScale: Integer = null
    override def isNullable: Boolean = false
    override def getDataType: PDataType[_] = mapToPhoenixTypeInstance(input)
    override def getMaxLength: Integer = null
    override def getSortOrder: SortOrder = SortOrder.getDefault
Example 7
Source File: HBaseUtils.scala    From bigdata-examples   with Apache License 2.0
package com.timeyang.common.util

import com.timeyang.common.config.BaseConf
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
import org.apache.hadoop.hbase.util.Base64
import org.apache.hadoop.mapreduce.Job

  def createHbaseOutputJob(tableName: String): Job = {
    val conf = HBaseUtils.newConf()
    conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
    val job = Job.getInstance(conf)
