java source code of ExportHBaseTableToDelimiteredSeq

package com.cloudera.sa.hbase.to.hdfs;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.zip.GZIPOutputStream;

import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class ExportHBaseTableToDelimiteredSeq {
  
  public static final String SHOULD_COMPRESSION_CONF = "custom.should.compress";
  public static final String SCHEMA_FILE_LOCATION_CONF = "custom.schema.file.location";
  public static final String OUTPUT_PATH_CONF = "custom.output.path";
  public static final String DELIMITER_CONF = "custom.delimiter";
  public static final String ROW_KEY_COLUMN_CONF = "custom.rowkey.column";
  
	public static void main (String[] args) throws IOException, InterruptedException, ClassNotFoundException {
	  if (args.length == 0) {
      System.out
          .println("ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional");
      return;
    }

    String table = args[0];
    String columnFamily = args[1];
    String outputPath = args[2];
    String compressionCodec = args[3];
    String schemaFilePath = args[4];
    String delimiter = args[5];

    String rowKeyColumn = "";
    if (args.length > 6) {
      rowKeyColumn = args[6];
    }
    
    Job job = Job.getInstance();
    job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
    
    HBaseConfiguration.addHbaseResources(job.getConfiguration());
    
    job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
    job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath);
    job.getConfiguration().set(DELIMITER_CONF, delimiter);

    job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class);
    job.setJobName("ExportHBaseTableToDelimiteredSeq ");

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                          // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.addFamily(Bytes.toBytes(columnFamily));

    TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
        scan, // Scan instance to control CF and attribute selection
        MyMapper.class, // mapper
        null, // mapper output key
        null, // mapper output value
        job);
    job.setOutputFormatClass(SequenceFileOutputFormat.class); 
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
    
    if (compressionCodec.equals("snappy")) {
      SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
    } else if (compressionCodec.equals("gzip")) {
      SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {
      //nothing
    }
    
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    
    job.setNumReduceTasks(0);
    
    boolean b = job.waitForCompletion(true);
  }

  public static class MyMapper extends TableMapper<Text, NullWritable> {

    
    String schemaFileLocation;
    int taskId;
    ArrayList<String> columns = new ArrayList<String>();
    HashMap<String, byte[]> columnValueMap = new HashMap<String, byte[]>();
    String delimiter;
    Text text = new Text();
    byte[] lastRowKey = null;
    String rowKeyColumn;
    
    @Override
    public void setup(Context context) throws TableNotFoundException,
        IOException {
      taskId = context.getTaskAttemptID().getTaskID().getId();
      schemaFileLocation = context.getConfiguration().get(SCHEMA_FILE_LOCATION_CONF);
      FileSystem fs = FileSystem.get(context.getConfiguration());
      columns = generateColumnsFromSchemaFile(fs, schemaFileLocation);
      
      delimiter = context.getConfiguration().get(DELIMITER_CONF);
      rowKeyColumn = context.getConfiguration().get(ROW_KEY_COLUMN_CONF);
    }
    
    @Override
    public void cleanup(Context context) throws IOException, InterruptedException {
      writeLine(context, Bytes.toString(lastRowKey));
      
    }
    
    protected static ArrayList<String> generateColumnsFromSchemaFile(FileSystem fs, String schemaFileLocation) throws IOException {
      ArrayList<String> results = new ArrayList<String>();
      
      BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(schemaFileLocation))));
      
      String line = reader.readLine();
      
      for (String columnName: line.split(",")) {
        results.add(columnName);
      }
      
      reader.close();
      
      return results;
    }

    @Override
    public void map(ImmutableBytesWritable row, Result value, Context context)
        throws InterruptedException, IOException {
      
      KeyValue[] kvs = value.raw();
      
      if (lastRowKey == null) {
        lastRowKey = row.get();
      } else if (Bytes.compareTo(lastRowKey, row.get()) != 0){
        writeLine(context,Bytes.toString(lastRowKey));
        columnValueMap.clear();
      }
      for (KeyValue kv: kvs) {
        String qualifier = Bytes.toString(kv.getQualifier());
        byte[] val = kv.getValue();
        columnValueMap.put(qualifier, val);
      }
    }
    
    protected void writeLine(Context context, String rowKey) throws IOException, InterruptedException {
      StringBuilder strBuilder = new StringBuilder();
      
      boolean isFirst = true;
      
      for (String col: columns) {
        
        if (isFirst) { isFirst = false; } 
        else { strBuilder.append(delimiter); }
        
        byte[] value = columnValueMap.get(col);
        if (value != null) {
          strBuilder.append(Bytes.toString(value));
        } else if (col.equals(rowKeyColumn)) {
          strBuilder.append(rowKey);
        }
      }
      text.set(strBuilder.toString());
      context.write(text, NullWritable.get());
    }
  }
}