package com.cloudera.sa.hbase.to.hdfs; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.zip.GZIPOutputStream; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapred.AvroOutputFormat; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyOutputFormat; import org.apache.avro.mapreduce.AvroKeyValueOutputFormat; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; public class ExportHBaseTableToAvro { public static final String SHOULD_COMPRESSION_CONF = "custom.compressionCodec"; public static final String SCHEMA_FILE_LOCATION_CONF = "custom.schema.file.location"; public static final String OUTPUT_PATH_CONF = "custom.output.path"; public static final String DELIMITER_CONF = "custom.delimiter"; public static final String ROW_KEY_COLUMN_CONF = "custom.rowkey.column"; public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length == 0) { System.out.println("ExportHBaseTableToAvro {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowKeyColumn.Optional}"); return; } String table = args[0]; String columnFamily = args[1]; String outputPath = args[2]; String compressionCodec = args[3]; String schemaFilePath = args[4]; String rowKeyColumn = ""; if (args.length > 5) { rowKeyColumn = args[5]; } Job job = Job.getInstance(); HBaseConfiguration.addHbaseResources(job.getConfiguration()); job.setJarByClass(ExportHBaseTableToAvro.class); job.setJobName("ExportHBaseTableToAvro "); job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn); job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs scan.addFamily(Bytes.toBytes(columnFamily)); TableMapReduceUtil.initTableMapperJob(table, // input HBase table name scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper null, // mapper output key null, // mapper output value job); job.setOutputFormatClass(AvroKeyOutputFormat.class); AvroKeyOutputFormat.setOutputPath(job, new Path(outputPath)); Schema.Parser parser = new Schema.Parser(); FileSystem fs = FileSystem.get(job.getConfiguration()); AvroJob.setOutputKeySchema(job, parser.parse(fs.open(new Path(schemaFilePath)))); if (compressionCodec.equals("snappy")) { AvroKeyOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); } else if (compressionCodec.equals("gzip")) { AvroKeyOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { // nothing } job.setNumReduceTasks(0); boolean b = job.waitForCompletion(true); } public static class MyMapper extends TableMapper<AvroKey<GenericRecord>, NullWritable> { FileSystem fs; ArrayList<String> columns = new ArrayList<String>(); HashMap<String, byte[]> columnValueMap = new HashMap<String, byte[]>(); Schema schema; String rowKeyColumn; AvroKey<GenericRecord> avroKey = new AvroKey<GenericRecord>(); byte[] lastRowKey = null; @Override public void setup(Context context) throws TableNotFoundException, IOException { fs = FileSystem.get(context.getConfiguration()); String schemaFileLocation = context.getConfiguration().get(SCHEMA_FILE_LOCATION_CONF); columns = generateColumnsFromSchemaFile(fs, schemaFileLocation); Schema.Parser parser = new Schema.Parser(); schema = parser.parse(context.getConfiguration().get("avro.schema.output.key")); rowKeyColumn = context.getConfiguration().get(ROW_KEY_COLUMN_CONF, ""); } @Override public void cleanup(Context context) throws IOException { } protected static ArrayList<String> generateColumnsFromSchemaFile(FileSystem fs, String schemaFileLocation) throws IOException { Schema.Parser parser = new Schema.Parser(); Schema schema = parser.parse(fs.open(new Path(schemaFileLocation))); ArrayList<String> results = new ArrayList<String>(); for (Field f: schema.getFields()) { results.add(f.name()); } return results; } @Override public void map(ImmutableBytesWritable row, Result value, Context context) throws InterruptedException, IOException { KeyValue[] kvs = value.raw(); if (lastRowKey == null) { lastRowKey = row.get(); } else if (Bytes.compareTo(lastRowKey, row.get()) != 0){ writeLine(context); columnValueMap.clear(); } for (KeyValue kv : kvs) { String qualifier = Bytes.toString(kv.getQualifier()); byte[] val = kv.getValue(); columnValueMap.put(qualifier, val); } } protected void writeLine(Context context) throws IOException, InterruptedException { if (columnValueMap.size() > 0) { Record record = new Record(schema); for (String col : columns) { byte[] value = columnValueMap.get(col); if (value != null) { putValue(record, col, value); } } if (!rowKeyColumn.isEmpty()) { byte[] value = columnValueMap.get(rowKeyColumn); if (value != null) { putValue(record, rowKeyColumn, value); } } avroKey.datum(record); context.write(avroKey, NullWritable.get()); } } private void putValue(Record record, String col, byte[] value) { if (schema.getField(col).schema().getType().equals(Type.STRING)) { record.put(col, Bytes.toString(value)); } else if (schema.getField(col).schema().getType().equals(Type.INT)) { record.put(col, Bytes.toInt(value)); } else if (schema.getField(col).schema().getType().equals(Type.LONG)) { record.put(col, Bytes.toLong(value)); } else { throw new RuntimeException("Unknown datatype: " + schema.getField(col).schema().getType()); } } } }