org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2 Java Examples

The following examples show how to use org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: HFileAppender.java From tajo with Apache License 2.0

6 votes

@Override
public void init() throws IOException {
  super.init();

  Configuration taskConf = new Configuration();
  Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME);
  taskConf.set(FileOutputFormat.OUTDIR, stagingResultDir.toString());

  ExecutionBlockId ebId = taskAttemptId.getTaskId().getExecutionBlockId();
  writerContext = new TaskAttemptContextImpl(taskConf,
      new TaskAttemptID(ebId.getQueryId().toString(), ebId.getId(), TaskType.MAP,
          taskAttemptId.getTaskId().getId(), taskAttemptId.getId()));

  HFileOutputFormat2 hFileOutputFormat2 = new HFileOutputFormat2();
  try {
    writer = hFileOutputFormat2.getRecordWriter(writerContext);

    committer = new FileOutputCommitter(FileOutputFormat.getOutputPath(writerContext), writerContext);
    workingFilePath = committer.getWorkPath();
  } catch (InterruptedException e) {
    throw new IOException(e.getMessage(), e);
  }

  LOG.info("Created hbase file writer: " + workingFilePath);
}

Example #2

Source File: HfileBulkExporter.java From super-cloudops with Apache License 2.0

5 votes

/**
 * Do hfile bulk exporting
 * 
 * @param builder
 * @throws Exception
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public static void doExporting(CommandLine line) throws Exception {
	// Configuration.
	String tabname = line.getOptionValue("tabname");
	String user = line.getOptionValue("user");
	Configuration conf = new Configuration();
	conf.set("hbase.zookeeper.quorum", line.getOptionValue("zkaddr"));
	conf.set("hbase.fs.tmp.dir", line.getOptionValue("T", DEFAULT_HBASE_MR_TMPDIR));
	conf.set(TableInputFormat.INPUT_TABLE, tabname);
	conf.set(TableInputFormat.SCAN_BATCHSIZE, line.getOptionValue("batchSize", DEFAULT_SCAN_BATCH_SIZE));

	// Check directory.
	String outputDir = line.getOptionValue("output", DEFAULT_HFILE_OUTPUT_DIR) + "/" + tabname;
	FileSystem fs = FileSystem.get(new URI(outputDir), new Configuration(), user);
	state(!fs.exists(new Path(outputDir)), format("HDFS temporary directory already has data, path: '%s'", outputDir));

	// Set scan condition.(if necessary)
	setScanIfNecessary(conf, line);

	// Job.
	Connection conn = ConnectionFactory.createConnection(conf);
	TableName tab = TableName.valueOf(tabname);
	Job job = Job.getInstance(conf);
	job.setJobName(HfileBulkExporter.class.getSimpleName() + "@" + tab.getNameAsString());
	job.setJarByClass(HfileBulkExporter.class);
	job.setMapperClass((Class<Mapper>) ClassUtils.getClass(line.getOptionValue("mapperClass", DEFAULT_MAPPER_CLASS)));
	job.setInputFormatClass(TableInputFormat.class);
	job.setMapOutputKeyClass(ImmutableBytesWritable.class);
	job.setMapOutputValueClass(Put.class);

	HFileOutputFormat2.configureIncrementalLoad(job, conn.getTable(tab), conn.getRegionLocator(tab));
	FileOutputFormat.setOutputPath(job, new Path(outputDir));
	if (job.waitForCompletion(true)) {
		long total = job.getCounters().findCounter(DEFUALT_COUNTER_GROUP, DEFUALT_COUNTER_TOTAL).getValue();
		long processed = job.getCounters().findCounter(DEFUALT_COUNTER_GROUP, DEFUALT_COUNTER_PROCESSED).getValue();
		log.info(String.format("Exported to successfully! with processed:(%d)/total:(%d)", processed, total));
	}

}

Example #3

Source File: IndexTool.java From hgraphdb with Apache License 2.0

5 votes

/**
 * Submits the job and waits for completion.
 * @param job job
 * @param outputPath output path
 * @throws Exception
 */
private void configureRunnableJobUsingBulkLoad(Job job, Path outputPath, TableName outputTableName,
                                               boolean skipDependencyJars) throws Exception {
    job.setMapperClass(getBulkMapperClass());
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    final Configuration configuration = job.getConfiguration();
    try (Connection conn = ConnectionFactory.createConnection(configuration);
         Admin admin = conn.getAdmin();
         Table table = conn.getTable(outputTableName);
         RegionLocator regionLocator = conn.getRegionLocator(outputTableName)) {
        HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator);
        if (skipDependencyJars) {
            job.getConfiguration().unset("tmpjars");
        }
        boolean status = job.waitForCompletion(true);
        if (!status) {
            LOG.error("IndexTool job failed!");
            throw new Exception("IndexTool job failed: " + job.toString());
        }

        LOG.info("Loading HFiles from {}", outputPath);
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(configuration);
        loader.doBulkLoad(outputPath, admin, table, regionLocator);
    }
    FileSystem.get(configuration).delete(outputPath, true);
}

Example #4

Source File: MapReduceHFileSplitterJob.java From hbase with Apache License 2.0

5 votes

/**
 * Sets up the actual job.
 * @param args The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public Job createSubmittableJob(String[] args) throws IOException {
  Configuration conf = getConf();
  String inputDirs = args[0];
  String tabName = args[1];
  conf.setStrings(TABLES_KEY, tabName);
  conf.set(FileInputFormat.INPUT_DIR, inputDirs);
  Job job =
      Job.getInstance(conf,
        conf.get(JOB_NAME_CONF_KEY, NAME + "_" + EnvironmentEdgeManager.currentTime()));
  job.setJarByClass(MapReduceHFileSplitterJob.class);
  job.setInputFormatClass(HFileInputFormat.class);
  job.setMapOutputKeyClass(ImmutableBytesWritable.class);
  String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
  if (hfileOutPath != null) {
    LOG.debug("add incremental job :" + hfileOutPath + " from " + inputDirs);
    TableName tableName = TableName.valueOf(tabName);
    job.setMapperClass(HFileCellMapper.class);
    job.setReducerClass(CellSortReducer.class);
    Path outputDir = new Path(hfileOutPath);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setMapOutputValueClass(MapReduceExtendedCell.class);
    try (Connection conn = ConnectionFactory.createConnection(conf);
        Table table = conn.getTable(tableName);
        RegionLocator regionLocator = conn.getRegionLocator(tableName)) {
      HFileOutputFormat2.configureIncrementalLoad(job, table.getDescriptor(), regionLocator);
    }
    LOG.debug("success configuring load incremental job");

    TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
      org.apache.hbase.thirdparty.com.google.common.base.Preconditions.class);
  } else {
    throw new IOException("No bulk output directory specified");
  }
  return job;
}

Example #5

Source File: SimpleHfileToRmdbExporter.java From super-cloudops with Apache License 2.0

4 votes

/**
 * Do hfile bulk exporting
 * 
 * @param builder
 * @throws Exception
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public static void doRmdbExporting(CommandLine line) throws Exception {
	// Configuration.
	String tabname = line.getOptionValue("tabname");
	String user = line.getOptionValue("user");
	Configuration conf = new Configuration();
	conf.set("hbase.zookeeper.quorum", line.getOptionValue("zkaddr"));
	conf.set("hbase.fs.tmp.dir", line.getOptionValue("T", DEFAULT_HBASE_MR_TMPDIR));
	conf.set(TableInputFormat.INPUT_TABLE, tabname);
	conf.set(TableInputFormat.SCAN_BATCHSIZE, line.getOptionValue("batchSize", DEFAULT_SCAN_BATCH_SIZE));

	// Check directory.
	String outputDir = line.getOptionValue("output", DEFAULT_HFILE_OUTPUT_DIR) + "/" + tabname;
	FileSystem fs = FileSystem.get(new URI(outputDir), new Configuration(), user);
	if (fs.exists(new Path(outputDir))) {
		fs.delete(new Path(outputDir), true);
	}

	// Set scan condition.(if necessary)
	HfileBulkExporter.setScanIfNecessary(conf, line);

	// Job.
	Connection conn = ConnectionFactory.createConnection(conf);
	TableName tab = TableName.valueOf(tabname);
	Job job = Job.getInstance(conf);
	job.setJobName(HfileBulkExporter.class.getSimpleName() + "@" + tab.getNameAsString());
	job.setJarByClass(HfileBulkExporter.class);
	job.setMapperClass((Class<Mapper>) ClassUtils.getClass(line.getOptionValue("mapperClass", DEFAULT_MAPPER_CLASS)));
	job.setInputFormatClass(TableInputFormat.class);
	job.setMapOutputKeyClass(ImmutableBytesWritable.class);
	job.setMapOutputValueClass(Put.class);

	HFileOutputFormat2.configureIncrementalLoad(job, conn.getTable(tab), conn.getRegionLocator(tab));
	FileOutputFormat.setOutputPath(job, new Path(outputDir));
	if (job.waitForCompletion(true)) {
		long total = job.getCounters().findCounter(DEFUALT_COUNTER_GROUP, DEFUALT_COUNTER_TOTAL).getValue();
		long processed = job.getCounters().findCounter(DEFUALT_COUNTER_GROUP, DEFUALT_COUNTER_PROCESSED).getValue();
		log.info(String.format("Exported to successfully! with processed:(%d)/total:(%d)", processed, total));
	}

}

Example #6

Source File: HalyardBulkDelete.java From Halyard with Apache License 2.0

4 votes

@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('t');
    TableMapReduceUtil.addDependencyJars(getConf(),
        HalyardExport.class,
        NTriplesUtil.class,
        Rio.class,
        AbstractRDFHandler.class,
        RDFFormat.class,
        RDFParser.class,
        HTable.class,
        HBaseConfiguration.class,
        AuthenticationProtos.class,
        Trace.class,
        Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardDelete " + source);
    if (cmd.hasOption('s')) {
        job.getConfiguration().set(SUBJECT, cmd.getOptionValue('s'));
    }
    if (cmd.hasOption('p')) {
        job.getConfiguration().set(PREDICATE, cmd.getOptionValue('p'));
    }
    if (cmd.hasOption('o')) {
        job.getConfiguration().set(OBJECT, cmd.getOptionValue('o'));
    }
    if (cmd.hasOption('g')) {
        job.getConfiguration().setStrings(CONTEXTS, cmd.getOptionValues('g'));
    }
    job.setJarByClass(HalyardBulkDelete.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(null, null);

    TableMapReduceUtil.initTableMapperJob(source,
        scan,
        DeleteMapper.class,
        ImmutableBytesWritable.class,
        LongWritable.class,
        job);

    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setSpeculativeExecution(false);
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), source, false, 0)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
        FileOutputFormat.setOutputPath(job, new Path(cmd.getOptionValue('f')));
        TableMapReduceUtil.addDependencyJars(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(cmd.getOptionValue('f')), hTable);
            LOG.info("Bulk Delete Completed..");
            return 0;
        }
    }
    return -1;
}

Example #7

Source File: HalyardBulkLoad.java From Halyard with Apache License 2.0

4 votes

@Override
protected int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String workdir = cmd.getOptionValue('w');
    String target = cmd.getOptionValue('t');
    getConf().setBoolean(SKIP_INVALID_PROPERTY, cmd.hasOption('i'));
    getConf().setBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, cmd.hasOption('d'));
    getConf().setBoolean(TRUNCATE_PROPERTY, cmd.hasOption('r'));
    getConf().setInt(SPLIT_BITS_PROPERTY, Integer.parseInt(cmd.getOptionValue('b', "3")));
    if (cmd.hasOption('g')) getConf().set(DEFAULT_CONTEXT_PROPERTY, cmd.getOptionValue('g'));
    getConf().setBoolean(OVERRIDE_CONTEXT_PROPERTY, cmd.hasOption('o'));
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis()))));
    if (cmd.hasOption('m')) getConf().setLong("mapreduce.input.fileinputformat.split.maxsize", Long.parseLong(cmd.getOptionValue('m')));
    TableMapReduceUtil.addDependencyJars(getConf(),
            NTriplesUtil.class,
            Rio.class,
            AbstractRDFHandler.class,
            RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + workdir + " -> " + target);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), target, true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, source);
        FileOutputFormat.setOutputPath(job, new Path(workdir));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            if (getConf().getBoolean(TRUNCATE_PROPERTY, false)) {
                HalyardTableUtils.truncateTable(hTable).close();
            }
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(workdir), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

Example #8

Source File: HalyardBulkUpdate.java From Halyard with Apache License 2.0

4 votes

public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String queryFiles = cmd.getOptionValue('q');
    String workdir = cmd.getOptionValue('w');
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis()))));
    if (cmd.hasOption('i')) getConf().set(ELASTIC_INDEX_URL, cmd.getOptionValue('i'));
    TableMapReduceUtil.addDependencyJars(getConf(),
           HalyardExport.class,
           NTriplesUtil.class,
           Rio.class,
           AbstractRDFHandler.class,
           RDFFormat.class,
           RDFParser.class,
           HTable.class,
           HBaseConfiguration.class,
           AuthenticationProtos.class,
           Trace.class,
           Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    getConf().setStrings(TABLE_NAME_PROPERTY, source);
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, getConf().getLong(DEFAULT_TIMESTAMP_PROPERTY, System.currentTimeMillis()));
    int stages = 1;
    for (int stage = 0; stage < stages; stage++) {
        Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + workdir + " -> " + source + " stage #" + stage);
        job.getConfiguration().setInt(STAGE_PROPERTY, stage);
        job.setJarByClass(HalyardBulkUpdate.class);
        job.setMapperClass(SPARQLUpdateMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);
        job.setInputFormatClass(QueryInputFormat.class);
        job.setSpeculativeExecution(false);
        job.setReduceSpeculativeExecution(false);
        try (HTable hTable = HalyardTableUtils.getTable(getConf(), source, false, 0)) {
            HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
            QueryInputFormat.setQueriesFromDirRecursive(job.getConfiguration(), queryFiles, true, stage);
            Path outPath = new Path(workdir, "stage"+stage);
            FileOutputFormat.setOutputPath(job, outPath);
            TableMapReduceUtil.addDependencyJars(job);
            TableMapReduceUtil.initCredentials(job);
            if (stage == 0) { //count real number of stages
                for (InputSplit is : new QueryInputFormat().getSplits(job)) {
                    QueryInputFormat.QueryInputSplit qis = (QueryInputFormat.QueryInputSplit)is;
                    int updates = QueryParserUtil.parseUpdate(QueryLanguage.SPARQL, qis.getQuery(), null).getUpdateExprs().size();
                    if (updates > stages) {
                        stages = updates;
                    }
                    LOG.log(Level.INFO, "{0} contains {1} stages of the update sequence.", new Object[]{qis.getQueryName(), updates});
                }
                LOG.log(Level.INFO, "Bulk Update will process {0} MapReduce stages.", stages);
            }
            if (job.waitForCompletion(true)) {
                new LoadIncrementalHFiles(getConf()).doBulkLoad(outPath, hTable);
                LOG.log(Level.INFO, "Stage #{0} of {1} completed..", new Object[]{stage, stages});
            } else {
                return -1;
            }
        }
    }
    LOG.info("Bulk Update Completed..");
    return 0;
}

Example #9

Source File: ConvertToHFiles.java From examples with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
  try {
    Configuration conf = HBaseConfiguration.create();
    Connection connection = ConnectionFactory.createConnection(conf);

    String inputPath = args[0];
    String outputPath = args[1];
    final TableName tableName = TableName.valueOf(args[2]);

    // tag::SETUP[]
    Table table = connection.getTable(tableName);

    Job job = Job.getInstance(conf, "ConvertToHFiles: Convert CSV to HFiles");

    HFileOutputFormat2.configureIncrementalLoad(job, table,
                                      connection.getRegionLocator(tableName)); // <1>
    job.setInputFormatClass(TextInputFormat.class); // <2>

    job.setJarByClass(ConvertToHFiles.class); // <3>
    job.setJar("/home/cloudera/ahae/target/ahae.jar"); // <3>

    job.setMapperClass(ConvertToHFilesMapper.class); // <4>
    job.setMapOutputKeyClass(ImmutableBytesWritable.class); // <5>
    job.setMapOutputValueClass(KeyValue.class); // <6>

    FileInputFormat.setInputPaths(job, inputPath);
    HFileOutputFormat2.setOutputPath(job, new Path(outputPath));
    // end::SETUP[]

    if (!job.waitForCompletion(true)) {
      LOG.error("Failure");
    } else {
      LOG.info("Success");
      return 0;
    }
  } catch (Exception e) {
    e.printStackTrace();
  }
  return 1;
}