org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#initTableMapperJob

Source File: TransformerBaseRunner.java From BigDataArchitect with Apache License 2.0

6 votes

/**
 * 创建job
 * 
 * @param conf
 * @return
 * @throws IOException
 */
protected Job initJob(Configuration conf) throws IOException {
    Job job = Job.getInstance(conf, this.jobName);

    job.setJarByClass(this.runnerClass);
    // 本地运行
    TableMapReduceUtil.initTableMapperJob(initScans(job), this.mapperClass, this.mapOutputKeyClass, this.mapOutputValueClass, job, false);
    // 集群运行：本地提交和打包(jar)提交
    // TableMapReduceUtil.initTableMapperJob(initScans(job),
    // this.mapperClass, this.mapOutputKeyClass, this.mapOutputValueClass,
    // job);
    job.setReducerClass(this.reducerClass);
    job.setOutputKeyClass(this.outputKeyClass);
    job.setOutputValueClass(this.outputValueClass);
    job.setOutputFormatClass(this.outputFormatClass);
    return job;
}

Source File: TransformBaseRunner.java From BigDataPlatform with GNU General Public License v3.0

6 votes

protected Job initJob(Configuration conf) throws IOException {
    Job job = Job.getInstance(conf, this.jobName);

    job.setJarByClass(this.runnerClass);
    // 本地运行
//    TableMapReduceUtil.initTableMapperJob(initScans(job), this.mapperClass, this.mapOutputKeyClass, this.mapOutputValueClass, job, false);
    TableMapReduceUtil.initTableMapperJob(initScans(job), this.mapperClass, this.mapOutputKeyClass, this.mapOutputValueClass, job, true);
    // 集群运行：本地提交和打包(jar)提交
    // TableMapReduceUtil.initTableMapperJob(initScans(job),
    // this.mapperClass, this.mapOutputKeyClass, this.mapOutputValueClass,
    // job);
    job.setReducerClass(this.reducerClass);
    job.setOutputKeyClass(this.outputKeyClass);
    job.setOutputValueClass(this.outputValueClass);
    job.setOutputFormatClass(this.outputFormatClass);
    return job;
  }

Source File: JobFileProcessor.java From hraven with Apache License 2.0

6 votes

/**
 * @param conf to use to create and run the job
 * @param scan to be used to scan the raw table.
 * @param totalJobCount the total number of jobs that need to be run in this
 *          batch. Used in job name.
 * @return The job to be submitted to the cluster.
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount)
    throws IOException {

  Configuration confClone = new Configuration(conf);

  // Turn off speculative execution.
  // Note: must be BEFORE the job construction with the new mapreduce API.
  confClone.setBoolean("mapred.map.tasks.speculative.execution", false);

  // Set up job
  Job job = new Job(confClone, getJobName(totalJobCount));

  // This is a map-only class, skip reduce step
  job.setNumReduceTasks(0);
  job.setJarByClass(JobFileProcessor.class);
  job.setOutputFormatClass(MultiTableOutputFormat.class);

  TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan,
      JobFileTableMapper.class, JobFileTableMapper.getOutputKeyClass(),
      JobFileTableMapper.getOutputValueClass(), job);

  return job;
}

Source File: TableInputFormatTest.java From hgraphdb with Apache License 2.0

6 votes

private void runTestOnTable() throws InterruptedException, ClassNotFoundException {
    Job job = null;
    try {
        Configuration conf = graph.configuration().toHBaseConfiguration();
        job = Job.getInstance(conf, "test123");
        job.setOutputFormatClass(NullOutputFormat.class);
        job.setNumReduceTasks(0);
        Scan scan = new Scan();
        scan.addColumn(FAMILY_NAME, COLUMN_NAME);
        scan.setTimeRange(MINSTAMP, MAXSTAMP);
        scan.setMaxVersions();
        TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(),
                scan, ProcessTimeRangeMapper.class, Text.class, Text.class, job,
                true, TableInputFormat.class);
        job.waitForCompletion(true);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } finally {
        if (job != null) {
            FileUtil.fullyDelete(
                    new File(job.getConfiguration().get("hadoop.tmp.dir")));
        }
    }
}

Source File: LeastRecentlyUsedPruner.java From metron with Apache License 2.0

6 votes

public static void setupHBaseJob(Job job, String sourceTable, String cf) throws IOException {
        Scan scan = new Scan();
        if(cf != null) {
            scan.addFamily(Bytes.toBytes(cf));
        }
        scan.setCaching(500);        // 1 is the default in Scan, which will be bad for MapReduce jobs
        scan.setCacheBlocks(false);  // don't set to true for MR jobs
// set other scan attrs

        TableMapReduceUtil.initTableMapperJob(
                sourceTable,      // input table
                scan,	          // Scan instance to control CF and attribute selection
                PrunerMapper.class,   // mapper class
                null,	          // mapper output key
                null,	          // mapper output value
                job);
        TableMapReduceUtil.initTableReducerJob(
                sourceTable,      // output table
                null,             // reducer class
                job);
    }

Source File: CellCounter.java From cloud-bigtable-examples with Apache License 2.0

6 votes

/**
 * Sets up the actual job.
 *
 * @param conf The current configuration.
 * @param args The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
    throws IOException {
  String tableName = args[0];
  Path outputDir = new Path(args[1]);
  String reportSeparatorString = (args.length > 2) ? args[2]: ":";
  conf.set("ReportSeparator", reportSeparatorString);
  Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
  job.setJarByClass(CellCounter.class);
  Scan scan = getConfiguredScanForJob(conf, args);
  TableMapReduceUtil.initTableMapperJob(tableName, scan,
      CellCounterMapper.class, ImmutableBytesWritable.class, Result.class, job);
  job.setNumReduceTasks(1);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileOutputFormat.setOutputPath(job, outputDir);
  job.setReducerClass(IntSumReducer.class);
  return job;
}

Source File: IntegrationTestWithCellVisibilityLoadAndVerify.java From hbase with Apache License 2.0

6 votes

private Job doVerify(Configuration conf, TableDescriptor tableDescriptor, String... auths)
    throws IOException, InterruptedException, ClassNotFoundException {
  Path outputDir = getTestDir(TEST_NAME, "verify-output");
  Job job = new Job(conf);
  job.setJarByClass(this.getClass());
  job.setJobName(TEST_NAME + " Verification for " + tableDescriptor.getTableName());
  setJobScannerConf(job);
  Scan scan = new Scan();
  scan.setAuthorizations(new Authorizations(auths));
  TableMapReduceUtil.initTableMapperJob(tableDescriptor.getTableName().getNameAsString(), scan,
      VerifyMapper.class, NullWritable.class, NullWritable.class, job);
  TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), AbstractHBaseTool.class);
  int scannerCaching = conf.getInt("verify.scannercaching", SCANNER_CACHING);
  TableMapReduceUtil.setScannerCaching(job, scannerCaching);
  job.setNumReduceTasks(0);
  FileOutputFormat.setOutputPath(job, outputDir);
  assertTrue(job.waitForCompletion(true));
  return job;
}

Source File: ActiveUserRunner.java From BigDataArchitect with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();
    // 初始化参数
    this.processArgs(conf, args);

    // 创建job
    Job job = Job.getInstance(conf, "active_user");
    

    // 设置job相关配置参数
    job.setJarByClass(ActiveUserRunner.class);
    // hbase 输入mapper参数
    // 1. 本地运行
    TableMapReduceUtil.initTableMapperJob(this.initScans(job), ActiveUserMapper.class, StatsUserDimension.class, TimeOutputValue.class, job, false);
    // 2. 集群运行
    // TableMapReduceUtil.initTableMapperJob(null, ActiveUserMapper.class,
    // StatsUserDimension.class, TimeOutputValue.class, job);

    // 设置reducer相关参数
    job.setReducerClass(ActiveUserReducer.class);
    job.setOutputKeyClass(StatsUserDimension.class);
    job.setOutputValueClass(MapWritableValue.class);

    // 设置output相关参数
    job.setOutputFormatClass(TransformerOutputFormat.class);
    // 开始毫秒数
    long startTime = System.currentTimeMillis();
    try {
        return job.waitForCompletion(true) ? 0 : -1;
    } finally {
        // 结束的毫秒数
        long endTime = System.currentTimeMillis();
        logger.info("Job<" + job.getJobName() + ">是否执行成功:" + job.isSuccessful() + "; 开始时间:" + startTime + "; 结束时间:" + endTime + "; 用时:" + (endTime - startTime) + "ms");
    }
}

Source File: NewInstallUserRunner.java From BigDataArchitect with Apache License 2.0

5 votes

@Override
    public int run(String[] args) throws Exception {
        Configuration conf = this.getConf();
        // 处理参数
        this.processArgs(conf, args);

        Job job = Job.getInstance(conf, "new_install_user");

        job.setJarByClass(NewInstallUserRunner.class);
        // 本地运行
        TableMapReduceUtil.initTableMapperJob(
        		initScans(job), 
        		NewInstallUserMapper.class, 
        		StatsUserDimension.class, 
        		TimeOutputValue.class, 
        		job, 
        		false);
        // 集群运行：本地提交和打包(jar)提交
//         TableMapReduceUtil.initTableMapperJob(initScans(job), NewInstallUserMapper.class, StatsUserDimension.class, TimeOutputValue.class, job);
        job.setReducerClass(NewInstallUserReducer.class);
        job.setOutputKeyClass(StatsUserDimension.class);
        job.setOutputValueClass(MapWritableValue.class);
//        job.setInputFormatClass(KeyValueTextInputFormat.class);
        job.setOutputFormatClass(TransformerOutputFormat.class);
        if (job.waitForCompletion(true)) {
            // 执行成功, 需要计算总用户
            this.calculateTotalUsers(conf);
            return 0;
        } else {
            return -1;
        }
    }

Source File: ExportHBaseTableToAvro.java From HBase-ToHDFS with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
  if (args.length == 0) {
    System.out.println("ExportHBaseTableToAvro {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowKeyColumn.Optional}");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];
  String rowKeyColumn = "";
  
  if (args.length > 5) {
    rowKeyColumn = args[5];
  }

  Job job = Job.getInstance();

  HBaseConfiguration.addHbaseResources(job.getConfiguration());

  job.setJarByClass(ExportHBaseTableToAvro.class);
  job.setJobName("ExportHBaseTableToAvro ");

  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
  
  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(AvroKeyOutputFormat.class);
  AvroKeyOutputFormat.setOutputPath(job, new Path(outputPath));

  Schema.Parser parser = new Schema.Parser();

  FileSystem fs = FileSystem.get(job.getConfiguration());

  AvroJob.setOutputKeySchema(job, parser.parse(fs.open(new Path(schemaFilePath))));

  if (compressionCodec.equals("snappy")) {
    AvroKeyOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    AvroKeyOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    // nothing
  }

  job.setNumReduceTasks(0);

  boolean b = job.waitForCompletion(true);
}

Source File: ImportMapReduce.java From hiped2 with Apache License 2.0

4 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.OutputFileOption.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path output = new Path(cli.getArgValueAsString(CliCommonOpts.OutputFileOption.OUTPUT));

  Configuration conf = super.getConf();

  Scan scan = new Scan();
  scan.addColumn(HBaseWriter.STOCK_DETAILS_COLUMN_FAMILY_AS_BYTES,
      HBaseWriter.STOCK_COLUMN_QUALIFIER_AS_BYTES);
  Job job = new Job(conf);

  job.setJarByClass(ImportMapReduce.class);

  TableMapReduceUtil.initTableMapperJob(
      HBaseWriter.STOCKS_TABLE_NAME,
      scan,
      Exporter.class,
      ImmutableBytesWritable.class,
      Put.class,
      job);

  job.setNumReduceTasks(0);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  FileOutputFormat.setOutputPath(job, output);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}

Source File: ExportHBaseTableToParquet.java From HBase-ToHDFS with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
  if (args.length == 0) {
    System.out
        .println("ExportHBaseTableToParquet {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowkey.column.optional");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];

  String rowKeyColumn = "";
  if (args.length > 5) {
    rowKeyColumn = args[5];
  }

  Job job = Job.getInstance();
  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);

  HBaseConfiguration.addHbaseResources(job.getConfiguration());

  job.setJarByClass(ExportHBaseTableToParquet.class);
  job.setJobName("ExportHBaseTableToParquet ");

  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(AvroParquetOutputFormat.class);
  AvroParquetOutputFormat.setOutputPath(job, new Path(outputPath));

  Schema.Parser parser = new Schema.Parser();

  FileSystem fs = FileSystem.get(job.getConfiguration());
  AvroParquetOutputFormat.setSchema(job, parser.parse(fs.open(new Path(schemaFilePath))));

  if (compressionCodec.equals("snappy")) {
    AvroParquetOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    AvroParquetOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    // nothing
  }

  job.setNumReduceTasks(0);

  boolean b = job.waitForCompletion(true);
}

Source File: IntegrationTestBigLinkedList.java From hbase with Apache License 2.0

4 votes

public int run(Path outputDir, int numReducers) throws Exception {
  LOG.info("Running Verify with outputDir=" + outputDir +", numReducers=" + numReducers);

  job = Job.getInstance(getConf());

  job.setJobName("Link Verifier");
  job.setNumReduceTasks(numReducers);
  job.setJarByClass(getClass());

  setJobScannerConf(job);

  Scan scan = new Scan();
  scan.addColumn(FAMILY_NAME, COLUMN_PREV);
  scan.setCaching(10000);
  scan.setCacheBlocks(false);
  if (isMultiUnevenColumnFamilies(getConf())) {
    scan.addColumn(BIG_FAMILY_NAME, BIG_FAMILY_NAME);
    scan.addColumn(TINY_FAMILY_NAME, TINY_FAMILY_NAME);
  }

  TableMapReduceUtil.initTableMapperJob(getTableName(getConf()).getName(), scan,
      VerifyMapper.class, BytesWritable.class, BytesWritable.class, job);
  TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(),
                                                 AbstractHBaseTool.class);

  job.getConfiguration().setBoolean("mapreduce.map.speculative", false);

  job.setReducerClass(VerifyReducer.class);
  job.setOutputFormatClass(SequenceFileAsBinaryOutputFormat.class);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  TextOutputFormat.setOutputPath(job, outputDir);

  boolean success = job.waitForCompletion(true);

  if (success) {
    Counters counters = job.getCounters();
    if (null == counters) {
      LOG.warn("Counters were null, cannot verify Job completion."
          + " This is commonly a result of insufficient YARN configuration.");
      // We don't have access to the counters to know if we have "bad" counts
      return 0;
    }

    // If we find no unexpected values, the job didn't outright fail
    if (verifyUnexpectedValues(counters)) {
      // We didn't check referenced+unreferenced counts, leave that to visual inspection
      return 0;
    }
  }

  // We failed
  return 1;
}

Source File: ExportHBaseTableToDelimiteredSeq.java From HBase-ToHDFS with Apache License 2.0

4 votes

public static void main (String[] args) throws IOException, InterruptedException, ClassNotFoundException {
 if (args.length == 0) {
    System.out
        .println("ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];
  String delimiter = args[5];

  String rowKeyColumn = "";
  if (args.length > 6) {
    rowKeyColumn = args[6];
  }
  
  Job job = Job.getInstance();
  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  
  HBaseConfiguration.addHbaseResources(job.getConfiguration());
  
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
  job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath);
  job.getConfiguration().set(DELIMITER_CONF, delimiter);

  job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class);
  job.setJobName("ExportHBaseTableToDelimiteredSeq ");

  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(SequenceFileOutputFormat.class); 
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  
  if (compressionCodec.equals("snappy")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    //nothing
  }
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NullWritable.class);
  
  job.setNumReduceTasks(0);
  
  boolean b = job.waitForCompletion(true);
}

Source File: ScanPerformanceEvaluation.java From hbase with Apache License 2.0

4 votes

public void testScanMapReduce() throws IOException, InterruptedException, ClassNotFoundException {
  Stopwatch scanOpenTimer = Stopwatch.createUnstarted();
  Stopwatch scanTimer = Stopwatch.createUnstarted();

  Scan scan = getScan();

  String jobName = "testScanMapReduce";

  Job job = new Job(conf);
  job.setJobName(jobName);

  job.setJarByClass(getClass());

  TableMapReduceUtil.initTableMapperJob(
      this.tablename,
      scan,
      MyMapper.class,
      NullWritable.class,
      NullWritable.class,
      job
  );

  job.setNumReduceTasks(0);
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(NullWritable.class);
  job.setOutputFormatClass(NullOutputFormat.class);

  scanTimer.start();
  job.waitForCompletion(true);
  scanTimer.stop();

  Counters counters = job.getCounters();
  long numRows = counters.findCounter(ScanCounter.NUM_ROWS).getValue();
  long numCells = counters.findCounter(ScanCounter.NUM_CELLS).getValue();

  long totalBytes = counters.findCounter(HBASE_COUNTER_GROUP_NAME, "BYTES_IN_RESULTS").getValue();
  double throughput = (double)totalBytes / scanTimer.elapsed(TimeUnit.SECONDS);
  double throughputRows = (double)numRows / scanTimer.elapsed(TimeUnit.SECONDS);
  double throughputCells = (double)numCells / scanTimer.elapsed(TimeUnit.SECONDS);

  System.out.println("HBase scan mapreduce: ");
  System.out.println("total time to open scanner: " +
    scanOpenTimer.elapsed(TimeUnit.MILLISECONDS) + " ms");
  System.out.println("total time to scan: " + scanTimer.elapsed(TimeUnit.MILLISECONDS) + " ms");

  System.out.println("total bytes: " + totalBytes + " bytes ("
      + StringUtils.humanReadableInt(totalBytes) + ")");
  System.out.println("throughput  : " + StringUtils.humanReadableInt((long)throughput) + "B/s");
  System.out.println("total rows  : " + numRows);
  System.out.println("throughput  : " + StringUtils.humanReadableInt((long)throughputRows) + " rows/s");
  System.out.println("total cells : " + numCells);
  System.out.println("throughput  : " + StringUtils.humanReadableInt((long)throughputCells) + " cells/s");
}

Source File: MobRefReporter.java From hbase with Apache License 2.0

4 votes

/**
 * Main method for the tool.
 * @return 0 if success, 1 for bad args. 2 if job aborted with an exception,
 *   3 if mr job was unsuccessful
 */
public int run(String[] args) throws IOException, InterruptedException {
  // TODO make family and table optional
  if (args.length != 3) {
    printUsage();
    return 1;
  }
  final String output = args[0];
  final String tableName = args[1];
  final String familyName = args[2];
  final long reportStartTime = EnvironmentEdgeManager.currentTime();
  Configuration conf = getConf();
  try {
    FileSystem fs = FileSystem.get(conf);
    // check whether the current user is the same one with the owner of hbase root
    String currentUserName = UserGroupInformation.getCurrentUser().getShortUserName();
    FileStatus[] hbaseRootFileStat = fs.listStatus(new Path(conf.get(HConstants.HBASE_DIR)));
    if (hbaseRootFileStat.length > 0) {
      String owner = hbaseRootFileStat[0].getOwner();
      if (!owner.equals(currentUserName)) {
        String errorMsg = "The current user[" + currentUserName
            + "] does not have hbase root credentials."
            + " If this job fails due to an inability to read HBase's internal directories, "
            + "you will need to rerun as a user with sufficient permissions. The HBase superuser "
            + "is a safe choice.";
        LOG.warn(errorMsg);
      }
    } else {
      LOG.error("The passed configs point to an HBase dir does not exist: {}",
          conf.get(HConstants.HBASE_DIR));
      throw new IOException("The target HBase does not exist");
    }

    byte[] family;
    int maxVersions;
    TableName tn = TableName.valueOf(tableName);
    try (Connection connection = ConnectionFactory.createConnection(conf);
         Admin admin = connection.getAdmin()) {
      TableDescriptor htd = admin.getDescriptor(tn);
      ColumnFamilyDescriptor hcd = htd.getColumnFamily(Bytes.toBytes(familyName));
      if (hcd == null || !hcd.isMobEnabled()) {
        throw new IOException("Column family " + familyName + " is not a MOB column family");
      }
      family = hcd.getName();
      maxVersions = hcd.getMaxVersions();
    }


    String id = getClass().getSimpleName() + UUID.randomUUID().toString().replace("-", "");
    Job job = null;
    Scan scan = new Scan();
    scan.addFamily(family);
    // Do not retrieve the mob data when scanning
    scan.setAttribute(MobConstants.MOB_SCAN_RAW, Bytes.toBytes(Boolean.TRUE));
    scan.setAttribute(MobConstants.MOB_SCAN_REF_ONLY, Bytes.toBytes(Boolean.TRUE));
    // If a scanner caching value isn't set, pick a smaller default since we know we're doing
    // a full table scan and don't want to impact other clients badly.
    scan.setCaching(conf.getInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, 10000));
    scan.setCacheBlocks(false);
    scan.readVersions(maxVersions);
    conf.set(REPORT_JOB_ID, id);

    job = Job.getInstance(conf);
    job.setJarByClass(getClass());
    TableMapReduceUtil.initTableMapperJob(tn, scan,
        MobRefMapper.class, Text.class, ImmutableBytesWritable.class, job);

    job.setReducerClass(MobRefReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(output));

    job.setJobName(getClass().getSimpleName() + "-" + tn + "-" + familyName);
    // for use in the reducer. easier than re-parsing it out of the scan string.
    job.getConfiguration().set(TableInputFormat.SCAN_COLUMN_FAMILY, familyName);

    // Use when we start this job as the base point for file "recency".
    job.getConfiguration().setLong(REPORT_START_DATETIME, reportStartTime);

    if (job.waitForCompletion(true)) {
      LOG.info("Finished creating report for '{}', family='{}'", tn, familyName);
    } else {
      System.err.println("Job was not successful");
      return 3;
    }
    return 0;

  } catch (ClassNotFoundException | RuntimeException | IOException | InterruptedException e) {
    System.err.println("Job aborted due to exception " + e);
    return 2; // job failed
  }
}

Source File: TwoLevelIndexBuilder.java From yuzhouwan with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {

        String rootDir = "hdfs://hadoop1:8020/hbase";
        String zkServer = "hadoop1";
        String port = "2181";

        TwoLevelIndexBuilder conn = new TwoLevelIndexBuilder(rootDir, zkServer, port);

        Configuration conf = conn.conf;
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        //TwoLevelIndexBuilder: TableName, ColumnFamily, Qualifier
        if (otherArgs.length < 3) {
            System.exit(-1);
        }
        //表名
        String tableName = otherArgs[0];
        //列族
        String columnFamily = otherArgs[1];

        conf.set("tableName", tableName);
        conf.set("columnFamily", columnFamily);

        //列 (可能存在多个列)
        String[] qualifiers = new String[otherArgs.length - 2];
        System.arraycopy(otherArgs, 2, qualifiers, 0, qualifiers.length);

        //设置列
        conf.setStrings("qualifiers", qualifiers);

        Job job = new Job(conf, tableName);

        job.setJarByClass(TwoLevelIndexBuilder.class);
        job.setMapperClass(TowLevelIndexMapper.class);
        job.setNumReduceTasks(0);       //由于不需要执行 reduce阶段
        job.setInputFormatClass(TableInputFormat.class);
        job.setOutputFormatClass(MultiTableOutputFormat.class);
        TableMapReduceUtil.initTableMapperJob(tableName, new Scan(),
                TowLevelIndexMapper.class, ImmutableBytesWritable.class, Put.class, job);

        job.waitForCompletion(true);
    }

Source File: HalyardSummary.java From Halyard with Apache License 2.0

4 votes

@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String target = cmd.getOptionValue('t');
    TableMapReduceUtil.addDependencyJars(getConf(),
           HalyardExport.class,
           Rio.class,
           AbstractRDFHandler.class,
           RDFFormat.class,
           RDFParser.class,
           HTable.class,
           HBaseConfiguration.class,
           AuthenticationProtos.class,
           Trace.class,
           Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardSummary " + source + (target == null ? " update" : " -> " + target));
    job.getConfiguration().set(SOURCE, source);
    if (target != null) job.getConfiguration().set(TARGET, target);
    if (cmd.hasOption('g')) job.getConfiguration().set(TARGET_GRAPH, cmd.getOptionValue('g'));
    if (cmd.hasOption('d')) job.getConfiguration().setInt(DECIMATION_FACTOR, Integer.parseInt(cmd.getOptionValue('d')));
    job.setJarByClass(HalyardSummary.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(new byte[]{HalyardTableUtils.POS_PREFIX}, new byte[]{HalyardTableUtils.POS_PREFIX + 1});

    TableMapReduceUtil.initTableMapperJob(source,
            scan,
            SummaryMapper.class,
            ImmutableBytesWritable.class,
            LongWritable.class,
            job);
    job.setNumReduceTasks(1);
    job.setCombinerClass(SummaryCombiner.class);
    job.setReducerClass(SummaryReducer.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    if (job.waitForCompletion(true)) {
        LOG.info("Summary Generation Completed..");
        return 0;
    }
    return -1;
}

Source File: HalyardBulkDelete.java From Halyard with Apache License 2.0

4 votes

@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('t');
    TableMapReduceUtil.addDependencyJars(getConf(),
        HalyardExport.class,
        NTriplesUtil.class,
        Rio.class,
        AbstractRDFHandler.class,
        RDFFormat.class,
        RDFParser.class,
        HTable.class,
        HBaseConfiguration.class,
        AuthenticationProtos.class,
        Trace.class,
        Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardDelete " + source);
    if (cmd.hasOption('s')) {
        job.getConfiguration().set(SUBJECT, cmd.getOptionValue('s'));
    }
    if (cmd.hasOption('p')) {
        job.getConfiguration().set(PREDICATE, cmd.getOptionValue('p'));
    }
    if (cmd.hasOption('o')) {
        job.getConfiguration().set(OBJECT, cmd.getOptionValue('o'));
    }
    if (cmd.hasOption('g')) {
        job.getConfiguration().setStrings(CONTEXTS, cmd.getOptionValues('g'));
    }
    job.setJarByClass(HalyardBulkDelete.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(null, null);

    TableMapReduceUtil.initTableMapperJob(source,
        scan,
        DeleteMapper.class,
        ImmutableBytesWritable.class,
        LongWritable.class,
        job);

    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setSpeculativeExecution(false);
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), source, false, 0)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
        FileOutputFormat.setOutputPath(job, new Path(cmd.getOptionValue('f')));
        TableMapReduceUtil.addDependencyJars(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(cmd.getOptionValue('f')), hTable);
            LOG.info("Bulk Delete Completed..");
            return 0;
        }
    }
    return -1;
}

Source File: HalyardStats.java From Halyard with Apache License 2.0

4 votes

@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String target = cmd.getOptionValue('t');
    String targetGraph = cmd.getOptionValue('g');
    String graphContext = cmd.getOptionValue('c');
    String thresh = cmd.getOptionValue('r');
    TableMapReduceUtil.addDependencyJars(getConf(),
        HalyardExport.class,
        NTriplesUtil.class,
        Rio.class,
        AbstractRDFHandler.class,
        RDFFormat.class,
        RDFParser.class,
        HTable.class,
        HBaseConfiguration.class,
        AuthenticationProtos.class,
        Trace.class,
        Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardStats " + source + (target == null ? " update" : " -> " + target));
    job.getConfiguration().set(SOURCE, source);
    if (target != null) job.getConfiguration().set(TARGET, target);
    if (targetGraph != null) job.getConfiguration().set(TARGET_GRAPH, targetGraph);
    if (graphContext != null) job.getConfiguration().set(GRAPH_CONTEXT, graphContext);
    if (thresh != null) job.getConfiguration().setLong(THRESHOLD, Long.parseLong(thresh));
    job.setJarByClass(HalyardStats.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(null, null);
    if (graphContext != null) { //restricting stats to scan given graph context only
        List<RowRange> ranges = new ArrayList<>();
        byte[] gcHash = HalyardTableUtils.hashKey(SimpleValueFactory.getInstance().createIRI(graphContext));
        ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, gcHash));
        ranges.add(rowRange(HalyardTableUtils.CPOS_PREFIX, gcHash));
        ranges.add(rowRange(HalyardTableUtils.COSP_PREFIX, gcHash));
        if (target == null) { //add stats context to the scanned row ranges (when in update mode) to delete the related stats during MapReduce
            ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, HalyardTableUtils.hashKey(targetGraph == null ? HALYARD.STATS_GRAPH_CONTEXT : SimpleValueFactory.getInstance().createIRI(targetGraph))));
        }
        scan.setFilter(new MultiRowRangeFilter(ranges));
    }
    TableMapReduceUtil.initTableMapperJob(
        source,
        scan,
        StatsMapper.class,
        ImmutableBytesWritable.class,
        LongWritable.class,
        job);
    job.setPartitionerClass(StatsPartitioner.class);
    job.setReducerClass(StatsReducer.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    if (job.waitForCompletion(true)) {
        LOG.info("Stats Generation Completed..");
        return 0;
    }
    return -1;
}

Java Code Examples for org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#initTableMapperJob()