Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPaths()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPaths() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: BigData-In-Practice   File: LeftJoin.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    GenericOptionsParser optionparser = new GenericOptionsParser(conf, args);
    conf = optionparser.getConfiguration();

    Job job = Job.getInstance(conf, "leftjoin");
    job.setJarByClass(LeftJoin.class);
    FileInputFormat.addInputPaths(job, conf.get("input_dir"));
    Path out = new Path(conf.get("output_dir"));
    FileOutputFormat.setOutputPath(job, out);
    job.setNumReduceTasks(conf.getInt("reduce_num", 1));

    job.setMapperClass(LeftJoinMapper.class);
    job.setReducerClass(LeftJoinReduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    conf.set("mapred.textoutputformat.separator", ",");

    return (job.waitForCompletion(true) ? 0 : 1);
}
 
Example 2
Source Project: hadoop   File: MultiFileWordCount.java    License: Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {

    if(args.length < 2) {
      printUsage();
      return 2;
    }

    Job job = Job.getInstance(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    //set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(MyInputFormat.class);
    
    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
  }
 
Example 3
Source Project: big-c   File: MultiFileWordCount.java    License: Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {

    if(args.length < 2) {
      printUsage();
      return 2;
    }

    Job job = Job.getInstance(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    //set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(MyInputFormat.class);
    
    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
  }
 
Example 4
Source Project: dkpro-c4corpus   File: SimpleTextSearch.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(SimpleTextSearch.class);

    job.setJobName(SimpleTextSearch.class.getName());

    // mapper
    job.setMapperClass(TextSearchMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    // regex with a phrase to be searched for
    String regex = otherArgs[2];
    job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 5
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(WordCounterExample.class);

    job.setJobName(WordCounterExample.class.getName());

    // mapper
    job.setMapperClass(WordCounterMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 6
/**
 * Job configurator
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass,
        Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass,
        String commaSeparatedInputFiles, String outputPath)
        throws IOException
{
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}
 
Example 7
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step1ExtractNearDupInfo.class);
    job.setJobName(Phase3Step1ExtractNearDupInfo.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DocumentInfo.class);

    // reducer
    job.setReducerClass(DeDuplicationTextOutputReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(List.class);

    job.setInputFormatClass(WARCInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, DocumentInfoOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;

}
 
Example 8
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    //set from the command line

    job.setJarByClass(Phase2ExactMatchDeDuplication.class);
    job.setJobName(Phase2ExactMatchDeDuplication.class.getName());

    // mapper
    job.setMapperClass(ExactMatchDetectionMapper.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(UniqueWarcWriterReducer.class);
    // no combiner, as the output classes in mapper and reducer are different!

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 9
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step3NearDupTuplesCreation.class);
    job.setJobName(Phase3Step3NearDupTuplesCreation.class.getName());

    // mapper
    job.setMapperClass(CreateTuplesMapper.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(TreeSet.class);

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setNumReduceTasks(0); //must be added or the mapper wont be called

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 10
Source Project: dkpro-c4corpus   File: Phase1FullJob.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    // set from the command line

    job.setJarByClass(Phase1FullJob.class);
    job.setJobName(Phase1FullJob.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(SimpleWarcWriterReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 11
@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    job.setJarByClass(Phase3Step2DistinctDataJob.class);
    job.setJobName(Phase3Step2DistinctDataJob.class.getName());

    //mapper
    job.setMapperClass(RemoveRedundantDataMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //reducer
    job.setReducerClass(RemoveRedundantDataReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    //paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //i/o paths
    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 12
Source Project: dkpro-c4corpus   File: URIExtractor.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    // set from the command line
    job.setJarByClass(URIExtractor.class);
    job.setJobName(URIExtractor.class.getName());

    // mapper
    job.setMapperClass(URIExtractorMapper.class);
    job.setReducerClass(URIExtractorReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // is necessary, so that Hadoop does not mix the map input format up.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 13
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    for (Map.Entry<String, String> next : job.getConfiguration()) {
        System.out.println(next.getKey() + ": " + next.getValue());
    }

    job.setJarByClass(PagesByURLExtractor.class);
    job.setJobName(PagesByURLExtractor.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // input
    job.setInputFormatClass(WARCInputFormat.class);

    // output
    job.setOutputFormatClass(WARCOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);
    FileOutputFormat.setCompressOutput(job, true);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    // load IDs to be searched for
    job.getConfiguration().set(MAPREDUCE_MAPPER_URLS, loadURLs(args[2]));

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 14
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf);
    job.setJarByClass(TextToSentencesSplitter.class);

    job.setJobName(TextToSentencesSplitter.class.getName());

    // mapper
    job.setMapperClass(TextToSentencesSplitter.MapperClass.class);
    job.setInputFormatClass(WARCInputFormat.class);

    // reducer
    job.setReducerClass(ReducerClass.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 15
@Override public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(ContentTypeAndSizeDistribution.class);

    job.setJobName(ContentTypeAndSizeDistribution.class.getName());

    // mapper
    job.setMapperClass(ContentAndSizeMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // reducer
    //        job.setReducerClass(DistributionReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 16
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(WordDistributionStatisticsCollector.class);
    job.setJobName(WordDistributionStatisticsCollector.class.getName());

    // mapper
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // reducer
    job.setReducerClass(SumReducer.class);
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 17
Source Project: dkpro-c4corpus   File: TopDomainCounter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(TopDomainCounter.class);

    job.setJobName(TopDomainCounter.class.getName());

    // mapper
    job.setMapperClass(DomainMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 18
Source Project: hadoop   File: GenericMRLoadGenerator.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Configure a job given argv.
 */
public static boolean parseArgs(String[] argv, Job job) throws IOException {
  if (argv.length < 1) {
    return 0 == printUsage();
  }
  for(int i=0; i < argv.length; ++i) {
    if (argv.length == i + 1) {
      System.out.println("ERROR: Required parameter missing from " +
          argv[i]);
      return 0 == printUsage();
    }
    try {
      if ("-r".equals(argv[i])) {
        job.setNumReduceTasks(Integer.parseInt(argv[++i]));
      } else if ("-inFormat".equals(argv[i])) {
        job.setInputFormatClass(
            Class.forName(argv[++i]).asSubclass(InputFormat.class));
      } else if ("-outFormat".equals(argv[i])) {
        job.setOutputFormatClass(
            Class.forName(argv[++i]).asSubclass(OutputFormat.class));
      } else if ("-outKey".equals(argv[i])) {
        job.setOutputKeyClass(
          Class.forName(argv[++i]).asSubclass(WritableComparable.class));
      } else if ("-outValue".equals(argv[i])) {
        job.setOutputValueClass(
          Class.forName(argv[++i]).asSubclass(Writable.class));
      } else if ("-keepmap".equals(argv[i])) {
        job.getConfiguration().set(MAP_PRESERVE_PERCENT, argv[++i]);
      } else if ("-keepred".equals(argv[i])) {
        job.getConfiguration().set(REDUCE_PRESERVE_PERCENT, argv[++i]);
      } else if ("-outdir".equals(argv[i])) {
        FileOutputFormat.setOutputPath(job, new Path(argv[++i]));
      } else if ("-indir".equals(argv[i])) {
        FileInputFormat.addInputPaths(job, argv[++i]);
      } else if ("-inFormatIndirect".equals(argv[i])) {
        job.getConfiguration().setClass(INDIRECT_INPUT_FORMAT,
            Class.forName(argv[++i]).asSubclass(InputFormat.class),
            InputFormat.class);
        job.setInputFormatClass(IndirectInputFormat.class);
      } else {
        System.out.println("Unexpected argument: " + argv[i]);
        return 0 == printUsage();
      }
    } catch (NumberFormatException except) {
      System.out.println("ERROR: Integer expected instead of " + argv[i]);
      return 0 == printUsage();
    } catch (Exception e) {
      throw (IOException)new IOException().initCause(e);
    }
  }
  return true;
}
 
Example 19
Source Project: big-c   File: GenericMRLoadGenerator.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Configure a job given argv.
 */
public static boolean parseArgs(String[] argv, Job job) throws IOException {
  if (argv.length < 1) {
    return 0 == printUsage();
  }
  for(int i=0; i < argv.length; ++i) {
    if (argv.length == i + 1) {
      System.out.println("ERROR: Required parameter missing from " +
          argv[i]);
      return 0 == printUsage();
    }
    try {
      if ("-r".equals(argv[i])) {
        job.setNumReduceTasks(Integer.parseInt(argv[++i]));
      } else if ("-inFormat".equals(argv[i])) {
        job.setInputFormatClass(
            Class.forName(argv[++i]).asSubclass(InputFormat.class));
      } else if ("-outFormat".equals(argv[i])) {
        job.setOutputFormatClass(
            Class.forName(argv[++i]).asSubclass(OutputFormat.class));
      } else if ("-outKey".equals(argv[i])) {
        job.setOutputKeyClass(
          Class.forName(argv[++i]).asSubclass(WritableComparable.class));
      } else if ("-outValue".equals(argv[i])) {
        job.setOutputValueClass(
          Class.forName(argv[++i]).asSubclass(Writable.class));
      } else if ("-keepmap".equals(argv[i])) {
        job.getConfiguration().set(MAP_PRESERVE_PERCENT, argv[++i]);
      } else if ("-keepred".equals(argv[i])) {
        job.getConfiguration().set(REDUCE_PRESERVE_PERCENT, argv[++i]);
      } else if ("-outdir".equals(argv[i])) {
        FileOutputFormat.setOutputPath(job, new Path(argv[++i]));
      } else if ("-indir".equals(argv[i])) {
        FileInputFormat.addInputPaths(job, argv[++i]);
      } else if ("-inFormatIndirect".equals(argv[i])) {
        job.getConfiguration().setClass(INDIRECT_INPUT_FORMAT,
            Class.forName(argv[++i]).asSubclass(InputFormat.class),
            InputFormat.class);
        job.setInputFormatClass(IndirectInputFormat.class);
      } else {
        System.out.println("Unexpected argument: " + argv[i]);
        return 0 == printUsage();
      }
    } catch (NumberFormatException except) {
      System.out.println("ERROR: Integer expected instead of " + argv[i]);
      return 0 == printUsage();
    } catch (Exception e) {
      throw (IOException)new IOException().initCause(e);
    }
  }
  return true;
}
 
Example 20
Source Project: phoenix   File: AbstractBulkLoadTool.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Submits the jobs to the cluster.
 * Loads the HFiles onto the respective tables.
 * @throws Exception 
 */
public int submitJob(final Configuration conf, final String qualifiedTableName,
    final String inputPaths, final Path outputPath, List<TargetTableRef> tablesToBeLoaded, boolean hasLocalIndexes) throws Exception {
   
    Job job = Job.getInstance(conf, "Phoenix MapReduce import for " + qualifiedTableName);
    FileInputFormat.addInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(PhoenixTextInputFormat.class);
    job.setMapOutputKeyClass(TableRowkeyPair.class);
    job.setMapOutputValueClass(ImmutableBytesWritable.class);
    job.setOutputKeyClass(TableRowkeyPair.class);
    job.setOutputValueClass(KeyValue.class);
    job.setReducerClass(FormatToKeyValueReducer.class);
    byte[][] splitKeysBeforeJob = null;
    try(org.apache.hadoop.hbase.client.Connection hbaseConn =
            ConnectionFactory.createConnection(job.getConfiguration())) {
        RegionLocator regionLocator = null;
        if(hasLocalIndexes) {
            try{
                regionLocator = hbaseConn.getRegionLocator(
                        TableName.valueOf(qualifiedTableName));
                splitKeysBeforeJob = regionLocator.getStartKeys();
            } finally {
                if (regionLocator != null) regionLocator.close();
            }
        }
        MultiHfileOutputFormat.configureIncrementalLoad(job, tablesToBeLoaded);

        final String tableNamesAsJson = TargetTableRefFunctions.NAMES_TO_JSON
                .apply(tablesToBeLoaded);
        final String logicalNamesAsJson = TargetTableRefFunctions.LOGICAL_NAMES_TO_JSON
                .apply(tablesToBeLoaded);

        job.getConfiguration().set(FormatToBytesWritableMapper.TABLE_NAMES_CONFKEY,
                tableNamesAsJson);
        job.getConfiguration().set(FormatToBytesWritableMapper.LOGICAL_NAMES_CONFKEY,
                logicalNamesAsJson);

        // give subclasses their hook
        setupJob(job);

        LOGGER.info("Running MapReduce import job from {} to {}", inputPaths, outputPath);
        boolean success = job.waitForCompletion(true);

        if (success) {
            if (hasLocalIndexes) {
                try {
                    regionLocator = hbaseConn.getRegionLocator(
                            TableName.valueOf(qualifiedTableName));
                    if(!IndexUtil.matchingSplitKeys(splitKeysBeforeJob,
                            regionLocator.getStartKeys())) {
                        LOGGER.error("The table " + qualifiedTableName + " has local indexes and"
                                + " there is split key mismatch before and after running"
                                + " bulkload job. Please rerun the job otherwise there may be"
                                + " inconsistencies between actual data and index data.");
                        return -1;
                    }
                } finally {
                    if (regionLocator != null) regionLocator.close();
                }
            }
            LOGGER.info("Loading HFiles from {}", outputPath);
            completebulkload(conf,outputPath,tablesToBeLoaded);
            LOGGER.info("Removing output directory {}", outputPath);
            if(!outputPath.getFileSystem(conf).delete(outputPath, true)) {
                LOGGER.error("Failed to delete the output directory {}", outputPath);
            }
            return 0;
        } else {
           return -1;
       }
   }
}