Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPaths()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPaths() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LeftJoin.java    From BigData-In-Practice with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    GenericOptionsParser optionparser = new GenericOptionsParser(conf, args);
    conf = optionparser.getConfiguration();

    Job job = Job.getInstance(conf, "leftjoin");
    job.setJarByClass(LeftJoin.class);
    FileInputFormat.addInputPaths(job, conf.get("input_dir"));
    Path out = new Path(conf.get("output_dir"));
    FileOutputFormat.setOutputPath(job, out);
    job.setNumReduceTasks(conf.getInt("reduce_num", 1));

    job.setMapperClass(LeftJoinMapper.class);
    job.setReducerClass(LeftJoinReduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    conf.set("mapred.textoutputformat.separator", ",");

    return (job.waitForCompletion(true) ? 0 : 1);
}
 
Example 2
Source File: Phase3Step3NearDupTuplesCreation.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step3NearDupTuplesCreation.class);
    job.setJobName(Phase3Step3NearDupTuplesCreation.class.getName());

    // mapper
    job.setMapperClass(CreateTuplesMapper.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(TreeSet.class);

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setNumReduceTasks(0); //must be added or the mapper wont be called

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 3
Source File: TopDomainCounter.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(TopDomainCounter.class);

    job.setJobName(TopDomainCounter.class.getName());

    // mapper
    job.setMapperClass(DomainMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 4
Source File: WordDistributionStatisticsCollector.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(WordDistributionStatisticsCollector.class);
    job.setJobName(WordDistributionStatisticsCollector.class.getName());

    // mapper
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // reducer
    job.setReducerClass(SumReducer.class);
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 5
Source File: ContentTypeAndSizeDistribution.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(ContentTypeAndSizeDistribution.class);

    job.setJobName(ContentTypeAndSizeDistribution.class.getName());

    // mapper
    job.setMapperClass(ContentAndSizeMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // reducer
    //        job.setReducerClass(DistributionReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 6
Source File: TextToSentencesSplitter.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf);
    job.setJarByClass(TextToSentencesSplitter.class);

    job.setJobName(TextToSentencesSplitter.class.getName());

    // mapper
    job.setMapperClass(TextToSentencesSplitter.MapperClass.class);
    job.setInputFormatClass(WARCInputFormat.class);

    // reducer
    job.setReducerClass(ReducerClass.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 7
Source File: PagesByURLExtractor.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    for (Map.Entry<String, String> next : job.getConfiguration()) {
        System.out.println(next.getKey() + ": " + next.getValue());
    }

    job.setJarByClass(PagesByURLExtractor.class);
    job.setJobName(PagesByURLExtractor.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // input
    job.setInputFormatClass(WARCInputFormat.class);

    // output
    job.setOutputFormatClass(WARCOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);
    FileOutputFormat.setCompressOutput(job, true);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    // load IDs to be searched for
    job.getConfiguration().set(MAPREDUCE_MAPPER_URLS, loadURLs(args[2]));

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 8
Source File: URIExtractor.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    // set from the command line
    job.setJarByClass(URIExtractor.class);
    job.setJobName(URIExtractor.class.getName());

    // mapper
    job.setMapperClass(URIExtractorMapper.class);
    job.setReducerClass(URIExtractorReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // is necessary, so that Hadoop does not mix the map input format up.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 9
Source File: Phase3Step2DistinctDataJob.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    job.setJarByClass(Phase3Step2DistinctDataJob.class);
    job.setJobName(Phase3Step2DistinctDataJob.class.getName());

    //mapper
    job.setMapperClass(RemoveRedundantDataMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //reducer
    job.setReducerClass(RemoveRedundantDataReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    //paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //i/o paths
    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 10
Source File: Phase1FullJob.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    // set from the command line

    job.setJarByClass(Phase1FullJob.class);
    job.setJobName(Phase1FullJob.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(SimpleWarcWriterReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 11
Source File: Phase2ExactMatchDeDuplication.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    //set from the command line

    job.setJarByClass(Phase2ExactMatchDeDuplication.class);
    job.setJobName(Phase2ExactMatchDeDuplication.class.getName());

    // mapper
    job.setMapperClass(ExactMatchDetectionMapper.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(UniqueWarcWriterReducer.class);
    // no combiner, as the output classes in mapper and reducer are different!

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 12
Source File: Phase3Step1ExtractNearDupInfo.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step1ExtractNearDupInfo.class);
    job.setJobName(Phase3Step1ExtractNearDupInfo.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DocumentInfo.class);

    // reducer
    job.setReducerClass(DeDuplicationTextOutputReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(List.class);

    job.setInputFormatClass(WARCInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, DocumentInfoOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;

}
 
Example 13
Source File: ConfigurationHelper.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
/**
 * Job configurator
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass,
        Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass,
        String commaSeparatedInputFiles, String outputPath)
        throws IOException
{
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}
 
Example 14
Source File: WordCounterExample.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(WordCounterExample.class);

    job.setJobName(WordCounterExample.class.getName());

    // mapper
    job.setMapperClass(WordCounterMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 15
Source File: SimpleTextSearch.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(SimpleTextSearch.class);

    job.setJobName(SimpleTextSearch.class.getName());

    // mapper
    job.setMapperClass(TextSearchMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    // regex with a phrase to be searched for
    String regex = otherArgs[2];
    job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 16
Source File: MultiFileWordCount.java    From big-c with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {

    if(args.length < 2) {
      printUsage();
      return 2;
    }

    Job job = Job.getInstance(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    //set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(MyInputFormat.class);
    
    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
  }
 
Example 17
Source File: MultiFileWordCount.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {

    if(args.length < 2) {
      printUsage();
      return 2;
    }

    Job job = Job.getInstance(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    //set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(MyInputFormat.class);
    
    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
  }
 
Example 18
Source File: GenericMRLoadGenerator.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Configure a job given argv.
 */
public static boolean parseArgs(String[] argv, Job job) throws IOException {
  if (argv.length < 1) {
    return 0 == printUsage();
  }
  for(int i=0; i < argv.length; ++i) {
    if (argv.length == i + 1) {
      System.out.println("ERROR: Required parameter missing from " +
          argv[i]);
      return 0 == printUsage();
    }
    try {
      if ("-r".equals(argv[i])) {
        job.setNumReduceTasks(Integer.parseInt(argv[++i]));
      } else if ("-inFormat".equals(argv[i])) {
        job.setInputFormatClass(
            Class.forName(argv[++i]).asSubclass(InputFormat.class));
      } else if ("-outFormat".equals(argv[i])) {
        job.setOutputFormatClass(
            Class.forName(argv[++i]).asSubclass(OutputFormat.class));
      } else if ("-outKey".equals(argv[i])) {
        job.setOutputKeyClass(
          Class.forName(argv[++i]).asSubclass(WritableComparable.class));
      } else if ("-outValue".equals(argv[i])) {
        job.setOutputValueClass(
          Class.forName(argv[++i]).asSubclass(Writable.class));
      } else if ("-keepmap".equals(argv[i])) {
        job.getConfiguration().set(MAP_PRESERVE_PERCENT, argv[++i]);
      } else if ("-keepred".equals(argv[i])) {
        job.getConfiguration().set(REDUCE_PRESERVE_PERCENT, argv[++i]);
      } else if ("-outdir".equals(argv[i])) {
        FileOutputFormat.setOutputPath(job, new Path(argv[++i]));
      } else if ("-indir".equals(argv[i])) {
        FileInputFormat.addInputPaths(job, argv[++i]);
      } else if ("-inFormatIndirect".equals(argv[i])) {
        job.getConfiguration().setClass(INDIRECT_INPUT_FORMAT,
            Class.forName(argv[++i]).asSubclass(InputFormat.class),
            InputFormat.class);
        job.setInputFormatClass(IndirectInputFormat.class);
      } else {
        System.out.println("Unexpected argument: " + argv[i]);
        return 0 == printUsage();
      }
    } catch (NumberFormatException except) {
      System.out.println("ERROR: Integer expected instead of " + argv[i]);
      return 0 == printUsage();
    } catch (Exception e) {
      throw (IOException)new IOException().initCause(e);
    }
  }
  return true;
}
 
Example 19
Source File: AbstractBulkLoadTool.java    From phoenix with Apache License 2.0 4 votes vote down vote up
/**
 * Submits the jobs to the cluster.
 * Loads the HFiles onto the respective tables.
 * @throws Exception 
 */
public int submitJob(final Configuration conf, final String qualifiedTableName,
    final String inputPaths, final Path outputPath, List<TargetTableRef> tablesToBeLoaded, boolean hasLocalIndexes) throws Exception {
   
    Job job = Job.getInstance(conf, "Phoenix MapReduce import for " + qualifiedTableName);
    FileInputFormat.addInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(PhoenixTextInputFormat.class);
    job.setMapOutputKeyClass(TableRowkeyPair.class);
    job.setMapOutputValueClass(ImmutableBytesWritable.class);
    job.setOutputKeyClass(TableRowkeyPair.class);
    job.setOutputValueClass(KeyValue.class);
    job.setReducerClass(FormatToKeyValueReducer.class);
    byte[][] splitKeysBeforeJob = null;
    try(org.apache.hadoop.hbase.client.Connection hbaseConn =
            ConnectionFactory.createConnection(job.getConfiguration())) {
        RegionLocator regionLocator = null;
        if(hasLocalIndexes) {
            try{
                regionLocator = hbaseConn.getRegionLocator(
                        TableName.valueOf(qualifiedTableName));
                splitKeysBeforeJob = regionLocator.getStartKeys();
            } finally {
                if (regionLocator != null) regionLocator.close();
            }
        }
        MultiHfileOutputFormat.configureIncrementalLoad(job, tablesToBeLoaded);

        final String tableNamesAsJson = TargetTableRefFunctions.NAMES_TO_JSON
                .apply(tablesToBeLoaded);
        final String logicalNamesAsJson = TargetTableRefFunctions.LOGICAL_NAMES_TO_JSON
                .apply(tablesToBeLoaded);

        job.getConfiguration().set(FormatToBytesWritableMapper.TABLE_NAMES_CONFKEY,
                tableNamesAsJson);
        job.getConfiguration().set(FormatToBytesWritableMapper.LOGICAL_NAMES_CONFKEY,
                logicalNamesAsJson);

        // give subclasses their hook
        setupJob(job);

        LOGGER.info("Running MapReduce import job from {} to {}", inputPaths, outputPath);
        boolean success = job.waitForCompletion(true);

        if (success) {
            if (hasLocalIndexes) {
                try {
                    regionLocator = hbaseConn.getRegionLocator(
                            TableName.valueOf(qualifiedTableName));
                    if(!IndexUtil.matchingSplitKeys(splitKeysBeforeJob,
                            regionLocator.getStartKeys())) {
                        LOGGER.error("The table " + qualifiedTableName + " has local indexes and"
                                + " there is split key mismatch before and after running"
                                + " bulkload job. Please rerun the job otherwise there may be"
                                + " inconsistencies between actual data and index data.");
                        return -1;
                    }
                } finally {
                    if (regionLocator != null) regionLocator.close();
                }
            }
            LOGGER.info("Loading HFiles from {}", outputPath);
            completebulkload(conf,outputPath,tablesToBeLoaded);
            LOGGER.info("Removing output directory {}", outputPath);
            if(!outputPath.getFileSystem(conf).delete(outputPath, true)) {
                LOGGER.error("Failed to delete the output directory {}", outputPath);
            }
            return 0;
        } else {
           return -1;
       }
   }
}
 
Example 20
Source File: GenericMRLoadGenerator.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * Configure a job given argv.
 */
public static boolean parseArgs(String[] argv, Job job) throws IOException {
  if (argv.length < 1) {
    return 0 == printUsage();
  }
  for(int i=0; i < argv.length; ++i) {
    if (argv.length == i + 1) {
      System.out.println("ERROR: Required parameter missing from " +
          argv[i]);
      return 0 == printUsage();
    }
    try {
      if ("-r".equals(argv[i])) {
        job.setNumReduceTasks(Integer.parseInt(argv[++i]));
      } else if ("-inFormat".equals(argv[i])) {
        job.setInputFormatClass(
            Class.forName(argv[++i]).asSubclass(InputFormat.class));
      } else if ("-outFormat".equals(argv[i])) {
        job.setOutputFormatClass(
            Class.forName(argv[++i]).asSubclass(OutputFormat.class));
      } else if ("-outKey".equals(argv[i])) {
        job.setOutputKeyClass(
          Class.forName(argv[++i]).asSubclass(WritableComparable.class));
      } else if ("-outValue".equals(argv[i])) {
        job.setOutputValueClass(
          Class.forName(argv[++i]).asSubclass(Writable.class));
      } else if ("-keepmap".equals(argv[i])) {
        job.getConfiguration().set(MAP_PRESERVE_PERCENT, argv[++i]);
      } else if ("-keepred".equals(argv[i])) {
        job.getConfiguration().set(REDUCE_PRESERVE_PERCENT, argv[++i]);
      } else if ("-outdir".equals(argv[i])) {
        FileOutputFormat.setOutputPath(job, new Path(argv[++i]));
      } else if ("-indir".equals(argv[i])) {
        FileInputFormat.addInputPaths(job, argv[++i]);
      } else if ("-inFormatIndirect".equals(argv[i])) {
        job.getConfiguration().setClass(INDIRECT_INPUT_FORMAT,
            Class.forName(argv[++i]).asSubclass(InputFormat.class),
            InputFormat.class);
        job.setInputFormatClass(IndirectInputFormat.class);
      } else {
        System.out.println("Unexpected argument: " + argv[i]);
        return 0 == printUsage();
      }
    } catch (NumberFormatException except) {
      System.out.println("ERROR: Integer expected instead of " + argv[i]);
      return 0 == printUsage();
    } catch (Exception e) {
      throw (IOException)new IOException().initCause(e);
    }
  }
  return true;
}