Java Code Examples for org.apache.hadoop.mapreduce.Job#setOutputFormatClass()

The following examples show how to use org.apache.hadoop.mapreduce.Job#setOutputFormatClass() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P2Q1");
	job.setJarByClass(P2Q1.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setMapperClass(P2Q1Map.class);
	job.setCombinerClass(P2Q1Reduce.class);
	job.setReducerClass(P2Q1Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}
 
Example 2
Source Project: recsys-offline   File: Step31.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job three1");  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step31.class);  
    job1.setMapperClass(WikiMapper31.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);  
      
    // set a reducer only to use SequenceFileOutputFormat  
    job1.setReducerClass(WiKiReducer31.class);  
    job1.setOutputKeyClass(IntWritable.class);  
    job1.setOutputValueClass(VectorOrPrefWritable.class);  
      
    // this MR's input is the MR2's output  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example 3
Source Project: flink-perf   File: KMeansDriver.java    License: Apache License 2.0 6 votes vote down vote up
public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception {
	Path points = new Path (pointsPath);
	Path seqFile = new Path (seqFilePath);
	if (fs.exists(seqFile)) {
		fs.delete(seqFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setMapperClass(CenterInitializer.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(Centroid.class);
	job.setMapOutputValueClass(Point.class);
	job.setOutputKeyClass(Centroid.class);
	job.setOutputValueClass(Point.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setInputFormatClass(TextInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(pointsPath));
	FileOutputFormat.setOutputPath(job, seqFile);
	job.waitForCompletion(true);
}
 
Example 4
Source Project: aegisthus   File: Distcp.java    License: Apache License 2.0 6 votes vote down vote up
protected Job initializeJob() throws IOException {
	Job job = new Job(getConf());
	job.setJarByClass(Distcp.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(CleanOutputFormat.class);
	job.setMapOutputKeyClass(LongWritable.class);
	job.setMapOutputValueClass(Text.class);
	job.setMapperClass(Map.class);
	job.setReducerClass(Reduce.class);
	job.setPartitionerClass(Partition.class);
	StorageHelper sh = new StorageHelper(job.getConfiguration());
	LOG.info(String.format("temp location for job: %s", sh.getBaseTempLocation()));

	return job;
}
 
Example 5
Source Project: laser   File: Compute.java    License: Apache License 2.0 6 votes vote down vote up
public static int run(Path model, Configuration baseConf) throws IOException, ClassNotFoundException,
		InterruptedException {
	Configuration conf = new Configuration(baseConf);
	conf.set("com.b5m.laser.msgpack.input.method", "ad_feature");
	conf.set("com.b5m.laser.msgpack.output.method", "precompute_ad_offline_model");
	conf.set("com.b5m.laser.offline.model", model.toString());
	Job job = Job.getInstance(conf);
	job.setJarByClass(Compute.class);
	job.setJobName("per compute stable part from offline model for each user");
	job.setInputFormatClass(MsgpackInputFormat.class);
	job.setOutputFormatClass(MsgpackOutputFormat.class);
	
	job.setOutputKeyClass(Long.class);
	job.setOutputValueClass(Result.class);
	
	job.setMapperClass(Mapper.class);
	job.setNumReduceTasks(0);
	
	boolean succeeded = job.waitForCompletion(true);
	if (!succeeded) {
		throw new IllegalStateException("Job failed!");
	}

	return 0;
}
 
Example 6
Source Project: hraven   File: JobFileProcessor.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * @param conf to use to create and run the job
 * @param scan to be used to scan the raw table.
 * @param totalJobCount the total number of jobs that need to be run in this
 *          batch. Used in job name.
 * @return The job to be submitted to the cluster.
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount)
    throws IOException {

  Configuration confClone = new Configuration(conf);

  // Turn off speculative execution.
  // Note: must be BEFORE the job construction with the new mapreduce API.
  confClone.setBoolean("mapred.map.tasks.speculative.execution", false);

  // Set up job
  Job job = new Job(confClone, getJobName(totalJobCount));

  // This is a map-only class, skip reduce step
  job.setNumReduceTasks(0);
  job.setJarByClass(JobFileProcessor.class);
  job.setOutputFormatClass(MultiTableOutputFormat.class);

  TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan,
      JobFileTableMapper.class, JobFileTableMapper.getOutputKeyClass(),
      JobFileTableMapper.getOutputValueClass(), job);

  return job;
}
 
Example 7
Source Project: Hadoop-BAM   File: TestBAMOutputFormat.java    License: MIT License 6 votes vote down vote up
private Path doMapReduce(final String inputFile) throws Exception {
    final FileSystem fileSystem = FileSystem.get(conf);
    final Path inputPath = new Path(inputFile);
    final Path outputPath = fileSystem.makeQualified(new Path("target/out"));
    fileSystem.delete(outputPath, true);

    final Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, inputPath);

    conf.set(BAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile);
    job.setInputFormatClass(BAMInputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SAMRecordWritable.class);

    job.setOutputFormatClass(BAMTestNoHeaderOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(SAMRecordWritable.class);

    job.setNumReduceTasks(0);
    FileOutputFormat.setOutputPath(job, outputPath);

    final boolean success = job.waitForCompletion(true);
    assertTrue(success);

    return outputPath;
}
 
Example 8
/**
   * Write the sequence file.
   *
   * @param args the command-line arguments
   * @return the process exit code
   * @throws Exception if something goes wrong
   */
  public int run(final String[] args) throws Exception {

    Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
    int result = cli.runCmd();

    if (result != 0) {
      return result;
    }

    Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
    Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

    Configuration conf = super.getConf();

    Job job = new Job(conf);
    job.setJarByClass(SequenceFileProtobufMapReduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Stock.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(PbMapper.class);
    job.setReducerClass(PbReducer.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);

  ProtobufSerialization.register(job.getConfiguration());

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}
 
Example 9
static void configureIncrementalLoad(Job job, HTableDescriptor tableDescriptor, RegionLocator regionLocator,
        Class<? extends OutputFormat<?, ?>> cls) throws IOException, UnsupportedEncodingException {
    Configuration conf = job.getConfiguration();
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);
    job.setOutputFormatClass(cls);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(PutSortReducer.class);
    } else if (Text.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(TextSortReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(),
            ResultSerialization.class.getName(), KeyValueSerialization.class.getName());

    // Use table's region boundaries for TOP split points.
    LOG.info("Looking up current regions for table " + tableDescriptor.getTableName());
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(regionLocator);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");
    job.setNumReduceTasks(startKeys.size());

    configurePartitioner(job, startKeys);
    // Set compression algorithms based on column families
    configureCompression(conf, tableDescriptor);
    configureBloomType(tableDescriptor, conf);
    configureBlockSize(tableDescriptor, conf);
    configureDataBlockEncoding(tableDescriptor, conf);

    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.initCredentials(job);
    LOG.info("Incremental table " + regionLocator.getName() + " output configured.");
}
 
Example 10
private int doVerify(Path outputDir, int numReducers) throws IOException, InterruptedException,
    ClassNotFoundException {
  job = new Job(getConf());

  job.setJobName("Link Verifier");
  job.setNumReduceTasks(numReducers);
  job.setJarByClass(getClass());

  setJobScannerConf(job);

  Scan scan = new Scan();
  scan.addColumn(FAMILY_NAME, COLUMN_PREV);
  scan.setCaching(10000);
  scan.setCacheBlocks(false);
  String[] split = labels.split(COMMA);

  scan.setAuthorizations(new Authorizations(split[this.labelIndex * 2],
      split[(this.labelIndex * 2) + 1]));

  TableMapReduceUtil.initTableMapperJob(tableName.getName(), scan, VerifyMapper.class,
      BytesWritable.class, BytesWritable.class, job);
  TableMapReduceUtil.addDependencyJars(job.getConfiguration(), AbstractHBaseTool.class);

  job.getConfiguration().setBoolean("mapreduce.map.speculative", false);

  job.setReducerClass(VerifyReducer.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  TextOutputFormat.setOutputPath(job, outputDir);
  boolean success = job.waitForCompletion(true);

  return success ? 0 : 1;
}
 
Example 11
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length < 2) {
        System.err.println(
                "Usage: LinkCountCooccurrences configFile outputDir");
        System.exit(2);
    }
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    
    Job job = Job.getInstance(conf, "link count cooccurrences");
    job.setJarByClass(LinkCountCooccurrences.class);
    job.setInputFormatClass(KeyValueInputFormat.class);
    job.setMapperClass(RefMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_KEY_CLASS, Text.class, 
            Writable.class);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, 
            Writable.class);
    conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, 
            HrefTitleMap.class, ElemAttrValueCooccurrences.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 12
public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput,
    Integer width, Integer wrapMultiplier, Integer numWalkers)
    throws Exception {
  LOG.info("Running RandomInputGenerator with numMappers=" + numMappers
      + ", numNodes=" + numNodes);
  Job job = Job.getInstance(getConf());

  job.setJobName("Random Input Generator");
  job.setNumReduceTasks(0);
  job.setJarByClass(getClass());

  job.setInputFormatClass(GeneratorInputFormat.class);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(NullWritable.class);

  setJobConf(job, numMappers, numNodes, width, wrapMultiplier, numWalkers);

  job.setMapperClass(Mapper.class); //identity mapper

  FileOutputFormat.setOutputPath(job, tmpOutput);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), Random64.class);

  boolean success = jobCompletion(job);

  return success ? 0 : 1;
}
 
Example 13
Source Project: hadoop   File: TestMRJobs.java    License: Apache License 2.0 5 votes vote down vote up
protected Job runFailingMapperJob()
throws IOException, InterruptedException, ClassNotFoundException {
  Configuration myConf = new Configuration(mrCluster.getConfig());
  myConf.setInt(MRJobConfig.NUM_MAPS, 1);
  myConf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 2); //reduce the number of attempts

  Job job = Job.getInstance(myConf);

  job.setJarByClass(FailingMapper.class);
  job.setJobName("failmapper");
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormatClass(RandomInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setMapperClass(FailingMapper.class);
  job.setNumReduceTasks(0);
  
  FileOutputFormat.setOutputPath(job, new Path(OUTPUT_ROOT_DIR,
    "failmapper-output"));
  job.addFileToClassPath(APP_JAR); // The AppMaster jar itself.
  job.submit();
  String trackingUrl = job.getTrackingURL();
  String jobId = job.getJobID().toString();
  boolean succeeded = job.waitForCompletion(true);
  Assert.assertFalse(succeeded);
  Assert.assertTrue("Tracking URL was " + trackingUrl +
                    " but didn't Match Job ID " + jobId ,
        trackingUrl.endsWith(jobId.substring(jobId.lastIndexOf("_")) + "/"));
  return job;
}
 
Example 14
Source Project: geowave   File: OSMConversionRunner.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public int run(final String[] args) throws Exception {

  final Configuration conf = getConf();
  final AccumuloRequiredOptions accumuloOptions =
      (AccumuloRequiredOptions) inputStoreOptions.getFactoryOptions();

  // job settings

  final Job job = Job.getInstance(conf, ingestOptions.getJobName() + "NodeConversion");
  job.setJarByClass(OSMConversionRunner.class);

  job.getConfiguration().set("osm_mapping", ingestOptions.getMappingContents());
  job.getConfiguration().set("arguments", ingestOptions.serializeToString());

  if (ingestOptions.getVisibilityOptions().getVisibility() != null) {
    job.getConfiguration().set(
        AbstractMapReduceIngest.GLOBAL_VISIBILITY_KEY,
        ingestOptions.getVisibilityOptions().getVisibility());
  }

  // input format

  AbstractInputFormat.setConnectorInfo(
      job,
      accumuloOptions.getUser(),
      new PasswordToken(accumuloOptions.getPassword()));
  InputFormatBase.setInputTableName(job, ingestOptions.getQualifiedTableName());
  AbstractInputFormat.setZooKeeperInstance(
      job,
      new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts(
          accumuloOptions.getZookeeper()));
  AbstractInputFormat.setScanAuthorizations(
      job,
      new Authorizations(ingestOptions.getVisibilityOptions().getVisibility()));

  final IteratorSetting is = new IteratorSetting(50, "WholeRow", WholeRowIterator.class);
  InputFormatBase.addIterator(job, is);
  job.setInputFormatClass(AccumuloInputFormat.class);
  final Range r = new Range();
  // final ArrayList<Pair<Text, Text>> columns = new ArrayList<>();
  InputFormatBase.setRanges(job, Arrays.asList(r));

  // output format
  GeoWaveOutputFormat.setStoreOptions(job.getConfiguration(), inputStoreOptions);
  final AccumuloOptions options = new AccumuloOptions();
  final AdapterStore as =
      new AdapterStoreImpl(
          new AccumuloOperations(
              accumuloOptions.getZookeeper(),
              accumuloOptions.getInstance(),
              accumuloOptions.getUser(),
              accumuloOptions.getPassword(),
              accumuloOptions.getGeoWaveNamespace(),
              options),
          options);
  for (final FeatureDataAdapter fda : FeatureDefinitionSet.featureAdapters.values()) {
    as.addAdapter(fda);
    GeoWaveOutputFormat.addDataAdapter(job.getConfiguration(), fda);
  }

  final Index primaryIndex =
      new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
  GeoWaveOutputFormat.addIndex(job.getConfiguration(), primaryIndex);
  job.getConfiguration().set(AbstractMapReduceIngest.INDEX_NAMES_KEY, primaryIndex.getName());

  job.setOutputFormatClass(GeoWaveOutputFormat.class);
  job.setMapOutputKeyClass(GeoWaveOutputKey.class);
  job.setMapOutputValueClass(SimpleFeature.class);

  // mappper

  job.setMapperClass(OSMConversionMapper.class);

  // reducer
  job.setNumReduceTasks(0);

  return job.waitForCompletion(true) ? 0 : -1;
}
 
Example 15
Source Project: big-c   File: TestMRJobs.java    License: Apache License 2.0 4 votes vote down vote up
private void testJobClassloader(boolean useCustomClasses) throws IOException,
    InterruptedException, ClassNotFoundException {
  LOG.info("\n\n\nStarting testJobClassloader()"
      + " useCustomClasses=" + useCustomClasses);

  if (!(new File(MiniMRYarnCluster.APPJAR)).exists()) {
    LOG.info("MRAppJar " + MiniMRYarnCluster.APPJAR
             + " not found. Not running test.");
    return;
  }
  final Configuration sleepConf = new Configuration(mrCluster.getConfig());
  // set master address to local to test that local mode applied iff framework == local
  sleepConf.set(MRConfig.MASTER_ADDRESS, "local");
  sleepConf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, true);
  if (useCustomClasses) {
    // to test AM loading user classes such as output format class, we want
    // to blacklist them from the system classes (they need to be prepended
    // as the first match wins)
    String systemClasses = ApplicationClassLoader.SYSTEM_CLASSES_DEFAULT;
    // exclude the custom classes from system classes
    systemClasses = "-" + CustomOutputFormat.class.getName() + ",-" +
        CustomSpeculator.class.getName() + "," +
        systemClasses;
    sleepConf.set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES,
        systemClasses);
  }
  sleepConf.set(MRJobConfig.IO_SORT_MB, TEST_IO_SORT_MB);
  sleepConf.set(MRJobConfig.MR_AM_LOG_LEVEL, Level.ALL.toString());
  sleepConf.set(MRJobConfig.MAP_LOG_LEVEL, Level.ALL.toString());
  sleepConf.set(MRJobConfig.REDUCE_LOG_LEVEL, Level.ALL.toString());
  sleepConf.set(MRJobConfig.MAP_JAVA_OPTS, "-verbose:class");
  final SleepJob sleepJob = new SleepJob();
  sleepJob.setConf(sleepConf);
  final Job job = sleepJob.createJob(1, 1, 10, 1, 10, 1);
  job.setMapperClass(ConfVerificationMapper.class);
  job.addFileToClassPath(APP_JAR); // The AppMaster jar itself.
  job.setJarByClass(SleepJob.class);
  job.setMaxMapAttempts(1); // speed up failures
  if (useCustomClasses) {
    // set custom output format class and speculator class
    job.setOutputFormatClass(CustomOutputFormat.class);
    final Configuration jobConf = job.getConfiguration();
    jobConf.setClass(MRJobConfig.MR_AM_JOB_SPECULATOR, CustomSpeculator.class,
        Speculator.class);
    // speculation needs to be enabled for the speculator to be loaded
    jobConf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true);
  }
  job.submit();
  boolean succeeded = job.waitForCompletion(true);
  Assert.assertTrue("Job status: " + job.getStatus().getFailureInfo(),
      succeeded);
}
 
Example 16
Source Project: Halyard   File: HalyardStats.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String target = cmd.getOptionValue('t');
    String targetGraph = cmd.getOptionValue('g');
    String graphContext = cmd.getOptionValue('c');
    String thresh = cmd.getOptionValue('r');
    TableMapReduceUtil.addDependencyJars(getConf(),
        HalyardExport.class,
        NTriplesUtil.class,
        Rio.class,
        AbstractRDFHandler.class,
        RDFFormat.class,
        RDFParser.class,
        HTable.class,
        HBaseConfiguration.class,
        AuthenticationProtos.class,
        Trace.class,
        Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardStats " + source + (target == null ? " update" : " -> " + target));
    job.getConfiguration().set(SOURCE, source);
    if (target != null) job.getConfiguration().set(TARGET, target);
    if (targetGraph != null) job.getConfiguration().set(TARGET_GRAPH, targetGraph);
    if (graphContext != null) job.getConfiguration().set(GRAPH_CONTEXT, graphContext);
    if (thresh != null) job.getConfiguration().setLong(THRESHOLD, Long.parseLong(thresh));
    job.setJarByClass(HalyardStats.class);
    TableMapReduceUtil.initCredentials(job);

    Scan scan = HalyardTableUtils.scan(null, null);
    if (graphContext != null) { //restricting stats to scan given graph context only
        List<RowRange> ranges = new ArrayList<>();
        byte[] gcHash = HalyardTableUtils.hashKey(SimpleValueFactory.getInstance().createIRI(graphContext));
        ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, gcHash));
        ranges.add(rowRange(HalyardTableUtils.CPOS_PREFIX, gcHash));
        ranges.add(rowRange(HalyardTableUtils.COSP_PREFIX, gcHash));
        if (target == null) { //add stats context to the scanned row ranges (when in update mode) to delete the related stats during MapReduce
            ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, HalyardTableUtils.hashKey(targetGraph == null ? HALYARD.STATS_GRAPH_CONTEXT : SimpleValueFactory.getInstance().createIRI(targetGraph))));
        }
        scan.setFilter(new MultiRowRangeFilter(ranges));
    }
    TableMapReduceUtil.initTableMapperJob(
        source,
        scan,
        StatsMapper.class,
        ImmutableBytesWritable.class,
        LongWritable.class,
        job);
    job.setPartitionerClass(StatsPartitioner.class);
    job.setReducerClass(StatsReducer.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    if (job.waitForCompletion(true)) {
        LOG.info("Stats Generation Completed..");
        return 0;
    }
    return -1;
}
 
Example 17
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class);
    job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName());

    // paths
    // text files of ids to be deleted
    String textFilePath = args[0];
    // corpus with *.warc.gz
    String commaSeparatedInputFiles = args[1];
    // output
    String outputPath = args[2];

    //second input the look up text file
    MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class,
            JoinTextMapper.class);
    //first input the data set (check comma separated availability)
    MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class,
            JoinWARCMapper.class);

    job.setPartitionerClass(SourceJoiningKeyPartitioner.class);
    job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class);

    job.setMapOutputKeyClass(CompositeKey.class);
    job.setMapOutputValueClass(WARCWritable.class);

    job.setReducerClass(JoinReducer.class);

    job.setOutputFormatClass(WARCOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 18
public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException
  {
  	Configuration conf = new Configuration();	
  	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
StringBuilder sb = new StringBuilder();
  	for (int j = 2; j < otherArgs.length; j++) {
	
  		sb.append(otherArgs[j]);
}
  	
  	LOGGER.debug("Arguments[ " + otherArgs.length+"]"+"and values respectively ["+otherArgs[0]+"], "+
		otherArgs[1]+", ["+otherArgs[2]+"]"+", ["+otherArgs[3]+"],"+
		otherArgs[4]);

String inputpath = otherArgs[0];
String outputpath = "/tmp/jumbune/dvjsonreport"+  new Date().getTime();

String json = otherArgs[1];
String nullCondition = otherArgs[2];
String regex = otherArgs[3];
String dvDir = otherArgs[4];



if(regex.isEmpty()){
	conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, "");
}else{
	conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, regex);
}

if(nullCondition.isEmpty()){
	conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, "");
}else{
	conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, nullCondition);
}


conf.set(JsonDataVaildationConstants.SLAVE_DIR, dvDir);
conf.set(JsonDataVaildationConstants.JSON_ARGUMENT, json);
FileSystem fs = FileSystem.get(conf);

@SuppressWarnings("deprecation")
Job job = new Job(conf, "JSONDataValidation");
job.setJarByClass(JsonDataValidationExecutor.class);

job.setInputFormatClass(JsonFileInputFormat.class);

job.setMapperClass(JsonDataValidationMapper.class);
job.setPartitionerClass(JsonDataValidationPartitioner.class);
job.setReducerClass(JsonDataValidationReducer.class);
job.setNumReduceTasks(5);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FileKeyViolationBean.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TotalReducerViolationBean.class);
	
job.setOutputFormatClass(SequenceFileOutputFormat.class);

  	Path[] inputPaths = FileUtil.getAllJsonNestedFilePath(job, inputpath);

FileInputFormat.setInputPaths(job, inputPaths);
FileOutputFormat.setOutputPath(job, new Path(outputpath));
		
if(fs.exists(new Path(outputpath)))
{
	fs.delete(new Path(outputpath), true);
}

job.waitForCompletion(true);	

 Map<String, JsonViolationReport> jsonMap = readDataFromHdfs(conf,outputpath);
 final Gson gson= new Gson();
 final String jsonReport = gson.toJson(jsonMap);

 LOGGER.info("Completed DataValidation");
 LOGGER.info(JsonDataVaildationConstants.JSON_DV_REPORT + jsonReport);
  }
 
Example 19
Source Project: hbase   File: ScanPerformanceEvaluation.java    License: Apache License 2.0 4 votes vote down vote up
public void testSnapshotScanMapReduce() throws IOException, InterruptedException, ClassNotFoundException {
  Stopwatch scanOpenTimer = Stopwatch.createUnstarted();
  Stopwatch scanTimer = Stopwatch.createUnstarted();

  Scan scan = getScan();

  String jobName = "testSnapshotScanMapReduce";

  Job job = new Job(conf);
  job.setJobName(jobName);

  job.setJarByClass(getClass());

  TableMapReduceUtil.initTableSnapshotMapperJob(
      this.snapshotName,
      scan,
      MyMapper.class,
      NullWritable.class,
      NullWritable.class,
      job,
      true,
      new Path(restoreDir)
  );

  job.setNumReduceTasks(0);
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(NullWritable.class);
  job.setOutputFormatClass(NullOutputFormat.class);

  scanTimer.start();
  job.waitForCompletion(true);
  scanTimer.stop();

  Counters counters = job.getCounters();
  long numRows = counters.findCounter(ScanCounter.NUM_ROWS).getValue();
  long numCells = counters.findCounter(ScanCounter.NUM_CELLS).getValue();

  long totalBytes = counters.findCounter(HBASE_COUNTER_GROUP_NAME, "BYTES_IN_RESULTS").getValue();
  double throughput = (double)totalBytes / scanTimer.elapsed(TimeUnit.SECONDS);
  double throughputRows = (double)numRows / scanTimer.elapsed(TimeUnit.SECONDS);
  double throughputCells = (double)numCells / scanTimer.elapsed(TimeUnit.SECONDS);

  System.out.println("HBase scan mapreduce: ");
  System.out.println("total time to open scanner: " +
    scanOpenTimer.elapsed(TimeUnit.MILLISECONDS) + " ms");
  System.out.println("total time to scan: " + scanTimer.elapsed(TimeUnit.MILLISECONDS) + " ms");

  System.out.println("total bytes: " + totalBytes + " bytes ("
      + StringUtils.humanReadableInt(totalBytes) + ")");
  System.out.println("throughput  : " + StringUtils.humanReadableInt((long)throughput) + "B/s");
  System.out.println("total rows  : " + numRows);
  System.out.println("throughput  : " + StringUtils.humanReadableInt((long)throughputRows) + " rows/s");
  System.out.println("total cells : " + numCells);
  System.out.println("throughput  : " + StringUtils.humanReadableInt((long)throughputCells) + " cells/s");
}
 
Example 20
Source Project: nutch-htmlunit   File: DomainStatistics.java    License: Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
  if (args.length < 3) {
    System.out.println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]");
    return 1;
  }
  String inputDir = args[0];
  String outputDir = args[1];
  int numOfReducers = 1;

  if (args.length > 3) {
    numOfReducers = Integer.parseInt(args[3]);
  }

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("DomainStatistics: starting at " + sdf.format(start));

  int mode = 0;
  String jobName = "DomainStatistics";
  if(args[2].equals("host")) {
    jobName = "Host statistics";
    mode = MODE_HOST;
  } else if(args[2].equals("domain")) {
    jobName  = "Domain statistics";
    mode = MODE_DOMAIN;
  } else if(args[2].equals("suffix")) {
    jobName = "Suffix statistics";
    mode = MODE_SUFFIX;
  } else if(args[2].equals("tld")) {
    jobName = "TLD statistics";
    mode = MODE_TLD;
  }

  Configuration conf = getConf();
  conf.setInt("domain.statistics.mode", mode);
  conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  Job job = new Job(conf, jobName);
  job.setJarByClass(DomainStatistics.class);

  String[] inputDirsSpecs = inputDir.split(",");
  for (int i = 0; i < inputDirsSpecs.length; i++) {
    FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i]));
  }

  job.setInputFormatClass(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(outputDir));
  job.setOutputFormatClass(TextOutputFormat.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);

  job.setMapperClass(DomainStatisticsMapper.class);
  job.setReducerClass(DomainStatisticsReducer.class);
  job.setCombinerClass(DomainStatisticsCombiner.class);
  job.setNumReduceTasks(numOfReducers);

  try {
    job.waitForCompletion(true);
  } catch (Exception e) {
    throw e;
  }

  long end = System.currentTimeMillis();
  LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}