Java Code Examples for org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

The following are top voted examples for showing how to use org.apache.hadoop.mapreduce.lib.output.FileOutputFormat. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: mumu-parquet   File: MapReduceParquetMapReducer.java   Source Code and License 8 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n",
                getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }

    Job job = new Job(getConf(), "Text to Parquet");
    job.setJarByClass(getClass());
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setMapperClass(TextToParquetMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    AvroParquetOutputFormat.setSchema(job, SCHEMA);
    job.setOutputKeyClass(Void.class);
    job.setOutputValueClass(Group.class);
    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 2
Project: LDA   File: CalParamDriver.java   Source Code and License 7 votes vote down vote up
public static void run(Configuration conf, Path inputPath, Path output, double params) throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "calculating parameter";
    conf.set("params",String.valueOf(params));

    Job job = new Job(conf, jobName);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(indexToCountWritable.class);
    job.setOutputKeyClass(twoDimensionIndexWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(CalParamsMapper.class);
    job.setReducerClass(CalParamsReducer.class);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job,output);

    job.setJarByClass(LDADriver.class);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("calculating parameter failed");
    }
}
 
Example 3
Project: Hadoop-Codes   File: MaxTempDriver.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	Job job = Job.getInstance(conf, "maxtemp");
	
	job.setMapperClass(MaxTempMapper.class);
	job.setReducerClass(MaxTempReducer.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(FloatWritable.class);

	FileInputFormat.setInputPaths(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	if (!job.waitForCompletion(true))
		return;
}
 
Example 4
Project: ditb   File: IntegrationTestBigLinkedList.java   Source Code and License 7 votes vote down vote up
public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput,
    Integer width, Integer wrapMuplitplier) throws Exception {
  LOG.info("Running RandomInputGenerator with numMappers=" + numMappers
      + ", numNodes=" + numNodes);
  Job job = Job.getInstance(getConf());

  job.setJobName("Random Input Generator");
  job.setNumReduceTasks(0);
  job.setJarByClass(getClass());

  job.setInputFormatClass(GeneratorInputFormat.class);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(NullWritable.class);

  setJobConf(job, numMappers, numNodes, width, wrapMuplitplier);

  job.setMapperClass(Mapper.class); //identity mapper

  FileOutputFormat.setOutputPath(job, tmpOutput);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  boolean success = jobCompletion(job);

  return success ? 0 : 1;
}
 
Example 5
Project: ditb   File: IntegrationTestLoadAndVerify.java   Source Code and License 7 votes vote down vote up
protected void doVerify(Configuration conf, HTableDescriptor htd) throws Exception {
  Path outputDir = getTestDir(TEST_NAME, "verify-output");
  LOG.info("Verify output dir: " + outputDir);

  Job job = Job.getInstance(conf);
  job.setJarByClass(this.getClass());
  job.setJobName(TEST_NAME + " Verification for " + htd.getTableName());
  setJobScannerConf(job);

  Scan scan = new Scan();

  TableMapReduceUtil.initTableMapperJob(
      htd.getTableName().getNameAsString(), scan, VerifyMapper.class,
      BytesWritable.class, BytesWritable.class, job);
  TableMapReduceUtil.addDependencyJars(job.getConfiguration(), AbstractHBaseTool.class);
  int scannerCaching = conf.getInt("verify.scannercaching", SCANNER_CACHING);
  TableMapReduceUtil.setScannerCaching(job, scannerCaching);

  job.setReducerClass(VerifyReducer.class);
  job.setNumReduceTasks(conf.getInt(NUM_REDUCE_TASKS_KEY, NUM_REDUCE_TASKS_DEFAULT));
  FileOutputFormat.setOutputPath(job, outputDir);
  assertTrue(job.waitForCompletion(true));

  long numOutputRecords = job.getCounters().findCounter(Counters.ROWS_WRITTEN).getValue();
  assertEquals(0, numOutputRecords);
}
 
Example 6
Project: big-data-benchmark   File: HadoopWordCount.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws Exception {
    BasicConfigurator.configure();
    Configuration conf = new Configuration();
    conf.setQuietMode(true);

    Job job = Job.getInstance(conf, "WordCount");
    job.setJarByClass(HadoopWordCount.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1] + "_" + System.currentTimeMillis()));

    long t = System.currentTimeMillis();
    job.waitForCompletion(true);

    System.out.println("TotalTime=" + (System.currentTimeMillis() - t));
}
 
Example 7
Project: learn-to-hadoop   File: MaxTemperatureWithCombiner.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws Exception {
    if(args.length != 2){
        System.err.println("Usage: MaxTemperatureWithCombiner <input path> <output path>");
        System.exit(-1);
    }

    Job job = new Job();
    job.setJarByClass(MaxTemperatureWithCombiner.class);
    job.setJobName("Max Temperature With Combiner");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(MaxTemperatureMapper.class);
    job.setCombinerClass(MaxTemperatureReducer.class);
    job.setReducerClass(MaxTemperatureReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 8
Project: hadoop   File: MapReduceTestUtil.java   Source Code and License 7 votes vote down vote up
/**
 * Creates a simple copy job.
 * 
 * @param conf Configuration object
 * @param outdir Output directory.
 * @param indirs Comma separated input directories.
 * @return Job initialized for a data copy job.
 * @throws Exception If an error occurs creating job configuration.
 */
public static Job createCopyJob(Configuration conf, Path outdir, 
    Path... indirs) throws Exception {
  conf.setInt(MRJobConfig.NUM_MAPS, 3);
  Job theJob = Job.getInstance(conf);
  theJob.setJobName("DataMoveJob");

  FileInputFormat.setInputPaths(theJob, indirs);
  theJob.setMapperClass(DataCopyMapper.class);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  theJob.setReducerClass(DataCopyReducer.class);
  theJob.setNumReduceTasks(1);
  return theJob;
}
 
Example 9
Project: hadoop   File: MapReduceTestUtil.java   Source Code and License 7 votes vote down vote up
/**
 * Creates a simple fail job.
 * 
 * @param conf Configuration object
 * @param outdir Output directory.
 * @param indirs Comma separated input directories.
 * @return Job initialized for a simple fail job.
 * @throws Exception If an error occurs creating job configuration.
 */
public static Job createFailJob(Configuration conf, Path outdir, 
    Path... indirs) throws Exception {
  FileSystem fs = outdir.getFileSystem(conf);
  if (fs.exists(outdir)) {
    fs.delete(outdir, true);
  }
  conf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 2);
  Job theJob = Job.getInstance(conf);
  theJob.setJobName("Fail-Job");

  FileInputFormat.setInputPaths(theJob, indirs);
  theJob.setMapperClass(FailMapper.class);
  theJob.setReducerClass(Reducer.class);
  theJob.setNumReduceTasks(0);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  return theJob;
}
 
Example 10
Project: ditb   File: IntegrationTestLoadAndVerify.java   Source Code and License 7 votes vote down vote up
protected Job doLoad(Configuration conf, HTableDescriptor htd) throws Exception {
  Path outputDir = getTestDir(TEST_NAME, "load-output");
  LOG.info("Load output dir: " + outputDir);

  NMapInputFormat.setNumMapTasks(conf, conf.getInt(NUM_MAP_TASKS_KEY, NUM_MAP_TASKS_DEFAULT));
  conf.set(TABLE_NAME_KEY, htd.getTableName().getNameAsString());

  Job job = Job.getInstance(conf);
  job.setJobName(TEST_NAME + " Load for " + htd.getTableName());
  job.setJarByClass(this.getClass());
  setMapperClass(job);
  job.setInputFormatClass(NMapInputFormat.class);
  job.setNumReduceTasks(0);
  setJobScannerConf(job);
  FileOutputFormat.setOutputPath(job, outputDir);

  TableMapReduceUtil.addDependencyJars(job);

  TableMapReduceUtil.addDependencyJars(job.getConfiguration(), AbstractHBaseTool.class);
  TableMapReduceUtil.initCredentials(job);
  assertTrue(job.waitForCompletion(true));
  return job;
}
 
Example 11
Project: hadoop   File: MapReduceTestUtil.java   Source Code and License 7 votes vote down vote up
public static Job createJob(Configuration conf, Path inDir, Path outDir, 
    int numInputFiles, int numReds, String input) throws IOException {
  Job job = Job.getInstance(conf);
  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }
  if (fs.exists(inDir)) {
    fs.delete(inDir, true);
  }
  fs.mkdirs(inDir);
  for (int i = 0; i < numInputFiles; ++i) {
    DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
    file.writeBytes(input);
    file.close();
  }    

  FileInputFormat.setInputPaths(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);
  job.setNumReduceTasks(numReds);
  return job;
}
 
Example 12
Project: mapreduce-samples   File: UnitSum.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setFloat("beta", Float.parseFloat(args[3]));
        Job job = Job.getInstance(conf);
        job.setJarByClass(UnitSum.class);

        ChainMapper.addMapper(job, PassMapper.class, Object.class, Text.class, Text.class, DoubleWritable.class, conf);
        ChainMapper.addMapper(job, BetaMapper.class, Text.class, DoubleWritable.class, Text.class, DoubleWritable.class, conf);

        job.setReducerClass(SumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);

        MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, PassMapper.class);
        MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, BetaMapper.class);

        FileOutputFormat.setOutputPath(job, new Path(args[2]));
        job.waitForCompletion(true);
    }
 
Example 13
Project: ditb   File: IntegrationTestWithCellVisibilityLoadAndVerify.java   Source Code and License 7 votes vote down vote up
private Job doVerify(Configuration conf, HTableDescriptor htd, String... auths)
    throws IOException, InterruptedException, ClassNotFoundException {
  Path outputDir = getTestDir(TEST_NAME, "verify-output");
  Job job = new Job(conf);
  job.setJarByClass(this.getClass());
  job.setJobName(TEST_NAME + " Verification for " + htd.getTableName());
  setJobScannerConf(job);
  Scan scan = new Scan();
  scan.setAuthorizations(new Authorizations(auths));
  TableMapReduceUtil.initTableMapperJob(htd.getTableName().getNameAsString(), scan,
      VerifyMapper.class, NullWritable.class, NullWritable.class, job);
  TableMapReduceUtil.addDependencyJars(job.getConfiguration(), AbstractHBaseTool.class);
  int scannerCaching = conf.getInt("verify.scannercaching", SCANNER_CACHING);
  TableMapReduceUtil.setScannerCaching(job, scannerCaching);
  job.setNumReduceTasks(0);
  FileOutputFormat.setOutputPath(job, outputDir);
  assertTrue(job.waitForCompletion(true));
  return job;
}
 
Example 14
Project: hadoop   File: TestJoinDatamerge.java   Source Code and License 7 votes vote down vote up
public void testEmptyJoin() throws Exception {
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/empty"));
  Path[] src = { new Path(base,"i0"), new Path("i1"), new Path("i2") };
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose("outer",
      MapReduceTestUtil.Fake_IF.class, src));
  MapReduceTestUtil.Fake_IF.setKeyClass(conf, 
    MapReduceTestUtil.IncomparableKey.class);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(Mapper.class);
  job.setReducerClass(Reducer.class);
  job.setOutputKeyClass(MapReduceTestUtil.IncomparableKey.class);
  job.setOutputValueClass(NullWritable.class);

  job.waitForCompletion(true);
  assertTrue(job.isSuccessful());
  base.getFileSystem(conf).delete(base, true);
}
 
Example 15
Project: hadoop   File: TestLocalRunner.java   Source Code and License 7 votes vote down vote up
/**
 * Run a test with a misconfigured number of mappers.
 * Expect failure.
 */
@Test
public void testInvalidMultiMapParallelism() throws Exception {
  Job job = Job.getInstance();

  Path inputPath = createMultiMapsInput();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  job.setMapperClass(StressMapper.class);
  job.setReducerClass(CountingReducer.class);
  job.setNumReduceTasks(1);
  LocalJobRunner.setLocalMaxRunningMaps(job, -6);
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  boolean success = job.waitForCompletion(true);
  assertFalse("Job succeeded somehow", success);
}
 
Example 16
Project: hadoop   File: TestLocalRunner.java   Source Code and License 7 votes vote down vote up
/** Test case for zero mappers */
@Test
public void testEmptyMaps() throws Exception {
  Job job = Job.getInstance();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  job.setInputFormatClass(EmptyInputFormat.class);
  job.setNumReduceTasks(1);
  FileOutputFormat.setOutputPath(job, outputPath);

  boolean success = job.waitForCompletion(true);
  assertTrue("Empty job should work", success);
}
 
Example 17
Project: DocIT   File: Cut.java   Source Code and License 6 votes vote down vote up
public static void cut(String name, String in) throws Exception {
	Job job = createJob(name, in);
	job.setNumReduceTasks(0);
	boolean success = job.waitForCompletion(true);

	if (success) {
		// MapReduce reads an input and writes the result to an new location.
		// Hence it can not modify data in place, and we have to replace the input
		// with the output
		Path output = FileOutputFormat.getOutputPath(job);
		FileSystem fs = output.getFileSystem(job.getConfiguration());
		Path[] inputs = FileInputFormat.getInputPaths(job);
		for (int i = 0; i < inputs.length; i++) {
			fs.delete(inputs[i], true);
		}
		fs.rename(output, inputs[0]);
	}
}
 
Example 18
Project: Wikipedia-Index   File: TF.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	conf.set("xmlinput.start", "<page>");
	conf.set("xmlinput.end", "</page>");
	
	Job job =Job.getInstance(conf);
	job.setJobName("TermFrequencyCount");
	job.setJarByClass(TF.class);
	
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(IntArrayWritable.class);
	
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(DoubleWritable.class);
	
	job.setMapperClass(TFMap.class);
	job.setReducerClass(TFReduce.class);
	
	job.setInputFormatClass(XmlInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);
	
	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));
	boolean wait = job.waitForCompletion(true);
	System.exit(wait ? 0 : 1);
}
 
Example 19
Project: ditb   File: CellCounter.java   Source Code and License 6 votes vote down vote up
/**
 * Sets up the actual job.
 *
 * @param conf The current configuration.
 * @param args The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
    throws IOException {
  String tableName = args[0];
  Path outputDir = new Path(args[1]);
  String reportSeparatorString = (args.length > 2) ? args[2]: ":";
  conf.set("ReportSeparator", reportSeparatorString);
  Job job = new Job(conf, NAME + "_" + tableName);
  job.setJarByClass(CellCounter.class);
  Scan scan = getConfiguredScanForJob(conf, args);
  TableMapReduceUtil.initTableMapperJob(tableName, scan,
      CellCounterMapper.class, ImmutableBytesWritable.class, Result.class, job);
  job.setNumReduceTasks(1);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileOutputFormat.setOutputPath(job, outputDir);
  job.setReducerClass(IntSumReducer.class);
  return job;
}
 
Example 20
Project: aliyun-maxcompute-data-collectors   File: HBaseBulkImportJob.java   Source Code and License 6 votes vote down vote up
@Override
protected void jobSetup(Job job) throws IOException, ImportException {
  super.jobSetup(job);

  // we shouldn't have gotten here if bulk load dir is not set
  // so let's throw a ImportException
  if(getContext().getDestination() == null){
    throw new ImportException("Can't run HBaseBulkImportJob without a " +
        "valid destination directory.");
  }

  TableMapReduceUtil.addDependencyJars(job.getConfiguration(), Preconditions.class);
  FileOutputFormat.setOutputPath(job, getContext().getDestination());
  HTable hTable = new HTable(job.getConfiguration(), options.getHBaseTable());
  HFileOutputFormat.configureIncrementalLoad(job, hTable);
}
 
Example 21
Project: Deep_learning_using_Java   File: Recommendation_program.java   Source Code and License 6 votes vote down vote up
private Job jobListFriends(String inputPath, String outputPath) throws IOException, InterruptedException, ClassNotFoundException{      
    Job job = new Job();
    job.setJarByClass(WordCount.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(KeyValueTextInputFormat.class);   // Need to change the import
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.waitForCompletion(true);

    return job;
}
 
Example 22
Project: hadoop   File: RandomTextWriterJob.java   Source Code and License 6 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  Job job = createJob(getConf());
  FileOutputFormat.setOutputPath(job, new Path(args[0]));
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 23
Project: mapreduce-samples   File: UnitMultiplication.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setFloat("beta", Float.parseFloat(args[3]));
        Job job = Job.getInstance(conf);
        job.setJarByClass(UnitMultiplication.class);

        ChainMapper.addMapper(job, TransitionMapper.class, Object.class, Text.class, Text.class, Text.class, conf);
        ChainMapper.addMapper(job, PRMapper.class, Object.class, Text.class, Text.class, Text.class, conf);

        job.setReducerClass(MultiplicationReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, TransitionMapper.class);
        MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, PRMapper.class);

        FileOutputFormat.setOutputPath(job, new Path(args[2]));
        job.waitForCompletion(true);
    }
 
Example 24
Project: hadoop   File: WordStandardDeviation.java   Source Code and License 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: wordstddev <in> <out>");
    return 0;
  }

  Configuration conf = getConf();

  Job job = Job.getInstance(conf, "word stddev");
  job.setJarByClass(WordStandardDeviation.class);
  job.setMapperClass(WordStandardDeviationMapper.class);
  job.setCombinerClass(WordStandardDeviationReducer.class);
  job.setReducerClass(WordStandardDeviationReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  Path outputpath = new Path(args[1]);
  FileOutputFormat.setOutputPath(job, outputpath);
  boolean result = job.waitForCompletion(true);

  // read output and calculate standard deviation
  stddev = readAndCalcStdDev(outputpath, conf);

  return (result ? 0 : 1);
}
 
Example 25
Project: hadoop   File: WordCount.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  Configuration conf = new Configuration();
  String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  if (otherArgs.length < 2) {
    System.err.println("Usage: wordcount <in> [<in>...] <out>");
    System.exit(2);
  }
  Job job = Job.getInstance(conf, "word count");
  job.setJarByClass(WordCount.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  for (int i = 0; i < otherArgs.length - 1; ++i) {
    FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
  }
  FileOutputFormat.setOutputPath(job,
    new Path(otherArgs[otherArgs.length - 1]));
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 26
Project: hadoop   File: WordMean.java   Source Code and License 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: wordmean <in> <out>");
    return 0;
  }

  Configuration conf = getConf();

  Job job = Job.getInstance(conf, "word mean");
  job.setJarByClass(WordMean.class);
  job.setMapperClass(WordMeanMapper.class);
  job.setCombinerClass(WordMeanReducer.class);
  job.setReducerClass(WordMeanReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  Path outputpath = new Path(args[1]);
  FileOutputFormat.setOutputPath(job, outputpath);
  boolean result = job.waitForCompletion(true);
  mean = readAndCalcMean(outputpath, conf);

  return (result ? 0 : 1);
}
 
Example 27
Project: mapreduce-samples   File: UnitMultiplication.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(UnitMultiplication.class);

        ChainMapper.addMapper(job, TransitionMapper.class, Object.class, Text.class, Text.class, Text.class, conf);
        ChainMapper.addMapper(job, PRMapper.class, Object.class, Text.class, Text.class, Text.class, conf);

        job.setReducerClass(MultiplicationReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, TransitionMapper.class);
        MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, PRMapper.class);

        FileOutputFormat.setOutputPath(job, new Path(args[2]));
        job.waitForCompletion(true);
    }
 
Example 28
Project: hadoop   File: TeraChecksum.java   Source Code and License 6 votes vote down vote up
public int run(String[] args) throws Exception {
  Job job = Job.getInstance(getConf());
  if (args.length != 2) {
    usage();
    return 2;
  }
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSum");
  job.setJarByClass(TeraChecksum.class);
  job.setMapperClass(ChecksumMapper.class);
  job.setReducerClass(ChecksumReducer.class);
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Unsigned16.class);
  // force a single reducer
  job.setNumReduceTasks(1);
  job.setInputFormatClass(TeraInputFormat.class);
  return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 29
Project: hadoop   File: TeraValidate.java   Source Code and License 6 votes vote down vote up
public int run(String[] args) throws Exception {
  Job job = Job.getInstance(getConf());
  if (args.length != 2) {
    usage();
    return 1;
  }
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraValidate");
  job.setJarByClass(TeraValidate.class);
  job.setMapperClass(ValidateMapper.class);
  job.setReducerClass(ValidateReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  // force a single reducer
  job.setNumReduceTasks(1);
  // force a single split 
  FileInputFormat.setMinInputSplitSize(job, Long.MAX_VALUE);
  job.setInputFormatClass(TeraInputFormat.class);
  return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 30
Project: hadoop   File: CompressionEmulationUtil.java   Source Code and License 6 votes vote down vote up
/**
 * Configure the {@link Job} for enabling compression emulation.
 */
static void configure(final Job job) throws IOException, InterruptedException,
                                            ClassNotFoundException {
  // set the random text mapper
  job.setMapperClass(RandomTextDataMapper.class);
  job.setNumReduceTasks(0);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);
  job.setInputFormatClass(GenDataFormat.class);
  job.setJarByClass(GenerateData.class);

  // set the output compression true
  FileOutputFormat.setCompressOutput(job, true);
  try {
    FileInputFormat.addInputPath(job, new Path("ignored"));
  } catch (IOException e) {
    LOG.error("Error while adding input path ", e);
  }
}
 
Example 31
Project: hadoop   File: LoadJob.java   Source Code and License 6 votes vote down vote up
public Job call() throws IOException, InterruptedException,
                         ClassNotFoundException {
  ugi.doAs(
    new PrivilegedExceptionAction<Job>() {
      public Job run() throws IOException, ClassNotFoundException,
                              InterruptedException {
        job.setMapperClass(LoadMapper.class);
        job.setReducerClass(LoadReducer.class);
        job.setNumReduceTasks(jobdesc.getNumberReduces());
        job.setMapOutputKeyClass(GridmixKey.class);
        job.setMapOutputValueClass(GridmixRecord.class);
        job.setSortComparatorClass(LoadSortComparator.class);
        job.setGroupingComparatorClass(SpecGroupingComparator.class);
        job.setInputFormatClass(LoadInputFormat.class);
        job.setOutputFormatClass(RawBytesOutputFormat.class);
        job.setPartitionerClass(DraftPartitioner.class);
        job.setJarByClass(LoadJob.class);
        job.getConfiguration().setBoolean(Job.USED_GENERIC_PARSER, true);
        FileOutputFormat.setOutputPath(job, outdir);
        job.submit();
        return job;
      }
    });

  return job;
}
 
Example 32
Project: hadoop   File: TestCompressionEmulationUtils.java   Source Code and License 6 votes vote down vote up
/**
 * Runs a GridMix data-generation job.
 */
private static void runDataGenJob(Configuration conf, Path tempDir) 
throws IOException, ClassNotFoundException, InterruptedException {
  JobClient client = new JobClient(conf);
  
  // get the local job runner
  conf.setInt(MRJobConfig.NUM_MAPS, 1);
  
  Job job = Job.getInstance(conf);
  
  CompressionEmulationUtil.configure(job);
  job.setInputFormatClass(CustomInputFormat.class);
  
  // set the output path
  FileOutputFormat.setOutputPath(job, tempDir);
  
  // submit and wait for completion
  job.submit();
  int ret = job.waitForCompletion(true) ? 0 : 1;

  assertEquals("Job Failed", 0, ret);
}
 
Example 33
Project: Machine-Learning-End-to-Endguide-for-Java-developers   File: AveragePageCount.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    Configuration con = new Configuration();
    Job bookJob = Job.getInstance(con, "Average Page Count");
    bookJob.setJarByClass(AveragePageCount.class);
    bookJob.setMapperClass(TextMapper.class);
    bookJob.setReducerClass(AverageReduce.class);
    bookJob.setOutputKeyClass(Text.class);
    bookJob.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(bookJob, new Path("C:/Hadoop/books.txt"));
    FileOutputFormat.setOutputPath(bookJob, new Path("C:/Hadoop/BookOutput"));
    if (bookJob.waitForCompletion(true)) {
        System.exit(0);
    }
}
 
Example 34
Project: ditb   File: TestTableInputFormatScanBase.java   Source Code and License 5 votes vote down vote up
/**
 * Tests a MR scan using specific start and stop rows.
 *
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
protected void testScan(String start, String stop, String last)
throws IOException, InterruptedException, ClassNotFoundException {
  String jobName = "Scan" + (start != null ? start.toUpperCase() : "Empty") +
    "To" + (stop != null ? stop.toUpperCase() : "Empty");
  LOG.info("Before map/reduce startup - job " + jobName);
  Configuration c = new Configuration(TEST_UTIL.getConfiguration());
  Scan scan = new Scan();
  scan.addFamily(INPUT_FAMILY);
  if (start != null) {
    scan.setStartRow(Bytes.toBytes(start));
  }
  c.set(KEY_STARTROW, start != null ? start : "");
  if (stop != null) {
    scan.setStopRow(Bytes.toBytes(stop));
  }
  c.set(KEY_LASTROW, last != null ? last : "");
  LOG.info("scan before: " + scan);
  Job job = new Job(c, jobName);
  TableMapReduceUtil.initTableMapperJob(
    Bytes.toString(TABLE_NAME), scan, ScanMapper.class,
    ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
  job.setReducerClass(ScanReducer.class);
  job.setNumReduceTasks(1); // one to get final "first" and "last" key
  FileOutputFormat.setOutputPath(job,
      new Path(TEST_UTIL.getDataTestDir(), job.getJobName()));
  LOG.info("Started " + job.getJobName());
  assertTrue(job.waitForCompletion(true));
  LOG.info("After map/reduce completion - job " + jobName);
}
 
Example 35
Project: Java-Data-Science-Made-Easy   File: AveragePageCount.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    Configuration con = new Configuration();
    Job bookJob = Job.getInstance(con, "Average Page Count");
    bookJob.setJarByClass(AveragePageCount.class);
    bookJob.setMapperClass(TextMapper.class);
    bookJob.setReducerClass(AverageReduce.class);
    bookJob.setOutputKeyClass(Text.class);
    bookJob.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(bookJob, new Path("C:/Hadoop/books.txt"));
    FileOutputFormat.setOutputPath(bookJob, new Path("C:/Hadoop/BookOutput"));
    if (bookJob.waitForCompletion(true)) {
        System.exit(0);
    }
}
 
Example 36
Project: Java-for-Data-Science   File: AveragePageCount.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    Configuration con = new Configuration();
    Job bookJob = Job.getInstance(con, "Average Page Count");
    bookJob.setJarByClass(AveragePageCount.class);
    bookJob.setMapperClass(TextMapper.class);
    bookJob.setReducerClass(AverageReduce.class);
    bookJob.setOutputKeyClass(Text.class);
    bookJob.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(bookJob, new Path("C:/Hadoop/books.txt"));
    FileOutputFormat.setOutputPath(bookJob, new Path("C:/Hadoop/BookOutput"));
    if (bookJob.waitForCompletion(true)) {
        System.exit(0);
    }
}
 
Example 37
Project: mapreduce-samples   File: UnitSum.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(UnitSum.class);
        job.setMapperClass(PassMapper.class);
        job.setReducerClass(SumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
 
Example 38
Project: mapreduce-samples   File: UnitSum.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(UnitSum.class);
        job.setMapperClass(PassMapper.class);
        job.setReducerClass(SumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
    }
 
Example 39
Project: Wikipedia-Index   File: PageWordCount.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	conf.set("xmlinput.start", "<page>");
	conf.set("xmlinput.end", "</page>");
			
	Job job =Job.getInstance(conf);
	job.setJobName("PageWordCount");
	job.setJarByClass(PageWordCount.class);
	
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	
	job.setMapperClass(PageWordCountMap.class);
	job.setCombinerClass(PageWordCountReduce.class);
	job.setReducerClass(PageWordCountReduce.class);
	
	job.setInputFormatClass(XmlInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);
	
	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));
	job.waitForCompletion(true);
}
 
Example 40
Project: ditb   File: HashTable.java   Source Code and License 5 votes vote down vote up
public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);
  
  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
  
  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));
  
  return job;
}
 
Example 41
Project: hadoop   File: TestCompressionEmulationUtils.java   Source Code and License 5 votes vote down vote up
/**
 * Test of {@link FileQueue} can identify compressed file and provide
 * readers to extract uncompressed data only if input-compression is enabled.
 */
@Test
public void testFileQueueDecompression() throws IOException {
  JobConf conf = new JobConf();
  FileSystem lfs = FileSystem.getLocal(conf);
  String inputLine = "Hi Hello!";
  
  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
  org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);
  org.apache.hadoop.mapred.FileOutputFormat.setOutputCompressorClass(conf, 
                                              GzipCodec.class);

  // define the test's root temp directory
  Path rootTempDir =
      new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
          lfs.getUri(), lfs.getWorkingDirectory());

  Path tempDir = new Path(rootTempDir, "TestFileQueueDecompression");
  lfs.delete(tempDir, true);

  // create a compressed file
  Path compressedFile = new Path(tempDir, "test");
  OutputStream out = 
    CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, 
                                                               conf);
  BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
  writer.write(inputLine);
  writer.close();
  
  compressedFile = compressedFile.suffix(".gz");
  // now read back the data from the compressed stream using FileQueue
  long fileSize = lfs.listStatus(compressedFile)[0].getLen();
  CombineFileSplit split = 
    new CombineFileSplit(new Path[] {compressedFile}, new long[] {fileSize});
  FileQueue queue = new FileQueue(split, conf);
  byte[] bytes = new byte[inputLine.getBytes().length];
  queue.read(bytes);
  queue.close();
  String readLine = new String(bytes);
  assertEquals("Compression/Decompression error", inputLine, readLine);
}
 
Example 42
Project: HBase-High-Performance-Cookbook   File: InputDriver.java   Source Code and License 5 votes vote down vote up
public static void runJob(Path input, Path output, String vectorClassName,Configuration config)
  throws IOException, InterruptedException, ClassNotFoundException {
  Configuration conf = config;
  conf.set("vector.implementation.class.name", vectorClassName);
  Job job = new Job(conf, "Input Driver running over input: " + input);
	 
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(VectorWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setMapperClass(InputMapper.class);   
  job.setNumReduceTasks(0);
  job.setJarByClass(InputDriver.class);
  
  FileInputFormat.addInputPath(job, input);
  FileOutputFormat.setOutputPath(job, output);
  
  job.waitForCompletion(true);
}
 
Example 43
Project: mumu-mapreduce   File: MonthTrafficStatisticsMapReduce.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage: Month Traffic Statistics <input path> <output path>");
        System.exit(-1);
    }
    String nginxLogInput = args[0];
    String nginxLogOutput = args[1];

    Configuration configuration = new Configuration();
    try {
        Job job = Job.getInstance(configuration);
        job.setJobName("MonthTrafficStatistics");

        job.setJarByClass(MonthTrafficStatisticsMapReduce.class);

        FileInputFormat.addInputPath(job, new Path(nginxLogInput));
        FileOutputFormat.setOutputPath(job, new Path(nginxLogOutput));

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setMapperClass(MonthTrafficStatisticsMapper.class);
        job.setReducerClass(MonthTrafficStatisticsReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        e.printStackTrace();
    }
}
 
Example 44
Project: mumu-mapreduce   File: YearTrafficStatisticsMapReduce.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage: Year Traffic Statistics <input path> <output path>");
        System.exit(-1);
    }
    String nginxLogInput = args[0];
    String nginxLogOutput = args[1];

    Configuration configuration = new Configuration();
    try {
        Job job = Job.getInstance(configuration);
        job.setJobName("YearTrafficStatistics");

        job.setJarByClass(YearTrafficStatisticsMapReduce.class);

        FileInputFormat.addInputPath(job, new Path(nginxLogInput));
        FileOutputFormat.setOutputPath(job, new Path(nginxLogOutput));

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setMapperClass(YearTrafficStatisticsMapper.class);
        job.setReducerClass(YearTrafficStatisticsReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        e.printStackTrace();
    }
}
 
Example 45
Project: mumu-mapreduce   File: MaxTemperatureMapReduce.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage: MaxTemperature <input path> <output path>");
        System.exit(-1);
    }

    String temperatureInput = args[0];
    String temperatureOutput = args[1];
    try {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(MaxTemperatureMapReduce.class);
        job.setJobName("MaxTemperature");
        FileInputFormat.addInputPath(job, new Path(temperatureInput));
        FileOutputFormat.setOutputPath(job, new Path(temperatureOutput));

        job.setMapperClass(MaxTemperatureMapper.class);
        job.setReducerClass(MaxTemperatureReduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.waitForCompletion(true);
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        e.printStackTrace();
    }
}
 
Example 46
Project: Hadoop-Codes   File: MaximumAverageDriver.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	Job job = Job.getInstance(conf, "maxaverage");
	
	job.setMapperClass(MaximumAverageMapper.class);
	job.setReducerClass(MaximumAverageReducer.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(FloatWritable.class);

	FileInputFormat.setInputPaths(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	if (!job.waitForCompletion(true))
		return;
}
 
Example 47
Project: hadoop   File: TestCompressionEmulationUtils.java   Source Code and License 5 votes vote down vote up
/**
 * Test compressible {@link GridmixRecord}.
 */
@Test
public void testCompressibleGridmixRecord() throws IOException {
  JobConf conf = new JobConf();
  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
  
  FileSystem lfs = FileSystem.getLocal(conf);
  int dataSize = 1024 * 1024 * 10; // 10 MB
  float ratio = 0.357F;
  
  // define the test's root temp directory
  Path rootTempDir =
      new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
          lfs.getUri(), lfs.getWorkingDirectory());

  Path tempDir = new Path(rootTempDir, 
                          "TestPossiblyCompressibleGridmixRecord");
  lfs.delete(tempDir, true);
  
  // define a compressible GridmixRecord
  GridmixRecord record = new GridmixRecord(dataSize, 0);
  record.setCompressibility(true, ratio); // enable compression
  
  conf.setClass(FileOutputFormat.COMPRESS_CODEC, GzipCodec.class, 
                CompressionCodec.class);
  org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);
  
  // write the record to a file
  Path recordFile = new Path(tempDir, "record");
  OutputStream outStream = CompressionEmulationUtil
                             .getPossiblyCompressedOutputStream(recordFile, 
                                                                conf);    
  DataOutputStream out = new DataOutputStream(outStream);
  record.write(out);
  out.close();
  outStream.close();
  
  // open the compressed stream for reading
  Path actualRecordFile = recordFile.suffix(".gz");
  InputStream in = 
    CompressionEmulationUtil
      .getPossiblyDecompressedInputStream(actualRecordFile, conf, 0);
  
  // get the compressed file size
  long compressedFileSize = lfs.listStatus(actualRecordFile)[0].getLen();
  
  GridmixRecord recordRead = new GridmixRecord();
  recordRead.readFields(new DataInputStream(in));
  
  assertEquals("Record size mismatch in a compressible GridmixRecord",
               dataSize, recordRead.getSize());
  assertTrue("Failed to generate a compressible GridmixRecord",
             recordRead.getSize() > compressedFileSize);
  
  // check if the record can generate data with the desired compression ratio
  float seenRatio = ((float)compressedFileSize)/dataSize;
  assertEquals(CompressionEmulationUtil.standardizeCompressionRatio(ratio), 
      CompressionEmulationUtil.standardizeCompressionRatio(seenRatio), 1.0D);
}
 
Example 48
Project: Hadoop-Codes   File: SystemUserDriver.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	Job job = Job.getInstance(conf, "systemuser");
	
	//job.setMapperClass(SystemUserMapper.class);
	job.setMapperClass(DailyCount.class);
	job.setReducerClass(IntSumReducer.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	FileInputFormat.setInputPaths(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	if (!job.waitForCompletion(true))
		return;
}
 
Example 49
Project: aliyun-maxcompute-data-collectors   File: AvroImportMapper.java   Source Code and License 5 votes vote down vote up
@Override
protected void setup(Context context)
    throws IOException, InterruptedException {
  Configuration conf = context.getConfiguration();
  schema = AvroJob.getMapOutputSchema(conf);
  lobLoader = new LargeObjectLoader(conf, FileOutputFormat.getWorkOutputPath(context));
  bigDecimalFormatString = conf.getBoolean(
      ImportJobBase.PROPERTY_BIGDECIMAL_FORMAT,
      ImportJobBase.PROPERTY_BIGDECIMAL_FORMAT_DEFAULT);
}
 
Example 50
Project: aliyun-maxcompute-data-collectors   File: AvroOutputFormat.java   Source Code and License 5 votes vote down vote up
static <T> void configureDataFileWriter(DataFileWriter<T> writer,
  TaskAttemptContext context) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(context)) {
    int level = context.getConfiguration()
      .getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
    String codecName = context.getConfiguration()
      .get(org.apache.avro.mapred.AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory =
      codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level)
        : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  writer.setSyncInterval(context.getConfiguration()
    .getInt(SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL));

  // copy metadata from job
  for (Map.Entry<String, String> e : context.getConfiguration()) {
    if (e.getKey().startsWith(org.apache.avro.mapred.AvroJob.TEXT_PREFIX)) {
      writer.setMeta(e.getKey()
        .substring(org.apache.avro.mapred.AvroJob.TEXT_PREFIX.length()),
        e.getValue());
    }
    if (e.getKey().startsWith(org.apache.avro.mapred.AvroJob.BINARY_PREFIX)) {
      writer.setMeta(e.getKey()
        .substring(org.apache.avro.mapred.AvroJob.BINARY_PREFIX.length()),
        URLDecoder.decode(e.getValue(), "ISO-8859-1").getBytes("ISO-8859-1"));
    }
  }
}
 
Example 51
Project: aliyun-maxcompute-data-collectors   File: HCatalogTestUtils.java   Source Code and License 5 votes vote down vote up
/**
 * Run a local map reduce job to read records from HCatalog table.
 * @param readCount
 * @param filter
 * @return
 * @throws Exception
 */
public List<HCatRecord> readHCatRecords(String dbName,
  String tableName, String filter) throws Exception {

  HCatReaderMapper.setReadRecordCount(0);
  recsRead.clear();

  // Configuration conf = new Configuration();
  Job job = new Job(conf, "HCatalog reader job");
  job.setJarByClass(this.getClass());
  job.setMapperClass(HCatReaderMapper.class);
  job.getConfiguration()
    .setInt(ConfigurationConstants.PROP_MAPRED_MAP_TASKS, 1);
  // input/output settings
  job.setInputFormatClass(HCatInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);

  HCatInputFormat.setInput(job, dbName, tableName).setFilter(filter);

  job.setMapOutputKeyClass(BytesWritable.class);
  job.setMapOutputValueClass(Text.class);

  job.setNumReduceTasks(0);

  Path path = new Path(fs.getWorkingDirectory(),
    "mapreduce/HCatTableIndexOutput");
  if (fs.exists(path)) {
    fs.delete(path, true);
  }

  FileOutputFormat.setOutputPath(job, path);

  job.waitForCompletion(true);
  LOG.info("Read " + HCatReaderMapper.readRecordCount + " records");

  return recsRead;
}
 
Example 52
Project: hadoop   File: TestReporter.java   Source Code and License 5 votes vote down vote up
@Test
public void testStatusLimit() throws IOException, InterruptedException,
    ClassNotFoundException {
  Path test = new Path(testRootTempDir, "testStatusLimit");

  Configuration conf = new Configuration();
  Path inDir = new Path(test, "in");
  Path outDir = new Path(test, "out");
  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(inDir)) {
    fs.delete(inDir, true);
  }
  fs.mkdirs(inDir);
  DataOutputStream file = fs.create(new Path(inDir, "part-" + 0));
  file.writeBytes("testStatusLimit");
  file.close();

  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }

  Job job = Job.getInstance(conf, "testStatusLimit");

  job.setMapperClass(StatusLimitMapper.class);
  job.setNumReduceTasks(0);

  FileInputFormat.addInputPath(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);

  job.waitForCompletion(true);

  assertTrue("Job failed", job.isSuccessful());
}
 
Example 53
Project: hadoop   File: TestMapperReducerCleanup.java   Source Code and License 5 votes vote down vote up
@Test
public void testReduceCleanup() throws Exception {
  reset();
  
  Job job = Job.getInstance();

  Path inputPath = createInput();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  job.setMapperClass(TrackingTokenizerMapper.class);
  job.setReducerClass(FailingReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setInputFormatClass(TrackingTextInputFormat.class);
  job.setOutputFormatClass(TrackingTextOutputFormat.class);
  job.setNumReduceTasks(1);
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);

  Assert.assertTrue(mapCleanup);
  Assert.assertTrue(reduceCleanup);
  Assert.assertTrue(recordReaderCleanup);
  Assert.assertTrue(recordWriterCleanup);
}
 
Example 54
Project: hadoop   File: LargeSorter.java   Source Code and License 5 votes vote down vote up
/** 
 * Generate the requested number of file splits, with the filename
 * set to the filename of the output file.
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
  List<InputSplit> result = new ArrayList<InputSplit>();
  Path outDir = FileOutputFormat.getOutputPath(job);
  int numSplits = 
        job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
  for(int i=0; i < numSplits; ++i) {
    result.add(new FileSplit(
        new Path(outDir, "dummy-split-" + i), 0, 1, null));
  }
  return result;
}
 
Example 55
Project: hadoop   File: TestMRJobs.java   Source Code and License 5 votes vote down vote up
protected Job runFailingMapperJob()
throws IOException, InterruptedException, ClassNotFoundException {
  Configuration myConf = new Configuration(mrCluster.getConfig());
  myConf.setInt(MRJobConfig.NUM_MAPS, 1);
  myConf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 2); //reduce the number of attempts

  Job job = Job.getInstance(myConf);

  job.setJarByClass(FailingMapper.class);
  job.setJobName("failmapper");
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormatClass(RandomInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setMapperClass(FailingMapper.class);
  job.setNumReduceTasks(0);
  
  FileOutputFormat.setOutputPath(job, new Path(OUTPUT_ROOT_DIR,
    "failmapper-output"));
  job.addFileToClassPath(APP_JAR); // The AppMaster jar itself.
  job.submit();
  String trackingUrl = job.getTrackingURL();
  String jobId = job.getJobID().toString();
  boolean succeeded = job.waitForCompletion(true);
  Assert.assertFalse(succeeded);
  Assert.assertTrue("Tracking URL was " + trackingUrl +
                    " but didn't Match Job ID " + jobId ,
        trackingUrl.endsWith(jobId.substring(jobId.lastIndexOf("_")) + "/"));
  return job;
}
 
Example 56
Project: hadoop   File: TestSpeculativeExecution.java   Source Code and License 5 votes vote down vote up
private Job runSpecTest(boolean mapspec, boolean redspec)
    throws IOException, ClassNotFoundException, InterruptedException {

  Path first = createTempFile("specexec_map_input1", "a\nz");
  Path secnd = createTempFile("specexec_map_input2", "a\nz");

  Configuration conf = mrCluster.getConfig();
  conf.setBoolean(MRJobConfig.MAP_SPECULATIVE,mapspec);
  conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE,redspec);
  conf.setClass(MRJobConfig.MR_AM_TASK_ESTIMATOR,
          TestSpecEstimator.class,
          TaskRuntimeEstimator.class);

  Job job = Job.getInstance(conf);
  job.setJarByClass(TestSpeculativeExecution.class);
  job.setMapperClass(SpeculativeMapper.class);
  job.setReducerClass(SpeculativeReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setNumReduceTasks(2);
  FileInputFormat.setInputPaths(job, first);
  FileInputFormat.addInputPath(job, secnd);
  FileOutputFormat.setOutputPath(job, TEST_OUT_DIR);

  // Delete output directory if it exists.
  try {
    localFs.delete(TEST_OUT_DIR,true);
  } catch (IOException e) {
    // ignore
  }

  // Creates the Job Configuration
  job.addFileToClassPath(APP_JAR); // The AppMaster jar itself.
  job.setMaxMapAttempts(2);

  job.submit();

  return job;
}
 
Example 57
Project: hadoop   File: TestMapReduceLazyOutput.java   Source Code and License 5 votes vote down vote up
private static void runTestLazyOutput(Configuration conf, Path output,
    int numReducers, boolean createLazily) 
throws Exception {
  Job job = Job.getInstance(conf, "Test-Lazy-Output");

  FileInputFormat.setInputPaths(job, INPUT);
  FileOutputFormat.setOutputPath(job, output);

  job.setJarByClass(TestMapReduceLazyOutput.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(numReducers);

  job.setMapperClass(TestMapper.class);
  job.setReducerClass(TestReducer.class);

  if (createLazily) {
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
  } else {
    job.setOutputFormatClass(TextOutputFormat.class);
  }
  assertTrue(job.waitForCompletion(true));
}
 
Example 58
Project: hadoop   File: TestLineRecordReaderJobs.java   Source Code and License 5 votes vote down vote up
/**
 * Creates and runs an MR job
 *
 * @param conf
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void createAndRunJob(Configuration conf) throws IOException,
    InterruptedException, ClassNotFoundException {
  Job job = Job.getInstance(conf);
  job.setJarByClass(TestLineRecordReaderJobs.class);
  job.setMapperClass(Mapper.class);
  job.setReducerClass(Reducer.class);
  FileInputFormat.addInputPath(job, inputDir);
  FileOutputFormat.setOutputPath(job, outputDir);
  job.waitForCompletion(true);
}
 
Example 59
Project: hadoop   File: TestLocalRunner.java   Source Code and License 5 votes vote down vote up
/**
 * Test that the GC counter actually increments when we know that we've
 * spent some time in the GC during the mapper.
 */
@Test
public void testGcCounter() throws Exception {
  Path inputPath = getInputPath();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  // Clear input/output dirs.
  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  if (fs.exists(inputPath)) {
    fs.delete(inputPath, true);
  }

  // Create one input file
  createInputFile(inputPath, 0, 20);

  // Now configure and run the job.
  Job job = Job.getInstance();
  job.setMapperClass(GCMapper.class);
  job.setNumReduceTasks(0);
  job.getConfiguration().set(MRJobConfig.IO_SORT_MB, "25");
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  boolean ret = job.waitForCompletion(true);
  assertTrue("job failed", ret);

  // This job should have done *some* gc work.
  // It had to clean up 400,000 objects.
  // We strongly suspect this will result in a few milliseconds effort.
  Counter gcCounter = job.getCounters().findCounter(
      TaskCounter.GC_TIME_MILLIS);
  assertNotNull(gcCounter);
  assertTrue("No time spent in gc", gcCounter.getValue() > 0);
}
 
Example 60
Project: LDA   File: InitDriver.java   Source Code and License 5 votes vote down vote up
public static void run(Configuration conf, Path[] inputPath, Path outputPath) throws IOException, ClassNotFoundException, InterruptedException {
        String jobName = "init matrix";
        Job job = new Job(conf, jobName);

        job.setMapOutputKeyClass(twoDimensionIndexWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(twoDimensionIndexWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setMapperClass(InitMapper.class);
        job.setReducerClass(InitReducer.class);
        job.setNumReduceTasks(1);

        for(Path path : inputPath) {
            FileInputFormat.addInputPath(job, path);
        }
        Path output = new Path(outputPath, "initDir");
        FileOutputFormat.setOutputPath(job, output);

        job.setJarByClass(LDADriver.class);
        if (!job.waitForCompletion(true)) {
            throw new InterruptedException("Init failed");
        }
}