Java Code Examples for org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

The following examples show how to use org.apache.hadoop.mapreduce.lib.output.FileOutputFormat. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: hiped2   Source File: Main.java    License: Apache License 2.0 6 votes vote down vote up
public static void runSortJob(Configuration conf, Path input, Path outputPath)
    throws Exception {

  Job job = new Job(conf);
  job.setJarByClass(Main.class);
  job.setMapperClass(SortMapReduce.Map.class);
  job.setReducerClass(SortMapReduce.Reduce.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setMapOutputKeyClass(Person.class);
  job.setMapOutputValueClass(Person.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setPartitionerClass(PersonNamePartitioner.class);
  job.setSortComparatorClass(PersonComparator.class);
  job.setGroupingComparatorClass(PersonNameComparator.class);

  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);
}
 
Example 2
/**
 * Refer to MRCompactorAvroKeyDedupJobRunner#configureInputAndOutputPaths(Job).
 * @return false if no valid input paths present for MR job to process,  where a path is valid if it is
 * a directory containing one or more files.
 *
 */
protected boolean configureInputAndOutputPaths(Job job, FileSystemDataset dataset) throws IOException {
  boolean emptyDirectoryFlag = false;

  String mrOutputBase = this.state.getProp(MRCompactor.COMPACTION_JOB_DIR);
  CompactionPathParser parser = new CompactionPathParser(this.state);
  CompactionPathParser.CompactionParserResult rst = parser.parse(dataset);
  this.mrOutputPath = concatPaths(mrOutputBase, rst.getDatasetName(), rst.getDstSubDir(), rst.getTimeString());

  log.info("Cleaning temporary MR output directory: " + mrOutputPath);
  this.fs.delete(mrOutputPath, true);

  this.mapReduceInputPaths = getGranularInputPaths(dataset.datasetRoot());
  if (this.mapReduceInputPaths.isEmpty()) {
    this.mapReduceInputPaths.add(dataset.datasetRoot());
    emptyDirectoryFlag = true;
  }
  this.oldFiles = new HashSet<>();
  for (Path path : mapReduceInputPaths) {
    oldFiles.add(this.fs.makeQualified(path).toString());
    FileInputFormat.addInputPath(job, path);
  }

  FileOutputFormat.setOutputPath(job, mrOutputPath);
  return emptyDirectoryFlag;
}
 
Example 3
Source Project: secure-data-service   Source File: BigDiffHadoop.java    License: Apache License 2.0 6 votes vote down vote up
public void execute(String inputPath1, String inputPath2, String outputPath) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "bigdiff");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(inputPath1));
    FileInputFormat.addInputPath(job, new Path(inputPath2));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.waitForCompletion(true);
}
 
Example 4
Source Project: big-c   Source File: TestJoinDatamerge.java    License: Apache License 2.0 6 votes vote down vote up
private static void checkOuterConsistency(Job job, Path[] src) 
    throws IOException {
  Path outf = FileOutputFormat.getOutputPath(job);
  FileStatus[] outlist = cluster.getFileSystem().listStatus(outf, new 
                           Utils.OutputFileUtils.OutputFilesFilter());
  assertEquals("number of part files is more than 1. It is" + outlist.length,
    1, outlist.length);
  assertTrue("output file with zero length" + outlist[0].getLen(),
    0 < outlist[0].getLen());
  SequenceFile.Reader r =
    new SequenceFile.Reader(cluster.getFileSystem(),
        outlist[0].getPath(), job.getConfiguration());
  IntWritable k = new IntWritable();
  IntWritable v = new IntWritable();
  while (r.next(k, v)) {
    assertEquals("counts does not match", v.get(),
      countProduct(k, src, job.getConfiguration()));
  }
  r.close();
}
 
Example 5
Source Project: parquet-flinktacular   Source File: ParquetAvroExample.java    License: Apache License 2.0 6 votes vote down vote up
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
	// Set up the Hadoop Input Format
	Job job = Job.getInstance();

	// Set up Hadoop Output Format
	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job);

	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	AvroParquetOutputFormat.setSchema(job, Person.getClassSchema());
	ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetOutputFormat.setEnableDictionary(job, true);

	// Output & Execute
	data.output(hadoopOutputFormat);
}
 
Example 6
Source Project: bigdata-tutorial   Source File: MapperInputSplitInfo.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: MapperInputSplitInfo <in> <out>");
		System.exit(2);
	}
	Job job = Job.getInstance(conf, MapperInputSplitInfo.class.getSimpleName());
	job.setJarByClass(MapperInputSplitInfo.class);
	job.setMapperClass(MyMapper.class);
	job.setCombinerClass(MyReducer.class);
	job.setReducerClass(MyReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 7
/**
 * Gets a configured instance of the stored {@link FileOutputFormat} in the configuration.
 *
 * @param conf the configuration to reference the keys from.
 * @return a configured instance of the stored {@link FileOutputFormat} in the configuration.
 * @throws IOException if there's an issue getting an instance of a FileOutputFormat from the
 *     configuration.
 */
@SuppressWarnings("rawtypes")
public static FileOutputFormat getFileOutputFormat(Configuration conf) throws IOException {
  // Ensure the BigQuery output information is valid.
  getMandatoryConfig(conf, OUTPUT_FORMAT_CLASS);

  Class<?> confClass = OUTPUT_FORMAT_CLASS.get(conf, conf::getClass);

  // Fail if the default value was used, or the class isn't a FileOutputFormat.
  if (confClass == null) {
    throw new IOException(
        "Unable to resolve value for the configuration key '"
            + OUTPUT_FORMAT_CLASS.getKey()
            + "'.");
  } else if (!FileOutputFormat.class.isAssignableFrom(confClass)) {
    throw new IOException("The class " + confClass.getName() + " is not a FileOutputFormat.");
  }

  Class<? extends FileOutputFormat> fileOutputClass =
      confClass.asSubclass(FileOutputFormat.class);

  // Create a new instance and configure it if it's configurable.
  return ReflectionUtils.newInstance(fileOutputClass, conf);
}
 
Example 8
Source Project: Cubert   Source File: JobExecutor.java    License: Apache License 2.0 6 votes vote down vote up
protected void setOutput() throws IOException
{
    JsonNode output = get(root, "output");
    JsonNode params = output.get("params");
    if (params == null)
        params = mapper.createObjectNode();

    Path outputPath = new Path(getText(output, "path"));
    FileOutputFormat.setOutputPath(job, outputPath);

    if (params.has("overwrite") && Boolean.parseBoolean(getText(params, "overwrite")))
    {
        fs.delete(outputPath, true);
    }

    BlockSchema schema = new BlockSchema(output.get("schema"));

    Storage storage = StorageFactory.get(getText(output, "type"));
    storage.prepareOutput(job, conf, params, schema, outputPath);
}
 
Example 9
Source Project: wifi   Source File: WordCount.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
//		System.out.println(otherArgs);
		if(otherArgs.length != 2) {
			System.out.println("Usage:wordcount <in> <out>");
			System.exit(2);
		}
//		if(args.length != 2) {
//			System.out.println("param error!");
//			System.exit(-1);
//		}
		
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		
	}
 
Example 10
Source Project: RDFS   Source File: GridmixJob.java    License: Apache License 2.0 6 votes vote down vote up
public Job call() throws IOException, InterruptedException,
                         ClassNotFoundException {
  job.setMapperClass(GridmixMapper.class);
  job.setReducerClass(GridmixReducer.class);
  job.setNumReduceTasks(jobdesc.getNumberReduces());
  job.setMapOutputKeyClass(GridmixKey.class);
  job.setMapOutputValueClass(GridmixRecord.class);
  job.setSortComparatorClass(GridmixKey.Comparator.class);
  job.setGroupingComparatorClass(SpecGroupingComparator.class);
  job.setInputFormatClass(GridmixInputFormat.class);
  job.setOutputFormatClass(RawBytesOutputFormat.class);
  job.setPartitionerClass(DraftPartitioner.class);
  job.setJarByClass(GridmixJob.class);
  job.getConfiguration().setInt("gridmix.job.seq", seq);
  job.getConfiguration().set(ORIGNAME, null == jobdesc.getJobID()
      ? "<unknown>" : jobdesc.getJobID().toString());
  job.getConfiguration().setBoolean("mapred.used.genericoptionsparser", true);
  FileInputFormat.addInputPath(job, new Path("ignored"));
  FileOutputFormat.setOutputPath(job, outdir);
  job.submit();
  return job;
}
 
Example 11
Source Project: bigdata-tutorial   Source File: WordCount.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: wordcount <in> <out>");
		System.exit(2);
	}
	Job job = new Job(conf, "word count");
	job.setJarByClass(WordCount.class);
	job.setMapperClass(TokenizerMapper.class);
	job.setCombinerClass(IntSumReducer.class);
	job.setReducerClass(IntSumReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 12
Source Project: hadoop   Source File: MapReduceTestUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Creates a simple copy job.
 * 
 * @param conf Configuration object
 * @param outdir Output directory.
 * @param indirs Comma separated input directories.
 * @return Job initialized for a data copy job.
 * @throws Exception If an error occurs creating job configuration.
 */
public static Job createCopyJob(Configuration conf, Path outdir, 
    Path... indirs) throws Exception {
  conf.setInt(MRJobConfig.NUM_MAPS, 3);
  Job theJob = Job.getInstance(conf);
  theJob.setJobName("DataMoveJob");

  FileInputFormat.setInputPaths(theJob, indirs);
  theJob.setMapperClass(DataCopyMapper.class);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  theJob.setReducerClass(DataCopyReducer.class);
  theJob.setNumReduceTasks(1);
  return theJob;
}
 
Example 13
Source Project: MLHadoop   Source File: MatMulDriver.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
	Configuration conf = new Configuration();
	// A is an m-by-n matrix; B is an n-by-p matrix.
	conf.set("m", args[0]);
	conf.set("n", args[1]);
	conf.set("p", args[2]);
	Job job = new Job(conf, "Matrix_Multiplication");
	job.setJarByClass(MatMulDriver.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);
	job.setMapperClass(MatMulMap.class);
	//Don't use combiner if there is no scope of combining the output. Otherwise the job will get stuck.
	//job.setCombinerClass(MatMulModGenReduce.class);
	job.setReducerClass(MatMulReduce.class);
	//args[3] is the input path.
	FileInputFormat.addInputPath(job, new Path(args[3]));
	//args[4] is the output path.
	FileOutputFormat.setOutputPath(job, new Path(args[4]));
	System.exit(job.waitForCompletion(true)?0:1);
}
 
Example 14
Source Project: hbase   Source File: HashTable.java    License: Apache License 2.0 6 votes vote down vote up
public Job createSubmittableJob(String[] args) throws IOException {
  Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
  generatePartitions(partitionsPath);

  Job job = Job.getInstance(getConf(),
        getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
  Configuration jobConf = job.getConfiguration();
  jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
  jobConf.setBoolean(IGNORE_TIMESTAMPS, tableHash.ignoreTimestamps);
  job.setJarByClass(HashTable.class);

  TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
      HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);

  // use a TotalOrderPartitioner and reducers to group region output into hash files
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
  job.setReducerClass(Reducer.class);  // identity reducer
  job.setNumReduceTasks(tableHash.numHashFiles);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(ImmutableBytesWritable.class);
  job.setOutputFormatClass(MapFileOutputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));

  return job;
}
 
Example 15
Source Project: Hadoop-BAM   Source File: TestCRAMOutputFormat.java    License: MIT License 6 votes vote down vote up
private Path doMapReduce(final String inputFile) throws Exception {
    final FileSystem fileSystem = FileSystem.get(conf);
    final Path inputPath = new Path(inputFile);
    final Path outputPath = fileSystem.makeQualified(new Path("target/out"));
    fileSystem.delete(outputPath, true);

    final Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, inputPath);

    job.setInputFormatClass(CRAMInputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(SAMRecordWritable.class);

    conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile);
    job.setOutputFormatClass(CRAMTestNoHeaderOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(SAMRecordWritable.class);

    job.setNumReduceTasks(0);
    FileOutputFormat.setOutputPath(job, outputPath);

    final boolean success = job.waitForCompletion(true);
    assertTrue(success);

    return outputPath;
}
 
Example 16
Source Project: hbase   Source File: IntegrationTestLoadAndVerify.java    License: Apache License 2.0 6 votes vote down vote up
protected Job doLoad(Configuration conf, TableDescriptor tableDescriptor) throws Exception {
  Path outputDir = getTestDir(TEST_NAME, "load-output");
  LOG.info("Load output dir: " + outputDir);

  NMapInputFormat.setNumMapTasks(conf, conf.getInt(NUM_MAP_TASKS_KEY, NUM_MAP_TASKS_DEFAULT));
  conf.set(TABLE_NAME_KEY, tableDescriptor.getTableName().getNameAsString());

  Job job = Job.getInstance(conf);
  job.setJobName(TEST_NAME + " Load for " + tableDescriptor.getTableName());
  job.setJarByClass(this.getClass());
  setMapperClass(job);
  job.setInputFormatClass(NMapInputFormat.class);
  job.setNumReduceTasks(0);
  setJobScannerConf(job);
  FileOutputFormat.setOutputPath(job, outputDir);

  TableMapReduceUtil.addDependencyJars(job);

  TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), AbstractHBaseTool.class);
  TableMapReduceUtil.initCredentials(job);
  assertTrue(job.waitForCompletion(true));
  return job;
}
 
Example 17
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 1) {
        System.err.println("Usage: ElemValueCooccurrencesTest configFile outputDir");
        System.exit(2);
    }

    Job job = Job.getInstance(conf);
    job.setJarByClass(ElemValueCooccurrencesTest.class);
    job.setInputFormatClass(ValueInputFormat.class);
    job.setMapperClass(ElemCooccurrencesMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, 
            Writable.class);
    conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, 
        ElemValueCooccurrencesFunction.class, ElemValueCooccurrences.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 18
Source Project: flink-perf   Source File: KMeansDriver.java    License: Apache License 2.0 6 votes vote down vote up
public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception {
	Path points = new Path (pointsPath);
	Path seqFile = new Path (seqFilePath);
	if (fs.exists(seqFile)) {
		fs.delete(seqFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setMapperClass(CenterInitializer.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(Centroid.class);
	job.setMapOutputValueClass(Point.class);
	job.setOutputKeyClass(Centroid.class);
	job.setOutputValueClass(Point.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setInputFormatClass(TextInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(pointsPath));
	FileOutputFormat.setOutputPath(job, seqFile);
	job.waitForCompletion(true);
}
 
Example 19
Source Project: MapReduce-Demo   Source File: Step6.java    License: MIT License 6 votes vote down vote up
public static boolean run(Configuration config, Map<String, String> paths) 
		throws IOException, ClassNotFoundException, InterruptedException {
	String jobName = "step6";
	Job job = Job.getInstance(config, jobName);
	job.setJarByClass(Step6.class);
	job.setJar("export\\ItemCF.jar");
	job.setMapperClass(Step6_Mapper.class);
	job.setReducerClass(Step6_Reducer.class);		
	job.setMapOutputKeyClass(PairWritable.class);
	job.setMapOutputValueClass(Text.class);
	//job.setSortComparatorClass(ScoreSort.class);			//自定义排序
	job.setGroupingComparatorClass(UserGroup.class);	//自定义分组
	
	Path inPath = new Path(paths.get("Step6Input"));
	Path outpath = new Path(paths.get("Step6Output"));
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outpath);		
	FileSystem fs = FileSystem.get(config);
	if (fs.exists(outpath)) {
		fs.delete(outpath, true);
	}
	
	return job.waitForCompletion(true);		
}
 
Example 20
Source Project: bigdata-tutorial   Source File: XflowDstIPCount.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: xflowdstipcount <in> <out>");
		System.exit(2);
	}
	Job job = Job.getInstance();
	job.setJobName("xflow dstip count");
	job.setJarByClass(XflowDstIPCount.class);
	job.setMapperClass(ParesDstIPMapper.class);
	job.setCombinerClass(IntSumReducer.class);
	job.setReducerClass(IntSumReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 21
Source Project: MLHadoop   Source File: NBCDriver.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
	Configuration conf=new Configuration();
	// The test input for which you want to find the acitivity that the Person should be doing
	conf.set("test_input", args[0]);
	Job job = new Job(conf);
	job.setJarByClass(NBCDriver.class);
	job.setJobName("Naive_Bayes_calssifier using Hadoop");
	FileInputFormat.setInputPaths(job, new Path(args[1]));
	FileOutputFormat.setOutputPath(job, new Path(args[2]));
	job.setMapperClass(NBCMap.class);
	job.setReducerClass(NBCReduce.class);
	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(Text.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(Text.class);
	boolean success = job.waitForCompletion(true);
	System.exit(success ? 0 : 1);
}
 
Example 22
Source Project: flink-perf   Source File: PageRankDriver.java    License: Apache License 2.0 6 votes vote down vote up
public static void calculateNextRanks (Configuration conf, FileSystem fs, String inputPath, String outputPath) throws Exception {
	Path outFile = new Path (outputPath);
	if (fs.exists(outFile)) {
		fs.delete(outFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setJarByClass(PageRankMapper.class);
	job.setMapperClass(PageRankMapper.class);
	job.setReducerClass(PageRankReducer.class);
	job.setMapOutputKeyClass(LongWritable.class);
	job.setMapOutputValueClass(Message.class);
	job.setOutputKeyClass(LongWritable.class);
	job.setOutputValueClass(Message.class);
	job.setInputFormatClass(SequenceFileInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, outFile);
	job.waitForCompletion(true);
}
 
Example 23
Source Project: yuzhouwan   Source File: PatentMainController.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Main方法里面,设置了 Patent任务流程,Mapper ->Combiner ->Reducer ->Partitioner.
 *
 * @param args
 * @throws Exception
 */
public static void main(String[] args) throws Exception {

    //配置 Job,并完成初始化
    Job job = Job.getInstance(new Configuration());

    //指定 Job的主类
    job.setJarByClass(PatentMainController.class);

    //指定 Job的 Mapper组件
    job.setMapperClass(PatentMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    //指定 Job的数据输入地址
    FileInputFormat.setInputPaths(job, new Path(args[0]));

    //指定 Job的 Combiner组件
    job.setCombinerClass(InverseIndexByKeywordCombiner.class);
    job.setReducerClass(InverseIndexByKeywordCombiner.class);

    //指定 Job的 Reducer组件
    job.setReducerClass(PatentReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    //指定 Job的数据输出地址
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setPartitionerClass(PatentPartitioner.class);
    //指定最大的 Task数量
    job.setNumReduceTasks(ConfUtil.getMax());

    //提交并等待执行完成
    job.waitForCompletion(true);
}
 
Example 24
Source Project: RDFS   Source File: SecondarySort.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  Configuration conf = new Configuration();
  String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  if (otherArgs.length != 2) {
    System.err.println("Usage: secondarysrot <in> <out>");
    System.exit(2);
  }
  Job job = new Job(conf, "secondary sort");
  job.setJarByClass(SecondarySort.class);
  job.setMapperClass(MapClass.class);
  job.setReducerClass(Reduce.class);

  // group and partition by the first int in the pair
  job.setPartitionerClass(FirstPartitioner.class);
  job.setGroupingComparatorClass(FirstGroupingComparator.class);

  // the map output is IntPair, IntWritable
  job.setMapOutputKeyClass(IntPair.class);
  job.setMapOutputValueClass(IntWritable.class);

  // the reduce output is Text, IntWritable
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  
  FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 25
Source Project: hadoop   Source File: TestLocalModeWithNewApis.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testNewApis() throws Exception {
  Random r = new Random(System.currentTimeMillis());
  Path tmpBaseDir = new Path("/tmp/wc-" + r.nextInt());
  final Path inDir = new Path(tmpBaseDir, "input");
  final Path outDir = new Path(tmpBaseDir, "output");
  String input = "The quick brown fox\nhas many silly\nred fox sox\n";
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  Job job = Job.getInstance(conf, "word count");
  job.setJarByClass(TestLocalModeWithNewApis.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileInputFormat.addInputPath(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);
  assertEquals(job.waitForCompletion(true), true);

  String output = readOutput(outDir, conf);
  assertEquals("The\t1\nbrown\t1\nfox\t2\nhas\t1\nmany\t1\n" +
               "quick\t1\nred\t1\nsilly\t1\nsox\t1\n", output);
  
  outFs.delete(tmpBaseDir, true);
}
 
Example 26
Source Project: dkpro-c4corpus   Source File: ConfigurationHelper.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Job configurator
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass,
        Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass,
        String commaSeparatedInputFiles, String outputPath)
        throws IOException
{
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}
 
Example 27
Source Project: pravega-samples   Source File: TeraStreamValidate.java    License: Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
  if (args.length != 5) {
    usage();
    return 2;
  }
  LOG.info("starting");
  Path inputDir = new Path(args[0]);
  Path outputDir = new Path(args[1]);
  getConf().setStrings(INPUT_URI_STRING, args[2]);
  getConf().setStrings(INPUT_SCOPE_NAME, args[3]);
  getConf().setStrings(INPUT_STREAM_NAME, args[4]);
  getConf().setStrings(INPUT_DESERIALIZER, TextSerializer.class.getName());

  getConf().setInt(MRJobConfig.NUM_MAPS, 1);
  Job job = Job.getInstance(getConf());

  TeraInputFormat.setInputPaths(job, inputDir);
  FileOutputFormat.setOutputPath(job, outputDir);

  job.setJobName("TeraStreamValidate");
  job.setJarByClass(TeraStreamValidate.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setMapperClass(TeraSortMapper.class);
  job.setNumReduceTasks(1);

  job.setInputFormatClass(PravegaInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  int ret = job.waitForCompletion(true) ? 0 : 1;
  LOG.info("done");
  return ret;
}
 
Example 28
Source Project: hadoop   Source File: TestLocalRunner.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Run a test which creates a SequenceMapper / IdentityReducer
 * job over a set of generated number files.
 */
private void doMultiReducerTest(int numMaps, int numReduces,
    int parallelMaps, int parallelReduces) throws Exception {

  Path in = getNumberDirPath();
  Path out = getOutputPath();

  // Clear data from any previous tests.
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);
  if (fs.exists(out)) {
    fs.delete(out, true);
  }

  if (fs.exists(in)) {
    fs.delete(in, true);
  }

  for (int i = 0; i < numMaps; i++) {
    makeNumberFile(i, 100);
  }

  Job job = Job.getInstance();
  job.setNumReduceTasks(numReduces);

  job.setMapperClass(SequenceMapper.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NullWritable.class);
  FileInputFormat.addInputPath(job, in);
  FileOutputFormat.setOutputPath(job, out);

  LocalJobRunner.setLocalMaxRunningMaps(job, parallelMaps);
  LocalJobRunner.setLocalMaxRunningReduces(job, parallelReduces);

  boolean result = job.waitForCompletion(true);
  assertTrue("Job failed!!", result);

  verifyNumberJob(numMaps);
}
 
Example 29
Source Project: hiped2   Source File: AvroParquetMapReduce.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(AvroParquetMapReduce.class);

  job.setInputFormatClass(AvroParquetInputFormat.class);
  AvroParquetInputFormat.setInputPaths(job, inputPath);

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  job.setOutputFormatClass(AvroParquetOutputFormat.class);
  FileOutputFormat.setOutputPath(job, outputPath);
  AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$);

  return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 30
public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath1 = ItemBasedCFDriver.path.get("step7InputPath1");
	String inputPath2 = ItemBasedCFDriver.path.get("step7InputPath2");
	String outputPath = ItemBasedCFDriver.path.get("step7OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ",");

	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step1_Mapper.class);
	job.setJarByClass(IAndKMatrixMultiplicationStep1.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(Text.class);
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath1), new Path(
			inputPath2));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);

}