org.apache.hadoop.mapreduce.lib.output.FileOutputFormat Java Examples
The following examples show how to use
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Main.java From hiped2 with Apache License 2.0 | 6 votes |
public static void runSortJob(Configuration conf, Path input, Path outputPath) throws Exception { Job job = new Job(conf); job.setJarByClass(Main.class); job.setMapperClass(SortMapReduce.Map.class); job.setReducerClass(SortMapReduce.Reduce.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapOutputKeyClass(Person.class); job.setMapOutputValueClass(Person.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(PersonNamePartitioner.class); job.setSortComparatorClass(PersonComparator.class); job.setGroupingComparatorClass(PersonNameComparator.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true); }
Example #2
Source File: CompactionJobConfigurator.java From incubator-gobblin with Apache License 2.0 | 6 votes |
/** * Refer to MRCompactorAvroKeyDedupJobRunner#configureInputAndOutputPaths(Job). * @return false if no valid input paths present for MR job to process, where a path is valid if it is * a directory containing one or more files. * */ protected boolean configureInputAndOutputPaths(Job job, FileSystemDataset dataset) throws IOException { boolean emptyDirectoryFlag = false; String mrOutputBase = this.state.getProp(MRCompactor.COMPACTION_JOB_DIR); CompactionPathParser parser = new CompactionPathParser(this.state); CompactionPathParser.CompactionParserResult rst = parser.parse(dataset); this.mrOutputPath = concatPaths(mrOutputBase, rst.getDatasetName(), rst.getDstSubDir(), rst.getTimeString()); log.info("Cleaning temporary MR output directory: " + mrOutputPath); this.fs.delete(mrOutputPath, true); this.mapReduceInputPaths = getGranularInputPaths(dataset.datasetRoot()); if (this.mapReduceInputPaths.isEmpty()) { this.mapReduceInputPaths.add(dataset.datasetRoot()); emptyDirectoryFlag = true; } this.oldFiles = new HashSet<>(); for (Path path : mapReduceInputPaths) { oldFiles.add(this.fs.makeQualified(path).toString()); FileInputFormat.addInputPath(job, path); } FileOutputFormat.setOutputPath(job, mrOutputPath); return emptyDirectoryFlag; }
Example #3
Source File: BigDiffHadoop.java From secure-data-service with Apache License 2.0 | 6 votes |
public void execute(String inputPath1, String inputPath2, String outputPath) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "bigdiff"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(inputPath1)); FileInputFormat.addInputPath(job, new Path(inputPath2)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }
Example #4
Source File: TestJoinDatamerge.java From big-c with Apache License 2.0 | 6 votes |
private static void checkOuterConsistency(Job job, Path[] src) throws IOException { Path outf = FileOutputFormat.getOutputPath(job); FileStatus[] outlist = cluster.getFileSystem().listStatus(outf, new Utils.OutputFileUtils.OutputFilesFilter()); assertEquals("number of part files is more than 1. It is" + outlist.length, 1, outlist.length); assertTrue("output file with zero length" + outlist[0].getLen(), 0 < outlist[0].getLen()); SequenceFile.Reader r = new SequenceFile.Reader(cluster.getFileSystem(), outlist[0].getPath(), job.getConfiguration()); IntWritable k = new IntWritable(); IntWritable v = new IntWritable(); while (r.next(k, v)) { assertEquals("counts does not match", v.get(), countProduct(k, src, job.getConfiguration())); } r.close(); }
Example #5
Source File: ParquetAvroExample.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException { // Set up the Hadoop Input Format Job job = Job.getInstance(); // Set up Hadoop Output Format HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job); FileOutputFormat.setOutputPath(job, new Path(outputPath)); AvroParquetOutputFormat.setSchema(job, Person.getClassSchema()); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetOutputFormat.setEnableDictionary(job, true); // Output & Execute data.output(hadoopOutputFormat); }
Example #6
Source File: MapperInputSplitInfo.java From bigdata-tutorial with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: MapperInputSplitInfo <in> <out>"); System.exit(2); } Job job = Job.getInstance(conf, MapperInputSplitInfo.class.getSimpleName()); job.setJarByClass(MapperInputSplitInfo.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #7
Source File: BigQueryOutputConfiguration.java From hadoop-connectors with Apache License 2.0 | 6 votes |
/** * Gets a configured instance of the stored {@link FileOutputFormat} in the configuration. * * @param conf the configuration to reference the keys from. * @return a configured instance of the stored {@link FileOutputFormat} in the configuration. * @throws IOException if there's an issue getting an instance of a FileOutputFormat from the * configuration. */ @SuppressWarnings("rawtypes") public static FileOutputFormat getFileOutputFormat(Configuration conf) throws IOException { // Ensure the BigQuery output information is valid. getMandatoryConfig(conf, OUTPUT_FORMAT_CLASS); Class<?> confClass = OUTPUT_FORMAT_CLASS.get(conf, conf::getClass); // Fail if the default value was used, or the class isn't a FileOutputFormat. if (confClass == null) { throw new IOException( "Unable to resolve value for the configuration key '" + OUTPUT_FORMAT_CLASS.getKey() + "'."); } else if (!FileOutputFormat.class.isAssignableFrom(confClass)) { throw new IOException("The class " + confClass.getName() + " is not a FileOutputFormat."); } Class<? extends FileOutputFormat> fileOutputClass = confClass.asSubclass(FileOutputFormat.class); // Create a new instance and configure it if it's configurable. return ReflectionUtils.newInstance(fileOutputClass, conf); }
Example #8
Source File: JobExecutor.java From Cubert with Apache License 2.0 | 6 votes |
protected void setOutput() throws IOException { JsonNode output = get(root, "output"); JsonNode params = output.get("params"); if (params == null) params = mapper.createObjectNode(); Path outputPath = new Path(getText(output, "path")); FileOutputFormat.setOutputPath(job, outputPath); if (params.has("overwrite") && Boolean.parseBoolean(getText(params, "overwrite"))) { fs.delete(outputPath, true); } BlockSchema schema = new BlockSchema(output.get("schema")); Storage storage = StorageFactory.get(getText(output, "type")); storage.prepareOutput(job, conf, params, schema, outputPath); }
Example #9
Source File: WordCount.java From wifi with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs(); // System.out.println(otherArgs); if(otherArgs.length != 2) { System.out.println("Usage:wordcount <in> <out>"); System.exit(2); } // if(args.length != 2) { // System.out.println("param error!"); // System.exit(-1); // } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #10
Source File: GridmixJob.java From RDFS with Apache License 2.0 | 6 votes |
public Job call() throws IOException, InterruptedException, ClassNotFoundException { job.setMapperClass(GridmixMapper.class); job.setReducerClass(GridmixReducer.class); job.setNumReduceTasks(jobdesc.getNumberReduces()); job.setMapOutputKeyClass(GridmixKey.class); job.setMapOutputValueClass(GridmixRecord.class); job.setSortComparatorClass(GridmixKey.Comparator.class); job.setGroupingComparatorClass(SpecGroupingComparator.class); job.setInputFormatClass(GridmixInputFormat.class); job.setOutputFormatClass(RawBytesOutputFormat.class); job.setPartitionerClass(DraftPartitioner.class); job.setJarByClass(GridmixJob.class); job.getConfiguration().setInt("gridmix.job.seq", seq); job.getConfiguration().set(ORIGNAME, null == jobdesc.getJobID() ? "<unknown>" : jobdesc.getJobID().toString()); job.getConfiguration().setBoolean("mapred.used.genericoptionsparser", true); FileInputFormat.addInputPath(job, new Path("ignored")); FileOutputFormat.setOutputPath(job, outdir); job.submit(); return job; }
Example #11
Source File: WordCount.java From bigdata-tutorial with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #12
Source File: MapReduceTestUtil.java From hadoop with Apache License 2.0 | 6 votes |
/** * Creates a simple copy job. * * @param conf Configuration object * @param outdir Output directory. * @param indirs Comma separated input directories. * @return Job initialized for a data copy job. * @throws Exception If an error occurs creating job configuration. */ public static Job createCopyJob(Configuration conf, Path outdir, Path... indirs) throws Exception { conf.setInt(MRJobConfig.NUM_MAPS, 3); Job theJob = Job.getInstance(conf); theJob.setJobName("DataMoveJob"); FileInputFormat.setInputPaths(theJob, indirs); theJob.setMapperClass(DataCopyMapper.class); FileOutputFormat.setOutputPath(theJob, outdir); theJob.setOutputKeyClass(Text.class); theJob.setOutputValueClass(Text.class); theJob.setReducerClass(DataCopyReducer.class); theJob.setNumReduceTasks(1); return theJob; }
Example #13
Source File: MatMulDriver.java From MLHadoop with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // A is an m-by-n matrix; B is an n-by-p matrix. conf.set("m", args[0]); conf.set("n", args[1]); conf.set("p", args[2]); Job job = new Job(conf, "Matrix_Multiplication"); job.setJarByClass(MatMulDriver.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MatMulMap.class); //Don't use combiner if there is no scope of combining the output. Otherwise the job will get stuck. //job.setCombinerClass(MatMulModGenReduce.class); job.setReducerClass(MatMulReduce.class); //args[3] is the input path. FileInputFormat.addInputPath(job, new Path(args[3])); //args[4] is the output path. FileOutputFormat.setOutputPath(job, new Path(args[4])); System.exit(job.waitForCompletion(true)?0:1); }
Example #14
Source File: HashTable.java From hbase with Apache License 2.0 | 6 votes |
public Job createSubmittableJob(String[] args) throws IOException { Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME); generatePartitions(partitionsPath); Job job = Job.getInstance(getConf(), getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName)); Configuration jobConf = job.getConfiguration(); jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize); jobConf.setBoolean(IGNORE_TIMESTAMPS, tableHash.ignoreTimestamps); job.setJarByClass(HashTable.class); TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(), HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job); // use a TotalOrderPartitioner and reducers to group region output into hash files job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath); job.setReducerClass(Reducer.class); // identity reducer job.setNumReduceTasks(tableHash.numHashFiles); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(ImmutableBytesWritable.class); job.setOutputFormatClass(MapFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR)); return job; }
Example #15
Source File: TestCRAMOutputFormat.java From Hadoop-BAM with MIT License | 6 votes |
private Path doMapReduce(final String inputFile) throws Exception { final FileSystem fileSystem = FileSystem.get(conf); final Path inputPath = new Path(inputFile); final Path outputPath = fileSystem.makeQualified(new Path("target/out")); fileSystem.delete(outputPath, true); final Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, inputPath); job.setInputFormatClass(CRAMInputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SAMRecordWritable.class); conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile); job.setOutputFormatClass(CRAMTestNoHeaderOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); final boolean success = job.waitForCompletion(true); assertTrue(success); return outputPath; }
Example #16
Source File: IntegrationTestLoadAndVerify.java From hbase with Apache License 2.0 | 6 votes |
protected Job doLoad(Configuration conf, TableDescriptor tableDescriptor) throws Exception { Path outputDir = getTestDir(TEST_NAME, "load-output"); LOG.info("Load output dir: " + outputDir); NMapInputFormat.setNumMapTasks(conf, conf.getInt(NUM_MAP_TASKS_KEY, NUM_MAP_TASKS_DEFAULT)); conf.set(TABLE_NAME_KEY, tableDescriptor.getTableName().getNameAsString()); Job job = Job.getInstance(conf); job.setJobName(TEST_NAME + " Load for " + tableDescriptor.getTableName()); job.setJarByClass(this.getClass()); setMapperClass(job); job.setInputFormatClass(NMapInputFormat.class); job.setNumReduceTasks(0); setJobScannerConf(job); FileOutputFormat.setOutputPath(job, outputDir); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), AbstractHBaseTool.class); TableMapReduceUtil.initCredentials(job); assertTrue(job.waitForCompletion(true)); return job; }
Example #17
Source File: ElemValueCooccurrencesTest.java From marklogic-contentpump with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 1) { System.err.println("Usage: ElemValueCooccurrencesTest configFile outputDir"); System.exit(2); } Job job = Job.getInstance(conf); job.setJarByClass(ElemValueCooccurrencesTest.class); job.setInputFormatClass(ValueInputFormat.class); job.setMapperClass(ElemCooccurrencesMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, ElemValueCooccurrencesFunction.class, ElemValueCooccurrences.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #18
Source File: KMeansDriver.java From flink-perf with Apache License 2.0 | 6 votes |
public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception { Path points = new Path (pointsPath); Path seqFile = new Path (seqFilePath); if (fs.exists(seqFile)) { fs.delete(seqFile, true); } Job job = Job.getInstance(conf); job.setMapperClass(CenterInitializer.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Centroid.class); job.setMapOutputValueClass(Point.class); job.setOutputKeyClass(Centroid.class); job.setOutputValueClass(Point.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(pointsPath)); FileOutputFormat.setOutputPath(job, seqFile); job.waitForCompletion(true); }
Example #19
Source File: Step6.java From MapReduce-Demo with MIT License | 6 votes |
public static boolean run(Configuration config, Map<String, String> paths) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "step6"; Job job = Job.getInstance(config, jobName); job.setJarByClass(Step6.class); job.setJar("export\\ItemCF.jar"); job.setMapperClass(Step6_Mapper.class); job.setReducerClass(Step6_Reducer.class); job.setMapOutputKeyClass(PairWritable.class); job.setMapOutputValueClass(Text.class); //job.setSortComparatorClass(ScoreSort.class); //自定义排序 job.setGroupingComparatorClass(UserGroup.class); //自定义分组 Path inPath = new Path(paths.get("Step6Input")); Path outpath = new Path(paths.get("Step6Output")); FileInputFormat.addInputPath(job, inPath); FileOutputFormat.setOutputPath(job, outpath); FileSystem fs = FileSystem.get(config); if (fs.exists(outpath)) { fs.delete(outpath, true); } return job.waitForCompletion(true); }
Example #20
Source File: XflowDstIPCount.java From bigdata-tutorial with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { String[] otherArgs = new GenericOptionsParser(args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: xflowdstipcount <in> <out>"); System.exit(2); } Job job = Job.getInstance(); job.setJobName("xflow dstip count"); job.setJarByClass(XflowDstIPCount.class); job.setMapperClass(ParesDstIPMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #21
Source File: NBCDriver.java From MLHadoop with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf=new Configuration(); // The test input for which you want to find the acitivity that the Person should be doing conf.set("test_input", args[0]); Job job = new Job(conf); job.setJarByClass(NBCDriver.class); job.setJobName("Naive_Bayes_calssifier using Hadoop"); FileInputFormat.setInputPaths(job, new Path(args[1])); FileOutputFormat.setOutputPath(job, new Path(args[2])); job.setMapperClass(NBCMap.class); job.setReducerClass(NBCReduce.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); boolean success = job.waitForCompletion(true); System.exit(success ? 0 : 1); }
Example #22
Source File: PageRankDriver.java From flink-perf with Apache License 2.0 | 6 votes |
public static void calculateNextRanks (Configuration conf, FileSystem fs, String inputPath, String outputPath) throws Exception { Path outFile = new Path (outputPath); if (fs.exists(outFile)) { fs.delete(outFile, true); } Job job = Job.getInstance(conf); job.setJarByClass(PageRankMapper.class); job.setMapperClass(PageRankMapper.class); job.setReducerClass(PageRankReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Message.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Message.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, outFile); job.waitForCompletion(true); }
Example #23
Source File: PatentMainController.java From yuzhouwan with Apache License 2.0 | 5 votes |
/** * Main方法里面,设置了 Patent任务流程,Mapper ->Combiner ->Reducer ->Partitioner. * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { //配置 Job,并完成初始化 Job job = Job.getInstance(new Configuration()); //指定 Job的主类 job.setJarByClass(PatentMainController.class); //指定 Job的 Mapper组件 job.setMapperClass(PatentMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //指定 Job的数据输入地址 FileInputFormat.setInputPaths(job, new Path(args[0])); //指定 Job的 Combiner组件 job.setCombinerClass(InverseIndexByKeywordCombiner.class); job.setReducerClass(InverseIndexByKeywordCombiner.class); //指定 Job的 Reducer组件 job.setReducerClass(PatentReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //指定 Job的数据输出地址 FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setPartitionerClass(PatentPartitioner.class); //指定最大的 Task数量 job.setNumReduceTasks(ConfUtil.getMax()); //提交并等待执行完成 job.waitForCompletion(true); }
Example #24
Source File: SecondarySort.java From RDFS with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysrot <in> <out>"); System.exit(2); } Job job = new Job(conf, "secondary sort"); job.setJarByClass(SecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); // group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(FirstGroupingComparator.class); // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #25
Source File: TestLocalModeWithNewApis.java From hadoop with Apache License 2.0 | 5 votes |
@Test public void testNewApis() throws Exception { Random r = new Random(System.currentTimeMillis()); Path tmpBaseDir = new Path("/tmp/wc-" + r.nextInt()); final Path inDir = new Path(tmpBaseDir, "input"); final Path outDir = new Path(tmpBaseDir, "output"); String input = "The quick brown fox\nhas many silly\nred fox sox\n"; FileSystem inFs = inDir.getFileSystem(conf); FileSystem outFs = outDir.getFileSystem(conf); outFs.delete(outDir, true); if (!inFs.mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.toString()); } { DataOutputStream file = inFs.create(new Path(inDir, "part-0")); file.writeBytes(input); file.close(); } Job job = Job.getInstance(conf, "word count"); job.setJarByClass(TestLocalModeWithNewApis.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, inDir); FileOutputFormat.setOutputPath(job, outDir); assertEquals(job.waitForCompletion(true), true); String output = readOutput(outDir, conf); assertEquals("The\t1\nbrown\t1\nfox\t2\nhas\t1\nmany\t1\n" + "quick\t1\nred\t1\nsilly\t1\nsox\t1\n", output); outFs.delete(tmpBaseDir, true); }
Example #26
Source File: ConfigurationHelper.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
/** * Job configurator * * @param job job instance * @param jarByClass class of the jar * @param mapperClass mapper * @param reducerClass reducer * @param commaSeparatedInputFiles input paths * @param outputPath output * @throws IOException I/O exception */ public static void configureJob(Job job, Class<?> jarByClass, Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass, String commaSeparatedInputFiles, String outputPath) throws IOException { job.setJarByClass(jarByClass); job.setJobName(jarByClass.getName()); // mapper job.setMapperClass(mapperClass); // reducer job.setReducerClass(reducerClass); // input-output is warc job.setInputFormatClass(WARCInputFormat.class); // prevent producing empty files LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class); // intermediate data job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(WARCWritable.class); // output data job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); // set output compression to GZip FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); }
Example #27
Source File: TeraStreamValidate.java From pravega-samples with Apache License 2.0 | 5 votes |
public int run(String[] args) throws Exception { if (args.length != 5) { usage(); return 2; } LOG.info("starting"); Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); getConf().setStrings(INPUT_URI_STRING, args[2]); getConf().setStrings(INPUT_SCOPE_NAME, args[3]); getConf().setStrings(INPUT_STREAM_NAME, args[4]); getConf().setStrings(INPUT_DESERIALIZER, TextSerializer.class.getName()); getConf().setInt(MRJobConfig.NUM_MAPS, 1); Job job = Job.getInstance(getConf()); TeraInputFormat.setInputPaths(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraStreamValidate"); job.setJarByClass(TeraStreamValidate.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(TeraSortMapper.class); job.setNumReduceTasks(1); job.setInputFormatClass(PravegaInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); int ret = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return ret; }
Example #28
Source File: TestLocalRunner.java From hadoop with Apache License 2.0 | 5 votes |
/** * Run a test which creates a SequenceMapper / IdentityReducer * job over a set of generated number files. */ private void doMultiReducerTest(int numMaps, int numReduces, int parallelMaps, int parallelReduces) throws Exception { Path in = getNumberDirPath(); Path out = getOutputPath(); // Clear data from any previous tests. Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); if (fs.exists(out)) { fs.delete(out, true); } if (fs.exists(in)) { fs.delete(in, true); } for (int i = 0; i < numMaps; i++) { makeNumberFile(i, 100); } Job job = Job.getInstance(); job.setNumReduceTasks(numReduces); job.setMapperClass(SequenceMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); LocalJobRunner.setLocalMaxRunningMaps(job, parallelMaps); LocalJobRunner.setLocalMaxRunningReduces(job, parallelReduces); boolean result = job.waitForCompletion(true); assertTrue("Job failed!!", result); verifyNumberJob(numMaps); }
Example #29
Source File: AvroParquetMapReduce.java From hiped2 with Apache License 2.0 | 5 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(AvroParquetMapReduce.class); job.setInputFormatClass(AvroParquetInputFormat.class); AvroParquetInputFormat.setInputPaths(job, inputPath); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$); return job.waitForCompletion(true) ? 0 : 1; }
Example #30
Source File: IAndKMatrixMultiplicationStep1.java From RecommendationEngine with MIT License | 5 votes |
public static void run() throws IOException, ClassNotFoundException, InterruptedException { String inputPath1 = ItemBasedCFDriver.path.get("step7InputPath1"); String inputPath2 = ItemBasedCFDriver.path.get("step7InputPath2"); String outputPath = ItemBasedCFDriver.path.get("step7OutputPath"); Configuration conf = new Configuration(); conf.set("mapred.textoutputformat.separator", ","); Job job = Job.getInstance(conf); HDFS hdfs = new HDFS(conf); hdfs.rmr(outputPath); job.setMapperClass(Step1_Mapper.class); job.setJarByClass(IAndKMatrixMultiplicationStep1.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath1), new Path( inputPath2)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }