Java Code Examples for org.apache.hadoop.mapreduce.Job#setGroupingComparatorClass()
The following examples show how to use
org.apache.hadoop.mapreduce.Job#setGroupingComparatorClass() .
These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: MapReduce-Demo File: Step6.java License: MIT License | 6 votes |
public static boolean run(Configuration config, Map<String, String> paths) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "step6"; Job job = Job.getInstance(config, jobName); job.setJarByClass(Step6.class); job.setJar("export\\ItemCF.jar"); job.setMapperClass(Step6_Mapper.class); job.setReducerClass(Step6_Reducer.class); job.setMapOutputKeyClass(PairWritable.class); job.setMapOutputValueClass(Text.class); //job.setSortComparatorClass(ScoreSort.class); //自定义排序 job.setGroupingComparatorClass(UserGroup.class); //自定义分组 Path inPath = new Path(paths.get("Step6Input")); Path outpath = new Path(paths.get("Step6Output")); FileInputFormat.addInputPath(job, inPath); FileOutputFormat.setOutputPath(job, outpath); FileSystem fs = FileSystem.get(config); if (fs.exists(outpath)) { fs.delete(outpath, true); } return job.waitForCompletion(true); }
Example 2
Source Project: MLHadoop File: merge_results_driver.java License: Apache License 2.0 | 6 votes |
public static boolean runWithJob(Job job, String out_path) throws IOException, InterruptedException, ClassNotFoundException { job.setJarByClass(merge_results_driver.class); job.setJobName("Final Step: Merging results and creating separate LU decomposed components of input matrix"); FileOutputFormat.setOutputPath(job, new Path(out_path)); job.setMapperClass(lu_decomposition.naive_gausssian.MergeResults.merge_results_mapper.class); job.setReducerClass(lu_decomposition.naive_gausssian.MergeResults.merge_results_reducer.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(TextPair.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(TextPairPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(TextPairComparator.class); boolean success = job.waitForCompletion(true); return success; }
Example 3
Source Project: BigDataArchitect File: InboundBounceRunner.java License: Apache License 2.0 | 5 votes |
@Override protected void beforeRunJob(Job job) throws IOException { super.beforeRunJob(job); // 自定义二次排序 job.setGroupingComparatorClass(InboundBounceGroupingComparator.class); job.setPartitionerClass(InboundBouncePartitioner.class); }
Example 4
Source Project: rya File: JoinSelectStatisticsTest.java License: Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String outpath = conf.get(OUTPUTPATH); Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis()); job.setJarByClass(this.getClass()); conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true); MultipleInputs.addInputPath(job, new Path(PROSPECTSOUT.getAbsolutePath()), SequenceFileInputFormat.class, JoinSelectAggregateMapper.class); MultipleInputs.addInputPath(job,new Path(SPOOUT.getAbsolutePath()) , SequenceFileInputFormat.class, JoinSelectAggregateMapper.class); job.setMapOutputKeyClass(CompositeType.class); job.setMapOutputValueClass(TripleCard.class); tempDir = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + ""); SequenceFileOutputFormat.setOutputPath(job, new Path(tempDir.getAbsolutePath())); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(TripleEntry.class); job.setOutputValueClass(CardList.class); job.setSortComparatorClass(JoinSelectSortComparator.class); job.setGroupingComparatorClass(JoinSelectGroupComparator.class); job.setPartitionerClass(JoinSelectPartitioner.class); job.setReducerClass(JoinReducer.class); job.setNumReduceTasks(32); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }
Example 5
Source Project: hadoop File: SecondarySort.java License: Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysort <in> <out>"); System.exit(2); } Job job = Job.getInstance(conf, "secondary sort"); job.setJarByClass(SecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); // group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(FirstGroupingComparator.class); // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 6
Source Project: big-c File: SecondarySort.java License: Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysort <in> <out>"); System.exit(2); } Job job = Job.getInstance(conf, "secondary sort"); job.setJarByClass(SecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); // group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(FirstGroupingComparator.class); // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 7
Source Project: hiped2 File: SortMapReduce.java License: Apache License 2.0 | 5 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(SortMapReduce.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapOutputKeyClass(Person.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(PersonNamePartitioner.class); job.setSortComparatorClass(PersonComparator.class); job.setGroupingComparatorClass(PersonNameComparator.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); if (job.waitForCompletion(true)) { return 0; } return 1; }
Example 8
Source Project: ignite File: HadoopGroupingTest.java License: Apache License 2.0 | 5 votes |
/** * @param combiner With combiner. * @throws Exception If failed. */ public void doTestGrouping(boolean combiner) throws Exception { HadoopGroupingTestState.values().clear(); Job job = Job.getInstance(); job.setInputFormatClass(InFormat.class); job.setOutputFormatClass(OutFormat.class); job.setOutputKeyClass(YearTemperature.class); job.setOutputValueClass(Text.class); job.setMapperClass(Mapper.class); if (combiner) { job.setCombinerClass(MyReducer.class); job.setNumReduceTasks(0); job.setCombinerKeyGroupingComparatorClass(YearComparator.class); } else { job.setReducerClass(MyReducer.class); job.setNumReduceTasks(4); job.setGroupingComparatorClass(YearComparator.class); } grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 2), createJobInfo(job.getConfiguration(), null)).get(30000); assertTrue(HadoopGroupingTestState.values().isEmpty()); }
Example 9
Source Project: halvade File: MapReduceRunner.java License: GNU General Public License v3.0 | 4 votes |
protected int runHalvadeJob(Configuration halvadeConf, String tmpOutDir, int jobType) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException { String pipeline = ""; if(jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) { HalvadeConf.setIsPass2(halvadeConf, true); HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false, halvadeOpts.useBamInput); pipeline = RNA_PASS2; } else if(jobType == HalvadeResourceManager.DNA) { HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false, halvadeOpts.useBamInput); pipeline = DNA; } halvadeOpts.splitChromosomes(halvadeConf, 0); HalvadeConf.setOutDir(halvadeConf, tmpOutDir); FileSystem outFs = FileSystem.get(new URI(tmpOutDir), halvadeConf); if (outFs.exists(new Path(tmpOutDir))) { Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists."); Logger.INFO("ERROR: Please remove this directory before trying again."); System.exit(-2); } if(halvadeOpts.useBamInput) setHeaderFile(halvadeOpts.in, halvadeConf); if(halvadeOpts.rnaPipeline) HalvadeConf.setPass2Suffix(halvadeConf, pass2suffix); Job halvadeJob = Job.getInstance(halvadeConf, "Halvade" + pipeline); halvadeJob.addCacheArchive(new URI(halvadeOpts.halvadeBinaries)); halvadeJob.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class); addInputFiles(halvadeOpts.in, halvadeConf, halvadeJob); FileOutputFormat.setOutputPath(halvadeJob, new Path(tmpOutDir)); if(jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) { halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class); halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RnaGATKReducer.class); } else if(jobType == HalvadeResourceManager.DNA){ halvadeJob.setMapperClass(halvadeOpts.alignmentTools[halvadeOpts.aln]); halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.DnaGATKReducer.class); } halvadeJob.setMapOutputKeyClass(ChromosomeRegion.class); halvadeJob.setMapOutputValueClass(SAMRecordWritable.class); halvadeJob.setInputFormatClass(HalvadeTextInputFormat.class); halvadeJob.setOutputKeyClass(Text.class); if(halvadeOpts.mergeBam) { halvadeJob.setSortComparatorClass(SimpleChrRegionComparator.class); halvadeJob.setOutputValueClass(SAMRecordWritable.class); }else { halvadeJob.setPartitionerClass(ChrRgPartitioner.class); halvadeJob.setSortComparatorClass(ChrRgSortComparator.class); halvadeJob.setGroupingComparatorClass(ChrRgGroupingComparator.class); halvadeJob.setOutputValueClass(VariantContextWritable.class); } if(halvadeOpts.justAlign && !halvadeOpts.mergeBam) halvadeJob.setNumReduceTasks(0); else if (halvadeOpts.mergeBam) { halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.BamMergeReducer.class); halvadeJob.setNumReduceTasks(1); } else { halvadeJob.setNumReduceTasks(halvadeOpts.reduces); if(halvadeOpts.countOnly) { halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.CountReadsReducer.class); halvadeJob.setOutputValueClass(LongWritable.class); } } if(halvadeOpts.useBamInput) { halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.AlignedBamMapper.class); halvadeJob.setInputFormatClass(BAMInputFormat.class); } return runTimedJob(halvadeJob, "Halvade Job"); }
Example 10
Source Project: incubator-retired-blur File: IndexerJobDriver.java License: Apache License 2.0 | 4 votes |
private boolean runAutomatic(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table, Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot) throws ClassNotFoundException, IOException, InterruptedException { PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot, fileCache); Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]"); InputSplitPruneUtil.setBlurLookupRowIdFromNewDataCounts(job, table, result._rowIdsFromNewData); InputSplitPruneUtil.setBlurLookupRowIdUpdateFromNewDataCounts(job, table, result._rowIdsToUpdateFromNewData); InputSplitPruneUtil.setBlurLookupRowIdFromIndexCounts(job, table, result._rowIdsFromIndex); InputSplitPruneUtil.setTable(job, table); BlurInputFormat.setLocalCachePath(job, fileCache); // Existing data - This adds the copy data files first open and stream // through all documents. { Path tablePath = new Path(descriptor.getTableUri()); BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT); MultipleInputs.addInputPath(job, tablePath, PrunedBlurInputFormat.class, ExistingDataMapper.class); } // Existing data - This adds the row id lookup { ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT); FileInputFormat.addInputPath(job, result._partitionedInputData); MultipleInputs.addInputPath(job, result._partitionedInputData, PrunedSequenceFileInputFormat.class, ExistingDataIndexLookupMapper.class); } // New Data for (Path p : inprogressPathList) { FileInputFormat.addInputPath(job, p); MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class); } BlurOutputFormat.setOutputPath(job, outputPath); BlurOutputFormat.setupJob(job, descriptor); job.setReducerClass(UpdateReducer.class); job.setMapOutputKeyClass(IndexKey.class); job.setMapOutputValueClass(IndexValue.class); job.setPartitionerClass(IndexKeyPartitioner.class); job.setGroupingComparatorClass(IndexKeyWritableComparator.class); BlurOutputFormat.setReducerMultiplier(job, reducerMultipler); boolean success = job.waitForCompletion(true); Counters counters = job.getCounters(); LOG.info("Counters [" + counters + "]"); return success; }
Example 11
Source Project: aegisthus File: Aegisthus.java License: Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); Configuration configuration = job.getConfiguration(); job.setJarByClass(Aegisthus.class); CommandLine cl = getOptions(args); if (cl == null) { return 1; } // Check all of the paths and load the sstable version from the input filenames List<Path> paths = Lists.newArrayList(); if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) { for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) { paths.add(new Path(input)); } } if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) { paths.addAll(getDataFiles(configuration, cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR))); } LOG.info("Processing paths: {}", paths); // At this point we have the version of sstable that we can use for this run Descriptor.Version version = Descriptor.Version.CURRENT; if (cl.hasOption(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION)) { version = new Descriptor.Version(cl.getOptionValue(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION)); } configuration.set(Feature.CONF_SSTABLE_VERSION, version.toString()); if (configuration.get(Feature.CONF_CQL_SCHEMA) != null) { setConfigurationFromCql(configuration); } if(cl.hasOption(Feature.CMD_ARG_COMBINE_SPLITS)) { job.setInputFormatClass(AegisthusCombinedInputFormat.class); } else { job.setInputFormatClass(AegisthusInputFormat.class); } job.setMapOutputKeyClass(AegisthusKey.class); job.setMapOutputValueClass(AtomWritable.class); job.setOutputKeyClass(AegisthusKey.class); job.setOutputValueClass(RowWritable.class); job.setMapperClass(AegisthusKeyMapper.class); job.setReducerClass(CassSSTableReducer.class); job.setGroupingComparatorClass(AegisthusKeyGroupingComparator.class); job.setPartitionerClass(AegisthusKeyPartitioner.class); job.setSortComparatorClass(AegisthusKeySortingComparator.class); TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()])); if (cl.hasOption(Feature.CMD_ARG_PRODUCE_SSTABLE)) { job.setOutputFormatClass(SSTableOutputFormat.class); } else { job.setOutputFormatClass(JsonOutputFormat.class); } CustomFileNameFileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR))); job.submit(); if (configuration.getBoolean(Feature.CONF_SHUTDOWN_HOOK, true)) { Runtime.getRuntime().addShutdownHook(new JobKiller(job)); } System.out.println(job.getJobID()); System.out.println(job.getTrackingURL()); boolean success = job.waitForCompletion(true); if (success) { Counter errorCounter = job.getCounters().findCounter("aegisthus", "error_skipped_input"); long errorCount = errorCounter != null ? errorCounter.getValue() : 0L; int maxAllowed = configuration.getInt(Feature.CONF_MAX_CORRUPT_FILES_TO_SKIP, 0); if (errorCounter != null && errorCounter.getValue() > maxAllowed) { LOG.error("Found {} corrupt files which is greater than the max allowed {}", errorCount, maxAllowed); success = false; } else if (errorCount > 0) { LOG.warn("Found {} corrupt files but not failing the job because the max allowed is {}", errorCount, maxAllowed); } } return success ? 0 : 1; }
Example 12
Source Project: hadoop-arch-book File: MRSessionize.java License: Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: MRSessionize <in> <out>"); System.exit(2); } Job job = new Job(conf, "MapReduce Sessionization"); job.setJarByClass(MRSessionize.class); job.setMapperClass(SessionizeMapper.class); job.setReducerClass(SessionizeReducer.class); // WARNING: do NOT set the Combiner class // from the same IP in one place before we can do sessionization // Also, our reducer doesn't return the same key,value types it takes // It can't be used on the result of a previous reducer job.setMapOutputKeyClass(IpTimestampKey.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); // We need these for secondary sorting. // We need to shuffle the records (between Map and Reduce phases) by using IP address as key, since that is // the field we are using for determining uniqueness of users. However, when the records arrive to the reducers, // we would like them to be sorted in ascending order of their timestamps. This concept is known as secondary // sorting since we are "secondarily" sorting the records by another key (timestamp, in our case) in addition // to the shuffle key (also called the "partition" key). // So, to get some terminology straight. // Natural key (aka Shuffle key or Partition key) is the key we use to shuffle. IP address in our case // Secondary Sorting Key is the key we use to sort within each partition that gets sent to the user. Timestamp // in our case. // Together, the natural key and secondary sorting key form what we call the composite key. This key is called // IpTimestampKey in our example. // For secondary sorting, even though we are partitioning and shuffling by only the natural key, the map output // key and the reduce input key is the composite key. We, however, use a custom partitioner and custom grouping // comparator that only uses the natural key part of the composite key to partition and group respectively (both // happen during the shuffle phase). // However, we have a different sort comparator which also gets used in the shuffle phase but determines how // the records are sorted when they enter the reduce phase. This custom sort comparator in our case will make use // of the entire composite key. // We found http://vangjee.wordpress.com/2012/03/20/secondary-sorting-aka-sorting-values-in-hadoops-mapreduce-programming-paradigm/ // to be very helpful, if you'd like to read more on the subject. job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 13
Source Project: 163-bigdate-note File: ParseLogJob.java License: GNU General Public License v3.0 | 4 votes |
public int run(String[] args) throws Exception { //创建job Configuration config = getConf(); //添加自定义配置 config.addResource("mr.xml"); Job job = Job.getInstance(config); //通过job设置一些参数 //通过job设置一些参数 job.setJarByClass(ParseLogJob.class); job.setJobName("parselog"); job.setMapperClass(LogMapper.class); job.setReducerClass(LogReducer.class); job.setMapOutputKeyClass(TextLongWritable.class); job.setGroupingComparatorClass(TextLongGroupComparator.class); job.setPartitionerClass(TextLongPartition.class); job.setMapOutputValueClass(LogWritable.class); job.setOutputValueClass(Text.class); //设置CombineFileInputFormat job.setInputFormatClass(CombineTextInputFormat.class); //添加分布式缓存 job.addCacheFile(new URI(config.get("ip.file.path"))); //设置OutputFormat job.setOutputFormatClass(LogOutputFormat.class); //添加输入和输出数据 FileInputFormat.addInputPath(job, new Path(args[0])); Path outputPath = new Path(args[1]); FileOutputFormat.setOutputPath(job, outputPath); //设置压缩类型 // FileOutputFormat.setCompressOutput(job, true); // FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class); FileSystem fs = FileSystem.get(config); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } //运行程序 if (!job.waitForCompletion(true)) { throw new RuntimeException(job.getJobName() + "failed!"); } return 0; }
Example 14
Source Project: hbase File: IntegrationTestBulkLoad.java License: Apache License 2.0 | 4 votes |
/** * After adding data to the table start a mr job to * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ private void runCheck() throws IOException, ClassNotFoundException, InterruptedException { LOG.info("Running check"); Configuration conf = getConf(); String jobName = getTablename() + "_check" + EnvironmentEdgeManager.currentTime(); Path p = util.getDataTestDirOnTestFS(jobName); Job job = new Job(conf); job.setJarByClass(getClass()); job.setJobName(jobName); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); Scan scan = new Scan(); scan.addFamily(CHAIN_FAM); scan.addFamily(SORT_FAM); scan.readVersions(1); scan.setCacheBlocks(false); scan.setBatch(1000); int replicaCount = conf.getInt(NUM_REPLICA_COUNT_KEY, NUM_REPLICA_COUNT_DEFAULT); if (replicaCount != NUM_REPLICA_COUNT_DEFAULT) { scan.setConsistency(Consistency.TIMELINE); } TableMapReduceUtil.initTableMapperJob( getTablename().getName(), scan, LinkedListCheckingMapper.class, LinkKey.class, LinkChain.class, job ); job.setReducerClass(LinkedListCheckingReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, p); assertEquals(true, job.waitForCompletion(true)); // Delete the files. util.getTestFileSystem().delete(p, true); }
Example 15
Source Project: clickstream-tutorial File: MRSessionize.java License: Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: MRSessionize <in> <out>"); System.exit(2); } Job job = new Job(conf, "MapReduce Sessionization"); job.setJarByClass(MRSessionize.class); job.setMapperClass(SessionizeMapper.class); job.setReducerClass(SessionizeReducer.class); // WARNING: do NOT set the Combiner class // from the same IP in one place before we can do sessionization // Also, our reducer doesn't return the same key,value types it takes // It can't be used on the result of a previous reducer job.setMapOutputKeyClass(IpTimestampKey.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); // We need these for secondary sorting. // We need to shuffle the records (between Map and Reduce phases) by using IP address as key, since that is // the field we are using for determining uniqueness of users. However, when the records arrive to the reducers, // we would like them to be sorted in ascending order of their timestamps. This concept is known as secondary // sorting since we are "secondarily" sorting the records by another key (timestamp, in our case) in addition // to the shuffle key (also called the "partition" key). // So, to get some terminology straight. // Natural key (aka Shuffle key or Partition key) is the key we use to shuffle. IP address in our case // Secondary Sorting Key is the key we use to sort within each partition that gets sent to the user. Timestamp // in our case. // Together, the natural key and secondary sorting key form what we call the composite key. This key is called // IpTimestampKey in our example. // For secondary sorting, even though we are partitioning and shuffling by only the natural key, the map output // key and the reduce input key is the composite key. We, however, use a custom partitioner and custom grouping // comparator that only uses the natural key part of the composite key to partition and group respectively (both // happen during the shuffle phase). // However, we have a different sort comparator which also gets used in the shuffle phase but determines how // the records are sorted when they enter the reduce phase. This custom sort comparator in our case will make use // of the entire composite key. // We found http://vangjee.wordpress.com/2012/03/20/secondary-sorting-aka-sorting-values-in-hadoops-mapreduce-programming-paradigm/ // to be very helpful, if you'd like to read more on the subject. job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 16
Source Project: 163-bigdate-note File: ParseLogJob.java License: GNU General Public License v3.0 | 4 votes |
public int run(String[] args) throws Exception { //创建job Configuration config = getConf(); //添加自定义配置 config.addResource("mr.xml"); Job job = Job.getInstance(config); //通过job设置一些参数 job.setJarByClass(ParseLogJob.class); job.setJobName("parselog"); job.setMapperClass(LogMapper.class); job.setReducerClass(LogReducer.class); job.setMapOutputKeyClass(TextLongWritable.class); job.setGroupingComparatorClass(TextLongGroupComparator.class); job.setPartitionerClass(TextLongPartition.class); job.setMapOutputValueClass(LogWritable.class); job.setOutputValueClass(Text.class); //设置CombineFileInputFormat job.setInputFormatClass(CombineTextInputFormat.class); //添加分布式缓存 job.addCacheFile(new URI(config.get("ip.file.path"))); //设置OutputFormat job.setOutputFormatClass(LogOutputFormat.class); //添加输入和输出数据 FileInputFormat.addInputPath(job, new Path(args[0])); Path outputPath = new Path(args[1]); FileOutputFormat.setOutputPath(job, outputPath); //设置压缩类型 // FileOutputFormat.setCompressOutput(job, true); // FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class); FileSystem fs = FileSystem.get(config); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } //运行程序 if (!job.waitForCompletion(true)) { throw new RuntimeException(job.getJobName() + "failed!"); } return 0; }
Example 17
Source Project: 163-bigdate-note File: ParseLogJob.java License: GNU General Public License v3.0 | 4 votes |
public int run(String[] args) throws Exception { //创建job Configuration config = getConf(); //添加自定义配置 config.addResource("mr.xml"); Job job = Job.getInstance(config); //通过job设置一些参数 job.setJarByClass(ParseLogJob.class); job.setJobName("parselog"); job.setMapperClass(LogMapper.class); job.setReducerClass(LogReducer.class); job.setMapOutputKeyClass(TextLongWritable.class); job.setGroupingComparatorClass(TextLongGroupComparator.class); job.setPartitionerClass(TextLongPartition.class); job.setMapOutputValueClass(LogWritable.class); job.setOutputValueClass(Text.class); //设置CombineFileInputFormat job.setInputFormatClass(CombineTextInputFormat.class); //添加分布式缓存 job.addCacheFile(new URI(config.get("ip.file.path"))); //设置OutputFormat job.setOutputFormatClass(LogOutputFormat.class); //添加输入和输出数据 FileInputFormat.addInputPath(job, new Path(args[0])); Path outputPath = new Path(args[1]); FileOutputFormat.setOutputPath(job, outputPath); //设置压缩类型 // FileOutputFormat.setCompressOutput(job, true); // FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class); FileSystem fs = FileSystem.get(config); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } //运行程序 if (!job.waitForCompletion(true)) { throw new RuntimeException(job.getJobName() + "failed!"); } return 0; }
Example 18
Source Project: dkpro-c4corpus File: Phase4RemoveDuplicatesUsingReduceSideJoins.java License: Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class); job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName()); // paths // text files of ids to be deleted String textFilePath = args[0]; // corpus with *.warc.gz String commaSeparatedInputFiles = args[1]; // output String outputPath = args[2]; //second input the look up text file MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class, JoinTextMapper.class); //first input the data set (check comma separated availability) MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class, JoinWARCMapper.class); job.setPartitionerClass(SourceJoiningKeyPartitioner.class); job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(WARCWritable.class); job.setReducerClass(JoinReducer.class); job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); return job.waitForCompletion(true) ? 0 : 1; }
Example 19
Source Project: hiped2 File: SampleMapReduce.java License: Apache License 2.0 | 3 votes |
public static void runSortJob(String input, String output) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJarByClass(SampleMapReduce.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapOutputKeyClass(Person.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(PersonNamePartitioner.class); job.setSortComparatorClass(PersonComparator.class); job.setGroupingComparatorClass(PersonNameComparator.class); Path outputPath = new Path(output); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, outputPath); outputPath.getFileSystem(conf).delete(outputPath, true); job.waitForCompletion(true); }
Example 20
Source Project: BigDataArchitect File: MyTopN.java License: Apache License 2.0 | 2 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(true); conf.set("mapreduce.framework.name","local"); conf.set("mapreduce.app-submission.cross-platform","true"); String[] other = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf); job.setJarByClass(MyTopN.class); job.setJobName("TopN"); job.setJar("C:\\Users\\Administrator\\IdeaProjects\\msbhadoop\\target\\hadoop-hdfs-1.0-0.1.jar"); //客户端规划的时候讲join的右表cache到mapTask出现的节点上 job.addCacheFile(new Path("/data/topn/dict/dict.txt").toUri()); //初学者,关注的是client端的代码梳理:因为把这块写明白了,其实你也就真的知道这个作业的开发原理; //maptask //input TextInputFormat.addInputPath(job,new Path(other[0])); Path outPath = new Path(other[1]); if(outPath.getFileSystem(conf).exists(outPath)) outPath.getFileSystem(conf).delete(outPath,true); TextOutputFormat.setOutputPath(job,outPath); //key //map job.setMapperClass(TMapper.class); job.setMapOutputKeyClass(TKey.class); job.setMapOutputValueClass(IntWritable.class); //partitioner 按 年,月 分区 -》 分区 > 分组 按 年分区!!!!!! //分区器潜台词:满足 相同的key获得相同的分区号就可以~! job.setPartitionerClass(TPartitioner.class); //sortComparator 年,月,温度 且 温度倒序 job.setSortComparatorClass(TSortComparator.class); //combine // job.setCombinerClass(); //reducetask //groupingComparator job.setGroupingComparatorClass(TGroupingComparator.class); //reduce job.setReducerClass(TReducer.class); job.waitForCompletion(true); }