Java Code Examples for org.apache.hadoop.mapreduce.Job#setOutputFormatClass()
The following examples show how to use
org.apache.hadoop.mapreduce.Job#setOutputFormatClass() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestBAMOutputFormat.java From Hadoop-BAM with MIT License | 6 votes |
private Path doMapReduce(final String inputFile) throws Exception { final FileSystem fileSystem = FileSystem.get(conf); final Path inputPath = new Path(inputFile); final Path outputPath = fileSystem.makeQualified(new Path("target/out")); fileSystem.delete(outputPath, true); final Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, inputPath); conf.set(BAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile); job.setInputFormatClass(BAMInputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(SAMRecordWritable.class); job.setOutputFormatClass(BAMTestNoHeaderOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SAMRecordWritable.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); final boolean success = job.waitForCompletion(true); assertTrue(success); return outputPath; }
Example 2
Source File: JobFileProcessor.java From hraven with Apache License 2.0 | 6 votes |
/** * @param conf to use to create and run the job * @param scan to be used to scan the raw table. * @param totalJobCount the total number of jobs that need to be run in this * batch. Used in job name. * @return The job to be submitted to the cluster. * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private Job getProcessingJob(Configuration conf, Scan scan, int totalJobCount) throws IOException { Configuration confClone = new Configuration(conf); // Turn off speculative execution. // Note: must be BEFORE the job construction with the new mapreduce API. confClone.setBoolean("mapred.map.tasks.speculative.execution", false); // Set up job Job job = new Job(confClone, getJobName(totalJobCount)); // This is a map-only class, skip reduce step job.setNumReduceTasks(0); job.setJarByClass(JobFileProcessor.class); job.setOutputFormatClass(MultiTableOutputFormat.class); TableMapReduceUtil.initTableMapperJob(Constants.HISTORY_RAW_TABLE, scan, JobFileTableMapper.class, JobFileTableMapper.getOutputKeyClass(), JobFileTableMapper.getOutputValueClass(), job); return job; }
Example 3
Source File: P2Q1.java From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 | 6 votes |
public final static void main(final String[] args) throws Exception { final Configuration conf = new Configuration(); final Job job = new Job(conf, "P2Q1"); job.setJarByClass(P2Q1.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(P2Q1Map.class); job.setCombinerClass(P2Q1Reduce.class); job.setReducerClass(P2Q1Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
Example 4
Source File: Compute.java From laser with Apache License 2.0 | 6 votes |
public static int run(Path model, Configuration baseConf) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(baseConf); conf.set("com.b5m.laser.msgpack.input.method", "ad_feature"); conf.set("com.b5m.laser.msgpack.output.method", "precompute_ad_offline_model"); conf.set("com.b5m.laser.offline.model", model.toString()); Job job = Job.getInstance(conf); job.setJarByClass(Compute.class); job.setJobName("per compute stable part from offline model for each user"); job.setInputFormatClass(MsgpackInputFormat.class); job.setOutputFormatClass(MsgpackOutputFormat.class); job.setOutputKeyClass(Long.class); job.setOutputValueClass(Result.class); job.setMapperClass(Mapper.class); job.setNumReduceTasks(0); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return 0; }
Example 5
Source File: Distcp.java From aegisthus with Apache License 2.0 | 6 votes |
protected Job initializeJob() throws IOException { Job job = new Job(getConf()); job.setJarByClass(Distcp.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(CleanOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setPartitionerClass(Partition.class); StorageHelper sh = new StorageHelper(job.getConfiguration()); LOG.info(String.format("temp location for job: %s", sh.getBaseTempLocation())); return job; }
Example 6
Source File: Step31.java From recsys-offline with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf1 = new Configuration(); Job job1 = new Job(conf1, "wiki job three1"); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setNumReduceTasks(1); job1.setJarByClass(Step31.class); job1.setMapperClass(WikiMapper31.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(VectorOrPrefWritable.class); // set a reducer only to use SequenceFileOutputFormat job1.setReducerClass(WiKiReducer31.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(VectorOrPrefWritable.class); // this MR's input is the MR2's output SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH)); SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH)); if(!job1.waitForCompletion(true)){ System.exit(1); // run error then exit } }
Example 7
Source File: KMeansDriver.java From flink-perf with Apache License 2.0 | 6 votes |
public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception { Path points = new Path (pointsPath); Path seqFile = new Path (seqFilePath); if (fs.exists(seqFile)) { fs.delete(seqFile, true); } Job job = Job.getInstance(conf); job.setMapperClass(CenterInitializer.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Centroid.class); job.setMapOutputValueClass(Point.class); job.setOutputKeyClass(Centroid.class); job.setOutputValueClass(Point.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(pointsPath)); FileOutputFormat.setOutputPath(job, seqFile); job.waitForCompletion(true); }
Example 8
Source File: TestMRJobs.java From hadoop with Apache License 2.0 | 5 votes |
protected Job runFailingMapperJob() throws IOException, InterruptedException, ClassNotFoundException { Configuration myConf = new Configuration(mrCluster.getConfig()); myConf.setInt(MRJobConfig.NUM_MAPS, 1); myConf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 2); //reduce the number of attempts Job job = Job.getInstance(myConf); job.setJarByClass(FailingMapper.class); job.setJobName("failmapper"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RandomInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(FailingMapper.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_ROOT_DIR, "failmapper-output")); job.addFileToClassPath(APP_JAR); // The AppMaster jar itself. job.submit(); String trackingUrl = job.getTrackingURL(); String jobId = job.getJobID().toString(); boolean succeeded = job.waitForCompletion(true); Assert.assertFalse(succeeded); Assert.assertTrue("Tracking URL was " + trackingUrl + " but didn't Match Job ID " + jobId , trackingUrl.endsWith(jobId.substring(jobId.lastIndexOf("_")) + "/")); return job; }
Example 9
Source File: IntegrationTestBigLinkedList.java From hbase with Apache License 2.0 | 5 votes |
public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput, Integer width, Integer wrapMultiplier, Integer numWalkers) throws Exception { LOG.info("Running RandomInputGenerator with numMappers=" + numMappers + ", numNodes=" + numNodes); Job job = Job.getInstance(getConf()); job.setJobName("Random Input Generator"); job.setNumReduceTasks(0); job.setJarByClass(getClass()); job.setInputFormatClass(GeneratorInputFormat.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(NullWritable.class); setJobConf(job, numMappers, numNodes, width, wrapMultiplier, numWalkers); job.setMapperClass(Mapper.class); //identity mapper FileOutputFormat.setOutputPath(job, tmpOutput); job.setOutputFormatClass(SequenceFileOutputFormat.class); TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), Random64.class); boolean success = jobCompletion(job); return success ? 0 : 1; }
Example 10
Source File: LinkCountCooccurrences.java From marklogic-contentpump with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println( "Usage: LinkCountCooccurrences configFile outputDir"); System.exit(2); } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count cooccurrences"); job.setJarByClass(LinkCountCooccurrences.class); job.setInputFormatClass(KeyValueInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_KEY_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, HrefTitleMap.class, ElemAttrValueCooccurrences.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 11
Source File: IntegrationTestBigLinkedListWithVisibility.java From hbase with Apache License 2.0 | 5 votes |
private int doVerify(Path outputDir, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { job = new Job(getConf()); job.setJobName("Link Verifier"); job.setNumReduceTasks(numReducers); job.setJarByClass(getClass()); setJobScannerConf(job); Scan scan = new Scan(); scan.addColumn(FAMILY_NAME, COLUMN_PREV); scan.setCaching(10000); scan.setCacheBlocks(false); String[] split = labels.split(COMMA); scan.setAuthorizations(new Authorizations(split[this.labelIndex * 2], split[(this.labelIndex * 2) + 1])); TableMapReduceUtil.initTableMapperJob(tableName.getName(), scan, VerifyMapper.class, BytesWritable.class, BytesWritable.class, job); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), AbstractHBaseTool.class); job.getConfiguration().setBoolean("mapreduce.map.speculative", false); job.setReducerClass(VerifyReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputDir); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
Example 12
Source File: HFileOutputFormat3.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
static void configureIncrementalLoad(Job job, HTableDescriptor tableDescriptor, RegionLocator regionLocator, Class<? extends OutputFormat<?, ?>> cls) throws IOException, UnsupportedEncodingException { Configuration conf = job.getConfiguration(); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setOutputFormatClass(cls); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(PutSortReducer.class); } else if (Text.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(TextSortReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName()); // Use table's region boundaries for TOP split points. LOG.info("Looking up current regions for table " + tableDescriptor.getTableName()); List<ImmutableBytesWritable> startKeys = getRegionStartKeys(regionLocator); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); job.setNumReduceTasks(startKeys.size()); configurePartitioner(job, startKeys); // Set compression algorithms based on column families configureCompression(conf, tableDescriptor); configureBloomType(tableDescriptor, conf); configureBlockSize(tableDescriptor, conf); configureDataBlockEncoding(tableDescriptor, conf); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); LOG.info("Incremental table " + regionLocator.getName() + " output configured."); }
Example 13
Source File: SequenceFileProtobufMapReduce.java From hiped2 with Apache License 2.0 | 5 votes |
/** * Write the sequence file. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(SequenceFileProtobufMapReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Stock.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(PbMapper.class); job.setReducerClass(PbReducer.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); ProtobufSerialization.register(job.getConfiguration()); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); if (job.waitForCompletion(true)) { return 0; } return 1; }
Example 14
Source File: JsonDataValidationExecutor.java From jumbune with GNU Lesser General Public License v3.0 | 4 votes |
public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); StringBuilder sb = new StringBuilder(); for (int j = 2; j < otherArgs.length; j++) { sb.append(otherArgs[j]); } LOGGER.debug("Arguments[ " + otherArgs.length+"]"+"and values respectively ["+otherArgs[0]+"], "+ otherArgs[1]+", ["+otherArgs[2]+"]"+", ["+otherArgs[3]+"],"+ otherArgs[4]); String inputpath = otherArgs[0]; String outputpath = "/tmp/jumbune/dvjsonreport"+ new Date().getTime(); String json = otherArgs[1]; String nullCondition = otherArgs[2]; String regex = otherArgs[3]; String dvDir = otherArgs[4]; if(regex.isEmpty()){ conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, ""); }else{ conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, regex); } if(nullCondition.isEmpty()){ conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, ""); }else{ conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, nullCondition); } conf.set(JsonDataVaildationConstants.SLAVE_DIR, dvDir); conf.set(JsonDataVaildationConstants.JSON_ARGUMENT, json); FileSystem fs = FileSystem.get(conf); @SuppressWarnings("deprecation") Job job = new Job(conf, "JSONDataValidation"); job.setJarByClass(JsonDataValidationExecutor.class); job.setInputFormatClass(JsonFileInputFormat.class); job.setMapperClass(JsonDataValidationMapper.class); job.setPartitionerClass(JsonDataValidationPartitioner.class); job.setReducerClass(JsonDataValidationReducer.class); job.setNumReduceTasks(5); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FileKeyViolationBean.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TotalReducerViolationBean.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); Path[] inputPaths = FileUtil.getAllJsonNestedFilePath(job, inputpath); FileInputFormat.setInputPaths(job, inputPaths); FileOutputFormat.setOutputPath(job, new Path(outputpath)); if(fs.exists(new Path(outputpath))) { fs.delete(new Path(outputpath), true); } job.waitForCompletion(true); Map<String, JsonViolationReport> jsonMap = readDataFromHdfs(conf,outputpath); final Gson gson= new Gson(); final String jsonReport = gson.toJson(jsonMap); LOGGER.info("Completed DataValidation"); LOGGER.info(JsonDataVaildationConstants.JSON_DV_REPORT + jsonReport); }
Example 15
Source File: Phase4RemoveDuplicatesUsingReduceSideJoins.java From dkpro-c4corpus with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class); job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName()); // paths // text files of ids to be deleted String textFilePath = args[0]; // corpus with *.warc.gz String commaSeparatedInputFiles = args[1]; // output String outputPath = args[2]; //second input the look up text file MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class, JoinTextMapper.class); //first input the data set (check comma separated availability) MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class, JoinWARCMapper.class); job.setPartitionerClass(SourceJoiningKeyPartitioner.class); job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(WARCWritable.class); job.setReducerClass(JoinReducer.class); job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); return job.waitForCompletion(true) ? 0 : 1; }
Example 16
Source File: HalyardStats.java From Halyard with Apache License 2.0 | 4 votes |
@Override public int run(CommandLine cmd) throws Exception { String source = cmd.getOptionValue('s'); String target = cmd.getOptionValue('t'); String targetGraph = cmd.getOptionValue('g'); String graphContext = cmd.getOptionValue('c'); String thresh = cmd.getOptionValue('r'); TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class, HTable.class, HBaseConfiguration.class, AuthenticationProtos.class, Trace.class, Gauge.class); HBaseConfiguration.addHbaseResources(getConf()); Job job = Job.getInstance(getConf(), "HalyardStats " + source + (target == null ? " update" : " -> " + target)); job.getConfiguration().set(SOURCE, source); if (target != null) job.getConfiguration().set(TARGET, target); if (targetGraph != null) job.getConfiguration().set(TARGET_GRAPH, targetGraph); if (graphContext != null) job.getConfiguration().set(GRAPH_CONTEXT, graphContext); if (thresh != null) job.getConfiguration().setLong(THRESHOLD, Long.parseLong(thresh)); job.setJarByClass(HalyardStats.class); TableMapReduceUtil.initCredentials(job); Scan scan = HalyardTableUtils.scan(null, null); if (graphContext != null) { //restricting stats to scan given graph context only List<RowRange> ranges = new ArrayList<>(); byte[] gcHash = HalyardTableUtils.hashKey(SimpleValueFactory.getInstance().createIRI(graphContext)); ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, gcHash)); ranges.add(rowRange(HalyardTableUtils.CPOS_PREFIX, gcHash)); ranges.add(rowRange(HalyardTableUtils.COSP_PREFIX, gcHash)); if (target == null) { //add stats context to the scanned row ranges (when in update mode) to delete the related stats during MapReduce ranges.add(rowRange(HalyardTableUtils.CSPO_PREFIX, HalyardTableUtils.hashKey(targetGraph == null ? HALYARD.STATS_GRAPH_CONTEXT : SimpleValueFactory.getInstance().createIRI(targetGraph)))); } scan.setFilter(new MultiRowRangeFilter(ranges)); } TableMapReduceUtil.initTableMapperJob( source, scan, StatsMapper.class, ImmutableBytesWritable.class, LongWritable.class, job); job.setPartitionerClass(StatsPartitioner.class); job.setReducerClass(StatsReducer.class); job.setOutputFormatClass(NullOutputFormat.class); if (job.waitForCompletion(true)) { LOG.info("Stats Generation Completed.."); return 0; } return -1; }
Example 17
Source File: ScanPerformanceEvaluation.java From hbase with Apache License 2.0 | 4 votes |
public void testSnapshotScanMapReduce() throws IOException, InterruptedException, ClassNotFoundException { Stopwatch scanOpenTimer = Stopwatch.createUnstarted(); Stopwatch scanTimer = Stopwatch.createUnstarted(); Scan scan = getScan(); String jobName = "testSnapshotScanMapReduce"; Job job = new Job(conf); job.setJobName(jobName); job.setJarByClass(getClass()); TableMapReduceUtil.initTableSnapshotMapperJob( this.snapshotName, scan, MyMapper.class, NullWritable.class, NullWritable.class, job, true, new Path(restoreDir) ); job.setNumReduceTasks(0); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormatClass(NullOutputFormat.class); scanTimer.start(); job.waitForCompletion(true); scanTimer.stop(); Counters counters = job.getCounters(); long numRows = counters.findCounter(ScanCounter.NUM_ROWS).getValue(); long numCells = counters.findCounter(ScanCounter.NUM_CELLS).getValue(); long totalBytes = counters.findCounter(HBASE_COUNTER_GROUP_NAME, "BYTES_IN_RESULTS").getValue(); double throughput = (double)totalBytes / scanTimer.elapsed(TimeUnit.SECONDS); double throughputRows = (double)numRows / scanTimer.elapsed(TimeUnit.SECONDS); double throughputCells = (double)numCells / scanTimer.elapsed(TimeUnit.SECONDS); System.out.println("HBase scan mapreduce: "); System.out.println("total time to open scanner: " + scanOpenTimer.elapsed(TimeUnit.MILLISECONDS) + " ms"); System.out.println("total time to scan: " + scanTimer.elapsed(TimeUnit.MILLISECONDS) + " ms"); System.out.println("total bytes: " + totalBytes + " bytes (" + StringUtils.humanReadableInt(totalBytes) + ")"); System.out.println("throughput : " + StringUtils.humanReadableInt((long)throughput) + "B/s"); System.out.println("total rows : " + numRows); System.out.println("throughput : " + StringUtils.humanReadableInt((long)throughputRows) + " rows/s"); System.out.println("total cells : " + numCells); System.out.println("throughput : " + StringUtils.humanReadableInt((long)throughputCells) + " cells/s"); }
Example 18
Source File: DomainStatistics.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]"); return 1; } String inputDir = args[0]; String outputDir = args[1]; int numOfReducers = 1; if (args.length > 3) { numOfReducers = Integer.parseInt(args[3]); } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("DomainStatistics: starting at " + sdf.format(start)); int mode = 0; String jobName = "DomainStatistics"; if(args[2].equals("host")) { jobName = "Host statistics"; mode = MODE_HOST; } else if(args[2].equals("domain")) { jobName = "Domain statistics"; mode = MODE_DOMAIN; } else if(args[2].equals("suffix")) { jobName = "Suffix statistics"; mode = MODE_SUFFIX; } else if(args[2].equals("tld")) { jobName = "TLD statistics"; mode = MODE_TLD; } Configuration conf = getConf(); conf.setInt("domain.statistics.mode", mode); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Job job = new Job(conf, jobName); job.setJarByClass(DomainStatistics.class); String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i])); } job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(DomainStatisticsMapper.class); job.setReducerClass(DomainStatisticsReducer.class); job.setCombinerClass(DomainStatisticsCombiner.class); job.setNumReduceTasks(numOfReducers); try { job.waitForCompletion(true); } catch (Exception e) { throw e; } long end = System.currentTimeMillis(); LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
Example 19
Source File: TestMRJobs.java From big-c with Apache License 2.0 | 4 votes |
private void testJobClassloader(boolean useCustomClasses) throws IOException, InterruptedException, ClassNotFoundException { LOG.info("\n\n\nStarting testJobClassloader()" + " useCustomClasses=" + useCustomClasses); if (!(new File(MiniMRYarnCluster.APPJAR)).exists()) { LOG.info("MRAppJar " + MiniMRYarnCluster.APPJAR + " not found. Not running test."); return; } final Configuration sleepConf = new Configuration(mrCluster.getConfig()); // set master address to local to test that local mode applied iff framework == local sleepConf.set(MRConfig.MASTER_ADDRESS, "local"); sleepConf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, true); if (useCustomClasses) { // to test AM loading user classes such as output format class, we want // to blacklist them from the system classes (they need to be prepended // as the first match wins) String systemClasses = ApplicationClassLoader.SYSTEM_CLASSES_DEFAULT; // exclude the custom classes from system classes systemClasses = "-" + CustomOutputFormat.class.getName() + ",-" + CustomSpeculator.class.getName() + "," + systemClasses; sleepConf.set(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, systemClasses); } sleepConf.set(MRJobConfig.IO_SORT_MB, TEST_IO_SORT_MB); sleepConf.set(MRJobConfig.MR_AM_LOG_LEVEL, Level.ALL.toString()); sleepConf.set(MRJobConfig.MAP_LOG_LEVEL, Level.ALL.toString()); sleepConf.set(MRJobConfig.REDUCE_LOG_LEVEL, Level.ALL.toString()); sleepConf.set(MRJobConfig.MAP_JAVA_OPTS, "-verbose:class"); final SleepJob sleepJob = new SleepJob(); sleepJob.setConf(sleepConf); final Job job = sleepJob.createJob(1, 1, 10, 1, 10, 1); job.setMapperClass(ConfVerificationMapper.class); job.addFileToClassPath(APP_JAR); // The AppMaster jar itself. job.setJarByClass(SleepJob.class); job.setMaxMapAttempts(1); // speed up failures if (useCustomClasses) { // set custom output format class and speculator class job.setOutputFormatClass(CustomOutputFormat.class); final Configuration jobConf = job.getConfiguration(); jobConf.setClass(MRJobConfig.MR_AM_JOB_SPECULATOR, CustomSpeculator.class, Speculator.class); // speculation needs to be enabled for the speculator to be loaded jobConf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); } job.submit(); boolean succeeded = job.waitForCompletion(true); Assert.assertTrue("Job status: " + job.getStatus().getFailureInfo(), succeeded); }
Example 20
Source File: OSMConversionRunner.java From geowave with Apache License 2.0 | 4 votes |
@Override public int run(final String[] args) throws Exception { final Configuration conf = getConf(); final AccumuloRequiredOptions accumuloOptions = (AccumuloRequiredOptions) inputStoreOptions.getFactoryOptions(); // job settings final Job job = Job.getInstance(conf, ingestOptions.getJobName() + "NodeConversion"); job.setJarByClass(OSMConversionRunner.class); job.getConfiguration().set("osm_mapping", ingestOptions.getMappingContents()); job.getConfiguration().set("arguments", ingestOptions.serializeToString()); if (ingestOptions.getVisibilityOptions().getVisibility() != null) { job.getConfiguration().set( AbstractMapReduceIngest.GLOBAL_VISIBILITY_KEY, ingestOptions.getVisibilityOptions().getVisibility()); } // input format AbstractInputFormat.setConnectorInfo( job, accumuloOptions.getUser(), new PasswordToken(accumuloOptions.getPassword())); InputFormatBase.setInputTableName(job, ingestOptions.getQualifiedTableName()); AbstractInputFormat.setZooKeeperInstance( job, new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts( accumuloOptions.getZookeeper())); AbstractInputFormat.setScanAuthorizations( job, new Authorizations(ingestOptions.getVisibilityOptions().getVisibility())); final IteratorSetting is = new IteratorSetting(50, "WholeRow", WholeRowIterator.class); InputFormatBase.addIterator(job, is); job.setInputFormatClass(AccumuloInputFormat.class); final Range r = new Range(); // final ArrayList<Pair<Text, Text>> columns = new ArrayList<>(); InputFormatBase.setRanges(job, Arrays.asList(r)); // output format GeoWaveOutputFormat.setStoreOptions(job.getConfiguration(), inputStoreOptions); final AccumuloOptions options = new AccumuloOptions(); final AdapterStore as = new AdapterStoreImpl( new AccumuloOperations( accumuloOptions.getZookeeper(), accumuloOptions.getInstance(), accumuloOptions.getUser(), accumuloOptions.getPassword(), accumuloOptions.getGeoWaveNamespace(), options), options); for (final FeatureDataAdapter fda : FeatureDefinitionSet.featureAdapters.values()) { as.addAdapter(fda); GeoWaveOutputFormat.addDataAdapter(job.getConfiguration(), fda); } final Index primaryIndex = new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions()); GeoWaveOutputFormat.addIndex(job.getConfiguration(), primaryIndex); job.getConfiguration().set(AbstractMapReduceIngest.INDEX_NAMES_KEY, primaryIndex.getName()); job.setOutputFormatClass(GeoWaveOutputFormat.class); job.setMapOutputKeyClass(GeoWaveOutputKey.class); job.setMapOutputValueClass(SimpleFeature.class); // mappper job.setMapperClass(OSMConversionMapper.class); // reducer job.setNumReduceTasks(0); return job.waitForCompletion(true) ? 0 : -1; }