Java Code Examples for org.apache.hadoop.mapreduce.lib.output.TextOutputFormat#setOutputPath()
The following examples show how to use
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat#setOutputPath() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordCount.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
Example 2
Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testSchemaEvolution() throws Exception { final Configuration conf = new Configuration(); final Path inputPath = new Path("target/test/thrift/schema_evolution/in"); final Path parquetPath = new Path("target/test/thrift/schema_evolution/parquet"); final Path outputPath = new Path("target/test/thrift/schema_evolution/out"); final FileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(inputPath, true); final FSDataOutputStream in = fileSystem.create(inputPath); in.writeUTF("Alice\nBob\nCharles\n"); in.close(); fileSystem.delete(parquetPath, true); fileSystem.delete(outputPath, true); { write(conf, inputPath, new Path(parquetPath, "V1"), TestInputOutputFormat.SchemaEvolutionMapper1.class, StructV1.class); write(conf, inputPath, new Path(parquetPath, "V2"), TestInputOutputFormat.SchemaEvolutionMapper2.class, StructV2.class); write(conf, inputPath, new Path(parquetPath, "V3"), TestInputOutputFormat.SchemaEvolutionMapper3.class, StructV3.class); } { final Job job = new Job(conf, "read"); job.setInputFormatClass(ParquetThriftInputFormat.class); ParquetThriftInputFormat.setInputPaths(job, new Path(parquetPath, "*")); ParquetThriftInputFormat.setThriftClass(job.getConfiguration(), StructV3.class); job.setMapperClass(TestInputOutputFormat.SchemaEvolutionReadMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputPath); waitForJob(job); } read(outputPath + "/part-m-00000", 3); read(outputPath + "/part-m-00001", 3); read(outputPath + "/part-m-00002", 3); }
Example 3
Source File: BusyLegs.java From gemfirexd-oss with Apache License 2.0 | 5 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); Configuration conf = getConf(); Path outputPath = new Path(args[0]); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); Job job = Job.getInstance(conf, "Busy Leg Count"); job.setInputFormatClass(RowInputFormat.class); // configure mapper and reducer job.setMapperClass(SampleMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // configure output TextOutputFormat.setOutputPath(job, outputPath); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); return job.waitForCompletion(true) ? 0 : 1; }
Example 4
Source File: ReduceSideJoinBloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length != 4) { printUsage(); } Job job = new Job(conf, "ReduceSideJoinBloomFilter"); job.setJarByClass(ReduceSideJoinBloomFilter.class); // Use MultipleInputs to set which input uses what mapper // This will keep parsing of each data set separate from a logical // standpoint // The first two elements of the args array are the two inputs MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, UserJoinMapper.class); MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, CommentJoinMapperWithBloom.class); job.getConfiguration().set("join.type", args[2]); job.setReducerClass(UserJoinReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(args[3])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 2; }
Example 5
Source File: BusyAirports.java From gemfirexd-oss with Apache License 2.0 | 5 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); Configuration conf = getConf(); Path outputPath = new Path(args[0]); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); Job job = Job.getInstance(conf, "Busy Airport Count"); job.setInputFormatClass(RowInputFormat.class); // configure mapper and reducer job.setMapperClass(SampleMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // configure output TextOutputFormat.setOutputPath(job, outputPath); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); return job.waitForCompletion(true) ? 0 : 1; }
Example 6
Source File: RegexExample.java From accumulo-examples with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Opts opts = new Opts(); opts.parseArgs(RegexExample.class.getName(), args); Job job = Job.getInstance(opts.getHadoopConfig()); job.setJobName(RegexExample.class.getSimpleName()); job.setJarByClass(RegexExample.class); job.setInputFormatClass(AccumuloInputFormat.class); IteratorSetting regex = new IteratorSetting(50, "regex", RegExFilter.class); RegExFilter.setRegexs(regex, opts.rowRegex, opts.columnFamilyRegex, opts.columnQualifierRegex, opts.valueRegex, false); AccumuloInputFormat.configure().clientProperties(opts.getClientProperties()) .table(opts.tableName).addIterator(regex).store(job); job.setMapperClass(RegexMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(opts.destination)); log.info("setRowRegex: " + opts.rowRegex); log.info("setColumnFamilyRegex: " + opts.columnFamilyRegex); log.info("setColumnQualifierRegex: " + opts.columnQualifierRegex); log.info("setValueRegex: " + opts.valueRegex); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 7
Source File: TestInputOutputFormatWithPadding.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void testBasicBehaviorWithPadding() throws Exception { HadoopOutputFile.getBlockFileSystems().add("file"); File inputFile = temp.newFile(); FileOutputStream out = new FileOutputStream(inputFile); out.write(FILE_CONTENT.getBytes("UTF-8")); out.close(); File tempFolder = temp.newFolder(); tempFolder.delete(); Path tempPath = new Path(tempFolder.toURI()); File outputFolder = temp.newFile(); outputFolder.delete(); Configuration conf = new Configuration(); // May test against multiple hadoop versions conf.set("dfs.block.size", "1024"); conf.set("dfs.blocksize", "1024"); conf.set("dfs.blockSize", "1024"); conf.set("fs.local.block.size", "1024"); // don't use a cached FS with a different block size conf.set("fs.file.impl.disable.cache", "true"); // disable summary metadata, it isn't needed conf.set("parquet.enable.summary-metadata", "false"); conf.set("parquet.example.schema", PARQUET_TYPE.toString()); { Job writeJob = new Job(conf, "write"); writeJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString())); writeJob.setOutputFormatClass(ParquetOutputFormat.class); writeJob.setMapperClass(Writer.class); writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class); ParquetOutputFormat.setBlockSize(writeJob, 1024); ParquetOutputFormat.setPageSize(writeJob, 512); ParquetOutputFormat.setDictionaryPageSize(writeJob, 512); ParquetOutputFormat.setEnableDictionary(writeJob, true); ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad ParquetOutputFormat.setOutputPath(writeJob, tempPath); waitForJob(writeJob); } // make sure padding was added File parquetFile = getDataFile(tempFolder); ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER); for (BlockMetaData block : footer.getBlocks()) { Assert.assertTrue("Block should start at a multiple of the block size", block.getStartingPos() % 1024 == 0); } { Job readJob = new Job(conf, "read"); readJob.setInputFormatClass(NoSplits.class); ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class); TextInputFormat.addInputPath(readJob, tempPath); readJob.setOutputFormatClass(TextOutputFormat.class); readJob.setMapperClass(Reader.class); readJob.setNumReduceTasks(0); // write directly to text without reduce TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString())); waitForJob(readJob); } File dataFile = getDataFile(outputFolder); Assert.assertNotNull("Should find a data file", dataFile); StringBuilder contentBuilder = new StringBuilder(); for (String line : Files.readAllLines(dataFile.toPath(), StandardCharsets.UTF_8)) { contentBuilder.append(line); } String reconstructed = contentBuilder.toString(); Assert.assertEquals("Should match written file content", FILE_CONTENT, reconstructed); HadoopOutputFile.getBlockFileSystems().remove("file"); }
Example 8
Source File: MobRefReporter.java From hbase with Apache License 2.0 | 4 votes |
/** * Main method for the tool. * @return 0 if success, 1 for bad args. 2 if job aborted with an exception, * 3 if mr job was unsuccessful */ public int run(String[] args) throws IOException, InterruptedException { // TODO make family and table optional if (args.length != 3) { printUsage(); return 1; } final String output = args[0]; final String tableName = args[1]; final String familyName = args[2]; final long reportStartTime = EnvironmentEdgeManager.currentTime(); Configuration conf = getConf(); try { FileSystem fs = FileSystem.get(conf); // check whether the current user is the same one with the owner of hbase root String currentUserName = UserGroupInformation.getCurrentUser().getShortUserName(); FileStatus[] hbaseRootFileStat = fs.listStatus(new Path(conf.get(HConstants.HBASE_DIR))); if (hbaseRootFileStat.length > 0) { String owner = hbaseRootFileStat[0].getOwner(); if (!owner.equals(currentUserName)) { String errorMsg = "The current user[" + currentUserName + "] does not have hbase root credentials." + " If this job fails due to an inability to read HBase's internal directories, " + "you will need to rerun as a user with sufficient permissions. The HBase superuser " + "is a safe choice."; LOG.warn(errorMsg); } } else { LOG.error("The passed configs point to an HBase dir does not exist: {}", conf.get(HConstants.HBASE_DIR)); throw new IOException("The target HBase does not exist"); } byte[] family; int maxVersions; TableName tn = TableName.valueOf(tableName); try (Connection connection = ConnectionFactory.createConnection(conf); Admin admin = connection.getAdmin()) { TableDescriptor htd = admin.getDescriptor(tn); ColumnFamilyDescriptor hcd = htd.getColumnFamily(Bytes.toBytes(familyName)); if (hcd == null || !hcd.isMobEnabled()) { throw new IOException("Column family " + familyName + " is not a MOB column family"); } family = hcd.getName(); maxVersions = hcd.getMaxVersions(); } String id = getClass().getSimpleName() + UUID.randomUUID().toString().replace("-", ""); Job job = null; Scan scan = new Scan(); scan.addFamily(family); // Do not retrieve the mob data when scanning scan.setAttribute(MobConstants.MOB_SCAN_RAW, Bytes.toBytes(Boolean.TRUE)); scan.setAttribute(MobConstants.MOB_SCAN_REF_ONLY, Bytes.toBytes(Boolean.TRUE)); // If a scanner caching value isn't set, pick a smaller default since we know we're doing // a full table scan and don't want to impact other clients badly. scan.setCaching(conf.getInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, 10000)); scan.setCacheBlocks(false); scan.readVersions(maxVersions); conf.set(REPORT_JOB_ID, id); job = Job.getInstance(conf); job.setJarByClass(getClass()); TableMapReduceUtil.initTableMapperJob(tn, scan, MobRefMapper.class, Text.class, ImmutableBytesWritable.class, job); job.setReducerClass(MobRefReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(output)); job.setJobName(getClass().getSimpleName() + "-" + tn + "-" + familyName); // for use in the reducer. easier than re-parsing it out of the scan string. job.getConfiguration().set(TableInputFormat.SCAN_COLUMN_FAMILY, familyName); // Use when we start this job as the base point for file "recency". job.getConfiguration().setLong(REPORT_START_DATETIME, reportStartTime); if (job.waitForCompletion(true)) { LOG.info("Finished creating report for '{}', family='{}'", tn, familyName); } else { System.err.println("Job was not successful"); return 3; } return 0; } catch (ClassNotFoundException | RuntimeException | IOException | InterruptedException e) { System.err.println("Job aborted due to exception " + e); return 2; // job failed } }
Example 9
Source File: TopBusyAirport.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); Configuration conf = getConf(); Path outputPath = new Path(args[0]); Path intermediateOutputPath = new Path(args[0] + "_int"); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); Job job = Job.getInstance(conf, "Busy Airport Count"); job.setInputFormatClass(RowInputFormat.class); // configure mapper and reducer job.setMapperClass(SampleMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // Only have one reduce task so that all of the results from mapping are // processed in one place. job.setNumReduceTasks(1); // configure output TextOutputFormat.setOutputPath(job, intermediateOutputPath); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); int rc = job.waitForCompletion(true) ? 0 : 1; if (rc == 0) { Job topJob = Job.getInstance(getConf(), "Top Busy Airport"); // We want the task to run on a single VM topJob.setNumReduceTasks(1); // Set the inputs topJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(topJob, intermediateOutputPath); // Set the mapper and reducer topJob.setMapperClass(TopBusyAirportMapper.class); topJob.setReducerClass(TopBusyAirportReducer.class); // Set the outputs TextOutputFormat.setOutputPath(topJob, outputPath); topJob.setOutputFormatClass(TextOutputFormat.class); topJob.setOutputKeyClass(Text.class); topJob.setOutputValueClass(IntWritable.class); topJob.setMapOutputKeyClass(Text.class); topJob.setMapOutputValueClass(StringIntPair.class); rc = topJob.waitForCompletion(true) ? 0 : 1; } return rc; }
Example 10
Source File: TopBusyAirportGemfirexd.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); Configuration conf = getConf(); Path outputPath = new Path(args[0]); Path intermediateOutputPath = new Path(args[0] + "_int"); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); Job job = Job.getInstance(conf, "Busy Airport Count"); job.setJarByClass(TopBusyAirportGemfirexd.class); job.setInputFormatClass(RowInputFormat.class); // configure mapper and reducer job.setMapperClass(SampleMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // configure output TextOutputFormat.setOutputPath(job, intermediateOutputPath); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); int rc = job.waitForCompletion(true) ? 0 : 1; if (rc == 0) { Configuration topConf = getConf(); String gemfirexdUrl = topConf.get("gemfirexd.url", "jdbc:gemfirexd://localhost:1527"); topConf.set(RowOutputFormat.OUTPUT_URL, gemfirexdUrl); topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT"); Configuration.dumpConfiguration(topConf, new PrintWriter(System.out)); Job topJob = Job.getInstance(topConf, "Top Busy Airport"); // We want the task to run on a single VM topJob.setNumReduceTasks(1); // Set the inputs topJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(topJob, intermediateOutputPath); // Set the mapper and reducer topJob.setMapperClass(TopBusyAirportMapper.class); topJob.setReducerClass(TopBusyAirportReducer.class); topJob.setMapOutputKeyClass(Text.class); topJob.setMapOutputValueClass(StringIntPair.class); // Set the outputs TextOutputFormat.setOutputPath(topJob, outputPath); topJob.setOutputFormatClass(RowOutputFormat.class); topJob.setOutputKeyClass(Key.class); topJob.setOutputValueClass(BusyAirportModel.class); rc = topJob.waitForCompletion(true) ? 0 : 1; } return rc; }
Example 11
Source File: TestJob.java From hadoop-louvain-community with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJobName(TestJob.class.getName()); job.setJarByClass(TestJob.class); job.setMapperClass(MapJob.class); job.setReducerClass(ReduceJob.class); // Hello there ZipFileInputFormat! job.setInputFormatClass(GraphInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
Example 12
Source File: JoinFilterExampleMRJob.java From hadoop-arch-book with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { String inputFoo = args[0]; String inputBar = args[1]; String output = args[2]; String fooValueMaxFilter = args[3]; String joinValueMaxFilter = args[4]; int numberOfReducers = Integer.parseInt(args[5]); //A Job job = Job.getInstance(); //B job.setJarByClass(JoinFilterExampleMRJob.class); job.setJobName("JoinFilterExampleMRJob"); //C Configuration config = job.getConfiguration(); config.set(FOO_TABLE_CONF, inputFoo); config.set(BAR_TABLE_CONF, inputBar); config.set(FOO_VAL_MAX_CONF, fooValueMaxFilter); config.set(JOIN_VAL_MAX_CONF, joinValueMaxFilter); // D job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path(inputFoo)); TextInputFormat.addInputPath(job, new Path(inputBar)); // E job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(output)); // F job.setMapperClass(JoinFilterMapper.class); job.setReducerClass(JoinFilterReducer.class); job.setPartitionerClass(JoinFilterPartitioner.class); // G job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //H job.setNumReduceTasks(numberOfReducers); // I job.waitForCompletion(true); return 0; }
Example 13
Source File: TestNewCombinerGrouping.java From big-c with Apache License 2.0 | 4 votes |
@Test public void testCombiner() throws Exception { if (!new File(TEST_ROOT_DIR).mkdirs()) { throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR); } File in = new File(TEST_ROOT_DIR, "input"); if (!in.mkdirs()) { throw new RuntimeException("Could not create test dir: " + in); } File out = new File(TEST_ROOT_DIR, "output"); PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt"))); pw.println("A|a,1"); pw.println("A|b,2"); pw.println("B|a,3"); pw.println("B|b,4"); pw.println("B|c,5"); pw.close(); JobConf conf = new JobConf(); conf.set("mapreduce.framework.name", "local"); Job job = new Job(conf); TextInputFormat.setInputPaths(job, new Path(in.getPath())); TextOutputFormat.setOutputPath(job, new Path(out.getPath())); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setGroupingComparatorClass(GroupComparator.class); job.setCombinerKeyGroupingComparatorClass(GroupComparator.class); job.setCombinerClass(Combiner.class); job.getConfiguration().setInt("min.num.spills.for.combine", 0); job.submit(); job.waitForCompletion(false); if (job.isSuccessful()) { Counters counters = job.getCounters(); long combinerInputRecords = counters.findCounter( "org.apache.hadoop.mapreduce.TaskCounter", "COMBINE_INPUT_RECORDS").getValue(); long combinerOutputRecords = counters.findCounter( "org.apache.hadoop.mapreduce.TaskCounter", "COMBINE_OUTPUT_RECORDS").getValue(); Assert.assertTrue(combinerInputRecords > 0); Assert.assertTrue(combinerInputRecords > combinerOutputRecords); BufferedReader br = new BufferedReader(new FileReader( new File(out, "part-r-00000"))); Set<String> output = new HashSet<String>(); String line = br.readLine(); Assert.assertNotNull(line); output.add(line.substring(0, 1) + line.substring(4, 5)); line = br.readLine(); Assert.assertNotNull(line); output.add(line.substring(0, 1) + line.substring(4, 5)); line = br.readLine(); Assert.assertNull(line); br.close(); Set<String> expected = new HashSet<String>(); expected.add("A2"); expected.add("B5"); Assert.assertEquals(expected, output); } else { Assert.fail("Job failed"); } }
Example 14
Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: JobChainingDriver <posts> <users> <out>"); System.exit(2); } Path postInput = new Path(otherArgs[0]); Path userInput = new Path(otherArgs[1]); Path outputDirIntermediate = new Path(otherArgs[2] + "_int"); Path outputDir = new Path(otherArgs[2]); // Setup first job to counter user posts Job countingJob = new Job(conf, "JobChaining-Counting"); countingJob.setJarByClass(BasicJobChaining.class); // Set our mapper and reducer, we can use the API's long sum reducer for // a combiner! countingJob.setMapperClass(UserIdCountMapper.class); countingJob.setCombinerClass(LongSumReducer.class); countingJob.setReducerClass(UserIdSumReducer.class); countingJob.setOutputKeyClass(Text.class); countingJob.setOutputValueClass(LongWritable.class); countingJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(countingJob, postInput); countingJob.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate); // Execute job and grab exit code int code = countingJob.waitForCompletion(true) ? 0 : 1; if (code == 0) { // Calculate the average posts per user by getting counter values double numRecords = (double) countingJob.getCounters() .findCounter(AVERAGE_CALC_GROUP, UserIdCountMapper.RECORDS_COUNTER_NAME) .getValue(); double numUsers = (double) countingJob.getCounters() .findCounter(AVERAGE_CALC_GROUP, UserIdSumReducer.USERS_COUNTER_NAME) .getValue(); double averagePostsPerUser = numRecords / numUsers; // Setup binning job Job binningJob = new Job(new Configuration(), "JobChaining-Binning"); binningJob.setJarByClass(BasicJobChaining.class); // Set mapper and the average posts per user binningJob.setMapperClass(UserIdBinningMapper.class); UserIdBinningMapper.setAveragePostsPerUser(binningJob, averagePostsPerUser); binningJob.setNumReduceTasks(0); binningJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(binningJob, outputDirIntermediate); // Add two named outputs for below/above average MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_BELOW_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_ABOVE_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.setCountersEnabled(binningJob, true); TextOutputFormat.setOutputPath(binningJob, outputDir); // Add the user files to the DistributedCache FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput); for (FileStatus status : userFiles) { DistributedCache.addCacheFile(status.getPath().toUri(), binningJob.getConfiguration()); } // Execute job and grab exit code code = binningJob.waitForCompletion(true) ? 0 : 1; } // Clean up the intermediate output FileSystem.get(conf).delete(outputDirIntermediate, true); System.exit(code); }
Example 15
Source File: TotalOrderSortingStage.java From hadoop-map-reduce-patterns with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Path inputPath = new Path(args[0]); Path partitionFile = new Path(args[1] + "_partitions.lst"); Path outputStage = new Path(args[1] + "_staging"); Path outputOrder = new Path(args[1]); // Configure job to prepare for sampling Job sampleJob = new Job(conf, "TotalOrderSortingStage"); sampleJob.setJarByClass(TotalOrderSortingStage.class); // Use the mapper implementation with zero reduce tasks sampleJob.setMapperClass(LastAccessMapper.class); sampleJob.setNumReduceTasks(0); sampleJob.setOutputKeyClass(Text.class); sampleJob.setOutputValueClass(Text.class); TextInputFormat.setInputPaths(sampleJob, inputPath); // Set the output format to a sequence file sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage); // Submit the job and get completion code. int code = sampleJob.waitForCompletion(true) ? 0 : 1; if (code == 0) { Job orderJob = new Job(conf, "TotalOrderSortingStage"); orderJob.setJarByClass(TotalOrderSortingStage.class); // Here, use the identity mapper to output the key/value pairs in // the SequenceFile orderJob.setMapperClass(Mapper.class); orderJob.setReducerClass(ValuesReducer.class); // Set the number of reduce tasks to an appropriate number for the // amount of data being sorted orderJob.setNumReduceTasks(10); // Use Hadoop's TotalOrderPartitioner class orderJob.setPartitionerClass(TotalOrderPartitioner.class); // Set the partition file TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(), partitionFile); orderJob.setOutputKeyClass(Text.class); orderJob.setOutputValueClass(Text.class); // Set the input to the previous job's output orderJob.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(orderJob, outputStage); // Set the output path to the command line parameter TextOutputFormat.setOutputPath(orderJob, outputOrder); // Set the separator to an empty string orderJob.getConfiguration().set( "mapred.textoutputformat.separator", ""); // Use the InputSampler to go through the output of the previous // job, sample it, and create the partition file InputSampler.writePartitionFile(orderJob, new InputSampler.RandomSampler(.001, 10000)); // Submit the job code = orderJob.waitForCompletion(true) ? 0 : 2; } // Clean up the partition file and the staging directory FileSystem.get(new Configuration()).delete(partitionFile, false); FileSystem.get(new Configuration()).delete(outputStage, true); return code; }
Example 16
Source File: TopBusyAirport.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); Configuration conf = getConf(); Path outputPath = new Path(args[0]); Path intermediateOutputPath = new Path(args[0] + "_int"); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); Job job = Job.getInstance(conf, "Busy Airport Count"); job.setInputFormatClass(RowInputFormat.class); // configure mapper and reducer job.setMapperClass(SampleMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // Only have one reduce task so that all of the results from mapping are // processed in one place. job.setNumReduceTasks(1); // configure output TextOutputFormat.setOutputPath(job, intermediateOutputPath); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); int rc = job.waitForCompletion(true) ? 0 : 1; if (rc == 0) { Job topJob = Job.getInstance(getConf(), "Top Busy Airport"); // We want the task to run on a single VM topJob.setNumReduceTasks(1); // Set the inputs topJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(topJob, intermediateOutputPath); // Set the mapper and reducer topJob.setMapperClass(TopBusyAirportMapper.class); topJob.setReducerClass(TopBusyAirportReducer.class); // Set the outputs TextOutputFormat.setOutputPath(topJob, outputPath); topJob.setOutputFormatClass(TextOutputFormat.class); topJob.setOutputKeyClass(Text.class); topJob.setOutputValueClass(IntWritable.class); topJob.setMapOutputKeyClass(Text.class); topJob.setMapOutputValueClass(StringIntPair.class); rc = topJob.waitForCompletion(true) ? 0 : 1; } return rc; }
Example 17
Source File: TopBusyAirportGemfirexd.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); Configuration conf = getConf(); Path outputPath = new Path(args[0]); Path intermediateOutputPath = new Path(args[0] + "_int"); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); Job job = Job.getInstance(conf, "Busy Airport Count"); job.setJarByClass(TopBusyAirportGemfirexd.class); job.setInputFormatClass(RowInputFormat.class); // configure mapper and reducer job.setMapperClass(SampleMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); // configure output TextOutputFormat.setOutputPath(job, intermediateOutputPath); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); int rc = job.waitForCompletion(true) ? 0 : 1; if (rc == 0) { Configuration topConf = getConf(); String gemfirexdUrl = topConf.get("gemfirexd.url", "jdbc:gemfirexd://localhost:1527"); topConf.set(RowOutputFormat.OUTPUT_URL, gemfirexdUrl); topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT"); Configuration.dumpConfiguration(topConf, new PrintWriter(System.out)); Job topJob = Job.getInstance(topConf, "Top Busy Airport"); // We want the task to run on a single VM topJob.setNumReduceTasks(1); // Set the inputs topJob.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(topJob, intermediateOutputPath); // Set the mapper and reducer topJob.setMapperClass(TopBusyAirportMapper.class); topJob.setReducerClass(TopBusyAirportReducer.class); topJob.setMapOutputKeyClass(Text.class); topJob.setMapOutputValueClass(StringIntPair.class); // Set the outputs TextOutputFormat.setOutputPath(topJob, outputPath); topJob.setOutputFormatClass(RowOutputFormat.class); topJob.setOutputKeyClass(Key.class); topJob.setOutputValueClass(BusyAirportModel.class); rc = topJob.waitForCompletion(true) ? 0 : 1; } return rc; }
Example 18
Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(this.conf); for (Map.Entry<String, String> entry : extraConf.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } final FileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(parquetPath, true); fileSystem.delete(outputPath, true); { writeJob = new Job(conf, "write"); TextInputFormat.addInputPath(writeJob, inputPath); writeJob.setInputFormatClass(TextInputFormat.class); writeJob.setNumReduceTasks(0); ParquetOutputFormat.setCompression(writeJob, codec); ParquetOutputFormat.setOutputPath(writeJob, parquetPath); writeJob.setOutputFormatClass(ParquetOutputFormat.class); writeJob.setMapperClass(readMapperClass); ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class); GroupWriteSupport.setSchema( MessageTypeParser.parseMessageType(writeSchema), writeJob.getConfiguration()); writeJob.submit(); waitForJob(writeJob); } { conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema); readJob = new Job(conf, "read"); readJob.setInputFormatClass(ParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class); ParquetInputFormat.setInputPaths(readJob, parquetPath); readJob.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(readJob, outputPath); readJob.setMapperClass(writeMapperClass); readJob.setNumReduceTasks(0); readJob.submit(); waitForJob(readJob); } }
Example 19
Source File: MyFof.java From BigDataArchitect with Apache License 2.0 | 2 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(true); conf.set("mapreduce.framework.name","local"); conf.set("mapreduce.app-submission.cross-platform","true"); String[] other = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf); job.setJarByClass(MyFof.class); job.setJobName("fof"); //初学者,关注的是client端的代码梳理:因为把这块写明白了,其实你也就真的知道这个作业的开发原理; //maptask //input TextInputFormat.addInputPath(job,new Path(other[0])); Path outPath = new Path(other[1]); if(outPath.getFileSystem(conf).exists(outPath)) outPath.getFileSystem(conf).delete(outPath,true); TextOutputFormat.setOutputPath(job,outPath); //key //map job.setMapperClass(FMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //reducetask //reduce // job.setNumReduceTasks(0); job.setReducerClass(FReducer.class); job.waitForCompletion(true); }
Example 20
Source File: MyWordCount.java From BigDataArchitect with Apache License 2.0 | 2 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(true); GenericOptionsParser parser = new GenericOptionsParser(conf, args); //工具类帮我们把-D 等等的属性直接set到conf,会留下commandOptions String[] othargs = parser.getRemainingArgs(); //让框架知道是windows异构平台运行 conf.set("mapreduce.app-submission.cross-platform","true"); // conf.set("mapreduce.framework.name","local"); // System.out.println(conf.get("mapreduce.framework.name")); Job job = Job.getInstance(conf); // FileInputFormat.setMinInputSplitSize(job,2222); // job.setInputFormatClass(ooxx.class); job.setJar("C:\\Users\\admin\\IdeaProjects\\msbhadoop\\target\\hadoop-hdfs-1.0-0.1.jar"); //必须必须写的 job.setJarByClass(MyWordCount.class); job.setJobName("mashibing"); Path infile = new Path(othargs[0]); TextInputFormat.addInputPath(job, infile); Path outfile = new Path(othargs[1]); if (outfile.getFileSystem(conf).exists(outfile)) outfile.getFileSystem(conf).delete(outfile, true); TextOutputFormat.setOutputPath(job, outfile); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MyReducer.class); // job.setNumReduceTasks(2); // Submit the job, then poll for progress until the job is complete job.waitForCompletion(true); }