org.apache.hadoop.mapreduce.lib.output.TextOutputFormat#setOutputPath

Source File: WordCount.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}

Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testSchemaEvolution() throws Exception {
  final Configuration conf = new Configuration();
  final Path inputPath = new Path("target/test/thrift/schema_evolution/in");
  final Path parquetPath = new Path("target/test/thrift/schema_evolution/parquet");
  final Path outputPath = new Path("target/test/thrift/schema_evolution/out");
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(inputPath, true);
  final FSDataOutputStream in = fileSystem.create(inputPath);
  in.writeUTF("Alice\nBob\nCharles\n");
  in.close();
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    write(conf, inputPath, new Path(parquetPath, "V1"), TestInputOutputFormat.SchemaEvolutionMapper1.class, StructV1.class);
    write(conf, inputPath, new Path(parquetPath, "V2"), TestInputOutputFormat.SchemaEvolutionMapper2.class, StructV2.class);
    write(conf, inputPath, new Path(parquetPath, "V3"), TestInputOutputFormat.SchemaEvolutionMapper3.class, StructV3.class);
  }
  {
    final Job job = new Job(conf, "read");
    job.setInputFormatClass(ParquetThriftInputFormat.class);
    ParquetThriftInputFormat.setInputPaths(job, new Path(parquetPath, "*"));
    ParquetThriftInputFormat.setThriftClass(job.getConfiguration(), StructV3.class);
    job.setMapperClass(TestInputOutputFormat.SchemaEvolutionReadMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputPath);

    waitForJob(job);
  }

  read(outputPath + "/part-m-00000", 3);
  read(outputPath + "/part-m-00001", 3);
  read(outputPath + "/part-m-00002", 3);
}

Source File: BusyLegs.java From gemfirexd-oss with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();
    Configuration conf = getConf();

    Path outputPath = new Path(args[0]);
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    Job job = Job.getInstance(conf, "Busy Leg Count");

    job.setInputFormatClass(RowInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(SampleMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
  }

Source File: ReduceSideJoinBloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 4) {
		printUsage();
	}
	Job job = new Job(conf, "ReduceSideJoinBloomFilter");
	job.setJarByClass(ReduceSideJoinBloomFilter.class);

	// Use MultipleInputs to set which input uses what mapper
	// This will keep parsing of each data set separate from a logical
	// standpoint
	// The first two elements of the args array are the two inputs
	MultipleInputs.addInputPath(job, new Path(args[0]),
			TextInputFormat.class, UserJoinMapper.class);
	MultipleInputs.addInputPath(job, new Path(args[1]),
			TextInputFormat.class, CommentJoinMapperWithBloom.class);
	job.getConfiguration().set("join.type", args[2]);

	job.setReducerClass(UserJoinReducer.class);

	job.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(job, new Path(args[3]));

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);

	return job.waitForCompletion(true) ? 0 : 2;
}

Source File: BusyAirports.java From gemfirexd-oss with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();
    Configuration conf = getConf();

    Path outputPath = new Path(args[0]);
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    Job job = Job.getInstance(conf, "Busy Airport Count");

    job.setInputFormatClass(RowInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(SampleMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
  }

Source File: RegexExample.java From accumulo-examples with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
  Opts opts = new Opts();
  opts.parseArgs(RegexExample.class.getName(), args);

  Job job = Job.getInstance(opts.getHadoopConfig());
  job.setJobName(RegexExample.class.getSimpleName());
  job.setJarByClass(RegexExample.class);

  job.setInputFormatClass(AccumuloInputFormat.class);

  IteratorSetting regex = new IteratorSetting(50, "regex", RegExFilter.class);
  RegExFilter.setRegexs(regex, opts.rowRegex, opts.columnFamilyRegex, opts.columnQualifierRegex,
      opts.valueRegex, false);

  AccumuloInputFormat.configure().clientProperties(opts.getClientProperties())
      .table(opts.tableName).addIterator(regex).store(job);

  job.setMapperClass(RegexMapper.class);
  job.setMapOutputKeyClass(Key.class);
  job.setMapOutputValueClass(Value.class);
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(TextOutputFormat.class);
  TextOutputFormat.setOutputPath(job, new Path(opts.destination));

  log.info("setRowRegex: " + opts.rowRegex);
  log.info("setColumnFamilyRegex: " + opts.columnFamilyRegex);
  log.info("setColumnQualifierRegex: " + opts.columnQualifierRegex);
  log.info("setValueRegex: " + opts.valueRegex);

  System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: TestInputOutputFormatWithPadding.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testBasicBehaviorWithPadding() throws Exception {
  HadoopOutputFile.getBlockFileSystems().add("file");

  File inputFile = temp.newFile();
  FileOutputStream out = new FileOutputStream(inputFile);
  out.write(FILE_CONTENT.getBytes("UTF-8"));
  out.close();

  File tempFolder = temp.newFolder();
  tempFolder.delete();
  Path tempPath = new Path(tempFolder.toURI());

  File outputFolder = temp.newFile();
  outputFolder.delete();

  Configuration conf = new Configuration();
  // May test against multiple hadoop versions
  conf.set("dfs.block.size", "1024");
  conf.set("dfs.blocksize", "1024");
  conf.set("dfs.blockSize", "1024");
  conf.set("fs.local.block.size", "1024");

  // don't use a cached FS with a different block size
  conf.set("fs.file.impl.disable.cache", "true");

  // disable summary metadata, it isn't needed
  conf.set("parquet.enable.summary-metadata", "false");
  conf.set("parquet.example.schema", PARQUET_TYPE.toString());

  {
    Job writeJob = new Job(conf, "write");
    writeJob.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));

    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(Writer.class);
    writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce
    ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class);
    ParquetOutputFormat.setBlockSize(writeJob, 1024);
    ParquetOutputFormat.setPageSize(writeJob, 512);
    ParquetOutputFormat.setDictionaryPageSize(writeJob, 512);
    ParquetOutputFormat.setEnableDictionary(writeJob, true);
    ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad
    ParquetOutputFormat.setOutputPath(writeJob, tempPath);

    waitForJob(writeJob);
  }

  // make sure padding was added
  File parquetFile = getDataFile(tempFolder);
  ParquetMetadata footer = ParquetFileReader.readFooter(conf,
      new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER);
  for (BlockMetaData block : footer.getBlocks()) {
    Assert.assertTrue("Block should start at a multiple of the block size",
        block.getStartingPos() % 1024 == 0);
  }

  {
    Job readJob = new Job(conf, "read");
    readJob.setInputFormatClass(NoSplits.class);
    ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class);
    TextInputFormat.addInputPath(readJob, tempPath);

    readJob.setOutputFormatClass(TextOutputFormat.class);
    readJob.setMapperClass(Reader.class);
    readJob.setNumReduceTasks(0); // write directly to text without reduce
    TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));

    waitForJob(readJob);
  }

  File dataFile = getDataFile(outputFolder);
  Assert.assertNotNull("Should find a data file", dataFile);

  StringBuilder contentBuilder = new StringBuilder();
  for (String line : Files.readAllLines(dataFile.toPath(), StandardCharsets.UTF_8)) {
    contentBuilder.append(line);
  }
  String reconstructed = contentBuilder.toString();
  Assert.assertEquals("Should match written file content",
      FILE_CONTENT, reconstructed);

  HadoopOutputFile.getBlockFileSystems().remove("file");
}

Source File: MobRefReporter.java From hbase with Apache License 2.0

4 votes

/**
 * Main method for the tool.
 * @return 0 if success, 1 for bad args. 2 if job aborted with an exception,
 *   3 if mr job was unsuccessful
 */
public int run(String[] args) throws IOException, InterruptedException {
  // TODO make family and table optional
  if (args.length != 3) {
    printUsage();
    return 1;
  }
  final String output = args[0];
  final String tableName = args[1];
  final String familyName = args[2];
  final long reportStartTime = EnvironmentEdgeManager.currentTime();
  Configuration conf = getConf();
  try {
    FileSystem fs = FileSystem.get(conf);
    // check whether the current user is the same one with the owner of hbase root
    String currentUserName = UserGroupInformation.getCurrentUser().getShortUserName();
    FileStatus[] hbaseRootFileStat = fs.listStatus(new Path(conf.get(HConstants.HBASE_DIR)));
    if (hbaseRootFileStat.length > 0) {
      String owner = hbaseRootFileStat[0].getOwner();
      if (!owner.equals(currentUserName)) {
        String errorMsg = "The current user[" + currentUserName
            + "] does not have hbase root credentials."
            + " If this job fails due to an inability to read HBase's internal directories, "
            + "you will need to rerun as a user with sufficient permissions. The HBase superuser "
            + "is a safe choice.";
        LOG.warn(errorMsg);
      }
    } else {
      LOG.error("The passed configs point to an HBase dir does not exist: {}",
          conf.get(HConstants.HBASE_DIR));
      throw new IOException("The target HBase does not exist");
    }

    byte[] family;
    int maxVersions;
    TableName tn = TableName.valueOf(tableName);
    try (Connection connection = ConnectionFactory.createConnection(conf);
         Admin admin = connection.getAdmin()) {
      TableDescriptor htd = admin.getDescriptor(tn);
      ColumnFamilyDescriptor hcd = htd.getColumnFamily(Bytes.toBytes(familyName));
      if (hcd == null || !hcd.isMobEnabled()) {
        throw new IOException("Column family " + familyName + " is not a MOB column family");
      }
      family = hcd.getName();
      maxVersions = hcd.getMaxVersions();
    }


    String id = getClass().getSimpleName() + UUID.randomUUID().toString().replace("-", "");
    Job job = null;
    Scan scan = new Scan();
    scan.addFamily(family);
    // Do not retrieve the mob data when scanning
    scan.setAttribute(MobConstants.MOB_SCAN_RAW, Bytes.toBytes(Boolean.TRUE));
    scan.setAttribute(MobConstants.MOB_SCAN_REF_ONLY, Bytes.toBytes(Boolean.TRUE));
    // If a scanner caching value isn't set, pick a smaller default since we know we're doing
    // a full table scan and don't want to impact other clients badly.
    scan.setCaching(conf.getInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, 10000));
    scan.setCacheBlocks(false);
    scan.readVersions(maxVersions);
    conf.set(REPORT_JOB_ID, id);

    job = Job.getInstance(conf);
    job.setJarByClass(getClass());
    TableMapReduceUtil.initTableMapperJob(tn, scan,
        MobRefMapper.class, Text.class, ImmutableBytesWritable.class, job);

    job.setReducerClass(MobRefReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(output));

    job.setJobName(getClass().getSimpleName() + "-" + tn + "-" + familyName);
    // for use in the reducer. easier than re-parsing it out of the scan string.
    job.getConfiguration().set(TableInputFormat.SCAN_COLUMN_FAMILY, familyName);

    // Use when we start this job as the base point for file "recency".
    job.getConfiguration().setLong(REPORT_START_DATETIME, reportStartTime);

    if (job.waitForCompletion(true)) {
      LOG.info("Finished creating report for '{}', family='{}'", tn, familyName);
    } else {
      System.err.println("Job was not successful");
      return 3;
    }
    return 0;

  } catch (ClassNotFoundException | RuntimeException | IOException | InterruptedException e) {
    System.err.println("Job aborted due to exception " + e);
    return 2; // job failed
  }
}

Source File: TopBusyAirport.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();
    Configuration conf = getConf();

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    Job job = Job.getInstance(conf, "Busy Airport Count");

    job.setInputFormatClass(RowInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(SampleMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    // Only have one reduce task so that all of the results from mapping are
    // processed in one place.
    job.setNumReduceTasks(1);

    // configure output
    TextOutputFormat.setOutputPath(job, intermediateOutputPath);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int rc = job.waitForCompletion(true) ? 0 : 1;
    if (rc == 0) {
      Job topJob = Job.getInstance(getConf(), "Top Busy Airport");

      // We want the task to run on a single VM
      topJob.setNumReduceTasks(1);

      // Set the inputs
      topJob.setInputFormatClass(TextInputFormat.class);
      TextInputFormat.addInputPath(topJob, intermediateOutputPath);

      // Set the mapper and reducer
      topJob.setMapperClass(TopBusyAirportMapper.class);
      topJob.setReducerClass(TopBusyAirportReducer.class);

      // Set the outputs
      TextOutputFormat.setOutputPath(topJob, outputPath);
      topJob.setOutputFormatClass(TextOutputFormat.class);
      topJob.setOutputKeyClass(Text.class);
      topJob.setOutputValueClass(IntWritable.class);

      topJob.setMapOutputKeyClass(Text.class);
      topJob.setMapOutputValueClass(StringIntPair.class);

      rc = topJob.waitForCompletion(true) ? 0 : 1;
    }
    return rc;
  }

Source File: TopBusyAirportGemfirexd.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();
    Configuration conf = getConf();

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    Job job = Job.getInstance(conf, "Busy Airport Count");
    job.setJarByClass(TopBusyAirportGemfirexd.class);

    job.setInputFormatClass(RowInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(SampleMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, intermediateOutputPath);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int rc = job.waitForCompletion(true) ? 0 : 1;
    if (rc == 0) {
      Configuration topConf = getConf();

      String gemfirexdUrl = topConf.get("gemfirexd.url", "jdbc:gemfirexd://localhost:1527");
      topConf.set(RowOutputFormat.OUTPUT_URL, gemfirexdUrl);
      topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT");

      Configuration.dumpConfiguration(topConf, new PrintWriter(System.out));

      Job topJob = Job.getInstance(topConf, "Top Busy Airport");

      // We want the task to run on a single VM
      topJob.setNumReduceTasks(1);

      // Set the inputs
      topJob.setInputFormatClass(TextInputFormat.class);
      TextInputFormat.addInputPath(topJob, intermediateOutputPath);

      // Set the mapper and reducer
      topJob.setMapperClass(TopBusyAirportMapper.class);
      topJob.setReducerClass(TopBusyAirportReducer.class);
      topJob.setMapOutputKeyClass(Text.class);
      topJob.setMapOutputValueClass(StringIntPair.class);

      // Set the outputs
      TextOutputFormat.setOutputPath(topJob, outputPath);
      topJob.setOutputFormatClass(RowOutputFormat.class);
      topJob.setOutputKeyClass(Key.class);
      topJob.setOutputValueClass(BusyAirportModel.class);

      rc = topJob.waitForCompletion(true) ? 0 : 1;
    }
    return rc;
  }

Source File: TestJob.java From hadoop-louvain-community with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        Job job = new Job(conf);
        job.setJobName(TestJob.class.getName());
        job.setJarByClass(TestJob.class);
        job.setMapperClass(MapJob.class);
        job.setReducerClass(ReduceJob.class);

        // Hello there ZipFileInputFormat!
        job.setInputFormatClass(GraphInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        TextOutputFormat.setOutputPath(job, new Path(args[1]));


        job.waitForCompletion(true);



    }

Source File: JoinFilterExampleMRJob.java From hadoop-arch-book with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  String inputFoo = args[0];
  String inputBar = args[1];
  String output = args[2];
  String fooValueMaxFilter = args[3];
  String joinValueMaxFilter = args[4];
  int numberOfReducers = Integer.parseInt(args[5]);

  //A
  Job job = Job.getInstance();

  //B
  job.setJarByClass(JoinFilterExampleMRJob.class);
  job.setJobName("JoinFilterExampleMRJob");

  //C
  Configuration config = job.getConfiguration();
  config.set(FOO_TABLE_CONF, inputFoo);
  config.set(BAR_TABLE_CONF, inputBar);
  config.set(FOO_VAL_MAX_CONF, fooValueMaxFilter);
  config.set(JOIN_VAL_MAX_CONF, joinValueMaxFilter);

  // D
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(job, new Path(inputFoo));
  TextInputFormat.addInputPath(job, new Path(inputBar));

  // E
  job.setOutputFormatClass(TextOutputFormat.class);
  TextOutputFormat.setOutputPath(job, new Path(output));

  // F
  job.setMapperClass(JoinFilterMapper.class);
  job.setReducerClass(JoinFilterReducer.class);
  job.setPartitionerClass(JoinFilterPartitioner.class);

  // G
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  //H
  job.setNumReduceTasks(numberOfReducers);

  // I
  job.waitForCompletion(true);
  return 0;
}

Source File: TestNewCombinerGrouping.java From big-c with Apache License 2.0

4 votes

@Test
public void testCombiner() throws Exception {
  if (!new File(TEST_ROOT_DIR).mkdirs()) {
    throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR);
  }
  File in = new File(TEST_ROOT_DIR, "input");
  if (!in.mkdirs()) {
    throw new RuntimeException("Could not create test dir: " + in);
  }
  File out = new File(TEST_ROOT_DIR, "output");
  PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt")));
  pw.println("A|a,1");
  pw.println("A|b,2");
  pw.println("B|a,3");
  pw.println("B|b,4");
  pw.println("B|c,5");
  pw.close();
  JobConf conf = new JobConf();
  conf.set("mapreduce.framework.name", "local");
  Job job = new Job(conf);
  TextInputFormat.setInputPaths(job, new Path(in.getPath()));
  TextOutputFormat.setOutputPath(job, new Path(out.getPath()));

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setGroupingComparatorClass(GroupComparator.class);

  job.setCombinerKeyGroupingComparatorClass(GroupComparator.class);
  job.setCombinerClass(Combiner.class);
  job.getConfiguration().setInt("min.num.spills.for.combine", 0);

  job.submit();
  job.waitForCompletion(false);
  if (job.isSuccessful()) {
    Counters counters = job.getCounters();

    long combinerInputRecords = counters.findCounter(
        "org.apache.hadoop.mapreduce.TaskCounter",
        "COMBINE_INPUT_RECORDS").getValue();
    long combinerOutputRecords = counters.findCounter(
        "org.apache.hadoop.mapreduce.TaskCounter",
        "COMBINE_OUTPUT_RECORDS").getValue();
    Assert.assertTrue(combinerInputRecords > 0);
    Assert.assertTrue(combinerInputRecords > combinerOutputRecords);

    BufferedReader br = new BufferedReader(new FileReader(
        new File(out, "part-r-00000")));
    Set<String> output = new HashSet<String>();
    String line = br.readLine();
    Assert.assertNotNull(line);
    output.add(line.substring(0, 1) + line.substring(4, 5));
    line = br.readLine();
    Assert.assertNotNull(line);
    output.add(line.substring(0, 1) + line.substring(4, 5));
    line = br.readLine();
    Assert.assertNull(line);
    br.close();

    Set<String> expected = new HashSet<String>();
    expected.add("A2");
    expected.add("B5");

    Assert.assertEquals(expected, output);

  } else {
    Assert.fail("Job failed");
  }
}

Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

	if (otherArgs.length != 3) {
		System.err.println("Usage: JobChainingDriver <posts> <users> <out>");
		System.exit(2);
	}

	Path postInput = new Path(otherArgs[0]);
	Path userInput = new Path(otherArgs[1]);
	Path outputDirIntermediate = new Path(otherArgs[2] + "_int");
	Path outputDir = new Path(otherArgs[2]);

	// Setup first job to counter user posts
	Job countingJob = new Job(conf, "JobChaining-Counting");
	countingJob.setJarByClass(BasicJobChaining.class);

	// Set our mapper and reducer, we can use the API's long sum reducer for
	// a combiner!
	countingJob.setMapperClass(UserIdCountMapper.class);
	countingJob.setCombinerClass(LongSumReducer.class);
	countingJob.setReducerClass(UserIdSumReducer.class);

	countingJob.setOutputKeyClass(Text.class);
	countingJob.setOutputValueClass(LongWritable.class);

	countingJob.setInputFormatClass(TextInputFormat.class);

	TextInputFormat.addInputPath(countingJob, postInput);

	countingJob.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);

	// Execute job and grab exit code
	int code = countingJob.waitForCompletion(true) ? 0 : 1;

	if (code == 0) {
		// Calculate the average posts per user by getting counter values
		double numRecords = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdCountMapper.RECORDS_COUNTER_NAME)
				.getValue();
		double numUsers = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdSumReducer.USERS_COUNTER_NAME)
				.getValue();

		double averagePostsPerUser = numRecords / numUsers;

		// Setup binning job
		Job binningJob = new Job(new Configuration(), "JobChaining-Binning");
		binningJob.setJarByClass(BasicJobChaining.class);

		// Set mapper and the average posts per user
		binningJob.setMapperClass(UserIdBinningMapper.class);
		UserIdBinningMapper.setAveragePostsPerUser(binningJob, averagePostsPerUser);

		binningJob.setNumReduceTasks(0);

		binningJob.setInputFormatClass(TextInputFormat.class);
		TextInputFormat.addInputPath(binningJob, outputDirIntermediate);

		// Add two named outputs for below/above average
		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_BELOW_NAME,
				TextOutputFormat.class, Text.class, Text.class);

		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_ABOVE_NAME,
				TextOutputFormat.class, Text.class, Text.class);
		MultipleOutputs.setCountersEnabled(binningJob, true);

		TextOutputFormat.setOutputPath(binningJob, outputDir);

		// Add the user files to the DistributedCache
		FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
		for (FileStatus status : userFiles) {
			DistributedCache.addCacheFile(status.getPath().toUri(),
					binningJob.getConfiguration());
		}

		// Execute job and grab exit code
		code = binningJob.waitForCompletion(true) ? 0 : 1;
	}

	// Clean up the intermediate output
	FileSystem.get(conf).delete(outputDirIntermediate, true);

	System.exit(code);
}

Source File: TotalOrderSortingStage.java From hadoop-map-reduce-patterns with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	Path inputPath = new Path(args[0]);
	Path partitionFile = new Path(args[1] + "_partitions.lst");
	Path outputStage = new Path(args[1] + "_staging");
	Path outputOrder = new Path(args[1]);
	// Configure job to prepare for sampling
	Job sampleJob = new Job(conf, "TotalOrderSortingStage");
	sampleJob.setJarByClass(TotalOrderSortingStage.class);
	// Use the mapper implementation with zero reduce tasks
	sampleJob.setMapperClass(LastAccessMapper.class);
	sampleJob.setNumReduceTasks(0);
	sampleJob.setOutputKeyClass(Text.class);
	sampleJob.setOutputValueClass(Text.class);
	TextInputFormat.setInputPaths(sampleJob, inputPath);
	// Set the output format to a sequence file
	sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);
	SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage);
	// Submit the job and get completion code.
	int code = sampleJob.waitForCompletion(true) ? 0 : 1;
	if (code == 0) {
		Job orderJob = new Job(conf, "TotalOrderSortingStage");
		orderJob.setJarByClass(TotalOrderSortingStage.class);
		// Here, use the identity mapper to output the key/value pairs in
		// the SequenceFile
		orderJob.setMapperClass(Mapper.class);
		orderJob.setReducerClass(ValuesReducer.class);
		// Set the number of reduce tasks to an appropriate number for the
		// amount of data being sorted
		orderJob.setNumReduceTasks(10);
		// Use Hadoop's TotalOrderPartitioner class
		orderJob.setPartitionerClass(TotalOrderPartitioner.class);
		// Set the partition file
		TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(),
				partitionFile);
		orderJob.setOutputKeyClass(Text.class);
		orderJob.setOutputValueClass(Text.class);
		// Set the input to the previous job's output
		orderJob.setInputFormatClass(SequenceFileInputFormat.class);
		SequenceFileInputFormat.setInputPaths(orderJob, outputStage);
		// Set the output path to the command line parameter
		TextOutputFormat.setOutputPath(orderJob, outputOrder);
		// Set the separator to an empty string
		orderJob.getConfiguration().set(
				"mapred.textoutputformat.separator", "");
		// Use the InputSampler to go through the output of the previous
		// job, sample it, and create the partition file
		InputSampler.writePartitionFile(orderJob,
				new InputSampler.RandomSampler(.001, 10000));
		// Submit the job
		code = orderJob.waitForCompletion(true) ? 0 : 2;
	}
	// Clean up the partition file and the staging directory
	FileSystem.get(new Configuration()).delete(partitionFile, false);
	FileSystem.get(new Configuration()).delete(outputStage, true);
	return code;
}

Source File: TopBusyAirport.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();
    Configuration conf = getConf();

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    Job job = Job.getInstance(conf, "Busy Airport Count");

    job.setInputFormatClass(RowInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(SampleMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    // Only have one reduce task so that all of the results from mapping are
    // processed in one place.
    job.setNumReduceTasks(1);

    // configure output
    TextOutputFormat.setOutputPath(job, intermediateOutputPath);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int rc = job.waitForCompletion(true) ? 0 : 1;
    if (rc == 0) {
      Job topJob = Job.getInstance(getConf(), "Top Busy Airport");

      // We want the task to run on a single VM
      topJob.setNumReduceTasks(1);

      // Set the inputs
      topJob.setInputFormatClass(TextInputFormat.class);
      TextInputFormat.addInputPath(topJob, intermediateOutputPath);

      // Set the mapper and reducer
      topJob.setMapperClass(TopBusyAirportMapper.class);
      topJob.setReducerClass(TopBusyAirportReducer.class);

      // Set the outputs
      TextOutputFormat.setOutputPath(topJob, outputPath);
      topJob.setOutputFormatClass(TextOutputFormat.class);
      topJob.setOutputKeyClass(Text.class);
      topJob.setOutputValueClass(IntWritable.class);

      topJob.setMapOutputKeyClass(Text.class);
      topJob.setMapOutputValueClass(StringIntPair.class);

      rc = topJob.waitForCompletion(true) ? 0 : 1;
    }
    return rc;
  }

Source File: TopBusyAirportGemfirexd.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();
    Configuration conf = getConf();

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    Job job = Job.getInstance(conf, "Busy Airport Count");
    job.setJarByClass(TopBusyAirportGemfirexd.class);

    job.setInputFormatClass(RowInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(SampleMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, intermediateOutputPath);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int rc = job.waitForCompletion(true) ? 0 : 1;
    if (rc == 0) {
      Configuration topConf = getConf();

      String gemfirexdUrl = topConf.get("gemfirexd.url", "jdbc:gemfirexd://localhost:1527");
      topConf.set(RowOutputFormat.OUTPUT_URL, gemfirexdUrl);
      topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT");

      Configuration.dumpConfiguration(topConf, new PrintWriter(System.out));

      Job topJob = Job.getInstance(topConf, "Top Busy Airport");

      // We want the task to run on a single VM
      topJob.setNumReduceTasks(1);

      // Set the inputs
      topJob.setInputFormatClass(TextInputFormat.class);
      TextInputFormat.addInputPath(topJob, intermediateOutputPath);

      // Set the mapper and reducer
      topJob.setMapperClass(TopBusyAirportMapper.class);
      topJob.setReducerClass(TopBusyAirportReducer.class);
      topJob.setMapOutputKeyClass(Text.class);
      topJob.setMapOutputValueClass(StringIntPair.class);

      // Set the outputs
      TextOutputFormat.setOutputPath(topJob, outputPath);
      topJob.setOutputFormatClass(RowOutputFormat.class);
      topJob.setOutputKeyClass(Key.class);
      topJob.setOutputValueClass(BusyAirportModel.class);

      rc = topJob.waitForCompletion(true) ? 0 : 1;
    }
    return rc;
  }

Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0

4 votes

private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration(this.conf);
  for (Map.Entry<String, String> entry : extraConf.entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    writeJob = new Job(conf, "write");
    TextInputFormat.addInputPath(writeJob, inputPath);
    writeJob.setInputFormatClass(TextInputFormat.class);
    writeJob.setNumReduceTasks(0);
    ParquetOutputFormat.setCompression(writeJob, codec);
    ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(readMapperClass);

    ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class);
    GroupWriteSupport.setSchema(
            MessageTypeParser.parseMessageType(writeSchema),
            writeJob.getConfiguration());
    writeJob.submit();
    waitForJob(writeJob);
  }
  {
    conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
    readJob = new Job(conf, "read");

    readJob.setInputFormatClass(ParquetInputFormat.class);
    ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class);

    ParquetInputFormat.setInputPaths(readJob, parquetPath);
    readJob.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(readJob, outputPath);
    readJob.setMapperClass(writeMapperClass);
    readJob.setNumReduceTasks(0);
    readJob.submit();
    waitForJob(readJob);
  }
}

Source File: MyFof.java From BigDataArchitect with Apache License 2.0

2 votes

public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration(true);

        conf.set("mapreduce.framework.name","local");
        conf.set("mapreduce.app-submission.cross-platform","true");

        String[] other = new GenericOptionsParser(conf, args).getRemainingArgs();

        Job job = Job.getInstance(conf);
        job.setJarByClass(MyFof.class);

        job.setJobName("fof");

        //初学者，关注的是client端的代码梳理：因为把这块写明白了，其实你也就真的知道这个作业的开发原理；

        //maptask
        //input

        TextInputFormat.addInputPath(job,new Path(other[0]));

        Path outPath = new Path(other[1]);

        if(outPath.getFileSystem(conf).exists(outPath))  outPath.getFileSystem(conf).delete(outPath,true);
        TextOutputFormat.setOutputPath(job,outPath);


        //key
        //map
        job.setMapperClass(FMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //reducetask
        //reduce
//        job.setNumReduceTasks(0);
        job.setReducerClass(FReducer.class);

        job.waitForCompletion(true);



    }

Source File: MyWordCount.java From BigDataArchitect with Apache License 2.0

2 votes

public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration(true);

        GenericOptionsParser parser = new GenericOptionsParser(conf, args);  //工具类帮我们把-D 等等的属性直接set到conf，会留下commandOptions
        String[] othargs = parser.getRemainingArgs();

        //让框架知道是windows异构平台运行
        conf.set("mapreduce.app-submission.cross-platform","true");

//        conf.set("mapreduce.framework.name","local");
//        System.out.println(conf.get("mapreduce.framework.name"));

        Job job = Job.getInstance(conf);


//        FileInputFormat.setMinInputSplitSize(job,2222);
//        job.setInputFormatClass(ooxx.class);






        job.setJar("C:\\Users\\admin\\IdeaProjects\\msbhadoop\\target\\hadoop-hdfs-1.0-0.1.jar");
        //必须必须写的
        job.setJarByClass(MyWordCount.class);

        job.setJobName("mashibing");

        Path infile = new Path(othargs[0]);
        TextInputFormat.addInputPath(job, infile);

        Path outfile = new Path(othargs[1]);
        if (outfile.getFileSystem(conf).exists(outfile)) outfile.getFileSystem(conf).delete(outfile, true);
        TextOutputFormat.setOutputPath(job, outfile);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(MyReducer.class);

//        job.setNumReduceTasks(2);
        // Submit the job, then poll for progress until the job is complete
        job.waitForCompletion(true);

    }

Java Code Examples for org.apache.hadoop.mapreduce.lib.output.TextOutputFormat#setOutputPath()