Java Code Examples for org.apache.hadoop.examples.WordCount

The following examples show how to use org.apache.hadoop.examples.WordCount. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: spydra   Source File: LifecycleIT.java    License: Apache License 2.0 4 votes vote down vote up
private String getExamplesJarPath() {
  Class clazz = WordCount.class;
  return clazz.getProtectionDomain().getCodeSource().getLocation().getPath();
}
 
Example 2
Source Project: RDFS   Source File: TestMiniMRLocalFS.java    License: Apache License 2.0 4 votes vote down vote up
private void runSecondarySort(Configuration conf) throws IOException,
                                                      InterruptedException,
                                                      ClassNotFoundException {
  FileSystem localFs = FileSystem.getLocal(conf);
  localFs.delete(new Path(TEST_ROOT_DIR + "/in"), true);
  localFs.delete(new Path(TEST_ROOT_DIR + "/out"), true);
  TestMapReduceLocal.writeFile
           ("in/part1", "-1 -4\n-3 23\n5 10\n-1 -2\n-1 300\n-1 10\n4 1\n" +
            "4 2\n4 10\n4 -1\n4 -10\n10 20\n10 30\n10 25\n");
  Job job = new Job(conf, "word count");
  job.setJarByClass(WordCount.class);
  job.setNumReduceTasks(2);
  job.setMapperClass(SecondarySort.MapClass.class);
  job.setReducerClass(SecondarySort.Reduce.class);
  // group and partition by the first int in the pair
  job.setPartitionerClass(FirstPartitioner.class);
  job.setGroupingComparatorClass(FirstGroupingComparator.class);

  // the map output is IntPair, IntWritable
  job.setMapOutputKeyClass(IntPair.class);
  job.setMapOutputValueClass(IntWritable.class);

  // the reduce output is Text, IntWritable
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);

  FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in"));
  FileOutputFormat.setOutputPath(job, new Path(TEST_ROOT_DIR + "/out"));
  assertTrue(job.waitForCompletion(true));
  String out = TestMapReduceLocal.readFile("out/part-r-00000");
  assertEquals("------------------------------------------------\n" +
               "4\t-10\n4\t-1\n4\t1\n4\t2\n4\t10\n" +
               "------------------------------------------------\n" +
               "10\t20\n10\t25\n10\t30\n", out);
  out = TestMapReduceLocal.readFile("out/part-r-00001");
  assertEquals("------------------------------------------------\n" +
               "-3\t23\n" +
               "------------------------------------------------\n" +
               "-1\t-4\n-1\t-2\n-1\t10\n-1\t300\n" +
               "------------------------------------------------\n" +
               "5\t10\n", out);
}
 
Example 3
Source Project: RDFS   Source File: TestMapReduceLocal.java    License: Apache License 2.0 4 votes vote down vote up
private void runWordCount(Configuration conf
                          ) throws IOException,
                                   InterruptedException,
                                   ClassNotFoundException {
  final String COUNTER_GROUP = "org.apache.hadoop.mapred.Task$Counter";
  localFs.delete(new Path(TEST_ROOT_DIR + "/in"), true);
  localFs.delete(new Path(TEST_ROOT_DIR + "/out"), true);    
  writeFile("in/part1", "this is a test\nof word count test\ntest\n");
  writeFile("in/part2", "more test");
  Job job = new Job(conf, "word count");     
  job.setJarByClass(WordCount.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setInputFormatClass(TrackingTextInputFormat.class);
  FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in"));
  FileOutputFormat.setOutputPath(job, new Path(TEST_ROOT_DIR + "/out"));
  assertTrue(job.waitForCompletion(false));
  String out = readFile("out/part-r-00000");
  System.out.println(out);
  assertEquals("a\t1\ncount\t1\nis\t1\nmore\t1\nof\t1\ntest\t4\nthis\t1\nword\t1\n",
               out);
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);
  long combineIn = ctrs.findCounter(COUNTER_GROUP,
                                    "COMBINE_INPUT_RECORDS").getValue();
  long combineOut = ctrs.findCounter(COUNTER_GROUP, 
                                     "COMBINE_OUTPUT_RECORDS").getValue();
  long reduceIn = ctrs.findCounter(COUNTER_GROUP,
                                   "REDUCE_INPUT_RECORDS").getValue();
  long mapOut = ctrs.findCounter(COUNTER_GROUP, 
                                 "MAP_OUTPUT_RECORDS").getValue();
  long reduceOut = ctrs.findCounter(COUNTER_GROUP,
                                    "REDUCE_OUTPUT_RECORDS").getValue();
  long reduceGrps = ctrs.findCounter(COUNTER_GROUP,
                                     "REDUCE_INPUT_GROUPS").getValue();
  assertEquals("map out = combine in", mapOut, combineIn);
  assertEquals("combine out = reduce in", combineOut, reduceIn);
  assertTrue("combine in > combine out", combineIn > combineOut);
  assertEquals("reduce groups = reduce out", reduceGrps, reduceOut);
  String group = "Random Group";
  CounterGroup ctrGrp = ctrs.getGroup(group);
  assertEquals(0, ctrGrp.size());
}
 
Example 4
Source Project: hadoop-gpu   Source File: TestMiniMRLocalFS.java    License: Apache License 2.0 4 votes vote down vote up
private void runSecondarySort(Configuration conf) throws IOException,
                                                      InterruptedException,
                                                      ClassNotFoundException {
  FileSystem localFs = FileSystem.getLocal(conf);
  localFs.delete(new Path(TEST_ROOT_DIR + "/in"), true);
  localFs.delete(new Path(TEST_ROOT_DIR + "/out"), true);
  TestMapReduceLocal.writeFile
           ("in/part1", "-1 -4\n-3 23\n5 10\n-1 -2\n-1 300\n-1 10\n4 1\n" +
            "4 2\n4 10\n4 -1\n4 -10\n10 20\n10 30\n10 25\n");
  Job job = new Job(conf, "word count");
  job.setJarByClass(WordCount.class);
  job.setNumReduceTasks(2);
  job.setMapperClass(SecondarySort.MapClass.class);
  job.setReducerClass(SecondarySort.Reduce.class);
  // group and partition by the first int in the pair
  job.setPartitionerClass(FirstPartitioner.class);
  job.setGroupingComparatorClass(FirstGroupingComparator.class);

  // the map output is IntPair, IntWritable
  job.setMapOutputKeyClass(IntPair.class);
  job.setMapOutputValueClass(IntWritable.class);

  // the reduce output is Text, IntWritable
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);

  FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in"));
  FileOutputFormat.setOutputPath(job, new Path(TEST_ROOT_DIR + "/out"));
  assertTrue(job.waitForCompletion(true));
  String out = TestMapReduceLocal.readFile("out/part-r-00000");
  assertEquals("------------------------------------------------\n" +
               "4\t-10\n4\t-1\n4\t1\n4\t2\n4\t10\n" +
               "------------------------------------------------\n" +
               "10\t20\n10\t25\n10\t30\n", out);
  out = TestMapReduceLocal.readFile("out/part-r-00001");
  assertEquals("------------------------------------------------\n" +
               "-3\t23\n" +
               "------------------------------------------------\n" +
               "-1\t-4\n-1\t-2\n-1\t10\n-1\t300\n" +
               "------------------------------------------------\n" +
               "5\t10\n", out);
}
 
Example 5
Source Project: hadoop-gpu   Source File: TestMapReduceLocal.java    License: Apache License 2.0 4 votes vote down vote up
private void runWordCount(Configuration conf
                          ) throws IOException,
                                   InterruptedException,
                                   ClassNotFoundException {
  final String COUNTER_GROUP = "org.apache.hadoop.mapred.Task$Counter";
  localFs.delete(new Path(TEST_ROOT_DIR + "/in"), true);
  localFs.delete(new Path(TEST_ROOT_DIR + "/out"), true);    
  writeFile("in/part1", "this is a test\nof word count test\ntest\n");
  writeFile("in/part2", "more test");
  Job job = new Job(conf, "word count");     
  job.setJarByClass(WordCount.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setInputFormatClass(TrackingTextInputFormat.class);
  FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in"));
  FileOutputFormat.setOutputPath(job, new Path(TEST_ROOT_DIR + "/out"));
  assertTrue(job.waitForCompletion(false));
  String out = readFile("out/part-r-00000");
  System.out.println(out);
  assertEquals("a\t1\ncount\t1\nis\t1\nmore\t1\nof\t1\ntest\t4\nthis\t1\nword\t1\n",
               out);
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);
  long combineIn = ctrs.findCounter(COUNTER_GROUP,
                                    "COMBINE_INPUT_RECORDS").getValue();
  long combineOut = ctrs.findCounter(COUNTER_GROUP, 
                                     "COMBINE_OUTPUT_RECORDS").getValue();
  long reduceIn = ctrs.findCounter(COUNTER_GROUP,
                                   "REDUCE_INPUT_RECORDS").getValue();
  long mapOut = ctrs.findCounter(COUNTER_GROUP, 
                                 "MAP_OUTPUT_RECORDS").getValue();
  assertEquals("map out = combine in", mapOut, combineIn);
  assertEquals("combine out = reduce in", combineOut, reduceIn);
  assertTrue("combine in > combine out", combineIn > combineOut);
  String group = "Random Group";
  CounterGroup ctrGrp = ctrs.getGroup(group);
  assertEquals(0, ctrGrp.size());
}