java source code of BinningTags

package com.eftimoff.mapreduce.organization.binning;

import java.io.IOException;
import java.util.Map;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.eftimoff.mapreduce.utils.MRDPUtils;

public class BinningTags extends Configured implements Tool {

	public static class BinningMapper extends
			Mapper<Object, Text, Text, NullWritable> {
		private MultipleOutputs<Text, NullWritable> mos = null;

		protected void setup(Context context) {
			// Create a new MultipleOutputs using the context object
			mos = new MultipleOutputs<Text, NullWritable>(context);
		}

		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
					.toString());
			String rawtags = parsed.get("Tags");
			if (rawtags == null) {
				return;
			}
			// Tags are delimited by ><. i.e. <tag1><tag2><tag3>
			String[] tagTokens = StringEscapeUtils.unescapeHtml(rawtags).split(
					"><");
			// For each tag
			for (String tag : tagTokens) {
				// Remove any > or < from the token
				String groomed = tag.replaceAll(">|<", "").toLowerCase();
				// If this tag is one of the following, write to the named bin
				if (groomed.equalsIgnoreCase("hadoop")) {
					mos.write("bins", value, NullWritable.get(), "hadoop-tag");
				}
				if (groomed.equalsIgnoreCase("schrader")) {
					mos.write("bins", value, NullWritable.get(), "schrader-tag");
				}
				if (groomed.equalsIgnoreCase("presta")) {
					mos.write("bins", value, NullWritable.get(), "presta-tag");
				}
				if (groomed.equalsIgnoreCase("innertube")) {
					mos.write("bins", value, NullWritable.get(),
							"innertube-tag");
				}
			}
			// Get the body of the post
			String post = parsed.get("Body");
			// If the post contains the word "hadoop", write it to its own bin
			if (post != null && post.toLowerCase().contains("hadoop")) {
				mos.write("bins", value, NullWritable.get(), "hadoop-post");
			}

		}

		protected void cleanup(Context context) throws IOException,
				InterruptedException {
			// Close multiple outputs!
			mos.close();
		}
	}

	@Override
	public int run(String[] args) throws Exception {
		Configuration conf = new Configuration();
		GenericOptionsParser parser = new GenericOptionsParser(conf, args);
		String[] otherArgs = parser.getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: BinningTags <in> <out>");
			ToolRunner.printGenericCommandUsage(System.err);
			System.exit(2);
		}
		Job job = new Job(conf, "Binning Tags");
		job.setJarByClass(BinningTags.class);
		// Configure the MultipleOutputs by adding an output called "bins"
		// With the proper output format and mapper key/value pairs
		MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class,
				Text.class, NullWritable.class);
		// Enable the counters for the job
		// If there are a significant number of different named outputs, this
		// should be disabled
		MultipleOutputs.setCountersEnabled(job, true);
		// Map-only job
		job.setNumReduceTasks(0);
		job.setMapperClass(BinningMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		boolean success = job.waitForCompletion(true);
		return success ? 0 : 1;
	}

	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new BinningTags(), args);
		System.exit(res);
	}
}