package com.eftimoff.mapreduce.summarization.numerical.average; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import com.eftimoff.mapreduce.utils.MRDPUtils; public class Average extends Configured implements Tool { public static class AverageMapper extends Mapper<Object, Text, IntWritable, CountAverageTuple> { private IntWritable outHour = new IntWritable(); private CountAverageTuple outCountAverage = new CountAverageTuple(); private final static SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); @SuppressWarnings("deprecation") public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString()); // Grab the "CreationDate" field, // since it is what we are grouping by String strDate = parsed.get("CreationDate"); // Grab the comment to find the length String text = parsed.get("Text"); if (strDate == null || text == null) { return; } // get the hour this comment was posted in Date creationDate; try { creationDate = frmt.parse(strDate); } catch (ParseException e) { return; } outHour.set(creationDate.getHours()); // get the comment length outCountAverage.setCount(1); outCountAverage.setAverage(text.length()); // write out the hour with the comment length context.write(outHour, outCountAverage); } } public static class AverageReducer extends Reducer<IntWritable, CountAverageTuple, IntWritable, CountAverageTuple> { private CountAverageTuple result = new CountAverageTuple(); public void reduce(IntWritable key, Iterable<CountAverageTuple> values, Context context) throws IOException, InterruptedException { float sum = 0; float count = 0; // Iterate through all input values for this key for (CountAverageTuple val : values) { sum += val.getCount() * val.getAverage(); count += val.getCount(); } result.setCount(count); result.setAverage(sum / count); context.write(key, result); } } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new Average(), args); System.exit(res); } @Override public int run(String[] arg0) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, arg0).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: Average <in> <out>"); System.exit(2); } Job job = new Job(conf, "StackOverflow Comment Average"); job.setJarByClass(Average.class); job.setMapperClass(AverageMapper.class); job.setCombinerClass(AverageReducer.class); job.setReducerClass(AverageReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(CountAverageTuple.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); boolean success = job.waitForCompletion(true); return success ? 0 : 1; } }