package org.commoncrawl.examples.mapreduce; import java.io.IOException; import java.util.StringTokenizer; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.log4j.Logger; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveRecord; public class WordCounterMap { private static final Logger LOG = Logger.getLogger(WordCounterMap.class); protected static enum MAPPERCOUNTER { RECORDS_IN, EMPTY_PAGE_TEXT, EXCEPTIONS, NON_PLAIN_TEXT } protected static class WordCountMapper extends Mapper<Text, ArchiveReader, Text, LongWritable> { private StringTokenizer tokenizer; private Text outKey = new Text(); private LongWritable outVal = new LongWritable(1); @Override public void map(Text key, ArchiveReader value, Context context) throws IOException { for (ArchiveRecord r : value) { try { if (r.getHeader().getMimetype().equals("text/plain")) { context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1); LOG.debug(r.getHeader().getUrl() + " -- " + r.available()); // Convenience function that reads the full message into a raw byte array byte[] rawData = IOUtils.toByteArray(r, r.available()); String content = new String(rawData); // Grab each word from the document tokenizer = new StringTokenizer(content); if (!tokenizer.hasMoreTokens()) { context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1); } else { while (tokenizer.hasMoreTokens()) { outKey.set(tokenizer.nextToken()); context.write(outKey, outVal); } } } else { context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1); } } catch (Exception ex) { LOG.error("Caught Exception", ex); context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1); } } } } }