java source code of MR

package mx1202.wla1;

import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * MR_WLA用于网站日志分析
 *
 * @className MR_WLA
 * @author mxlee
 * @email [email protected]
 * @date 2016年12月2日
 */
public class MR_WLA extends Configured implements Tool {

	public static void main(String[] args) throws Exception {
		ToolRunner.run(new MR_WLA(), args);
	}

	public int run(String[] args) throws Exception {
		String jobName = "wla_baidu";

		String inputPath = args[0];
		String outputPath = args[1];
		Path path = new Path(outputPath);
		// 删除输出目录
		path.getFileSystem(getConf()).delete(path, true);

		// 1、把所有代码组织到类似于Topology的类中
		Job job = Job.getInstance(getConf(), jobName);

		// 2、一定要打包运行，必须写下面一行代码
		job.setJarByClass(MR_WLA.class);

		// 3、指定输入的hdfs
		FileInputFormat.setInputPaths(job, inputPath);

		// 4、指定map类
		job.setMapperClass(WLA_Mapper.class);

		// 5、指定map输出的<key,value>的类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		// 6、指定reduce类
		job.setReducerClass(WLA_Reducer.class);

		// 7、指定reduce输出的<key,value>的类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		// 8、指定输出的hdfs
		FileOutputFormat.setOutputPath(job, new Path(outputPath));

		return job.waitForCompletion(true) ? 0 : 1;
	}

	/**
	 * WLA_Mapper用于网站日志分组
	 *
	 * @className WLA_Mapper
	 * @author mxlee
	 * @email [email protected]
	 * @date 2016年12月2日
	 */
	public static class WLA_Mapper extends Mapper<LongWritable, Text, Text, Text> {

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 格式[2016-11-29 00:02:07 INFO ]
			// (cn.baidu.core.inteceptor.LogInteceptor:55) - [0 183.136.190.51
			// null http://www.baidu.cn/payment]
			String log = value.toString();// 网站访问日志
			String str = "(cn.baidu.core.inteceptor.LogInteceptor:55)";
			String baseUrl = "http://www.baidu.cn/";
			int len = str.length();
			int urlLen = baseUrl.length();
			if (log.indexOf(str) != -1) {
				String[] log1 = log.split(str);
				// 分析第一段[2016-11-29 00:29:58 INFO
				String visitTime = log1[0].substring(1, 20);// 获取访问时间
				// 分析第二段112.90.82.196 null
				// http://www.baidu.cn/course/jobOffline]
				String[] split2 = log1[1].split("\t");
				String ip = split2[1];// 获取ip
				String url = split2[3];// 获取网址
				String subUrl = "http://www.baidu.cn";
				if (url.length() - 1 > urlLen) {
					subUrl = url.substring(urlLen, url.length() - 1);
				}
				String result = visitTime + "," + subUrl;
				context.write(new Text(ip), new Text(result));
			}
		}

	}

	/**
	 * WLA_Reducer用于处理分组后的数据
	 *
	 * @className WLA_Reducer
	 * @author mxlee
	 * @email [email protected]
	 * @date 2016年12月2日
	 */
	public static class WLA_Reducer extends Reducer<Text, Text, Text, Text> {

		@Override
		protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {

			long firstTime = Long.MAX_VALUE;// 首次访问时间
			String startTime = null;
			String endTime = null;
			long lastTime = Long.MIN_VALUE;
			String firstPage = null;// 首次访问页面
			String lastPage = null;
			int count = 0;// 访问页面次数

			for (Text value : values) {
				count++;
				String[] split = value.toString().split(",");

				if (TimeUtil.transDate(split[0]) < firstTime) {
					firstTime = TimeUtil.transDate(split[0]);// yyyy-MM-dd
																// HH:mm:ss
					startTime = split[0].substring(11, 19);
					firstPage = split[1];
				}

				if (TimeUtil.transDate(split[0]) > lastTime) {
					lastTime = TimeUtil.transDate(split[0]);
					endTime = split[0].substring(11, 19);
					lastPage = split[1];
				}

			} // end for

			long time = 0;
			if ((lastTime - firstTime) % (1000 * 60) > 0) {
				time = (lastTime - firstTime) / (1000 * 60) + 1;
			} else {
				time = (lastTime - firstTime) / (1000 * 60);
			}
			String result = startTime + "\t" + firstPage + "\t" + endTime + "\t" + lastPage + "\t" + count + "\t" + time
					+ "分钟";
			context.write(key, new Text(result));

		}// end reduce

	}// end class

}