java source code of CalculateSimilarityStep6

package org.tinylcy.similarity;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.tinylcy.driver.ItemBasedCFDriver;
import org.tinylcy.hdfs.HDFS;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;


/*
 * 
 * 计算物品的相似度矩阵；
 * 首先需要将Step4和Step5的输出数据缓存到每个结点；
 * 输入数据为Step3的输出数据（物品的同现矩阵）；
 * 输出数据格式为：ItemID_i ItemID_j similarity;
 *
 */
public class CalculateSimilarityStep6 {

	private static final Pattern DELIMITER = Pattern.compile("[:, ]");

	public static class Step6_Mapper extends
			Mapper<LongWritable, Text, Text, DoubleWritable> {

		// 在每个map结点缓存每个物品被几个用户喜欢
		private Map<Integer, Integer> map_A = new HashMap<Integer, Integer>();
		// 在每个map结点缓存每个用户喜欢几个物品
		private Map<Integer, Integer> map_B = new HashMap<Integer, Integer>();

		private Text k = new Text();
		private DoubleWritable v = new DoubleWritable();

		public void setup(Context context) throws IOException,
				InterruptedException {

			Configuration conf = context.getConfiguration();
			Path[] paths = DistributedCache.getLocalCacheFiles(conf);

			Path file1Path = paths[0];
			Path file2Path = paths[1];

			String line = null;
			String[] tokens = null;

			BufferedReader reader = new BufferedReader(new FileReader(
					file1Path.toString()));

			try {
				while ((line = reader.readLine()) != null) {
					tokens = line.split(":");
					map_A.put(Integer.parseInt(tokens[0]),
							Integer.parseInt(tokens[1]));
				}
			} finally {
				reader.close();
			}

			reader = new BufferedReader(new FileReader(file2Path.toString()));
			try {
				while ((line = reader.readLine()) != null) {
					tokens = line.split(":");
					map_B.put(Integer.parseInt(tokens[0]),
							Integer.parseInt(tokens[1]));
				}
			} finally {
				reader.close();
			}

		}

		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] tokens = DELIMITER.split(value.toString());
			int itemID_a = Integer.parseInt(tokens[0]);
			int itemID_b = Integer.parseInt(tokens[1]);
			int userID = -1;
			double sum = 0.0;
			double similarity = 0.0;

			if (tokens.length > 2) {
				for (int i = 2; i < tokens.length; i++) {
					userID = Integer.parseInt(tokens[i]);
					sum += 1 / (Math.log(1 + map_B.get(userID)));
				}
				similarity = sum
						/ Math.sqrt(map_A.get(itemID_a) * map_A.get(itemID_b));
				k.set(itemID_a + " " + itemID_b);
				v.set(similarity);
				context.write(k, v);
			}
		}
	}

	public static void run() throws IOException, ClassNotFoundException,
			InterruptedException, URISyntaxException {

		Configuration conf = new Configuration();
		HDFS hdfs = new HDFS(conf);

		// hdfs.download(HDFS.HDFSPATH + "/step4/part-r-00000",
		// ItemBasedCFDriver.LOCALPATH + "/step4/part-r-00000");
		// hdfs.download(HDFS.HDFSPATH + "/step5/part-r-00000",
		// ItemBasedCFDriver.LOCALPATH + "/step5/part-r-00000");

		// String inputPath1 = "/var/ItemBased/step4/part-r-00000";//
		// 每个物品被几个用户喜欢
		// String inputPath2 = "/var/ItemBased/step5/part-r-00000";// 每个用户喜欢几个物品

		String inputPath1 = HDFS.HDFSPATH + "/step4/part-r-00000";
		String inputPath2 = HDFS.HDFSPATH + "/step5/part-r-00000";

		String inputPath3 = ItemBasedCFDriver.path.get("step6InputPath");
		String outputPath = ItemBasedCFDriver.path.get("step6OutputPath");

		conf.set("mapreduce.output.textoutputformat.separator", ":");

		DistributedCache.addCacheFile(new Path(inputPath1).toUri(), conf);
		DistributedCache.addCacheFile(new Path(inputPath2).toUri(), conf);

		Job job = Job.getInstance(conf);

		hdfs.rmr(outputPath);

		job.setMapperClass(Step6_Mapper.class);

		job.setJarByClass(CalculateSimilarityStep6.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(DoubleWritable.class);

		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		FileInputFormat.setInputPaths(job, new Path(inputPath3));
		FileOutputFormat.setOutputPath(job, new Path(outputPath));

		job.waitForCompletion(true);
	}

}