Java Code Examples for org.apache.hadoop.filecache.DistributedCache#addCacheFile()

The following examples show how to use org.apache.hadoop.filecache.DistributedCache#addCacheFile() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TeraSort.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 2
Source File: TeraSort.java    From hadoop-book with Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 3
Source File: BloomFilter.java    From hadoop-map-reduce-patterns with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 3) {
		System.err
				.println("Usage: BloomFilter <bloom_filter_file> <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	DistributedCache.addCacheFile(new URI(otherArgs[0]), conf);
	Job job = new Job(conf, "Bloom Filter");
	job.setJarByClass(BloomFilter.class);
	job.setMapperClass(BloomFilterMapper.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(NullWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}
 
Example 4
Source File: JobLibLoader.java    From SpyGlass with Apache License 2.0 5 votes vote down vote up
public static void addFiletoCache(String libPathStr, Configuration config) {

		try {
			Path filePath = new Path(libPathStr);
			DistributedCache.addCacheFile(filePath.toUri(), config);
			// DistributedCache.createSymlink(config);

			// config.set("mapred.cache.files", libPathStr);
			// config.set("mapred.create.symlink", "yes");

		} catch (Exception e) {
			e.printStackTrace();
		}
	}
 
Example 5
Source File: GroupedKeyRangePartitioner.java    From accumulo-recipes with Apache License 2.0 5 votes vote down vote up
/**
 * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning
 */
public static void addSplitFile(JobContext job, String group, String file) {
    URI uri = new Path(file).toUri();
    DistributedCache.addCacheFile(uri, job.getConfiguration());
    String[] groups = job.getConfiguration().getStrings(GROUPS_KEY);
    if (groups == null || Arrays.binarySearch(groups, group) == -1) {
        String[] newGroups = groups != null ? Arrays.copyOf(groups, groups.length + 1) : new String[]{};
        newGroups[newGroups.length - 1] = group;
        job.getConfiguration().setStrings(GROUPS_KEY, newGroups);
        job.getConfiguration().set(GROUPS_KEY + "." + group, file);
    }
}
 
Example 6
Source File: MRWordCountFeatures.java    From hadoop-book with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), MRWordCount.class);
    conf.setJobName(
            "WordCountFeatures");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapFeatures.class);
    conf.setCombinerClass(ReduceFeatures.class);
    conf.setReducerClass(ReduceFeatures.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        if ("-skip".equals(args[i])) {
            DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
            conf.setBoolean("wordcount.skip.patterns", true);
        } else {
            other_args.add(args[i]);
        }
    }

    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}
 
Example 7
Source File: TeraSort.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}
 
Example 8
Source File: AccumuloMrGeoRangePartitioner.java    From mrgeo with Apache License 2.0 5 votes vote down vote up
/**
 * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning
 */
public static void setSplitFile(JobContext job, String file)
{
  URI uri = new Path(file).toUri();
  DistributedCache.addCacheFile(uri, job.getConfiguration());
  job.getConfiguration().set(CUTFILE_KEY, uri.getPath());
}
 
Example 9
Source File: Main.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public static void runJob(Path inputPath,
                          Path smallFilePath,
                          Path outputPath)
    throws Exception {

  Configuration conf = new Configuration();

  FileSystem fs = smallFilePath.getFileSystem(conf);

  FileStatus smallFilePathStatus = fs.getFileStatus(smallFilePath);

  if (smallFilePathStatus.isDir()) {
    for (FileStatus f : fs.listStatus(smallFilePath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(smallFilePath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(Main.class);
  job.setMapperClass(GenericReplicatedJoin.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setNumReduceTasks(0);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);
}
 
Example 10
Source File: ReplicatedFilterJob.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public static void runJob(Configuration conf,
                          Path usersPath,
                          Path uniqueUsersPath,
                          Path outputPath)
    throws Exception {

  FileSystem fs = uniqueUsersPath.getFileSystem(conf);

  FileStatus uniqueUserStatus = fs.getFileStatus(uniqueUsersPath);

  if (uniqueUserStatus.isDir()) {
    for (FileStatus f : fs.listStatus(uniqueUsersPath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(uniqueUsersPath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(ReplicatedFilterJob.class);
  job.setMapperClass(ReplicatedFilterJob.class);

  job.setNumReduceTasks(0);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, usersPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (!job.waitForCompletion(true)) {
    throw new Exception("Job failed");
  }
}
 
Example 11
Source File: FinalJoinJob.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public static void runJob(Configuration conf,
                          Path userLogsPath,
                          Path usersPath,
                          Path outputPath)
    throws Exception {

  FileSystem fs = usersPath.getFileSystem(conf);

  FileStatus usersStatus = fs.getFileStatus(usersPath);

  if (usersStatus.isDir()) {
    for (FileStatus f : fs.listStatus(usersPath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(usersPath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(FinalJoinJob.class);
  job.setMapperClass(GenericReplicatedJoin.class);

  job.setNumReduceTasks(0);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, userLogsPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (!job.waitForCompletion(true)) {
    throw new Exception("Job failed");
  }
}
 
Example 12
Source File: AvroUtils.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
/**
 * Given a path to an output folder, it finds the existing "*.avro" files and adds 
 * them as cache files to be distributed. Throws an exception if no files are found/added.
 * 
 * @param conf Job configuration
 * @param outPath The path to the hdfs directory that has part files to cache
 * @throws Exception If no file is found at outPath throws a RuntimeException 
 */
public static void addAvroCacheFiles(JobConf conf, Path outPath) throws Exception
{
   FileStatus[] partFiles = getAvroPartFiles(conf, outPath);
   if (partFiles.length == 0)
   {      
     throw new RuntimeException("DistributedCacheFileUtils: No (part) file is found to cache at location:" + outPath );
   }
   
   for (FileStatus partFile : partFiles)
   {
     // add the file and set fileRead to true, since we have read at least one file
     DistributedCache.addCacheFile(partFile.getPath().toUri(), conf);
   }
 }
 
Example 13
Source File: JobExecutor.java    From Cubert with Apache License 2.0 5 votes vote down vote up
protected void cacheFiles() throws URISyntaxException,
        IOException
{
    if (!root.has("cachedFiles") || root.get("cachedFiles").isNull()
            || root.get("cachedFiles").size() == 0)
        return;

    for (JsonNode cachedFile : root.path("cachedFiles"))
    {
        URI uri = new URI(cachedFile.getTextValue());
        print.f("CACHING file %s", uri);
        DistributedCache.addCacheFile(uri, conf);
    }
}
 
Example 14
Source File: MapJoin.java    From BigData-In-Practice with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {

        try {
            // 创建配置信息
            Configuration conf = new Configuration();
            // 获取命令行的参数
            String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
            // 当参数违法时,中断程序
            if (otherArgs.length != 3) {
                System.err.println("Usage:MyMapJoin<in1> <in2> <out>");
                System.exit(1);
            }

            // 给路径赋值
            INPUT_PATH1 = otherArgs[0];
            INPUT_PATH2 = otherArgs[1];
            OUT_PATH = otherArgs[2];
            // 创建文件系统
            FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH), conf);
            // 如果输出目录存在,我们就删除
            if (fileSystem.exists(new Path(OUT_PATH))) {
                fileSystem.delete(new Path(OUT_PATH), true);
            }
            // 添加到内存中的文件(随便添加多少个文件)
            DistributedCache.addCacheFile(new Path(INPUT_PATH2).toUri(), conf);

            // 创建任务
            Job job = new Job(conf, MapJoin.class.getName());
            // 打成jar包运行,这句话是关键
            job.setJarByClass(MapJoin.class);
            //1.1 设置输入目录和设置输入数据格式化的类
            FileInputFormat.setInputPaths(job, INPUT_PATH1);
            job.setInputFormatClass(TextInputFormat.class);

            //1.2 设置自定义Mapper类和设置map函数输出数据的key和value的类型
            job.setMapperClass(MapJoinMapper.class);
            job.setMapOutputKeyClass(NullWritable.class);
            job.setMapOutputValueClass(Emp_Dep.class);

            //1.3 设置分区和reduce数量
            job.setPartitionerClass(HashPartitioner.class);
            job.setNumReduceTasks(0);

            FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
            // 提交作业 退出
            System.exit(job.waitForCompletion(true) ? 0 : 1);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }
 
Example 15
Source File: BloomJoin.java    From hiped2 with Apache License 2.0 4 votes vote down vote up
public static void runJob(String inputPath,
                          Path outputPath,
                          Path bloomFilterPath)
    throws Exception {

  Configuration conf = new Configuration();

  DistributedCache.addCacheFile(bloomFilterPath.toUri(), conf);

  Job job = new Job(conf);

  job.setJarByClass(BloomJoin.class);
  job.setMapperClass(Map.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setNumReduceTasks(0);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);
}
 
Example 16
Source File: TestMiniMRMapRedDebugScript.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * Launches failed map task and debugs the failed task
 * @param conf configuration for the mapred job
 * @param inDir input path
 * @param outDir output path
 * @param debugDir debug directory where script is present
 * @param debugCommand The command to execute script
 * @param input Input text
 * @return the output of debug script 
 * @throws IOException
 */
public String launchFailMapAndDebug(JobConf conf,
                                    Path inDir,
                                    Path outDir,
                                    Path debugDir,
                                    String debugScript,
                                    String input)
throws IOException {

  // set up the input file system and write input text.
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    // write input into input file
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  // configure the mapred Job for failing map task.
  conf.setJobName("failmap");
  conf.setMapperClass(MapClass.class);        
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(0);
  conf.setMapDebugScript(debugScript);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  String TEST_ROOT_DIR = new Path(System.getProperty("test.build.data",
                                    "/tmp")).toString().replace(' ', '+');
  conf.set("test.build.data", TEST_ROOT_DIR);

  // copy debug script to cache from local file system.
  FileSystem debugFs = debugDir.getFileSystem(conf);
  Path scriptPath = new Path(debugDir,"testscript.txt");
  Path cachePath = new Path("/cacheDir");
  if (!debugFs.mkdirs(cachePath)) {
    throw new IOException("Mkdirs failed to create " + cachePath.toString());
  }
  debugFs.copyFromLocalFile(scriptPath,cachePath);
  
  URI uri = debugFs.getUri().resolve(cachePath+"/testscript.txt#testscript");
  DistributedCache.createSymlink(conf);
  DistributedCache.addCacheFile(uri, conf);

  RunningJob job =null;
  // run the job. It will fail with IOException.
  try {
    job = new JobClient(conf).submitJob(conf);
  } catch (IOException e) {
  	LOG.info("Running Job failed", e);
  }

  JobID jobId = job.getID();
  // construct the task id of first map task of failmap
  TaskAttemptID taskId = new TaskAttemptID(new TaskID(jobId,true, 0), 0);
  // wait for the job to finish.
  while (!job.isComplete()) ;
  
  // return the output of debugout log.
  return readTaskLog(TaskLog.LogName.DEBUGOUT,taskId, false);
}
 
Example 17
Source File: TestTaskLogsMonitor.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * Test the truncation of DEBUGOUT file by {@link TaskLogsMonitor}
 * @throws IOException 
 */
@Test
public void testDebugLogsTruncationWithMiniMR() throws IOException {

  MiniMRCluster mr = null;
  try {
    JobConf clusterConf = new JobConf();
    clusterConf.setLong(TaskTracker.MAP_USERLOG_RETAIN_SIZE, 10000L);
    clusterConf.setLong(TaskTracker.REDUCE_USERLOG_RETAIN_SIZE, 10000L);
    mr = new MiniMRCluster(1, "file:///", 3, null, null, clusterConf);

    JobConf conf = mr.createJobConf();

    Path inDir = new Path(TEST_ROOT_DIR + "/input");
    Path outDir = new Path(TEST_ROOT_DIR + "/output");
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outDir)) {
      fs.delete(outDir, true);
    }
    if (!fs.exists(inDir)) {
      fs.mkdirs(inDir);
    }
    String input = "The quick brown fox jumped over the lazy dog";
    DataOutputStream file = fs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    conf.setNumMapTasks(1);
    conf.setMaxMapAttempts(1);
    conf.setNumReduceTasks(0);
    conf.setMapperClass(TestMiniMRMapRedDebugScript.MapClass.class);

    // copy debug script to cache from local file system.
    Path scriptPath = new Path(TEST_ROOT_DIR, "debug-script.txt");
    String debugScriptContent =
        "for ((i=0;i<1000;i++)); " + "do "
            + "echo \"Lots of logs! Lots of logs! "
            + "Waiting to be truncated! Lots of logs!\";" + "done";
    DataOutputStream scriptFile = fs.create(scriptPath);
    scriptFile.writeBytes(debugScriptContent);
    scriptFile.close();
    new File(scriptPath.toUri().getPath()).setExecutable(true);

    URI uri = scriptPath.toUri();
    DistributedCache.createSymlink(conf);
    DistributedCache.addCacheFile(uri, conf);
    conf.setMapDebugScript(scriptPath.toUri().getPath());

    RunningJob job = null;
    try {
      JobClient jc = new JobClient(conf);
      job = jc.submitJob(conf);
      try {
        jc.monitorAndPrintJob(conf, job);
      } catch (InterruptedException e) {
        //
      }
    } catch (IOException ioe) {
    } finally{
      for (TaskCompletionEvent tce : job.getTaskCompletionEvents(0)) {
        File debugOutFile =
            TaskLog.getTaskLogFile(tce.getTaskAttemptId(),
                TaskLog.LogName.DEBUGOUT);
        if (debugOutFile.exists()) {
          long length = debugOutFile.length();
          assertTrue("DEBUGOUT log file length for "
              + tce.getTaskAttemptId() + " is " + length
              + " and not =10000", length == 10000);
        }
      }
    }
  } finally { 
    if (mr != null) {
      mr.shutdown();
    }
  }
}
 
Example 18
Source File: BasicJobChaining.java    From hadoop-map-reduce-patterns with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

	if (otherArgs.length != 3) {
		System.err.println("Usage: JobChainingDriver <posts> <users> <out>");
		System.exit(2);
	}

	Path postInput = new Path(otherArgs[0]);
	Path userInput = new Path(otherArgs[1]);
	Path outputDirIntermediate = new Path(otherArgs[2] + "_int");
	Path outputDir = new Path(otherArgs[2]);

	// Setup first job to counter user posts
	Job countingJob = new Job(conf, "JobChaining-Counting");
	countingJob.setJarByClass(BasicJobChaining.class);

	// Set our mapper and reducer, we can use the API's long sum reducer for
	// a combiner!
	countingJob.setMapperClass(UserIdCountMapper.class);
	countingJob.setCombinerClass(LongSumReducer.class);
	countingJob.setReducerClass(UserIdSumReducer.class);

	countingJob.setOutputKeyClass(Text.class);
	countingJob.setOutputValueClass(LongWritable.class);

	countingJob.setInputFormatClass(TextInputFormat.class);

	TextInputFormat.addInputPath(countingJob, postInput);

	countingJob.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);

	// Execute job and grab exit code
	int code = countingJob.waitForCompletion(true) ? 0 : 1;

	if (code == 0) {
		// Calculate the average posts per user by getting counter values
		double numRecords = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdCountMapper.RECORDS_COUNTER_NAME)
				.getValue();
		double numUsers = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdSumReducer.USERS_COUNTER_NAME)
				.getValue();

		double averagePostsPerUser = numRecords / numUsers;

		// Setup binning job
		Job binningJob = new Job(new Configuration(), "JobChaining-Binning");
		binningJob.setJarByClass(BasicJobChaining.class);

		// Set mapper and the average posts per user
		binningJob.setMapperClass(UserIdBinningMapper.class);
		UserIdBinningMapper.setAveragePostsPerUser(binningJob, averagePostsPerUser);

		binningJob.setNumReduceTasks(0);

		binningJob.setInputFormatClass(TextInputFormat.class);
		TextInputFormat.addInputPath(binningJob, outputDirIntermediate);

		// Add two named outputs for below/above average
		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_BELOW_NAME,
				TextOutputFormat.class, Text.class, Text.class);

		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_ABOVE_NAME,
				TextOutputFormat.class, Text.class, Text.class);
		MultipleOutputs.setCountersEnabled(binningJob, true);

		TextOutputFormat.setOutputPath(binningJob, outputDir);

		// Add the user files to the DistributedCache
		FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
		for (FileStatus status : userFiles) {
			DistributedCache.addCacheFile(status.getPath().toUri(),
					binningJob.getConfiguration());
		}

		// Execute job and grab exit code
		code = binningJob.waitForCompletion(true) ? 0 : 1;
	}

	// Clean up the intermediate output
	FileSystem.get(conf).delete(outputDirIntermediate, true);

	System.exit(code);
}
 
Example 19
Source File: TestMRWithDistributedCache.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test (timeout = 1000)
public void testDeprecatedFunctions() throws Exception {
  DistributedCache.addLocalArchives(conf, "Test Local Archives 1");
  Assert.assertEquals("Test Local Archives 1",
      conf.get(DistributedCache.CACHE_LOCALARCHIVES));
  Assert.assertEquals(1,
      DistributedCache.getLocalCacheArchives(conf).length);
  Assert.assertEquals("Test Local Archives 1",
      DistributedCache.getLocalCacheArchives(conf)[0].getName());
  DistributedCache.addLocalArchives(conf, "Test Local Archives 2");
  Assert.assertEquals("Test Local Archives 1,Test Local Archives 2",
      conf.get(DistributedCache.CACHE_LOCALARCHIVES));
  Assert.assertEquals(2,
      DistributedCache.getLocalCacheArchives(conf).length);
  Assert.assertEquals("Test Local Archives 2",
      DistributedCache.getLocalCacheArchives(conf)[1].getName());
  DistributedCache.setLocalArchives(conf, "Test Local Archives 3");
  Assert.assertEquals("Test Local Archives 3",
      conf.get(DistributedCache.CACHE_LOCALARCHIVES));
  Assert.assertEquals(1,
      DistributedCache.getLocalCacheArchives(conf).length);
  Assert.assertEquals("Test Local Archives 3",
      DistributedCache.getLocalCacheArchives(conf)[0].getName());

  DistributedCache.addLocalFiles(conf, "Test Local Files 1");
  Assert.assertEquals("Test Local Files 1",
      conf.get(DistributedCache.CACHE_LOCALFILES));
  Assert.assertEquals(1,
      DistributedCache.getLocalCacheFiles(conf).length);
  Assert.assertEquals("Test Local Files 1",
      DistributedCache.getLocalCacheFiles(conf)[0].getName());
  DistributedCache.addLocalFiles(conf, "Test Local Files 2");
  Assert.assertEquals("Test Local Files 1,Test Local Files 2",
      conf.get(DistributedCache.CACHE_LOCALFILES));
  Assert.assertEquals(2,
      DistributedCache.getLocalCacheFiles(conf).length);
  Assert.assertEquals("Test Local Files 2",
      DistributedCache.getLocalCacheFiles(conf)[1].getName());
  DistributedCache.setLocalFiles(conf, "Test Local Files 3");
  Assert.assertEquals("Test Local Files 3",
      conf.get(DistributedCache.CACHE_LOCALFILES));
  Assert.assertEquals(1,
      DistributedCache.getLocalCacheFiles(conf).length);
  Assert.assertEquals("Test Local Files 3",
      DistributedCache.getLocalCacheFiles(conf)[0].getName());

  DistributedCache.setArchiveTimestamps(conf, "1234567890");
  Assert.assertEquals(1234567890,
      conf.getLong(DistributedCache.CACHE_ARCHIVES_TIMESTAMPS, 0));
  Assert.assertEquals(1,
      DistributedCache.getArchiveTimestamps(conf).length);
  Assert.assertEquals(1234567890,
      DistributedCache.getArchiveTimestamps(conf)[0]);
  DistributedCache.setFileTimestamps(conf, "1234567890");
  Assert.assertEquals(1234567890,
      conf.getLong(DistributedCache.CACHE_FILES_TIMESTAMPS, 0));
  Assert.assertEquals(1,
      DistributedCache.getFileTimestamps(conf).length);
  Assert.assertEquals(1234567890,
      DistributedCache.getFileTimestamps(conf)[0]);

  DistributedCache.createAllSymlink(conf, new File("Test Job Cache Dir"),
      new File("Test Work Dir"));
  Assert.assertNull(conf.get(DistributedCache.CACHE_SYMLINK));
  Assert.assertTrue(DistributedCache.getSymlink(conf));

  Assert.assertTrue(symlinkFile.createNewFile());
  FileStatus fileStatus =
      DistributedCache.getFileStatus(conf, symlinkFile.toURI());
  Assert.assertNotNull(fileStatus);
  Assert.assertEquals(fileStatus.getModificationTime(),
      DistributedCache.getTimestamp(conf, symlinkFile.toURI()));
  Assert.assertTrue(symlinkFile.delete());

  DistributedCache.addCacheArchive(symlinkFile.toURI(), conf);
  Assert.assertEquals(symlinkFile.toURI().toString(),
      conf.get(DistributedCache.CACHE_ARCHIVES));
  Assert.assertEquals(1, DistributedCache.getCacheArchives(conf).length);
  Assert.assertEquals(symlinkFile.toURI(),
      DistributedCache.getCacheArchives(conf)[0]);

  DistributedCache.addCacheFile(symlinkFile.toURI(), conf);
  Assert.assertEquals(symlinkFile.toURI().toString(),
      conf.get(DistributedCache.CACHE_FILES));
  Assert.assertEquals(1, DistributedCache.getCacheFiles(conf).length);
  Assert.assertEquals(symlinkFile.toURI(),
      DistributedCache.getCacheFiles(conf)[0]);
}
 
Example 20
Source File: CalculateSimilarityStep6.java    From RecommendationEngine with MIT License 2 votes vote down vote up
public static void run() throws IOException, ClassNotFoundException,
		InterruptedException, URISyntaxException {

	Configuration conf = new Configuration();
	HDFS hdfs = new HDFS(conf);

	// hdfs.download(HDFS.HDFSPATH + "/step4/part-r-00000",
	// ItemBasedCFDriver.LOCALPATH + "/step4/part-r-00000");
	// hdfs.download(HDFS.HDFSPATH + "/step5/part-r-00000",
	// ItemBasedCFDriver.LOCALPATH + "/step5/part-r-00000");

	// String inputPath1 = "/var/ItemBased/step4/part-r-00000";//
	// 每个物品被几个用户喜欢
	// String inputPath2 = "/var/ItemBased/step5/part-r-00000";// 每个用户喜欢几个物品

	String inputPath1 = HDFS.HDFSPATH + "/step4/part-r-00000";
	String inputPath2 = HDFS.HDFSPATH + "/step5/part-r-00000";

	String inputPath3 = ItemBasedCFDriver.path.get("step6InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step6OutputPath");

	conf.set("mapreduce.output.textoutputformat.separator", ":");

	DistributedCache.addCacheFile(new Path(inputPath1).toUri(), conf);
	DistributedCache.addCacheFile(new Path(inputPath2).toUri(), conf);

	Job job = Job.getInstance(conf);

	hdfs.rmr(outputPath);

	job.setMapperClass(Step6_Mapper.class);

	job.setJarByClass(CalculateSimilarityStep6.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(DoubleWritable.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath3));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}