Java Code Examples for org.apache.hadoop.mapreduce.filecache.DistributedCache#addCacheFile()

The following examples show how to use org.apache.hadoop.mapreduce.filecache.DistributedCache#addCacheFile() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Add local non-jar files the job depends on to DistributedCache.
 */
@SuppressWarnings("deprecation")
private void addLocalFiles(Path jobFileDir, String jobFileList, Configuration conf) throws IOException {
  DistributedCache.createSymlink(conf);
  for (String jobFile : SPLITTER.split(jobFileList)) {
    Path srcJobFile = new Path(jobFile);
    // DistributedCache requires absolute path, so we need to use makeQualified.
    Path destJobFile = new Path(this.fs.makeQualified(jobFileDir), srcJobFile.getName());
    // Copy the file from local file system to HDFS
    this.fs.copyFromLocalFile(srcJobFile, destJobFile);
    // Create a URI that is in the form path#symlink
    URI destFileUri = URI.create(destJobFile.toUri().getPath() + "#" + destJobFile.getName());
    LOG.info(String.format("Adding %s to DistributedCache", destFileUri));
    // Finally add the file to DistributedCache with a symlink named after the file name
    DistributedCache.addCacheFile(destFileUri, conf);
  }
}
 
Example 2
Source File: GCPCredentialCopier.java    From circus-train with Apache License 2.0 5 votes vote down vote up
private void linkRelativePathInDistributedCache(Configuration conf, Path source, Path destination)
  throws URISyntaxException, IOException {
  /*
   * The "#" links the HDFS location for the key file to the local file system credential provider path so that the
   * GoogleHadoopFileSystem can subsequently resolve it from a local file system uri despite it being in a Distributed
   * file system when the DistCP job runs.
   */
  String cacheFileUri = destination.toString() + "#" + source;
  DistributedCache.addCacheFile(new URI(cacheFileUri), conf);
  LOG.info("mapreduce.job.cache.files : {}", conf.get("mapreduce.job.cache.files"));
  conf.set(GCP_KEYFILE_CACHED_LOCATION, source.toString());
}
 
Example 3
Source File: TestMRApps.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflicts() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI archive = new URI("mockfs://mock/tmp/something.zip#something");
  Path archivePath = new Path(archive);
  URI file = new URI("mockfs://mock/tmp/something.txt#something");
  Path filePath = new Path(file);
  
  when(mockFs.resolvePath(archivePath)).thenReturn(archivePath);
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  
  DistributedCache.addCacheArchive(archive, conf);
  conf.set(MRJobConfig.CACHE_ARCHIVES_TIMESTAMPS, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_SIZES, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES, "true");
  DistributedCache.addCacheFile(file, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //Archive wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.ARCHIVE, lr.getType());
}
 
Example 4
Source File: TestMRApps.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflictsFiles() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI file = new URI("mockfs://mock/tmp/something.zip#something");
  Path filePath = new Path(file);
  URI file2 = new URI("mockfs://mock/tmp/something.txt#something");
  Path file2Path = new Path(file2);
  
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  when(mockFs.resolvePath(file2Path)).thenReturn(file2Path);
  
  DistributedCache.addCacheFile(file, conf);
  DistributedCache.addCacheFile(file2, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "10,11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "10,11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true,true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //First one wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.FILE, lr.getType());
}
 
Example 5
Source File: TestMRApps.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflicts() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI archive = new URI("mockfs://mock/tmp/something.zip#something");
  Path archivePath = new Path(archive);
  URI file = new URI("mockfs://mock/tmp/something.txt#something");
  Path filePath = new Path(file);
  
  when(mockFs.resolvePath(archivePath)).thenReturn(archivePath);
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  
  DistributedCache.addCacheArchive(archive, conf);
  conf.set(MRJobConfig.CACHE_ARCHIVES_TIMESTAMPS, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_SIZES, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES, "true");
  DistributedCache.addCacheFile(file, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //Archive wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.ARCHIVE, lr.getType());
}
 
Example 6
Source File: TestMRApps.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflictsFiles() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI file = new URI("mockfs://mock/tmp/something.zip#something");
  Path filePath = new Path(file);
  URI file2 = new URI("mockfs://mock/tmp/something.txt#something");
  Path file2Path = new Path(file2);
  
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  when(mockFs.resolvePath(file2Path)).thenReturn(file2Path);
  
  DistributedCache.addCacheFile(file, conf);
  DistributedCache.addCacheFile(file2, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "10,11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "10,11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true,true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //First one wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.FILE, lr.getType());
}
 
Example 7
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@VisibleForTesting
static void serializeJobState(FileSystem fs, Path mrJobDir, Configuration conf, JobState jobState, Job job)
    throws IOException {
  Path jobStateFilePath = new Path(mrJobDir, JOB_STATE_FILE_NAME);
  // Write the job state with an empty task set (work units are read by the mapper from a different file)
  try (DataOutputStream dataOutputStream = new DataOutputStream(fs.create(jobStateFilePath))) {
    jobState.write(dataOutputStream, false,
        conf.getBoolean(SERIALIZE_PREVIOUS_WORKUNIT_STATES_KEY, DEFAULT_SERIALIZE_PREVIOUS_WORKUNIT_STATES));
  }

  job.getConfiguration().set(ConfigurationKeys.JOB_STATE_FILE_PATH_KEY, jobStateFilePath.toString());

  DistributedCache.addCacheFile(jobStateFilePath.toUri(), job.getConfiguration());
  job.getConfiguration().set(ConfigurationKeys.JOB_STATE_DISTRIBUTED_CACHE_NAME, jobStateFilePath.getName());
}
 
Example 8
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Add non-jar files already on HDFS that the job depends on to DistributedCache.
 */
@SuppressWarnings("deprecation")
private void addHDFSFiles(String jobFileList, Configuration conf) {
  DistributedCache.createSymlink(conf);
  jobFileList = PasswordManager.getInstance(this.jobProps).readPassword(jobFileList);
  for (String jobFile : SPLITTER.split(jobFileList)) {
    Path srcJobFile = new Path(jobFile);
    // Create a URI that is in the form path#symlink
    URI srcFileUri = URI.create(srcJobFile.toUri().getPath() + "#" + srcJobFile.getName());
    LOG.info(String.format("Adding %s to DistributedCache", srcFileUri));
    // Finally add the file to DistributedCache with a symlink named after the file name
    DistributedCache.addCacheFile(srcFileUri, conf);
  }
}
 
Example 9
Source File: TestMRApps.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("deprecation")
@Test (timeout = 30000)
public void testSetupDistributedCache() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI archive = new URI("mockfs://mock/tmp/something.zip");
  Path archivePath = new Path(archive);
  URI file = new URI("mockfs://mock/tmp/something.txt#something");
  Path filePath = new Path(file);
  
  when(mockFs.resolvePath(archivePath)).thenReturn(archivePath);
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  
  DistributedCache.addCacheArchive(archive, conf);
  conf.set(MRJobConfig.CACHE_ARCHIVES_TIMESTAMPS, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_SIZES, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES, "true");
  DistributedCache.addCacheFile(file, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  assertEquals(2, localResources.size());
  LocalResource lr = localResources.get("something.zip");
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.ARCHIVE, lr.getType());
  lr = localResources.get("something");
  assertNotNull(lr);
  assertEquals(11l, lr.getSize());
  assertEquals(11l, lr.getTimestamp());
  assertEquals(LocalResourceType.FILE, lr.getType());
}
 
Example 10
Source File: Job.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * Add a file to be localized
 * @param uri The uri of the cache to be localized
 */
public void addCacheFile(URI uri) {
  ensureState(JobState.DEFINE);
  DistributedCache.addCacheFile(uri, conf);
}
 
Example 11
Source File: TestMRApps.java    From big-c with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("deprecation")
@Test (timeout = 30000)
public void testSetupDistributedCache() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI archive = new URI("mockfs://mock/tmp/something.zip");
  Path archivePath = new Path(archive);
  URI file = new URI("mockfs://mock/tmp/something.txt#something");
  Path filePath = new Path(file);
  
  when(mockFs.resolvePath(archivePath)).thenReturn(archivePath);
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  
  DistributedCache.addCacheArchive(archive, conf);
  conf.set(MRJobConfig.CACHE_ARCHIVES_TIMESTAMPS, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_SIZES, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES, "true");
  DistributedCache.addCacheFile(file, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  assertEquals(2, localResources.size());
  LocalResource lr = localResources.get("something.zip");
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.ARCHIVE, lr.getType());
  lr = localResources.get("something");
  assertNotNull(lr);
  assertEquals(11l, lr.getSize());
  assertEquals(11l, lr.getTimestamp());
  assertEquals(LocalResourceType.FILE, lr.getType());
}
 
Example 12
Source File: Job.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Add a file to be localized
 * @param uri The uri of the cache to be localized
 */
public void addCacheFile(URI uri) {
  ensureState(JobState.DEFINE);
  DistributedCache.addCacheFile(uri, conf);
}
 
Example 13
Source File: TotalSortMapReduce.java    From hiped2 with Apache License 2.0 4 votes vote down vote up
/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  int numReducers = 2;

  Cli cli = Cli.builder().setArgs(args).addOptions(CliOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path input = new Path(cli.getArgValueAsString(CliOpts.INPUT));
  Path partitionFile = new Path(cli.getArgValueAsString(CliOpts.PARTITION));
  Path output = new Path(cli.getArgValueAsString(CliOpts.OUTPUT));


  InputSampler.Sampler<Text, Text> sampler =
      new InputSampler.RandomSampler<Text, Text>
          (0.1,
              10000,
              10);

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(TotalSortMapReduce.class);

  job.setNumReduceTasks(numReducers);

  job.setInputFormatClass(KeyValueTextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, output);

  InputSampler.writePartitionFile(job, sampler);

  URI partitionUri = new URI(partitionFile.toString() +
      "#" + "_sortPartitioning");
  DistributedCache.addCacheFile(partitionUri, conf);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}