Java Code Examples for org.apache.hadoop.filecache.DistributedCache#addFileToClassPath()

The following examples show how to use org.apache.hadoop.filecache.DistributedCache#addFileToClassPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: JobLibLoader.java From SpyGlass with Apache License 2.0

6 votes

public static void loadJars(String libPathStr, Configuration config) {
	
	try {
		Path libPath = new Path(libPathStr);

		FileSystem fs = FileSystem.get(config);

		RemoteIterator<LocatedFileStatus> itr = fs.listFiles(libPath, true);

		while (itr.hasNext()) {
			LocatedFileStatus f = itr.next();

			if (!f.isDirectory() && f.getPath().getName().endsWith("jar")) {
				logger.info("Loading Jar : " + f.getPath().getName());
				DistributedCache.addFileToClassPath(f.getPath(), config);
			}
		}
	} catch (Exception e) {
		e.printStackTrace();
		logger.error(e.toString());
	}
}

Example 2

Source File: MRCompactorJobRunner.java From incubator-gobblin with Apache License 2.0

5 votes

private void addJars(Configuration conf) throws IOException {
  if (!this.dataset.jobProps().contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(this.dataset.jobProps().getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : this.fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, this.fs);
  }
}

Example 3

Source File: CompactionJobConfigurator.java From incubator-gobblin with Apache License 2.0

5 votes

protected void addJars(Configuration conf, State state, FileSystem fs) throws IOException {
  if (!state.contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(state.getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, fs);
  }
}

Example 4

Source File: JobControlCompiler.java From spork with Apache License 2.0

5 votes

/**
 * if url is not in HDFS will copy the path to HDFS from local before adding to distributed cache
 * @param pigContext the pigContext
 * @param conf the job conf
 * @param url the url to be added to distributed cache
 * @return the path as seen on distributed cache
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void putJarOnClassPathThroughDistributedCache(
        PigContext pigContext,
        Configuration conf,
        URL url) throws IOException {

    // Turn on the symlink feature
    DistributedCache.createSymlink(conf);

    Path distCachePath = getExistingDistCacheFilePath(conf, url);
    if (distCachePath != null) {
        log.info("Jar file " + url + " already in DistributedCache as "
                + distCachePath + ". Not copying to hdfs and adding again");
        // Path already in dist cache
        if (!HadoopShims.isHadoopYARN()) {
            // Mapreduce in YARN includes $PWD/* which will add all *.jar files in classapth.
            // So don't have to ensure that the jar is separately added to mapreduce.job.classpath.files
            // But path may only be in 'mapred.cache.files' and not be in
            // 'mapreduce.job.classpath.files' in Hadoop 1.x. So adding it there
            DistributedCache.addFileToClassPath(distCachePath, conf, distCachePath.getFileSystem(conf));
        }
    }
    else {
        // REGISTER always copies locally the jar file. see PigServer.registerJar()
        Path pathInHDFS = shipToHDFS(pigContext, conf, url);
        DistributedCache.addFileToClassPath(pathInHDFS, conf, FileSystem.get(conf));
        log.info("Added jar " + url + " to DistributedCache through " + pathInHDFS);
    }

}

Example 5

Source File: DependencyLoader.java From mrgeo with Apache License 2.0

5 votes

private static void addFileToClasspath(Configuration conf, Set<String> existing, FileSystem fs, Path hdfsBase,
    File file) throws IOException
{
  Path hdfsPath = new Path(hdfsBase, file.getName());
  if (!existing.contains(hdfsPath.toString()))
  {
    if (fs.exists(hdfsPath))
    {
      // check the timestamp and exit if the one in hdfs is "newer"
      FileStatus status = fs.getFileStatus(hdfsPath);

      if (file.lastModified() <= status.getModificationTime())
      {
        log.debug(file.getPath() + " up to date");
        DistributedCache.addFileToClassPath(hdfsPath, conf, fs);

        existing.add(hdfsPath.toString());
        return;
      }
    }

    // copy the file...
    log.debug("Copying " + file.getPath() + " to HDFS for distribution");

    fs.copyFromLocalFile(new Path(file.getCanonicalFile().toURI()), hdfsPath);
    DistributedCache.addFileToClassPath(hdfsPath, conf, fs);
    existing.add(hdfsPath.toString());
  }
}

Example 6

Source File: QueryTestParams.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public <T extends Configuration> T provisionQueries(T cfg) {
    if (HadoopCfgUtils.isLocal(cfg)) {
        return cfg;
    }

    try {
        DistributedCache.addFileToClassPath(new Path(TestData.unpackResource(QUERY_DSL, stagingLocation).getAbsolutePath()), cfg);
        DistributedCache.addFileToClassPath(new Path(TestData.unpackResource(QUERY_URI, stagingLocation).getAbsolutePath()), cfg);
    } catch (IOException ex) {
    }
    return cfg;
}

Example 7

Source File: TestJobControlCompiler.java From spork with Apache License 2.0

4 votes

/**
* Tests that no duplicate jars are added to distributed cache, which might cause conflicts
* and tests with both symlinked and normal jar specification
*/
 @Test
 public void testNoDuplicateJarsInDistributedCache() throws Exception {

     // JobControlCompiler setup
     final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
     PigContext pigContext = pigServer.getPigContext();
     pigContext.connect();

     Configuration conf = new Configuration();
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf-0.jar#udf.jar")), conf, FileSystem.get(conf));
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf1.jar#diffname.jar")), conf, FileSystem.get(conf));
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf2.jar")), conf, FileSystem.get(conf));
     createAndAddResource("udf.jar", pigContext);
     createAndAddResource("udf1.jar", pigContext);
     createAndAddResource("udf2.jar", pigContext);
     createAndAddResource("another.jar", pigContext);

     final JobConf jobConf = compileTestJob(pigContext, conf);

     // verifying the jar gets on distributed cache
     URI[] cacheURIs = DistributedCache.getCacheFiles(jobConf);
     Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
     // expected - 1. udf.jar#udf.jar, 2. udf1.jar#diffname.jar 3. udf2.jar (same added twice)
     // 4. another.jar and 5. udf1.jar, and not duplicate udf.jar
     System.out.println("cache.files= " + Arrays.toString(cacheURIs));
     System.out.println("classpath.files= " + Arrays.toString(fileClassPaths));
     if (HadoopShims.isHadoopYARN()) {
         // Default jars - 5 (pig, antlr, joda-time, automaton)
         // Other jars - 10 (udf.jar#udf.jar, udf1.jar#diffname.jar, udf2.jar, udf1.jar, another.jar
         Assert.assertEquals("size 9 for " + Arrays.toString(cacheURIs), 9,
                 Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
         Assert.assertEquals("size 9 for " + Arrays.toString(fileClassPaths), 9,
                 Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
     } else {
         // Default jars - 5. Has guava in addition
         // There will be same entries duplicated for udf.jar and udf2.jar
         Assert.assertEquals("size 12 for " + Arrays.toString(cacheURIs), 12,
                 Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
         Assert.assertEquals("size 12 for " + Arrays.toString(fileClassPaths), 12,
                 Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
     }

     // Count occurrences of the resources
     Map<String, Integer> occurrences = new HashMap<String, Integer>();

     for (URI cacheURI : cacheURIs) {
         Integer val = occurrences.get(cacheURI.toString());
         val = (val == null) ? 1 : ++val;
         occurrences.put(cacheURI.toString(), val);
     }
     if (HadoopShims.isHadoopYARN()) {
         Assert.assertEquals(9, occurrences.size());
     } else {
         Assert.assertEquals(10, occurrences.size()); //guava jar in addition
     }

     for (String file : occurrences.keySet()) {
         if (!HadoopShims.isHadoopYARN() && (file.endsWith("udf.jar") || file.endsWith("udf2.jar"))) {
             // Same path added twice which is ok. It should not be a shipped to hdfs temp path.
             // We assert path is same by checking count
             Assert.assertEquals("Two occurrences for " + file, 2, (int) occurrences.get(file));
         } else {
             // check that only single occurrence even though we added once to dist cache (simulating via Oozie)
             // and second time through pig register jar when there is symlink
             Assert.assertEquals("One occurrence for " + file, 1, (int) occurrences.get(file));
         }
     }
 }