Java Code Examples for org.apache.hadoop.filecache.DistributedCache#getCacheFiles()

The following examples show how to use org.apache.hadoop.filecache.DistributedCache#getCacheFiles() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CrossProductOperation.java    From incubator-retired-mrql with Apache License 2.0 6 votes vote down vote up
@Override
protected void setup ( Context context ) throws IOException,InterruptedException {
    super.setup(context);
    try {
        conf = context.getConfiguration();
        Plan.conf = conf;
        Config.read(Plan.conf);
        Tree code = Tree.parse(conf.get("mrql.reducer"));
        reduce_fnc = functional_argument(conf,code);
        code = Tree.parse(conf.get("mrql.mapper"));
        map_fnc = functional_argument(conf,code);
        if (conf.get("mrql.zero") != null) {
            code = Tree.parse(conf.get("mrql.zero"));
            result = Interpreter.evalE(code);
            code = Tree.parse(conf.get("mrql.accumulator"));
            acc_fnc = functional_argument(conf,code);
        } else result = null;
        counter = conf.get("mrql.counter");
        uris = DistributedCache.getCacheFiles(conf);
        local_paths = DistributedCache.getLocalCacheFiles(conf);
        index = 0;
    } catch (Exception e) {
        throw new Error("Cannot setup the crossProduct: "+e);
    }
}
 
Example 2
Source File: JobControlCompiler.java    From spork with Apache License 2.0 6 votes vote down vote up
private static Path getExistingDistCacheFilePath(Configuration conf, URL url) throws IOException {
    URI[] cacheFileUris = DistributedCache.getCacheFiles(conf);
    if (cacheFileUris != null) {
        String fileName = url.getRef() == null ? FilenameUtils.getName(url.getPath()) : url.getRef();
        for (URI cacheFileUri : cacheFileUris) {
            Path path = new Path(cacheFileUri);
            String cacheFileName = cacheFileUri.getFragment() == null ? path.getName() : cacheFileUri.getFragment();
            // Match
            //     - if both filenames are same and no symlinks (or)
            //     - if both symlinks are same (or)
            //     - symlink of existing cache file is same as the name of the new file to be added.
            //         That would be the case when hbase-0.98.4.jar#hbase.jar is configured via Oozie
            // and register hbase.jar is done in the pig script.
            // If two different files are symlinked to the same name, then there is a conflict
            // and hadoop itself does not guarantee which file will be symlinked to that name.
            // So we are good.
            if (fileName.equals(cacheFileName)) {
                return path;
            }
        }
    }
    return null;
}
 
Example 3
Source File: TestJobControlCompiler.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testAddArchiveToDistributedCache() throws IOException {
    final File textFile = File.createTempFile("file", ".txt");
    textFile.deleteOnExit();

    final List<File> zipArchives = createFiles(".zip");
    zipArchives.add(textFile);
    final List<File> tarArchives = createFiles(".tgz", ".tar.gz", ".tar");

    final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
    final PigContext pigContext = pigServer.getPigContext();
    pigContext.connect();
    pigContext.getProperties().put("pig.streaming.ship.files",
            StringUtils.join(zipArchives, ","));
    pigContext.getProperties().put("pig.streaming.cache.files",
            StringUtils.join(tarArchives, ","));

    final JobConf jobConf = compileTestJob(pigContext, CONF);

    URI[] uris = DistributedCache.getCacheFiles(jobConf);
    int sizeTxt = 0;
    for (int i = 0; i < uris.length; i++) {
        if (uris[i].toString().endsWith(".txt")) {
            sizeTxt++;
        }
    }
    Assert.assertTrue(sizeTxt == 1);
    assertFilesInDistributedCache(
            DistributedCache.getCacheArchives(jobConf), 4, ".zip", ".tgz",
            ".tar.gz", ".tar");
}
 
Example 4
Source File: TestJobControlCompiler.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
* Tests that no duplicate jars are added to distributed cache, which might cause conflicts
* and tests with both symlinked and normal jar specification
*/
 @Test
 public void testNoDuplicateJarsInDistributedCache() throws Exception {

     // JobControlCompiler setup
     final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
     PigContext pigContext = pigServer.getPigContext();
     pigContext.connect();

     Configuration conf = new Configuration();
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf-0.jar#udf.jar")), conf, FileSystem.get(conf));
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf1.jar#diffname.jar")), conf, FileSystem.get(conf));
     DistributedCache.addFileToClassPath(new Path(new URI("/lib/udf2.jar")), conf, FileSystem.get(conf));
     createAndAddResource("udf.jar", pigContext);
     createAndAddResource("udf1.jar", pigContext);
     createAndAddResource("udf2.jar", pigContext);
     createAndAddResource("another.jar", pigContext);

     final JobConf jobConf = compileTestJob(pigContext, conf);

     // verifying the jar gets on distributed cache
     URI[] cacheURIs = DistributedCache.getCacheFiles(jobConf);
     Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
     // expected - 1. udf.jar#udf.jar, 2. udf1.jar#diffname.jar 3. udf2.jar (same added twice)
     // 4. another.jar and 5. udf1.jar, and not duplicate udf.jar
     System.out.println("cache.files= " + Arrays.toString(cacheURIs));
     System.out.println("classpath.files= " + Arrays.toString(fileClassPaths));
     if (HadoopShims.isHadoopYARN()) {
         // Default jars - 5 (pig, antlr, joda-time, automaton)
         // Other jars - 10 (udf.jar#udf.jar, udf1.jar#diffname.jar, udf2.jar, udf1.jar, another.jar
         Assert.assertEquals("size 9 for " + Arrays.toString(cacheURIs), 9,
                 Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
         Assert.assertEquals("size 9 for " + Arrays.toString(fileClassPaths), 9,
                 Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
     } else {
         // Default jars - 5. Has guava in addition
         // There will be same entries duplicated for udf.jar and udf2.jar
         Assert.assertEquals("size 12 for " + Arrays.toString(cacheURIs), 12,
                 Arrays.asList(StringUtils.join(cacheURIs, ",").split(",")).size());
         Assert.assertEquals("size 12 for " + Arrays.toString(fileClassPaths), 12,
                 Arrays.asList(StringUtils.join(fileClassPaths, ",").split(",")).size());
     }

     // Count occurrences of the resources
     Map<String, Integer> occurrences = new HashMap<String, Integer>();

     for (URI cacheURI : cacheURIs) {
         Integer val = occurrences.get(cacheURI.toString());
         val = (val == null) ? 1 : ++val;
         occurrences.put(cacheURI.toString(), val);
     }
     if (HadoopShims.isHadoopYARN()) {
         Assert.assertEquals(9, occurrences.size());
     } else {
         Assert.assertEquals(10, occurrences.size()); //guava jar in addition
     }

     for (String file : occurrences.keySet()) {
         if (!HadoopShims.isHadoopYARN() && (file.endsWith("udf.jar") || file.endsWith("udf2.jar"))) {
             // Same path added twice which is ok. It should not be a shipped to hdfs temp path.
             // We assert path is same by checking count
             Assert.assertEquals("Two occurrences for " + file, 2, (int) occurrences.get(file));
         } else {
             // check that only single occurrence even though we added once to dist cache (simulating via Oozie)
             // and second time through pig register jar when there is symlink
             Assert.assertEquals("One occurrence for " + file, 1, (int) occurrences.get(file));
         }
     }
 }
 
Example 5
Source File: Submitter.java    From RDFS with Apache License 2.0 4 votes vote down vote up
private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
  setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
  setIfUnset(conf, "mapred.output.key.class", textClassname);
  setIfUnset(conf, "mapred.output.value.class", textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass("mapred.pipes.user.inputformat", 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    DistributedCache.createSymlink(conf);
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf,"mapred.map.task.debug.script",defScript);
    setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}
 
Example 6
Source File: HadoopIOUtils.java    From elasticsearch-hadoop with Apache License 2.0 4 votes vote down vote up
public static InputStream open(String resource, Configuration conf) {
    ClassLoader loader = conf.getClassLoader();

    if (loader == null) {
        loader = Thread.currentThread().getContextClassLoader();
    }

    if (loader == null) {
        loader = HadoopIOUtils.class.getClassLoader();
    }

    boolean trace = log.isTraceEnabled();

    try {
        // no prefix means classpath
        if (!resource.contains(":")) {

            InputStream result = loader.getResourceAsStream(resource);
            if (result != null) {
                if (trace) {
                    log.trace(String.format("Loaded resource %s from classpath", resource));
                }
                return result;
            }
            // fall back to the distributed cache
            URI[] uris = DistributedCache.getCacheFiles(conf);
            if (uris != null) {
                for (URI uri : uris) {
                    if (uri.toString().contains(resource)) {
                        if (trace) {
                            log.trace(String.format("Loaded resource %s from distributed cache", resource));
                        }
                        return uri.toURL().openStream();
                    }
                }
            }
        }

        // fall back to file system
        Path p = new Path(resource);
        FileSystem fs = p.getFileSystem(conf);
        return fs.open(p);
    } catch (IOException ex) {
        throw new EsHadoopIllegalArgumentException(String.format("Cannot open stream for resource %s", resource));
    }
}