Java Code Examples for org.apache.hadoop.filecache.DistributedCache

The following examples show how to use org.apache.hadoop.filecache.DistributedCache. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: BigData-In-Practice   Source File: MapJoin.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected void setup(Mapper<LongWritable, Text, NullWritable, Emp_Dep>.Context context) throws IOException, InterruptedException {
    // 预处理把要关联的文件加载到缓存中
    Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
    // 我们这里只缓存了一个文件,所以取第一个即可,创建BufferReader去读取
    BufferedReader reader = new BufferedReader(new FileReader(paths[0].toString()));

    String str = null;
    try {
        // 一行一行读取
        while ((str = reader.readLine()) != null) {
            // 对缓存中的表进行分割
            String[] splits = str.split("\t");
            // 把字符数组中有用的数据存在一个Map中
            joinData.put(Integer.parseInt(splits[0]), splits[1]);
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        reader.close();
    }

}
 
Example 2
@Override
protected void setup ( Context context ) throws IOException,InterruptedException {
    super.setup(context);
    try {
        conf = context.getConfiguration();
        Plan.conf = conf;
        Config.read(Plan.conf);
        Tree code = Tree.parse(conf.get("mrql.reducer"));
        reduce_fnc = functional_argument(conf,code);
        code = Tree.parse(conf.get("mrql.mapper"));
        map_fnc = functional_argument(conf,code);
        if (conf.get("mrql.zero") != null) {
            code = Tree.parse(conf.get("mrql.zero"));
            result = Interpreter.evalE(code);
            code = Tree.parse(conf.get("mrql.accumulator"));
            acc_fnc = functional_argument(conf,code);
        } else result = null;
        counter = conf.get("mrql.counter");
        uris = DistributedCache.getCacheFiles(conf);
        local_paths = DistributedCache.getLocalCacheFiles(conf);
        index = 0;
    } catch (Exception e) {
        throw new Error("Cannot setup the crossProduct: "+e);
    }
}
 
Example 3
Source Project: ml-ease   Source File: AvroDistributedCacheFileReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected List<Path> getPaths(String filePath) throws IOException
{
  Path[] localFiles = DistributedCache.getLocalCacheFiles(getConf());
  List<Path> paths = new ArrayList<Path>();
  
  for (Path file: localFiles)
  {
    if (!file.toString().contains(filePath))
    {
      continue;
    }
    
    paths.add(file);
  }
    
  return paths;
}
 
Example 4
Source Project: examples   Source File: SolrOutputFormat.java    License: Apache License 2.0 6 votes vote down vote up
public static void addSolrConfToDistributedCache(Job job, File solrHomeZip)
    throws IOException {
  // Make a reasonably unique name for the zip file in the distributed cache
  // to avoid collisions if multiple jobs are running.
  String hdfsZipName = UUID.randomUUID().toString() + '.'
      + ZIP_FILE_BASE_NAME;
  Configuration jobConf = job.getConfiguration();
  jobConf.set(ZIP_NAME, hdfsZipName);

  Path zipPath = new Path("/tmp", getZipName(jobConf));
  FileSystem fs = FileSystem.get(jobConf);
  fs.copyFromLocalFile(new Path(solrHomeZip.toString()), zipPath);
  final URI baseZipUrl = fs.getUri().resolve(
      zipPath.toString() + '#' + getZipName(jobConf));

  DistributedCache.addCacheArchive(baseZipUrl, jobConf);
  LOG.debug("Set Solr distributed cache: {}", Arrays.asList(job.getCacheArchives()));
  LOG.debug("Set zipPath: {}", zipPath);
  // Actually send the path for the configuration zip file
  jobConf.set(SETUP_OK, zipPath.toString());
}
 
Example 5
Source Project: spork   Source File: JobControlCompiler.java    License: Apache License 2.0 6 votes vote down vote up
private static Path getExistingDistCacheFilePath(Configuration conf, URL url) throws IOException {
    URI[] cacheFileUris = DistributedCache.getCacheFiles(conf);
    if (cacheFileUris != null) {
        String fileName = url.getRef() == null ? FilenameUtils.getName(url.getPath()) : url.getRef();
        for (URI cacheFileUri : cacheFileUris) {
            Path path = new Path(cacheFileUri);
            String cacheFileName = cacheFileUri.getFragment() == null ? path.getName() : cacheFileUri.getFragment();
            // Match
            //     - if both filenames are same and no symlinks (or)
            //     - if both symlinks are same (or)
            //     - symlink of existing cache file is same as the name of the new file to be added.
            //         That would be the case when hbase-0.98.4.jar#hbase.jar is configured via Oozie
            // and register hbase.jar is done in the pig script.
            // If two different files are symlinked to the same name, then there is a conflict
            // and hadoop itself does not guarantee which file will be symlinked to that name.
            // So we are good.
            if (fileName.equals(cacheFileName)) {
                return path;
            }
        }
    }
    return null;
}
 
Example 6
Source Project: spork   Source File: L2.java    License: Apache License 2.0 6 votes vote down vote up
public void configure(JobConf conf) {
    try {
        Path[] paths = DistributedCache.getLocalCacheFiles(conf);
        if (paths == null || paths.length < 1) {
            throw new RuntimeException("DistributedCache no work.");
        }

        // Open the small table
        BufferedReader reader =
            new BufferedReader(new InputStreamReader(new
            FileInputStream(paths[0].toString())));
        String line;
        hash = new HashSet<String>(500);
        while ((line = reader.readLine()) != null) {
            if (line.length() < 1) continue;
            String[] fields = line.split("");
            hash.add(fields[0]);
        }
        reader.close();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}
 
Example 7
public void setup(Context context) throws IOException,
		InterruptedException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	// Read all files in the DistributedCache
	for (Path p : files) {
		BufferedReader rdr = new BufferedReader(new InputStreamReader(
				new GZIPInputStream(new FileInputStream(new File(
						p.toString())))));
		String line = null;
		// For each record in the user file
		while ((line = rdr.readLine()) != null) {
			// Get the user ID for this record
			Map<String, String> parsed = MRDPUtils
					.transformXmlToMap(line);
			String userId = parsed.get("Id");
			// Map the user ID to the record
			userIdToInfo.put(userId, line);
		}
		rdr.close();
	}
	// Get the join type from the configuration
	joinType = context.getConfiguration().get("join.type");
}
 
Example 8
Source Project: hadoop-map-reduce-patterns   Source File: BloomFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 3) {
		System.err
				.println("Usage: BloomFilter <bloom_filter_file> <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	DistributedCache.addCacheFile(new URI(otherArgs[0]), conf);
	Job job = new Job(conf, "Bloom Filter");
	job.setJarByClass(BloomFilter.class);
	job.setMapperClass(BloomFilterMapper.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(NullWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}
 
Example 9
Source Project: elasticsearch-hadoop   Source File: HiveSuite.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
@BeforeClass
public static void setup() throws Exception {
    if (!isLocal) {
        hadoopConfig = HdpBootstrap.hadoopConfig();

        HdfsUtils.copyFromLocal(Provisioner.ESHADOOP_TESTING_JAR, Provisioner.HDFS_ES_HDP_LIB);
        hdfsEsLib = HdfsUtils.qualify(Provisioner.HDFS_ES_HDP_LIB, hadoopConfig);
        // copy jar to DistributedCache
        try {
            DistributedCache.addArchiveToClassPath(new Path(Provisioner.HDFS_ES_HDP_LIB), hadoopConfig);
        } catch (IOException ex) {
            throw new RuntimeException("Cannot provision Hive", ex);
        }

        hdfsResource = "/eshdp/hive/hive-compund.dat";
        HdfsUtils.copyFromLocal(originalResource, hdfsResource);
        hdfsResource = HdfsUtils.qualify(hdfsResource, hadoopConfig);

        hdfsJsonResource = "/eshdp/hive/hive-compund.json";
        HdfsUtils.copyFromLocal(originalResource, hdfsJsonResource);
        hdfsJsonResource = HdfsUtils.qualify(hdfsJsonResource, hadoopConfig);
    }
}
 
Example 10
Source Project: hadoop-book   Source File: TeraSort.java    License: Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 11
Source Project: hadoop-book   Source File: MapFeatures.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void configure(JobConf job) {
    caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
    inputFile = job.get("map.input.file");

    if (job.getBoolean("wordcount.skip.patterns", false)) {
        Path[] patternsFiles = new Path[0];
        try {
            patternsFiles = DistributedCache.getLocalCacheFiles(job);
        } catch (IOException ioe) {
            System.err.println("Caught exception getting cached files: "
                    + StringUtils.stringifyException(ioe));
        }
        for (Path patternsFile : patternsFiles) {
            parseSkipFile(patternsFile);
        }
    }
}
 
Example 12
Source Project: multimedia-indexing   Source File: VisualJob.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    String inputPath = args[0];
    String outputPath = args[1];
    if (!IS_LOCAL & args.length >= 3) {
        String configFile = args[2];
        if (configFile != null) {
            getConf().addResource(configFile);
        }
        //The learning files have to be uploaded to the s3 bucket first
        //Then when starting the job, they have to be added to the hadoop distributed cache
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_0.csv#surf_l2_128c_0.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_1.csv#surf_l2_128c_1.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_2.csv#surf_l2_128c_2.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_3.csv#surf_l2_128c_3.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pca_surf_4x128_32768to1024.txt#pca_surf_4x128_32768to1024.txt"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/qcoarse_1024d_8192k.csv#qcoarse_1024d_8192k.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pq_1024_64x8_rp_ivf_8192k.csv#pq_1024_64x8_rp_ivf_8192k.csv"), getConf());
    }

    Job job = createJob(inputPath, outputPath);
    return job.waitForCompletion(true) ? 0 : -1;
}
 
Example 13
Source Project: SpyGlass   Source File: JobLibLoader.java    License: Apache License 2.0 6 votes vote down vote up
public static void loadJars(String libPathStr, Configuration config) {
	
	try {
		Path libPath = new Path(libPathStr);

		FileSystem fs = FileSystem.get(config);

		RemoteIterator<LocatedFileStatus> itr = fs.listFiles(libPath, true);

		while (itr.hasNext()) {
			LocatedFileStatus f = itr.next();

			if (!f.isDirectory() && f.getPath().getName().endsWith("jar")) {
				logger.info("Loading Jar : " + f.getPath().getName());
				DistributedCache.addFileToClassPath(f.getPath(), config);
			}
		}
	} catch (Exception e) {
		e.printStackTrace();
		logger.error(e.toString());
	}
}
 
Example 14
Source Project: hadoop-gpu   Source File: TeraSort.java    License: Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 15
Source Project: Cubert   Source File: JobExecutor.java    License: Apache License 2.0 5 votes vote down vote up
protected void cacheFiles() throws URISyntaxException,
        IOException
{
    if (!root.has("cachedFiles") || root.get("cachedFiles").isNull()
            || root.get("cachedFiles").size() == 0)
        return;

    for (JsonNode cachedFile : root.path("cachedFiles"))
    {
        URI uri = new URI(cachedFile.getTextValue());
        print.f("CACHING file %s", uri);
        DistributedCache.addCacheFile(uri, conf);
    }
}
 
Example 16
Source Project: ml-ease   Source File: AvroUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Given a path to an output folder, it finds the existing "*.avro" files and adds 
 * them as cache files to be distributed. Throws an exception if no files are found/added.
 * 
 * @param conf Job configuration
 * @param outPath The path to the hdfs directory that has part files to cache
 * @throws Exception If no file is found at outPath throws a RuntimeException 
 */
public static void addAvroCacheFiles(JobConf conf, Path outPath) throws Exception
{
   FileStatus[] partFiles = getAvroPartFiles(conf, outPath);
   if (partFiles.length == 0)
   {      
     throw new RuntimeException("DistributedCacheFileUtils: No (part) file is found to cache at location:" + outPath );
   }
   
   for (FileStatus partFile : partFiles)
   {
     // add the file and set fileRead to true, since we have read at least one file
     DistributedCache.addCacheFile(partFile.getPath().toUri(), conf);
   }
 }
 
Example 17
Source Project: incubator-gobblin   Source File: MRCompactorJobRunner.java    License: Apache License 2.0 5 votes vote down vote up
private void addJars(Configuration conf) throws IOException {
  if (!this.dataset.jobProps().contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(this.dataset.jobProps().getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : this.fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, this.fs);
  }
}
 
Example 18
protected void addJars(Configuration conf, State state, FileSystem fs) throws IOException {
  if (!state.contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(state.getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, fs);
  }
}
 
Example 19
Source Project: hiped2   Source File: BloomJoin.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected void setup(
    Context context)
    throws IOException, InterruptedException {

  Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());
  filter = BloomFilterDumper.fromFile(
      new File(files[0].toString()));

  System.out.println("Filter = " + filter);
}
 
Example 20
Source Project: hiped2   Source File: FinalJoinJob.java    License: Apache License 2.0 5 votes vote down vote up
public static void runJob(Configuration conf,
                          Path userLogsPath,
                          Path usersPath,
                          Path outputPath)
    throws Exception {

  FileSystem fs = usersPath.getFileSystem(conf);

  FileStatus usersStatus = fs.getFileStatus(usersPath);

  if (usersStatus.isDir()) {
    for (FileStatus f : fs.listStatus(usersPath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(usersPath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(FinalJoinJob.class);
  job.setMapperClass(GenericReplicatedJoin.class);

  job.setNumReduceTasks(0);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, userLogsPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (!job.waitForCompletion(true)) {
    throw new Exception("Job failed");
  }
}
 
Example 21
Source Project: hiped2   Source File: ReplicatedFilterJob.java    License: Apache License 2.0 5 votes vote down vote up
public static void runJob(Configuration conf,
                          Path usersPath,
                          Path uniqueUsersPath,
                          Path outputPath)
    throws Exception {

  FileSystem fs = uniqueUsersPath.getFileSystem(conf);

  FileStatus uniqueUserStatus = fs.getFileStatus(uniqueUsersPath);

  if (uniqueUserStatus.isDir()) {
    for (FileStatus f : fs.listStatus(uniqueUsersPath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(uniqueUsersPath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(ReplicatedFilterJob.class);
  job.setMapperClass(ReplicatedFilterJob.class);

  job.setNumReduceTasks(0);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, usersPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (!job.waitForCompletion(true)) {
    throw new Exception("Job failed");
  }
}
 
Example 22
Source Project: hiped2   Source File: Main.java    License: Apache License 2.0 5 votes vote down vote up
public static void runJob(Path inputPath,
                          Path smallFilePath,
                          Path outputPath)
    throws Exception {

  Configuration conf = new Configuration();

  FileSystem fs = smallFilePath.getFileSystem(conf);

  FileStatus smallFilePathStatus = fs.getFileStatus(smallFilePath);

  if (smallFilePathStatus.isDir()) {
    for (FileStatus f : fs.listStatus(smallFilePath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(smallFilePath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(Main.class);
  job.setMapperClass(GenericReplicatedJoin.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setNumReduceTasks(0);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);
}
 
Example 23
Source Project: spork   Source File: JobControlCompiler.java    License: Apache License 2.0 5 votes vote down vote up
private static void addToDistributedCache(URI uri, Configuration conf) {
    if (DISTRIBUTED_CACHE_ARCHIVE_MATCHER.reset(uri.toString()).find()) {
        DistributedCache.addCacheArchive(uri, conf);
    } else {
        DistributedCache.addCacheFile(uri, conf);
    }
}
 
Example 24
Source Project: spork   Source File: JobControlCompiler.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * if url is not in HDFS will copy the path to HDFS from local before adding to distributed cache
 * @param pigContext the pigContext
 * @param conf the job conf
 * @param url the url to be added to distributed cache
 * @return the path as seen on distributed cache
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void putJarOnClassPathThroughDistributedCache(
        PigContext pigContext,
        Configuration conf,
        URL url) throws IOException {

    // Turn on the symlink feature
    DistributedCache.createSymlink(conf);

    Path distCachePath = getExistingDistCacheFilePath(conf, url);
    if (distCachePath != null) {
        log.info("Jar file " + url + " already in DistributedCache as "
                + distCachePath + ". Not copying to hdfs and adding again");
        // Path already in dist cache
        if (!HadoopShims.isHadoopYARN()) {
            // Mapreduce in YARN includes $PWD/* which will add all *.jar files in classapth.
            // So don't have to ensure that the jar is separately added to mapreduce.job.classpath.files
            // But path may only be in 'mapred.cache.files' and not be in
            // 'mapreduce.job.classpath.files' in Hadoop 1.x. So adding it there
            DistributedCache.addFileToClassPath(distCachePath, conf, distCachePath.getFileSystem(conf));
        }
    }
    else {
        // REGISTER always copies locally the jar file. see PigServer.registerJar()
        Path pathInHDFS = shipToHDFS(pigContext, conf, url);
        DistributedCache.addFileToClassPath(pathInHDFS, conf, FileSystem.get(conf));
        log.info("Added jar " + url + " to DistributedCache through " + pathInHDFS);
    }

}
 
Example 25
Source Project: spork   Source File: TestJobControlCompiler.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * specifically tests that REGISTERED jars get added to distributed cache
 * @throws Exception
 */
@Test
public void testJarAddedToDistributedCache() throws Exception {

  // creating a jar with a UDF *not* in the current classloader
  File tmpFile = File.createTempFile("Some_", ".jar");
  tmpFile.deleteOnExit();
  String className = createTestJar(tmpFile);
  final String testUDFFileName = className+".class";

  // JobControlCompiler setup
  PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
  PigContext pigContext = pigServer.getPigContext();
  pigContext.connect();
  pigContext.addJar(tmpFile.getAbsolutePath());
  JobControlCompiler jobControlCompiler = new JobControlCompiler(pigContext, CONF);
  MROperPlan plan = new MROperPlan();
  MapReduceOper mro = new MapReduceOper(new OperatorKey());
  mro.UDFs = new HashSet<String>();
  mro.UDFs.add(className+"()");
  plan.add(mro);

  // compiling the job
  JobControl jobControl = jobControlCompiler.compile(plan , "test");
  JobConf jobConf = jobControl.getWaitingJobs().get(0).getJobConf();

  // verifying the jar gets on distributed cache
  Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
  // guava jar is not shipped with Hadoop 2.x
  Assert.assertEquals("size for "+Arrays.toString(fileClassPaths), HadoopShims.isHadoopYARN() ? 5 : 6, fileClassPaths.length);
  Path distributedCachePath = fileClassPaths[0];
  Assert.assertEquals("ends with jar name: "+distributedCachePath, distributedCachePath.getName(), tmpFile.getName());
  // hadoop bug requires path to not contain hdfs://hotname in front
  Assert.assertTrue("starts with /: "+distributedCachePath,
      distributedCachePath.toString().startsWith("/"));
  Assert.assertTrue("jar pushed to distributed cache should contain testUDF",
      jarContainsFileNamed(new File(fileClassPaths[0].toUri().getPath()), testUDFFileName));
}
 
Example 26
Source Project: spork   Source File: TestJobControlCompiler.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testAddArchiveToDistributedCache() throws IOException {
    final File textFile = File.createTempFile("file", ".txt");
    textFile.deleteOnExit();

    final List<File> zipArchives = createFiles(".zip");
    zipArchives.add(textFile);
    final List<File> tarArchives = createFiles(".tgz", ".tar.gz", ".tar");

    final PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
    final PigContext pigContext = pigServer.getPigContext();
    pigContext.connect();
    pigContext.getProperties().put("pig.streaming.ship.files",
            StringUtils.join(zipArchives, ","));
    pigContext.getProperties().put("pig.streaming.cache.files",
            StringUtils.join(tarArchives, ","));

    final JobConf jobConf = compileTestJob(pigContext, CONF);

    URI[] uris = DistributedCache.getCacheFiles(jobConf);
    int sizeTxt = 0;
    for (int i = 0; i < uris.length; i++) {
        if (uris[i].toString().endsWith(".txt")) {
            sizeTxt++;
        }
    }
    Assert.assertTrue(sizeTxt == 1);
    assertFilesInDistributedCache(
            DistributedCache.getCacheArchives(jobConf), 4, ".zip", ".tgz",
            ".tar.gz", ".tar");
}
 
Example 27
Source Project: datafu   Source File: DistributedCacheHelper.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Deserializes an object from a path in HDFS.
 * 
 * @param conf Hadoop configuration
 * @param path Path to deserialize from
 * @return Deserialized object
 * @throws IOException IOException
 */
public static Object readObject(Configuration conf, org.apache.hadoop.fs.Path path) throws IOException
{
  String localPath = null;
  Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(conf);
  for (Path localCacheFile : localCacheFiles)
  {
    if (localCacheFile.getName().endsWith(path.getName()))
    {
      localPath = localCacheFile.getName();
      break;
    }
  }
  if (localPath == null)
  {
    throw new RuntimeException("Could not find " + path + " in local cache");
  }
  FileInputStream inputStream = new FileInputStream(new File(localPath));
  ObjectInputStream objStream = new ObjectInputStream(inputStream);
  
  try
  {
    try {
      return objStream.readObject();
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    }
  }
  finally
  {
    objStream.close();
    inputStream.close();
  }
}
 
Example 28
Source Project: mrgeo   Source File: AccumuloMrGeoRangePartitioner.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning
 */
public static void setSplitFile(JobContext job, String file)
{
  URI uri = new Path(file).toUri();
  DistributedCache.addCacheFile(uri, job.getConfiguration());
  job.getConfiguration().set(CUTFILE_KEY, uri.getPath());
}
 
Example 29
Source Project: mrgeo   Source File: DependencyLoader.java    License: Apache License 2.0 5 votes vote down vote up
private static void addFileToClasspath(Configuration conf, Set<String> existing, FileSystem fs, Path hdfsBase,
    File file) throws IOException
{
  Path hdfsPath = new Path(hdfsBase, file.getName());
  if (!existing.contains(hdfsPath.toString()))
  {
    if (fs.exists(hdfsPath))
    {
      // check the timestamp and exit if the one in hdfs is "newer"
      FileStatus status = fs.getFileStatus(hdfsPath);

      if (file.lastModified() <= status.getModificationTime())
      {
        log.debug(file.getPath() + " up to date");
        DistributedCache.addFileToClassPath(hdfsPath, conf, fs);

        existing.add(hdfsPath.toString());
        return;
      }
    }

    // copy the file...
    log.debug("Copying " + file.getPath() + " to HDFS for distribution");

    fs.copyFromLocalFile(new Path(file.getCanonicalFile().toURI()), hdfsPath);
    DistributedCache.addFileToClassPath(hdfsPath, conf, fs);
    existing.add(hdfsPath.toString());
  }
}
 
Example 30
Source Project: RDFS   Source File: TeraSort.java    License: Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}