org.apache.hadoop.filecache.DistributedCache Java Examples

The following examples show how to use org.apache.hadoop.filecache.DistributedCache. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: JobControlCompiler.java From spork with Apache License 2.0

6 votes

private static Path getExistingDistCacheFilePath(Configuration conf, URL url) throws IOException {
    URI[] cacheFileUris = DistributedCache.getCacheFiles(conf);
    if (cacheFileUris != null) {
        String fileName = url.getRef() == null ? FilenameUtils.getName(url.getPath()) : url.getRef();
        for (URI cacheFileUri : cacheFileUris) {
            Path path = new Path(cacheFileUri);
            String cacheFileName = cacheFileUri.getFragment() == null ? path.getName() : cacheFileUri.getFragment();
            // Match
            //     - if both filenames are same and no symlinks (or)
            //     - if both symlinks are same (or)
            //     - symlink of existing cache file is same as the name of the new file to be added.
            //         That would be the case when hbase-0.98.4.jar#hbase.jar is configured via Oozie
            // and register hbase.jar is done in the pig script.
            // If two different files are symlinked to the same name, then there is a conflict
            // and hadoop itself does not guarantee which file will be symlinked to that name.
            // So we are good.
            if (fileName.equals(cacheFileName)) {
                return path;
            }
        }
    }
    return null;
}

Example #2

Source File: HiveSuite.java From elasticsearch-hadoop with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
@BeforeClass
public static void setup() throws Exception {
    if (!isLocal) {
        hadoopConfig = HdpBootstrap.hadoopConfig();

        HdfsUtils.copyFromLocal(Provisioner.ESHADOOP_TESTING_JAR, Provisioner.HDFS_ES_HDP_LIB);
        hdfsEsLib = HdfsUtils.qualify(Provisioner.HDFS_ES_HDP_LIB, hadoopConfig);
        // copy jar to DistributedCache
        try {
            DistributedCache.addArchiveToClassPath(new Path(Provisioner.HDFS_ES_HDP_LIB), hadoopConfig);
        } catch (IOException ex) {
            throw new RuntimeException("Cannot provision Hive", ex);
        }

        hdfsResource = "/eshdp/hive/hive-compund.dat";
        HdfsUtils.copyFromLocal(originalResource, hdfsResource);
        hdfsResource = HdfsUtils.qualify(hdfsResource, hadoopConfig);

        hdfsJsonResource = "/eshdp/hive/hive-compund.json";
        HdfsUtils.copyFromLocal(originalResource, hdfsJsonResource);
        hdfsJsonResource = HdfsUtils.qualify(hdfsJsonResource, hadoopConfig);
    }
}

Example #3

Source File: BloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 3) {
		System.err
				.println("Usage: BloomFilter <bloom_filter_file> <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	DistributedCache.addCacheFile(new URI(otherArgs[0]), conf);
	Job job = new Job(conf, "Bloom Filter");
	job.setJarByClass(BloomFilter.class);
	job.setMapperClass(BloomFilterMapper.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(NullWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}

Example #4

Source File: ReplicatedUserJoin.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

public void setup(Context context) throws IOException,
		InterruptedException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	// Read all files in the DistributedCache
	for (Path p : files) {
		BufferedReader rdr = new BufferedReader(new InputStreamReader(
				new GZIPInputStream(new FileInputStream(new File(
						p.toString())))));
		String line = null;
		// For each record in the user file
		while ((line = rdr.readLine()) != null) {
			// Get the user ID for this record
			Map<String, String> parsed = MRDPUtils
					.transformXmlToMap(line);
			String userId = parsed.get("Id");
			// Map the user ID to the record
			userIdToInfo.put(userId, line);
		}
		rdr.close();
	}
	// Get the join type from the configuration
	joinType = context.getConfiguration().get("join.type");
}

Example #5

Source File: TeraSort.java From hadoop-book with Apache License 2.0

6 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}

Example #6

Source File: MapFeatures.java From hadoop-book with Apache License 2.0

6 votes

@Override
public void configure(JobConf job) {
    caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
    inputFile = job.get("map.input.file");

    if (job.getBoolean("wordcount.skip.patterns", false)) {
        Path[] patternsFiles = new Path[0];
        try {
            patternsFiles = DistributedCache.getLocalCacheFiles(job);
        } catch (IOException ioe) {
            System.err.println("Caught exception getting cached files: "
                    + StringUtils.stringifyException(ioe));
        }
        for (Path patternsFile : patternsFiles) {
            parseSkipFile(patternsFile);
        }
    }
}

Example #7

Source File: L2.java From spork with Apache License 2.0

6 votes

public void configure(JobConf conf) {
    try {
        Path[] paths = DistributedCache.getLocalCacheFiles(conf);
        if (paths == null || paths.length < 1) {
            throw new RuntimeException("DistributedCache no work.");
        }

        // Open the small table
        BufferedReader reader =
            new BufferedReader(new InputStreamReader(new
            FileInputStream(paths[0].toString())));
        String line;
        hash = new HashSet<String>(500);
        while ((line = reader.readLine()) != null) {
            if (line.length() < 1) continue;
            String[] fields = line.split("");
            hash.add(fields[0]);
        }
        reader.close();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}

Example #8

Source File: VisualJob.java From multimedia-indexing with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
    String inputPath = args[0];
    String outputPath = args[1];
    if (!IS_LOCAL & args.length >= 3) {
        String configFile = args[2];
        if (configFile != null) {
            getConf().addResource(configFile);
        }
        //The learning files have to be uploaded to the s3 bucket first
        //Then when starting the job, they have to be added to the hadoop distributed cache
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_0.csv#surf_l2_128c_0.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_1.csv#surf_l2_128c_1.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_2.csv#surf_l2_128c_2.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_3.csv#surf_l2_128c_3.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pca_surf_4x128_32768to1024.txt#pca_surf_4x128_32768to1024.txt"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/qcoarse_1024d_8192k.csv#qcoarse_1024d_8192k.csv"), getConf());
        DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pq_1024_64x8_rp_ivf_8192k.csv#pq_1024_64x8_rp_ivf_8192k.csv"), getConf());
    }

    Job job = createJob(inputPath, outputPath);
    return job.waitForCompletion(true) ? 0 : -1;
}

Example #9

Source File: JobLibLoader.java From SpyGlass with Apache License 2.0

6 votes

public static void loadJars(String libPathStr, Configuration config) {
	
	try {
		Path libPath = new Path(libPathStr);

		FileSystem fs = FileSystem.get(config);

		RemoteIterator<LocatedFileStatus> itr = fs.listFiles(libPath, true);

		while (itr.hasNext()) {
			LocatedFileStatus f = itr.next();

			if (!f.isDirectory() && f.getPath().getName().endsWith("jar")) {
				logger.info("Loading Jar : " + f.getPath().getName());
				DistributedCache.addFileToClassPath(f.getPath(), config);
			}
		}
	} catch (Exception e) {
		e.printStackTrace();
		logger.error(e.toString());
	}
}

Example #10

Source File: SolrOutputFormat.java From examples with Apache License 2.0

6 votes

public static void addSolrConfToDistributedCache(Job job, File solrHomeZip)
    throws IOException {
  // Make a reasonably unique name for the zip file in the distributed cache
  // to avoid collisions if multiple jobs are running.
  String hdfsZipName = UUID.randomUUID().toString() + '.'
      + ZIP_FILE_BASE_NAME;
  Configuration jobConf = job.getConfiguration();
  jobConf.set(ZIP_NAME, hdfsZipName);

  Path zipPath = new Path("/tmp", getZipName(jobConf));
  FileSystem fs = FileSystem.get(jobConf);
  fs.copyFromLocalFile(new Path(solrHomeZip.toString()), zipPath);
  final URI baseZipUrl = fs.getUri().resolve(
      zipPath.toString() + '#' + getZipName(jobConf));

  DistributedCache.addCacheArchive(baseZipUrl, jobConf);
  LOG.debug("Set Solr distributed cache: {}", Arrays.asList(job.getCacheArchives()));
  LOG.debug("Set zipPath: {}", zipPath);
  // Actually send the path for the configuration zip file
  jobConf.set(SETUP_OK, zipPath.toString());
}

Example #11

Source File: MapJoin.java From BigData-In-Practice with Apache License 2.0

6 votes

@Override
protected void setup(Mapper<LongWritable, Text, NullWritable, Emp_Dep>.Context context) throws IOException, InterruptedException {
    // 预处理把要关联的文件加载到缓存中
    Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
    // 我们这里只缓存了一个文件，所以取第一个即可，创建BufferReader去读取
    BufferedReader reader = new BufferedReader(new FileReader(paths[0].toString()));

    String str = null;
    try {
        // 一行一行读取
        while ((str = reader.readLine()) != null) {
            // 对缓存中的表进行分割
            String[] splits = str.split("\t");
            // 把字符数组中有用的数据存在一个Map中
            joinData.put(Integer.parseInt(splits[0]), splits[1]);
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        reader.close();
    }

}

Example #12

Source File: AvroDistributedCacheFileReader.java From ml-ease with Apache License 2.0

6 votes

@Override
protected List<Path> getPaths(String filePath) throws IOException
{
  Path[] localFiles = DistributedCache.getLocalCacheFiles(getConf());
  List<Path> paths = new ArrayList<Path>();
  
  for (Path file: localFiles)
  {
    if (!file.toString().contains(filePath))
    {
      continue;
    }
    
    paths.add(file);
  }
    
  return paths;
}

Example #13

Source File: CrossProductOperation.java From incubator-retired-mrql with Apache License 2.0

6 votes

@Override
protected void setup ( Context context ) throws IOException,InterruptedException {
    super.setup(context);
    try {
        conf = context.getConfiguration();
        Plan.conf = conf;
        Config.read(Plan.conf);
        Tree code = Tree.parse(conf.get("mrql.reducer"));
        reduce_fnc = functional_argument(conf,code);
        code = Tree.parse(conf.get("mrql.mapper"));
        map_fnc = functional_argument(conf,code);
        if (conf.get("mrql.zero") != null) {
            code = Tree.parse(conf.get("mrql.zero"));
            result = Interpreter.evalE(code);
            code = Tree.parse(conf.get("mrql.accumulator"));
            acc_fnc = functional_argument(conf,code);
        } else result = null;
        counter = conf.get("mrql.counter");
        uris = DistributedCache.getCacheFiles(conf);
        local_paths = DistributedCache.getLocalCacheFiles(conf);
        index = 0;
    } catch (Exception e) {
        throw new Error("Cannot setup the crossProduct: "+e);
    }
}

Example #14

Source File: TeraSort.java From hadoop-gpu with Apache License 2.0

6 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}

Example #15

Source File: DistributedCacheHelper.java From datafu with Apache License 2.0

5 votes

/**
 * Deserializes an object from a path in HDFS.
 * 
 * @param conf Hadoop configuration
 * @param path Path to deserialize from
 * @return Deserialized object
 * @throws IOException IOException
 */
public static Object readObject(Configuration conf, org.apache.hadoop.fs.Path path) throws IOException
{
  String localPath = null;
  Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(conf);
  for (Path localCacheFile : localCacheFiles)
  {
    if (localCacheFile.getName().endsWith(path.getName()))
    {
      localPath = localCacheFile.getName();
      break;
    }
  }
  if (localPath == null)
  {
    throw new RuntimeException("Could not find " + path + " in local cache");
  }
  FileInputStream inputStream = new FileInputStream(new File(localPath));
  ObjectInputStream objStream = new ObjectInputStream(inputStream);
  
  try
  {
    try {
      return objStream.readObject();
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    }
  }
  finally
  {
    objStream.close();
    inputStream.close();
  }
}

Example #16

Source File: BloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

@Override
public void setup(Context context) throws IOException,
		InterruptedException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	System.out.println("Reading Bloom filter from: " + files[0]);

	DataInputStream stream = new DataInputStream(new FileInputStream(
			files[0].toString()));
	filter.readFields(stream);
	stream.close();
}

Example #17

Source File: ReduceSideJoinBloomFilter.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

public void setup(Context context) throws IOException {
	Path[] files = DistributedCache.getLocalCacheFiles(context
			.getConfiguration());
	DataInputStream strm = new DataInputStream(new FileInputStream(
			new File(files[0].toString())));
	bfilter.readFields(strm);
}

Example #18

Source File: QueryTestParams.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public <T extends Configuration> T provisionQueries(T cfg) {
    if (HadoopCfgUtils.isLocal(cfg)) {
        return cfg;
    }

    try {
        DistributedCache.addFileToClassPath(new Path(TestData.unpackResource(QUERY_DSL, stagingLocation).getAbsolutePath()), cfg);
        DistributedCache.addFileToClassPath(new Path(TestData.unpackResource(QUERY_URI, stagingLocation).getAbsolutePath()), cfg);
    } catch (IOException ex) {
    }
    return cfg;
}

Example #19

Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

protected void setup(Context context) throws IOException, InterruptedException {
	average = getAveragePostsPerUser(context.getConfiguration());
	mos = new MultipleOutputs<Text, Text>(context);

	try {
		Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());

		if (files == null || files.length == 0) {
			throw new RuntimeException("User information is not set in DistributedCache");
		}

		// Read all files in the DistributedCache
		for (Path p : files) {
			BufferedReader rdr = new BufferedReader(new InputStreamReader(
					new GZIPInputStream(new FileInputStream(new File(p.toString())))));

			String line;
			// For each record in the user file
			while ((line = rdr.readLine()) != null) {

				// Get the user ID and reputation
				Map<String, String> parsed = MRDPUtils.transformXmlToMap(line);
				String userId = parsed.get("Id");
				String reputation = parsed.get("Reputation");

				if (userId != null && reputation != null) {
					// Map the user ID to the reputation
					userIdToReputation.put(userId, reputation);
				}
			}
		}

	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}

Example #20

Source File: MRWordCountFeatures.java From hadoop-book with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), MRWordCount.class);
    conf.setJobName(
            "WordCountFeatures");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapFeatures.class);
    conf.setCombinerClass(ReduceFeatures.class);
    conf.setReducerClass(ReduceFeatures.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    List<String> other_args = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        if ("-skip".equals(args[i])) {
            DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
            conf.setBoolean("wordcount.skip.patterns", true);
        } else {
            other_args.add(args[i]);
        }
    }

    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

Example #21

Source File: SolrRecordWriter.java From hbase-indexer with Apache License 2.0

5 votes

public static Path findSolrConfig(Configuration conf) throws IOException {
  // FIXME when mrunit supports the new cache apis
  //URI[] localArchives = context.getCacheArchives();
  Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
  for (Path unpackedDir : localArchives) {
    if (unpackedDir.getName().equals(SolrOutputFormat.getZipName(conf))) {
      LOG.info("Using this unpacked directory as solr home: {}", unpackedDir);
      return unpackedDir;
    }
  }
  throw new IOException(String.format(Locale.ENGLISH,
      "No local cache archives, where is %s:%s", SolrOutputFormat
          .getSetupOk(), SolrOutputFormat.getZipName(conf)));
}

Example #22

Source File: TeraSort.java From RDFS with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}

Example #23

Source File: DependencyLoader.java From mrgeo with Apache License 2.0

5 votes

private static void addFileToClasspath(Configuration conf, Set<String> existing, FileSystem fs, Path hdfsBase,
    File file) throws IOException
{
  Path hdfsPath = new Path(hdfsBase, file.getName());
  if (!existing.contains(hdfsPath.toString()))
  {
    if (fs.exists(hdfsPath))
    {
      // check the timestamp and exit if the one in hdfs is "newer"
      FileStatus status = fs.getFileStatus(hdfsPath);

      if (file.lastModified() <= status.getModificationTime())
      {
        log.debug(file.getPath() + " up to date");
        DistributedCache.addFileToClassPath(hdfsPath, conf, fs);

        existing.add(hdfsPath.toString());
        return;
      }
    }

    // copy the file...
    log.debug("Copying " + file.getPath() + " to HDFS for distribution");

    fs.copyFromLocalFile(new Path(file.getCanonicalFile().toURI()), hdfsPath);
    DistributedCache.addFileToClassPath(hdfsPath, conf, fs);
    existing.add(hdfsPath.toString());
  }
}

Example #24

Source File: GroupedKeyRangePartitioner.java From accumulo-recipes with Apache License 2.0

5 votes

/**
 * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning
 */
public static void addSplitFile(JobContext job, String group, String file) {
    URI uri = new Path(file).toUri();
    DistributedCache.addCacheFile(uri, job.getConfiguration());
    String[] groups = job.getConfiguration().getStrings(GROUPS_KEY);
    if (groups == null || Arrays.binarySearch(groups, group) == -1) {
        String[] newGroups = groups != null ? Arrays.copyOf(groups, groups.length + 1) : new String[]{};
        newGroups[newGroups.length - 1] = group;
        job.getConfiguration().setStrings(GROUPS_KEY, newGroups);
        job.getConfiguration().set(GROUPS_KEY + "." + group, file);
    }
}

Example #25

Source File: AccumuloMrGeoRangePartitioner.java From mrgeo with Apache License 2.0

5 votes

/**
 * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning
 */
public static void setSplitFile(JobContext job, String file)
{
  URI uri = new Path(file).toUri();
  DistributedCache.addCacheFile(uri, job.getConfiguration());
  job.getConfiguration().set(CUTFILE_KEY, uri.getPath());
}

Example #26

Source File: MRCompactorJobRunner.java From incubator-gobblin with Apache License 2.0

5 votes

private void addJars(Configuration conf) throws IOException {
  if (!this.dataset.jobProps().contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(this.dataset.jobProps().getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : this.fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, this.fs);
  }
}

Example #27

Source File: Main.java From hiped2 with Apache License 2.0

5 votes

public static void runJob(Path inputPath,
                          Path smallFilePath,
                          Path outputPath)
    throws Exception {

  Configuration conf = new Configuration();

  FileSystem fs = smallFilePath.getFileSystem(conf);

  FileStatus smallFilePathStatus = fs.getFileStatus(smallFilePath);

  if (smallFilePathStatus.isDir()) {
    for (FileStatus f : fs.listStatus(smallFilePath)) {
      if (f.getPath().getName().startsWith("part")) {
        DistributedCache.addCacheFile(f.getPath().toUri(), conf);
      }
    }
  } else {
    DistributedCache.addCacheFile(smallFilePath.toUri(), conf);
  }

  Job job = new Job(conf);

  job.setJarByClass(Main.class);
  job.setMapperClass(GenericReplicatedJoin.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setNumReduceTasks(0);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);
}

Example #28

Source File: CompactionJobConfigurator.java From incubator-gobblin with Apache License 2.0

5 votes

protected void addJars(Configuration conf, State state, FileSystem fs) throws IOException {
  if (!state.contains(MRCompactor.COMPACTION_JARS)) {
    return;
  }
  Path jarFileDir = new Path(state.getProp(MRCompactor.COMPACTION_JARS));
  for (FileStatus status : fs.listStatus(jarFileDir)) {
    DistributedCache.addFileToClassPath(status.getPath(), conf, fs);
  }
}

Example #29

Source File: JobLibLoader.java From SpyGlass with Apache License 2.0

5 votes

public static void addFiletoCache(String libPathStr, Configuration config) {

		try {
			Path filePath = new Path(libPathStr);
			DistributedCache.addCacheFile(filePath.toUri(), config);
			// DistributedCache.createSymlink(config);

			// config.set("mapred.cache.files", libPathStr);
			// config.set("mapred.create.symlink", "yes");

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

Example #30

Source File: AvroUtils.java From ml-ease with Apache License 2.0

5 votes

/**
 * Given a path to an output folder, it finds the existing "*.avro" files and adds 
 * them as cache files to be distributed. Throws an exception if no files are found/added.
 * 
 * @param conf Job configuration
 * @param outPath The path to the hdfs directory that has part files to cache
 * @throws Exception If no file is found at outPath throws a RuntimeException 
 */
public static void addAvroCacheFiles(JobConf conf, Path outPath) throws Exception
{
   FileStatus[] partFiles = getAvroPartFiles(conf, outPath);
   if (partFiles.length == 0)
   {      
     throw new RuntimeException("DistributedCacheFileUtils: No (part) file is found to cache at location:" + outPath );
   }
   
   for (FileStatus partFile : partFiles)
   {
     // add the file and set fileRead to true, since we have read at least one file
     DistributedCache.addCacheFile(partFile.getPath().toUri(), conf);
   }
 }