org.apache.hadoop.mapreduce.filecache.DistributedCache Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.filecache.DistributedCache. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MRApps.java    From big-c with Apache License 2.0 6 votes vote down vote up
public static void setupDistributedCache( 
    Configuration conf, 
    Map<String, LocalResource> localResources) 
throws IOException {
  
  // Cache archives
  parseDistributedCacheArtifacts(conf, localResources,  
      LocalResourceType.ARCHIVE, 
      DistributedCache.getCacheArchives(conf), 
      DistributedCache.getArchiveTimestamps(conf),
      getFileSizes(conf, MRJobConfig.CACHE_ARCHIVES_SIZES), 
      DistributedCache.getArchiveVisibilities(conf));
  
  // Cache files
  parseDistributedCacheArtifacts(conf, 
      localResources,  
      LocalResourceType.FILE, 
      DistributedCache.getCacheFiles(conf),
      DistributedCache.getFileTimestamps(conf),
      getFileSizes(conf, MRJobConfig.CACHE_FILES_SIZES),
      DistributedCache.getFileVisibilities(conf));
}
 
Example #2
Source File: MRApps.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public static void setupDistributedCache( 
    Configuration conf, 
    Map<String, LocalResource> localResources) 
throws IOException {
  
  // Cache archives
  parseDistributedCacheArtifacts(conf, localResources,  
      LocalResourceType.ARCHIVE, 
      DistributedCache.getCacheArchives(conf), 
      DistributedCache.getArchiveTimestamps(conf),
      getFileSizes(conf, MRJobConfig.CACHE_ARCHIVES_SIZES), 
      DistributedCache.getArchiveVisibilities(conf));
  
  // Cache files
  parseDistributedCacheArtifacts(conf, 
      localResources,  
      LocalResourceType.FILE, 
      DistributedCache.getCacheFiles(conf),
      DistributedCache.getFileTimestamps(conf),
      getFileSizes(conf, MRJobConfig.CACHE_FILES_SIZES),
      DistributedCache.getFileVisibilities(conf));
}
 
Example #3
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Add local non-jar files the job depends on to DistributedCache.
 */
@SuppressWarnings("deprecation")
private void addLocalFiles(Path jobFileDir, String jobFileList, Configuration conf) throws IOException {
  DistributedCache.createSymlink(conf);
  for (String jobFile : SPLITTER.split(jobFileList)) {
    Path srcJobFile = new Path(jobFile);
    // DistributedCache requires absolute path, so we need to use makeQualified.
    Path destJobFile = new Path(this.fs.makeQualified(jobFileDir), srcJobFile.getName());
    // Copy the file from local file system to HDFS
    this.fs.copyFromLocalFile(srcJobFile, destJobFile);
    // Create a URI that is in the form path#symlink
    URI destFileUri = URI.create(destJobFile.toUri().getPath() + "#" + destJobFile.getName());
    LOG.info(String.format("Adding %s to DistributedCache", destFileUri));
    // Finally add the file to DistributedCache with a symlink named after the file name
    DistributedCache.addCacheFile(destFileUri, conf);
  }
}
 
Example #4
Source File: TestMRApps.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflictsFiles() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI file = new URI("mockfs://mock/tmp/something.zip#something");
  Path filePath = new Path(file);
  URI file2 = new URI("mockfs://mock/tmp/something.txt#something");
  Path file2Path = new Path(file2);
  
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  when(mockFs.resolvePath(file2Path)).thenReturn(file2Path);
  
  DistributedCache.addCacheFile(file, conf);
  DistributedCache.addCacheFile(file2, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "10,11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "10,11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true,true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //First one wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.FILE, lr.getType());
}
 
Example #5
Source File: TestMRApps.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflictsFiles() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI file = new URI("mockfs://mock/tmp/something.zip#something");
  Path filePath = new Path(file);
  URI file2 = new URI("mockfs://mock/tmp/something.txt#something");
  Path file2Path = new Path(file2);
  
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  when(mockFs.resolvePath(file2Path)).thenReturn(file2Path);
  
  DistributedCache.addCacheFile(file, conf);
  DistributedCache.addCacheFile(file2, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "10,11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "10,11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true,true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //First one wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.FILE, lr.getType());
}
 
Example #6
Source File: TestMRApps.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflicts() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI archive = new URI("mockfs://mock/tmp/something.zip#something");
  Path archivePath = new Path(archive);
  URI file = new URI("mockfs://mock/tmp/something.txt#something");
  Path filePath = new Path(file);
  
  when(mockFs.resolvePath(archivePath)).thenReturn(archivePath);
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  
  DistributedCache.addCacheArchive(archive, conf);
  conf.set(MRJobConfig.CACHE_ARCHIVES_TIMESTAMPS, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_SIZES, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES, "true");
  DistributedCache.addCacheFile(file, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //Archive wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.ARCHIVE, lr.getType());
}
 
Example #7
Source File: TestMRAppWithCombiner.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testCombinerShouldUpdateTheReporter() throws Exception {
  JobConf conf = new JobConf(mrCluster.getConfig());
  int numMaps = 5;
  int numReds = 2;
  Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-in");
  Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-out");
  createInputOutPutFolder(in, out, numMaps);
  conf.setJobName("test-job-with-combiner");
  conf.setMapperClass(IdentityMapper.class);
  conf.setCombinerClass(MyCombinerToCheckReporter.class);
  //conf.setJarByClass(MyCombinerToCheckReporter.class);
  conf.setReducerClass(IdentityReducer.class);
  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, in);
  FileOutputFormat.setOutputPath(conf, out);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);
  
  runJob(conf);
}
 
Example #8
Source File: TestMROldApiJobs.java    From hadoop with Apache License 2.0 5 votes vote down vote up
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, 
                         int numReds) throws IOException, InterruptedException {

  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }
  if (!fs.exists(inDir)) {
    fs.mkdirs(inDir);
  }
  String input = "The quick brown fox\n" + "has many silly\n"
      + "red fox sox\n";
  for (int i = 0; i < numMaps; ++i) {
    DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
    file.writeBytes(input);
    file.close();
  }

  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);

  JobClient jobClient = new JobClient(conf);
  
  RunningJob job = jobClient.submitJob(conf);
  return jobClient.monitorAndPrintJob(conf, job);
}
 
Example #9
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void addHdfsJars(String hdfsJarFileList, Configuration conf) throws IOException {
  for (String jarFile : SPLITTER.split(hdfsJarFileList)) {
    FileStatus[] status = this.fs.listStatus(new Path(jarFile));
    for (FileStatus fileStatus : status) {
      if (!fileStatus.isDirectory()) {
        Path path = new Path(jarFile, fileStatus.getPath().getName());
        LOG.info(String.format("Adding %s to classpath", path));
        DistributedCache.addFileToClassPath(path, conf, this.fs);
      }
    }
  }
}
 
Example #10
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Add non-jar files already on HDFS that the job depends on to DistributedCache.
 */
@SuppressWarnings("deprecation")
private void addHDFSFiles(String jobFileList, Configuration conf) {
  DistributedCache.createSymlink(conf);
  jobFileList = PasswordManager.getInstance(this.jobProps).readPassword(jobFileList);
  for (String jobFile : SPLITTER.split(jobFileList)) {
    Path srcJobFile = new Path(jobFile);
    // Create a URI that is in the form path#symlink
    URI srcFileUri = URI.create(srcJobFile.toUri().getPath() + "#" + srcJobFile.getName());
    LOG.info(String.format("Adding %s to DistributedCache", srcFileUri));
    // Finally add the file to DistributedCache with a symlink named after the file name
    DistributedCache.addCacheFile(srcFileUri, conf);
  }
}
 
Example #11
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@VisibleForTesting
static void serializeJobState(FileSystem fs, Path mrJobDir, Configuration conf, JobState jobState, Job job)
    throws IOException {
  Path jobStateFilePath = new Path(mrJobDir, JOB_STATE_FILE_NAME);
  // Write the job state with an empty task set (work units are read by the mapper from a different file)
  try (DataOutputStream dataOutputStream = new DataOutputStream(fs.create(jobStateFilePath))) {
    jobState.write(dataOutputStream, false,
        conf.getBoolean(SERIALIZE_PREVIOUS_WORKUNIT_STATES_KEY, DEFAULT_SERIALIZE_PREVIOUS_WORKUNIT_STATES));
  }

  job.getConfiguration().set(ConfigurationKeys.JOB_STATE_FILE_PATH_KEY, jobStateFilePath.toString());

  DistributedCache.addCacheFile(jobStateFilePath.toUri(), job.getConfiguration());
  job.getConfiguration().set(ConfigurationKeys.JOB_STATE_DISTRIBUTED_CACHE_NAME, jobStateFilePath.getName());
}
 
Example #12
Source File: GCPCredentialCopier.java    From circus-train with Apache License 2.0 5 votes vote down vote up
private void linkRelativePathInDistributedCache(Configuration conf, Path source, Path destination)
  throws URISyntaxException, IOException {
  /*
   * The "#" links the HDFS location for the key file to the local file system credential provider path so that the
   * GoogleHadoopFileSystem can subsequently resolve it from a local file system uri despite it being in a Distributed
   * file system when the DistCP job runs.
   */
  String cacheFileUri = destination.toString() + "#" + source;
  DistributedCache.addCacheFile(new URI(cacheFileUri), conf);
  LOG.info("mapreduce.job.cache.files : {}", conf.get("mapreduce.job.cache.files"));
  conf.set(GCP_KEYFILE_CACHED_LOCATION, source.toString());
}
 
Example #13
Source File: TestMROldApiJobs.java    From big-c with Apache License 2.0 5 votes vote down vote up
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, 
                         int numReds) throws IOException, InterruptedException {

  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }
  if (!fs.exists(inDir)) {
    fs.mkdirs(inDir);
  }
  String input = "The quick brown fox\n" + "has many silly\n"
      + "red fox sox\n";
  for (int i = 0; i < numMaps; ++i) {
    DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
    file.writeBytes(input);
    file.close();
  }

  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);

  JobClient jobClient = new JobClient(conf);
  
  RunningJob job = jobClient.submitJob(conf);
  return jobClient.monitorAndPrintJob(conf, job);
}
 
Example #14
Source File: TestMRAppWithCombiner.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testCombinerShouldUpdateTheReporter() throws Exception {
  JobConf conf = new JobConf(mrCluster.getConfig());
  int numMaps = 5;
  int numReds = 2;
  Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-in");
  Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-out");
  createInputOutPutFolder(in, out, numMaps);
  conf.setJobName("test-job-with-combiner");
  conf.setMapperClass(IdentityMapper.class);
  conf.setCombinerClass(MyCombinerToCheckReporter.class);
  //conf.setJarByClass(MyCombinerToCheckReporter.class);
  conf.setReducerClass(IdentityReducer.class);
  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, in);
  FileOutputFormat.setOutputPath(conf, out);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);
  
  runJob(conf);
}
 
Example #15
Source File: TestMRApps.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public void testSetupDistributedCacheConflicts() throws Exception {
  Configuration conf = new Configuration();
  conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
  
  URI mockUri = URI.create("mockfs://mock/");
  FileSystem mockFs = ((FilterFileSystem)FileSystem.get(mockUri, conf))
      .getRawFileSystem();
  
  URI archive = new URI("mockfs://mock/tmp/something.zip#something");
  Path archivePath = new Path(archive);
  URI file = new URI("mockfs://mock/tmp/something.txt#something");
  Path filePath = new Path(file);
  
  when(mockFs.resolvePath(archivePath)).thenReturn(archivePath);
  when(mockFs.resolvePath(filePath)).thenReturn(filePath);
  
  DistributedCache.addCacheArchive(archive, conf);
  conf.set(MRJobConfig.CACHE_ARCHIVES_TIMESTAMPS, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_SIZES, "10");
  conf.set(MRJobConfig.CACHE_ARCHIVES_VISIBILITIES, "true");
  DistributedCache.addCacheFile(file, conf);
  conf.set(MRJobConfig.CACHE_FILE_TIMESTAMPS, "11");
  conf.set(MRJobConfig.CACHE_FILES_SIZES, "11");
  conf.set(MRJobConfig.CACHE_FILE_VISIBILITIES, "true");
  Map<String, LocalResource> localResources = 
    new HashMap<String, LocalResource>();
  MRApps.setupDistributedCache(conf, localResources);
  
  assertEquals(1, localResources.size());
  LocalResource lr = localResources.get("something");
  //Archive wins
  assertNotNull(lr);
  assertEquals(10l, lr.getSize());
  assertEquals(10l, lr.getTimestamp());
  assertEquals(LocalResourceType.ARCHIVE, lr.getType());
}
 
Example #16
Source File: JobContextImpl.java    From tez with Apache License 2.0 4 votes vote down vote up
/**
 * Get the archive entries in classpath as an array of Path
 */
public Path[] getArchiveClassPaths() {
  return DistributedCache.getArchiveClassPaths(conf);
}
 
Example #17
Source File: JobContextImpl.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Get the archive entries in classpath as an array of Path
 */
public Path[] getArchiveClassPaths() {
  return DistributedCache.getArchiveClassPaths(conf);
}
 
Example #18
Source File: MRJobLauncher.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
/**
 * Add framework or job-specific jars to the classpath through DistributedCache
 * so the mappers can use them.
 */
@SuppressWarnings("deprecation")
private void addJars(Path jarFileDir, String jarFileList, Configuration conf) throws IOException {
  LocalFileSystem lfs = FileSystem.getLocal(conf);
  for (String jarFile : SPLITTER.split(jarFileList)) {
    Path srcJarFile = new Path(jarFile);
    FileStatus[] fileStatusList = lfs.globStatus(srcJarFile);

    for (FileStatus status : fileStatusList) {
      // For each FileStatus there are chances it could fail in copying at the first attempt, due to file-existence
      // or file-copy is ongoing by other job instance since all Gobblin jobs share the same jar file directory.
      // the retryCount is to avoid cases (if any) where retry is going too far and causes job hanging.
      int retryCount = 0;
      boolean shouldFileBeAddedIntoDC = true;
      Path destJarFile = calculateDestJarFile(status, jarFileDir);
      // Adding destJarFile into HDFS until it exists and the size of file on targetPath matches the one on local path.
      while (!this.fs.exists(destJarFile) || fs.getFileStatus(destJarFile).getLen() != status.getLen()) {
        try {
          if (this.fs.exists(destJarFile) && fs.getFileStatus(destJarFile).getLen() != status.getLen()) {
            Thread.sleep(WAITING_TIME_ON_IMCOMPLETE_UPLOAD);
            throw new IOException("Waiting for file to complete on uploading ... ");
          }
          // Set the first parameter as false for not deleting sourceFile
          // Set the second parameter as false for not overwriting existing file on the target, by default it is true.
          // If the file is preExisted but overwrite flag set to false, then an IOException if thrown.
          this.fs.copyFromLocalFile(false, false, status.getPath(), destJarFile);
        } catch (IOException | InterruptedException e) {
          LOG.warn("Path:" + destJarFile + " is not copied successfully. Will require retry.");
          retryCount += 1;
          if (retryCount >= this.jarFileMaximumRetry) {
            LOG.error("The jar file:" + destJarFile + "failed in being copied into hdfs", e);
            // If retry reaches upper limit, skip copying this file.
            shouldFileBeAddedIntoDC = false;
            break;
          }
        }
      }
      if (shouldFileBeAddedIntoDC) {
        // Then add the jar file on HDFS to the classpath
        LOG.info(String.format("Adding %s to classpath", destJarFile));
        DistributedCache.addFileToClassPath(destJarFile, conf, this.fs);
      }
    }
  }
}
 
Example #19
Source File: Job.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Originally intended to enable symlinks, but currently symlinks cannot be
 * disabled.
 */
@Deprecated
public void createSymlink() {
  ensureState(JobState.DEFINE);
  DistributedCache.createSymlink(conf);
}
 
Example #20
Source File: JobContextImpl.java    From incubator-tez with Apache License 2.0 4 votes vote down vote up
/**
 * Get the archive entries in classpath as an array of Path
 */
public Path[] getArchiveClassPaths() {
  return DistributedCache.getArchiveClassPaths(conf);
}
 
Example #21
Source File: MiniCluster.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
protected void setupMiniDfsAndMrClusters() {
    try {
        final int dataNodes = 4;     // There will be 4 data nodes
        final int taskTrackers = 4;  // There will be 4 task tracker nodes

        System.setProperty("hadoop.log.dir", "build/test/logs");
        // Create the dir that holds hadoop-site.xml file
        // Delete if hadoop-site.xml exists already
        CONF_DIR.mkdirs();
        if(CONF_FILE.exists()) {
            CONF_FILE.delete();
        }

        // Builds and starts the mini dfs and mapreduce clusters
        Configuration config = new Configuration();
        config.set("yarn.scheduler.capacity.root.queues", "default");
        config.set("yarn.scheduler.capacity.root.default.capacity", "100");
        m_dfs = new MiniDFSCluster(config, dataNodes, true, null);
        m_fileSys = m_dfs.getFileSystem();
        m_dfs_conf = m_dfs.getConfiguration(0);

        //Create user home directory
        m_fileSys.mkdirs(m_fileSys.getWorkingDirectory());

        m_mr = new MiniMRYarnCluster("PigMiniCluster", taskTrackers);
        m_mr.init(m_dfs_conf);
        m_mr.start();

        // Write the necessary config info to hadoop-site.xml
        m_mr_conf = new Configuration(m_mr.getConfig());

        m_conf = m_mr_conf;
        m_conf.set("fs.default.name", m_dfs_conf.get("fs.default.name"));
        m_conf.unset(MRConfiguration.JOB_CACHE_FILES);

        m_conf.setInt(MRConfiguration.IO_SORT_MB, 200);
        m_conf.set(MRConfiguration.CHILD_JAVA_OPTS, "-Xmx512m");

        m_conf.setInt(MRConfiguration.SUMIT_REPLICATION, 2);
        m_conf.setInt(MRConfiguration.MAP_MAX_ATTEMPTS, 2);
        m_conf.setInt(MRConfiguration.REDUCE_MAX_ATTEMPTS, 2);
        m_conf.set("dfs.datanode.address", "0.0.0.0:0");
        m_conf.set("dfs.datanode.http.address", "0.0.0.0:0");
        m_conf.set("pig.jobcontrol.sleep", "100");
        m_conf.writeXml(new FileOutputStream(CONF_FILE));
        m_fileSys.copyFromLocalFile(new Path(CONF_FILE.getAbsoluteFile().toString()),
                new Path("/pigtest/conf/hadoop-site.xml"));
        DistributedCache.addFileToClassPath(new Path("/pigtest/conf/hadoop-site.xml"), m_conf);

        System.err.println("XXX: Setting fs.default.name to: " + m_dfs_conf.get("fs.default.name"));
        // Set the system properties needed by Pig
        System.setProperty("cluster", m_conf.get(MRConfiguration.JOB_TRACKER));
        //System.setProperty("namenode", m_dfs_conf.get("fs.default.name"));
        System.setProperty("namenode", m_conf.get("fs.default.name"));
        System.setProperty("junit.hadoop.conf", CONF_DIR.getPath());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
 
Example #22
Source File: TotalSortMapReduce.java    From hiped2 with Apache License 2.0 4 votes vote down vote up
/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  int numReducers = 2;

  Cli cli = Cli.builder().setArgs(args).addOptions(CliOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path input = new Path(cli.getArgValueAsString(CliOpts.INPUT));
  Path partitionFile = new Path(cli.getArgValueAsString(CliOpts.PARTITION));
  Path output = new Path(cli.getArgValueAsString(CliOpts.OUTPUT));


  InputSampler.Sampler<Text, Text> sampler =
      new InputSampler.RandomSampler<Text, Text>
          (0.1,
              10000,
              10);

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(TotalSortMapReduce.class);

  job.setNumReduceTasks(numReducers);

  job.setInputFormatClass(KeyValueTextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, output);

  InputSampler.writePartitionFile(job, sampler);

  URI partitionUri = new URI(partitionFile.toString() +
      "#" + "_sortPartitioning");
  DistributedCache.addCacheFile(partitionUri, conf);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}
 
Example #23
Source File: JobContextImpl.java    From incubator-tez with Apache License 2.0 4 votes vote down vote up
/**
 * Get the file entries in classpath as an array of Path
 */
public Path[] getFileClassPaths() {
  return DistributedCache.getFileClassPaths(conf);
}
 
Example #24
Source File: Job.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Set the given set of files
 * @param files The list of files that need to be localized
 */
public void setCacheFiles(URI[] files) {
  ensureState(JobState.DEFINE);
  DistributedCache.setCacheFiles(files, conf);
}
 
Example #25
Source File: Job.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Set the given set of archives
 * @param archives The list of archives that need to be localized
 */
public void setCacheArchives(URI[] archives) {
  ensureState(JobState.DEFINE);
  DistributedCache.setCacheArchives(archives, conf);
}
 
Example #26
Source File: Submitter.java    From big-c with Apache License 2.0 4 votes vote down vote up
private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass(Submitter.INPUT_FORMAT, 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript);
    setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}
 
Example #27
Source File: Application.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Start the child process to handle the task for us.
 * @param conf the task's configuration
 * @param recordReader the fake record reader to update progress with
 * @param output the collector to send output to
 * @param reporter the reporter for the task
 * @param outputKeyClass the class of the output keys
 * @param outputValueClass the class of the output values
 * @throws IOException
 * @throws InterruptedException
 */
Application(JobConf conf, 
            RecordReader<FloatWritable, NullWritable> recordReader, 
            OutputCollector<K2,V2> output, Reporter reporter,
            Class<? extends K2> outputKeyClass,
            Class<? extends V2> outputValueClass
            ) throws IOException, InterruptedException {
  serverSocket = new ServerSocket(0);
  Map<String, String> env = new HashMap<String,String>();
  // add TMPDIR environment variable with the value of java.io.tmpdir
  env.put("TMPDIR", System.getProperty("java.io.tmpdir"));
  env.put(Submitter.PORT, 
          Integer.toString(serverSocket.getLocalPort()));
  
  //Add token to the environment if security is enabled
  Token<JobTokenIdentifier> jobToken = TokenCache.getJobToken(conf
      .getCredentials());
  // This password is used as shared secret key between this application and
  // child pipes process
  byte[]  password = jobToken.getPassword();
  String localPasswordFile = new File(".") + Path.SEPARATOR
      + "jobTokenPassword";
  writePasswordToLocalFile(localPasswordFile, password, conf);
  env.put("hadoop.pipes.shared.secret.location", localPasswordFile);
 
  List<String> cmd = new ArrayList<String>();
  String interpretor = conf.get(Submitter.INTERPRETOR);
  if (interpretor != null) {
    cmd.add(interpretor);
  }
  String executable = DistributedCache.getLocalCacheFiles(conf)[0].toString();
  if (!FileUtil.canExecute(new File(executable))) {
    // LinuxTaskController sets +x permissions on all distcache files already.
    // In case of DefaultTaskController, set permissions here.
    FileUtil.chmod(executable, "u+x");
  }
  cmd.add(executable);
  // wrap the command in a stdout/stderr capture
  // we are starting map/reduce task of the pipes job. this is not a cleanup
  // attempt. 
  TaskAttemptID taskid = 
    TaskAttemptID.forName(conf.get(MRJobConfig.TASK_ATTEMPT_ID));
  File stdout = TaskLog.getTaskLogFile(taskid, false, TaskLog.LogName.STDOUT);
  File stderr = TaskLog.getTaskLogFile(taskid, false, TaskLog.LogName.STDERR);
  long logLength = TaskLog.getTaskLogLength(conf);
  cmd = TaskLog.captureOutAndError(null, cmd, stdout, stderr, logLength,
                                   false);
  
  process = runClient(cmd, env);
  clientSocket = serverSocket.accept();
  
  String challenge = getSecurityChallenge();
  String digestToSend = createDigest(password, challenge);
  String digestExpected = createDigest(password, digestToSend);
  
  handler = new OutputHandler<K2, V2>(output, reporter, recordReader, 
      digestExpected);
  K2 outputKey = (K2)
    ReflectionUtils.newInstance(outputKeyClass, conf);
  V2 outputValue = (V2) 
    ReflectionUtils.newInstance(outputValueClass, conf);
  downlink = new BinaryProtocol<K1, V1, K2, V2>(clientSocket, handler, 
                                outputKey, outputValue, conf);
  
  downlink.authenticate(digestToSend, challenge);
  waitForAuthentication();
  LOG.debug("Authentication succeeded");
  downlink.start();
  downlink.setJobConf(conf);
}
 
Example #28
Source File: JobContextImpl.java    From tez with Apache License 2.0 4 votes vote down vote up
/**
 * Get the file entries in classpath as an array of Path
 */
public Path[] getFileClassPaths() {
  return DistributedCache.getFileClassPaths(conf);
}
 
Example #29
Source File: Job.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Add a file to be localized
 * @param uri The uri of the cache to be localized
 */
public void addCacheFile(URI uri) {
  ensureState(JobState.DEFINE);
  DistributedCache.addCacheFile(uri, conf);
}
 
Example #30
Source File: Job.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Add a archives to be localized
 * @param uri The uri of the cache to be localized
 */
public void addCacheArchive(URI uri) {
  ensureState(JobState.DEFINE);
  DistributedCache.addCacheArchive(uri, conf);
}