Java Code Examples for org.apache.hadoop.filecache.DistributedCache#createSymlink()

The following examples show how to use org.apache.hadoop.filecache.DistributedCache#createSymlink() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TeraSort.java    From hadoop-book with Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 2
Source File: TeraSort.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 3
Source File: JobControlCompiler.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * if url is not in HDFS will copy the path to HDFS from local before adding to distributed cache
 * @param pigContext the pigContext
 * @param conf the job conf
 * @param url the url to be added to distributed cache
 * @return the path as seen on distributed cache
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void putJarOnClassPathThroughDistributedCache(
        PigContext pigContext,
        Configuration conf,
        URL url) throws IOException {

    // Turn on the symlink feature
    DistributedCache.createSymlink(conf);

    Path distCachePath = getExistingDistCacheFilePath(conf, url);
    if (distCachePath != null) {
        log.info("Jar file " + url + " already in DistributedCache as "
                + distCachePath + ". Not copying to hdfs and adding again");
        // Path already in dist cache
        if (!HadoopShims.isHadoopYARN()) {
            // Mapreduce in YARN includes $PWD/* which will add all *.jar files in classapth.
            // So don't have to ensure that the jar is separately added to mapreduce.job.classpath.files
            // But path may only be in 'mapred.cache.files' and not be in
            // 'mapreduce.job.classpath.files' in Hadoop 1.x. So adding it there
            DistributedCache.addFileToClassPath(distCachePath, conf, distCachePath.getFileSystem(conf));
        }
    }
    else {
        // REGISTER always copies locally the jar file. see PigServer.registerJar()
        Path pathInHDFS = shipToHDFS(pigContext, conf, url);
        DistributedCache.addFileToClassPath(pathInHDFS, conf, FileSystem.get(conf));
        log.info("Added jar " + url + " to DistributedCache through " + pathInHDFS);
    }

}
 
Example 4
Source File: TeraSort.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}
 
Example 5
Source File: Submitter.java    From RDFS with Apache License 2.0 4 votes vote down vote up
private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
  setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
  setIfUnset(conf, "mapred.output.key.class", textClassname);
  setIfUnset(conf, "mapred.output.value.class", textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass("mapred.pipes.user.inputformat", 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    DistributedCache.createSymlink(conf);
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf,"mapred.map.task.debug.script",defScript);
    setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}
 
Example 6
Source File: TestMiniMRMapRedDebugScript.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * Launches failed map task and debugs the failed task
 * @param conf configuration for the mapred job
 * @param inDir input path
 * @param outDir output path
 * @param debugDir debug directory where script is present
 * @param debugCommand The command to execute script
 * @param input Input text
 * @return the output of debug script 
 * @throws IOException
 */
public String launchFailMapAndDebug(JobConf conf,
                                    Path inDir,
                                    Path outDir,
                                    Path debugDir,
                                    String debugScript,
                                    String input)
throws IOException {

  // set up the input file system and write input text.
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    // write input into input file
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  // configure the mapred Job for failing map task.
  conf.setJobName("failmap");
  conf.setMapperClass(MapClass.class);        
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(0);
  conf.setMapDebugScript(debugScript);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  String TEST_ROOT_DIR = new Path(System.getProperty("test.build.data",
                                    "/tmp")).toString().replace(' ', '+');
  conf.set("test.build.data", TEST_ROOT_DIR);

  // copy debug script to cache from local file system.
  FileSystem debugFs = debugDir.getFileSystem(conf);
  Path scriptPath = new Path(debugDir,"testscript.txt");
  Path cachePath = new Path("/cacheDir");
  if (!debugFs.mkdirs(cachePath)) {
    throw new IOException("Mkdirs failed to create " + cachePath.toString());
  }
  debugFs.copyFromLocalFile(scriptPath,cachePath);
  
  URI uri = debugFs.getUri().resolve(cachePath+"/testscript.txt#testscript");
  DistributedCache.createSymlink(conf);
  DistributedCache.addCacheFile(uri, conf);

  RunningJob job =null;
  // run the job. It will fail with IOException.
  try {
    job = new JobClient(conf).submitJob(conf);
  } catch (IOException e) {
  	LOG.info("Running Job failed", e);
  }

  JobID jobId = job.getID();
  // construct the task id of first map task of failmap
  TaskAttemptID taskId = new TaskAttemptID(new TaskID(jobId,true, 0), 0);
  // wait for the job to finish.
  while (!job.isComplete()) ;
  
  // return the output of debugout log.
  return readTaskLog(TaskLog.LogName.DEBUGOUT,taskId, false);
}
 
Example 7
Source File: TestTaskLogsMonitor.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * Test the truncation of DEBUGOUT file by {@link TaskLogsMonitor}
 * @throws IOException 
 */
@Test
public void testDebugLogsTruncationWithMiniMR() throws IOException {

  MiniMRCluster mr = null;
  try {
    JobConf clusterConf = new JobConf();
    clusterConf.setLong(TaskTracker.MAP_USERLOG_RETAIN_SIZE, 10000L);
    clusterConf.setLong(TaskTracker.REDUCE_USERLOG_RETAIN_SIZE, 10000L);
    mr = new MiniMRCluster(1, "file:///", 3, null, null, clusterConf);

    JobConf conf = mr.createJobConf();

    Path inDir = new Path(TEST_ROOT_DIR + "/input");
    Path outDir = new Path(TEST_ROOT_DIR + "/output");
    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outDir)) {
      fs.delete(outDir, true);
    }
    if (!fs.exists(inDir)) {
      fs.mkdirs(inDir);
    }
    String input = "The quick brown fox jumped over the lazy dog";
    DataOutputStream file = fs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    conf.setNumMapTasks(1);
    conf.setMaxMapAttempts(1);
    conf.setNumReduceTasks(0);
    conf.setMapperClass(TestMiniMRMapRedDebugScript.MapClass.class);

    // copy debug script to cache from local file system.
    Path scriptPath = new Path(TEST_ROOT_DIR, "debug-script.txt");
    String debugScriptContent =
        "for ((i=0;i<1000;i++)); " + "do "
            + "echo \"Lots of logs! Lots of logs! "
            + "Waiting to be truncated! Lots of logs!\";" + "done";
    DataOutputStream scriptFile = fs.create(scriptPath);
    scriptFile.writeBytes(debugScriptContent);
    scriptFile.close();
    new File(scriptPath.toUri().getPath()).setExecutable(true);

    URI uri = scriptPath.toUri();
    DistributedCache.createSymlink(conf);
    DistributedCache.addCacheFile(uri, conf);
    conf.setMapDebugScript(scriptPath.toUri().getPath());

    RunningJob job = null;
    try {
      JobClient jc = new JobClient(conf);
      job = jc.submitJob(conf);
      try {
        jc.monitorAndPrintJob(conf, job);
      } catch (InterruptedException e) {
        //
      }
    } catch (IOException ioe) {
    } finally{
      for (TaskCompletionEvent tce : job.getTaskCompletionEvents(0)) {
        File debugOutFile =
            TaskLog.getTaskLogFile(tce.getTaskAttemptId(),
                TaskLog.LogName.DEBUGOUT);
        if (debugOutFile.exists()) {
          long length = debugOutFile.length();
          assertTrue("DEBUGOUT log file length for "
              + tce.getTaskAttemptId() + " is " + length
              + " and not =10000", length == 10000);
        }
      }
    }
  } finally { 
    if (mr != null) {
      mr.shutdown();
    }
  }
}
 
Example 8
Source File: TestMiniMRMapRedDebugScript.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
/**
 * Launches failed map task and debugs the failed task
 * @param conf configuration for the mapred job
 * @param inDir input path
 * @param outDir output path
 * @param debugDir debug directory where script is present
 * @param debugCommand The command to execute script
 * @param input Input text
 * @return the output of debug script 
 * @throws IOException
 */
public String launchFailMapAndDebug(JobConf conf,
                                    Path inDir,
                                    Path outDir,
                                    Path debugDir,
                                    String debugScript,
                                    String input)
throws IOException {

  // set up the input file system and write input text.
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    // write input into input file
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  // configure the mapred Job for failing map task.
  conf.setJobName("failmap");
  conf.setMapperClass(MapClass.class);        
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(0);
  conf.setMapDebugScript(debugScript);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  String TEST_ROOT_DIR = new Path(System.getProperty("test.build.data",
                                    "/tmp")).toString().replace(' ', '+');
  conf.set("test.build.data", TEST_ROOT_DIR);

  // copy debug script to cache from local file system.
  FileSystem debugFs = debugDir.getFileSystem(conf);
  Path scriptPath = new Path(debugDir,"testscript.txt");
  Path cachePath = new Path("/cacheDir");
  if (!debugFs.mkdirs(cachePath)) {
    throw new IOException("Mkdirs failed to create " + cachePath.toString());
  }
  debugFs.copyFromLocalFile(scriptPath,cachePath);
  
  URI uri = debugFs.getUri().resolve(cachePath+"/testscript.txt#testscript");
  DistributedCache.createSymlink(conf);
  DistributedCache.addCacheFile(uri, conf);

  RunningJob job =null;
  // run the job. It will fail with IOException.
  try {
    job = new JobClient(conf).submitJob(conf);
  } catch (IOException e) {
  	LOG.info("Running Job failed", e);
  }

  JobID jobId = job.getID();
  // construct the task id of first map task of failmap
  TaskAttemptID taskId = new TaskAttemptID(new TaskID(jobId,true, 0), 0);
  // wait for the job to finish.
  while (!job.isComplete()) ;
  
  // return the output of debugout log.
  return readTaskLog(TaskLog.LogName.DEBUGOUT,taskId, false);
}