Java Code Examples for org.apache.hadoop.mapred.JobConf#setNumReduceTasks()

The following examples show how to use org.apache.hadoop.mapred.JobConf#setNumReduceTasks() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: NNBench.java    From hadoop with Apache License 2.0 8 votes vote down vote up
/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests() throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}
 
Example 2
Source File: TeraGen.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * @param args the cli arguments
 */
public int run(String[] args) throws IOException {
  JobConf job = (JobConf) getConf();
  setNumberOfRows(job, Long.parseLong(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraGen");
  job.setJarByClass(TeraGen.class);
  job.setMapperClass(SortGenMapper.class);
  job.setNumReduceTasks(0);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(RangeInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  JobClient.runJob(job);
  return 0;
}
 
Example 3
Source File: SliveTest.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}
 
Example 4
Source File: JobControlTestUtils.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * Creates a simple copy job.
 * 
 * @param indirs List of input directories.
 * @param outdir Output directory.
 * @return JobConf initialised for a simple copy job.
 * @throws Exception If an error occurs creating job configuration.
 */
static JobConf createCopyJob(List<Path> indirs, Path outdir) throws Exception {

  Configuration defaults = new Configuration();
  JobConf theJob = new JobConf(defaults, TestJobControl.class);
  theJob.setJobName("DataMoveJob");

  FileInputFormat.setInputPaths(theJob, indirs.toArray(new Path[0]));
  theJob.setMapperClass(DataCopy.class);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  theJob.setReducerClass(DataCopy.class);
  theJob.setNumMapTasks(12);
  theJob.setNumReduceTasks(4);
  return theJob;
}
 
Example 5
Source File: TeraValidate.java    From hadoop-book with Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  JobConf job = (JobConf) getConf();
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraValidate");
  job.setJarByClass(TeraValidate.class);
  job.setMapperClass(ValidateMapper.class);
  job.setReducerClass(ValidateReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  // force a single reducer
  job.setNumReduceTasks(1);
  // force a single split 
  job.setLong("mapred.min.split.size", Long.MAX_VALUE);
  job.setInputFormat(TeraInputFormat.class);
  JobClient.runJob(job);
  return 0;
}
 
Example 6
Source File: TestTableMapReduceUtil.java    From hbase with Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("deprecation")
public void shoudBeValidMapReduceEvaluation() throws Exception {
  Configuration cfg = UTIL.getConfiguration();
  JobConf jobConf = new JobConf(cfg);
  try {
    jobConf.setJobName("process row task");
    jobConf.setNumReduceTasks(1);
    TableMapReduceUtil.initTableMapJob(TABLE_NAME, new String(COLUMN_FAMILY),
        ClassificatorMapper.class, ImmutableBytesWritable.class, Put.class,
        jobConf);
    TableMapReduceUtil.initTableReduceJob(TABLE_NAME,
        ClassificatorRowReduce.class, jobConf);
    RunningJob job = JobClient.runJob(jobConf);
    assertTrue(job.isSuccessful());
  } finally {
    if (jobConf != null)
      FileUtil.fullyDelete(new File(jobConf.get("hadoop.tmp.dir")));
  }
}
 
Example 7
Source File: TestTableMapReduceUtil.java    From hbase with Apache License 2.0 6 votes vote down vote up
/**
 * Check what the given number of reduce tasks for the given job configuration
 * does not exceed the number of regions for the given table.
 */
@Test
public void shouldNumberOfReduceTaskNotExceedNumberOfRegionsForGivenTable()
    throws IOException {
  Assert.assertNotNull(presidentsTable);
  Configuration cfg = UTIL.getConfiguration();
  JobConf jobConf = new JobConf(cfg);
  TableMapReduceUtil.setNumReduceTasks(TABLE_NAME, jobConf);
  TableMapReduceUtil.limitNumReduceTasks(TABLE_NAME, jobConf);
  TableMapReduceUtil.setScannerCaching(jobConf, 100);
  assertEquals(1, jobConf.getNumReduceTasks());
  assertEquals(100, jobConf.getInt("hbase.client.scanner.caching", 0));

  jobConf.setNumReduceTasks(10);
  TableMapReduceUtil.setNumMapTasks(TABLE_NAME, jobConf);
  TableMapReduceUtil.limitNumReduceTasks(TABLE_NAME, jobConf);
  assertEquals(1, jobConf.getNumReduceTasks());
}
 
Example 8
Source File: TeraGen.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * @param args the cli arguments
 */
public int run(String[] args) throws IOException {
  JobConf job = (JobConf) getConf();
  setNumberOfRows(job, Long.parseLong(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraGen");
  job.setJarByClass(TeraGen.class);
  job.setMapperClass(SortGenMapper.class);
  job.setNumReduceTasks(0);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(RangeInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  JobClient.runJob(job);
  return 0;
}
 
Example 9
Source File: DistCp.java    From RDFS with Apache License 2.0 6 votes vote down vote up
private static JobConf createJobConfForCopyByChunk(Configuration conf) {
  JobConf jobconf = new JobConf(conf, DistCp.class);
  jobconf.setJobName(NAME);

  // turn off speculative execution, because DFS doesn't handle
  // multiple writers to the same file.
  jobconf.setMapSpeculativeExecution(false);

  jobconf.setOutputKeyClass(Text.class);
  jobconf.setOutputValueClass(Text.class);

  jobconf.setInputFormat(CopyByChunkInputFormat.class);
  jobconf.setMapperClass(CopyFilesByChunkMapper.class);     
    
  jobconf.setNumReduceTasks(0);
  return jobconf;
}
 
Example 10
Source File: UtilsForTests.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * Configure a waiting job
 */
static void configureWaitingJobConf(JobConf jobConf, Path inDir,
                                    Path outputPath, int numMaps, int numRed,
                                    String jobName, String mapSignalFilename,
                                    String redSignalFilename)
throws IOException {
  jobConf.setJobName(jobName);
  jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class);
  jobConf.setOutputFormat(SequenceFileOutputFormat.class);
  FileInputFormat.setInputPaths(jobConf, inDir);
  FileOutputFormat.setOutputPath(jobConf, outputPath);
  jobConf.setMapperClass(UtilsForTests.HalfWaitingMapper.class);
  jobConf.setReducerClass(IdentityReducer.class);
  jobConf.setOutputKeyClass(BytesWritable.class);
  jobConf.setOutputValueClass(BytesWritable.class);
  jobConf.setInputFormat(RandomInputFormat.class);
  jobConf.setNumMapTasks(numMaps);
  jobConf.setNumReduceTasks(numRed);
  jobConf.setJar("build/test/testjar/testjob.jar");
  jobConf.set(getTaskSignalParameter(true), mapSignalFilename);
  jobConf.set(getTaskSignalParameter(false), redSignalFilename);
}
 
Example 11
Source File: TestMROldApiJobs.java    From hadoop with Apache License 2.0 5 votes vote down vote up
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, 
                         int numReds) throws IOException, InterruptedException {

  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }
  if (!fs.exists(inDir)) {
    fs.mkdirs(inDir);
  }
  String input = "The quick brown fox\n" + "has many silly\n"
      + "red fox sox\n";
  for (int i = 0; i < numMaps; ++i) {
    DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
    file.writeBytes(input);
    file.close();
  }

  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);

  JobClient jobClient = new JobClient(conf);
  
  RunningJob job = jobClient.submitJob(conf);
  return jobClient.monitorAndPrintJob(conf, job);
}
 
Example 12
Source File: DistCh.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private static JobConf createJobConf(Configuration conf) {
  JobConf jobconf = new JobConf(conf, DistCh.class);
  jobconf.setJobName(NAME);
  jobconf.setMapSpeculativeExecution(false);

  jobconf.setInputFormat(ChangeInputFormat.class);
  jobconf.setOutputKeyClass(Text.class);
  jobconf.setOutputValueClass(Text.class);

  jobconf.setMapperClass(ChangeFilesMapper.class);
  jobconf.setNumReduceTasks(0);
  return jobconf;
}
 
Example 13
Source File: UpdateColumnJob.java    From indexr with Apache License 2.0 5 votes vote down vote up
public boolean doRun(Config upcolConfig) throws Exception {
    JobConf jobConf = new JobConf(getConf(), UpdateColumnJob.class);
    jobConf.setKeepFailedTaskFiles(false);
    jobConf.setNumReduceTasks(0);
    String jobName = String.format("indexr-upcol-%s-%s-%s",
            upcolConfig.table,
            LocalDateTime.now().format(timeFormatter),
            RandomStringUtils.randomAlphabetic(5));
    jobConf.setJobName(jobName);
    jobConf.set(CONFKEY, JsonUtil.toJson(upcolConfig));
    Path workDir = new Path(jobConf.getWorkingDirectory(), jobName);
    jobConf.setWorkingDirectory(workDir);

    Job job = Job.getInstance(jobConf);
    job.setInputFormatClass(SegmentInputFormat.class);
    job.setMapperClass(UpColSegmentMapper.class);
    job.setJarByClass(UpdateColumnJob.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(UpColSegmentOutputFormat.class);

    job.submit();
    boolean ok = job.waitForCompletion(true);
    if (!ok) {
        TaskReport[] reports = job.getTaskReports(TaskType.MAP);
        if (reports != null) {
            for (TaskReport report : reports) {
                log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics()));
            }
        }
    }
    return ok;
}
 
Example 14
Source File: TestMapProcessor.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public void setUpJobConf(JobConf job) {
  job.set(TezRuntimeFrameworkConfigs.LOCAL_DIRS, workDir.toString());
  job.set(MRConfig.LOCAL_DIR, workDir.toString());
  job.setClass(
      Constants.TEZ_RUNTIME_TASK_OUTPUT_MANAGER,
      TezLocalTaskOutputFiles.class, 
      TezTaskOutput.class);
  job.set(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName());
  job.setNumReduceTasks(1);
}
 
Example 15
Source File: TestMapProcessor.java    From tez with Apache License 2.0 5 votes vote down vote up
public void setUpJobConf(JobConf job) {
  job.set(TezRuntimeFrameworkConfigs.LOCAL_DIRS, workDir.toString());
  job.set(MRConfig.LOCAL_DIR, workDir.toString());
  job.setClass(
      Constants.TEZ_RUNTIME_TASK_OUTPUT_MANAGER,
      TezTaskOutputFiles.class,
      TezTaskOutput.class);
  job.set(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName());
  job.setNumReduceTasks(1);
}
 
Example 16
Source File: CloudBurst.java    From emr-sample-apps with Apache License 2.0 4 votes vote down vote up
public static RunningJob alignall(String refpath, 
		                          String qrypath,
		                          String outpath,
		                          int MIN_READ_LEN,
		                          int MAX_READ_LEN,
		                          int K,
		                          int ALLOW_DIFFERENCES,
		                          boolean FILTER_ALIGNMENTS,
		                          int NUM_MAP_TASKS,
		                          int NUM_REDUCE_TASKS,
		                          int BLOCK_SIZE,
		                          int REDUNDANCY) throws IOException, Exception
{
	int SEED_LEN   = MIN_READ_LEN / (K+1);
	int FLANK_LEN  = MAX_READ_LEN-SEED_LEN+K;
	
	System.out.println("refath: "            + refpath);
	System.out.println("qrypath: "           + qrypath);
	System.out.println("outpath: "           + outpath);
	System.out.println("MIN_READ_LEN: "      + MIN_READ_LEN);
	System.out.println("MAX_READ_LEN: "      + MAX_READ_LEN);
	System.out.println("K: "                 + K);
	System.out.println("SEED_LEN: "          + SEED_LEN);
	System.out.println("FLANK_LEN: "         + FLANK_LEN);
	System.out.println("ALLOW_DIFFERENCES: " + ALLOW_DIFFERENCES);
	System.out.println("FILTER_ALIGNMENTS: " + FILTER_ALIGNMENTS);
	System.out.println("NUM_MAP_TASKS: "     + NUM_MAP_TASKS);
	System.out.println("NUM_REDUCE_TASKS: "  + NUM_REDUCE_TASKS);
	System.out.println("BLOCK_SIZE: "        + BLOCK_SIZE);
	System.out.println("REDUNDANCY: "        + REDUNDANCY);
	
	JobConf conf = new JobConf(MerReduce.class);
	conf.setJobName("CloudBurst");
	conf.setNumMapTasks(NUM_MAP_TASKS);
	conf.setNumReduceTasks(NUM_REDUCE_TASKS);
	
	FileInputFormat.addInputPath(conf, new Path(refpath));
	FileInputFormat.addInputPath(conf, new Path(qrypath));

	conf.set("refpath",           refpath);
	conf.set("qrypath",           qrypath);
	conf.set("MIN_READ_LEN",      Integer.toString(MIN_READ_LEN));
	conf.set("MAX_READ_LEN",      Integer.toString(MAX_READ_LEN));
	conf.set("K",                 Integer.toString(K));
	conf.set("SEED_LEN",          Integer.toString(SEED_LEN));
	conf.set("FLANK_LEN",         Integer.toString(FLANK_LEN));
	conf.set("ALLOW_DIFFERENCES", Integer.toString(ALLOW_DIFFERENCES));
	conf.set("BLOCK_SIZE",        Integer.toString(BLOCK_SIZE));
	conf.set("REDUNDANCY",        Integer.toString(REDUNDANCY));
	conf.set("FILTER_ALIGNMENTS", (FILTER_ALIGNMENTS ? "1" : "0"));
	
	conf.setMapperClass(MapClass.class);
	
	conf.setInputFormat(SequenceFileInputFormat.class);			
	conf.setMapOutputKeyClass(BytesWritable.class);
	conf.setMapOutputValueClass(BytesWritable.class);
	
	conf.setReducerClass(ReduceClass.class);		
	conf.setOutputKeyClass(IntWritable.class);
	conf.setOutputValueClass(BytesWritable.class);
	conf.setOutputFormat(SequenceFileOutputFormat.class);

	Path oPath = new Path(outpath);
	FileOutputFormat.setOutputPath(conf, oPath);
	System.err.println("  Removing old results");
	FileSystem.get(conf).delete(oPath);
	
	RunningJob rj = JobClient.runJob(conf);
	System.err.println("CloudBurst Finished");
	return rj;
}
 
Example 17
Source File: RandomTextWriter.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormat(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite);
  }
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormat(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}
 
Example 18
Source File: FreeGenerator.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
    System.err.println("\tinputDir\tinput directory containing one or more input files.");
    System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
    System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
    System.err.println("\t-filter\trun current URLFilters on input URLs");
    System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
    return -1;
  }
  boolean filter = false;
  boolean normalize = false;
  if (args.length > 2) {
    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else {
        LOG.error("Unknown argument: " + args[i] + ", exiting ...");
        return -1;
      }
    }
  }
  
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("FreeGenerator: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());
  job.setBoolean(FILTER_KEY, filter);
  job.setBoolean(NORMALIZE_KEY, normalize);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  job.setInputFormat(TextInputFormat.class);
  job.setMapperClass(FG.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Generator.SelectorEntry.class);
  job.setPartitionerClass(URLPartitioner.class);
  job.setReducerClass(FG.class);
  String segName = Generator.generateSegmentName();
  job.setNumReduceTasks(job.getNumMapTasks());
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);
  job.setOutputKeyComparatorClass(Generator.HashComparator.class);
  FileOutputFormat.setOutputPath(job, new Path(args[1],
      new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
  try {
    JobClient.runJob(job);
  } catch (Exception e) {
    LOG.error("FAILED: " + StringUtils.stringifyException(e));
    return -1;
  }
  long end = System.currentTimeMillis();
  LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}
 
Example 19
Source File: IndexUpdater.java    From RDFS with Apache License 2.0 4 votes vote down vote up
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}
 
Example 20
Source File: RandomTextWriter.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormat(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite);
  }
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormat(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}