Java Code Examples for org.apache.hadoop.mapred.JobConf#setPartitionerClass()

The following examples show how to use org.apache.hadoop.mapred.JobConf#setPartitionerClass() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SliveTest.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}
 
Example 2
Source File: SliveTest.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}
 
Example 3
Source File: ItemModelTest.java    From ml-ease with Apache License 2.0 6 votes vote down vote up
@Override
public void run() throws Exception
{
  JobConfig props = super.getJobConfig();
  List<String> lambdastr = props.getStringList(LAMBDA, ",");
  String outBasePath = props.getString(OUTPUT_BASE_PATH);
  for (String lambda : lambdastr)
  {
    String outPath = outBasePath + "/lambda-" + lambda;
    props.put("output.path", outPath);
    JobConf conf = createJobConf(PerItemTestMapper.class, PerItemTestReducer.class);
    AvroUtils.addAvroCacheFilesAndSetTheProperty(conf,
                                                 new Path(props.get(MODEL_PATH)),
                                                 MODEL_PATH);
    conf.set(ITEM_KEY, props.getString(ITEM_KEY));
    conf.setFloat(LAMBDA, Float.parseFloat(lambda));
    conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
    conf.setPartitionerClass(PerItemTestPartitioner.class);
    conf.setInt(NUM_REDUCERS, conf.getNumReduceTasks());
    AvroUtils.runAvroJob(conf);
  }
}
 
Example 4
Source File: TableMapReduceUtil.java    From hbase with Apache License 2.0 6 votes vote down vote up
/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.
 *
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job configuration to adjust.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 */
public static void initTableReduceJob(String table,
  Class<? extends TableReduce> reducer, JobConf job, Class partitioner,
  boolean addDependencyJars) throws IOException {
  job.setOutputFormat(TableOutputFormat.class);
  job.setReducerClass(reducer);
  job.set(TableOutputFormat.OUTPUT_TABLE, table);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Put.class);
  job.setStrings("io.serializations", job.get("io.serializations"),
      MutationSerialization.class.getName(), ResultSerialization.class.getName());
  if (partitioner == HRegionPartitioner.class) {
    job.setPartitionerClass(HRegionPartitioner.class);
    int regions =
      MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
    if (job.getNumReduceTasks() > regions) {
      job.setNumReduceTasks(regions);
    }
  } else if (partitioner != null) {
    job.setPartitionerClass(partitioner);
  }
  if (addDependencyJars) {
    addDependencyJars(job);
  }
  initCredentials(job);
}
 
Example 5
Source File: TeraSort.java    From hadoop-book with Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 6
Source File: TeraSort.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}
 
Example 7
Source File: SleepJob.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public JobConf setupJobConf(int numMapper, int numReducer, 
                              long mapSleepTime, int mapSleepCount, 
                              long reduceSleepTime, int reduceSleepCount,
                              boolean doSpeculation, List<String> slowMaps,
                              List<String> slowReduces, int slowRatio,
                              int countersPerTask, List<String> hosts,
                              int hostsPerSplit, boolean setup) {
  
  JobConf job = new JobConf(getConf(), SleepJob.class);
  job.setNumMapTasks(numMapper);
  job.setNumReduceTasks(numReducer);
  job.setMapperClass(SleepJob.class);
  job.setMapOutputKeyClass(IntWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(SleepJob.class);
  job.setOutputFormat(NullOutputFormat.class);
  job.setJobSetupCleanupNeeded(setup);
  job.setInputFormat(SleepInputFormat.class);
  job.setPartitionerClass(SleepJob.class);
  job.setJobName("Sleep job");
  FileInputFormat.addInputPath(job, new Path("ignored"));
  job.setLong("sleep.job.map.sleep.time", mapSleepTime);
  job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
  job.setInt("sleep.job.map.sleep.count", mapSleepCount);
  job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
  job.setSpeculativeExecution(doSpeculation);
  job.setInt(SLOW_RATIO, slowRatio);
  job.setStrings(SLOW_MAPS, slowMaps.toArray(new String[slowMaps.size()]));
  job.setStrings(SLOW_REDUCES, slowMaps.toArray(new String[slowReduces.size()]));
  job.setInt("sleep.job.counters.per.task", countersPerTask);
  job.setStrings(HOSTS_FOR_LOCALITY, hosts.toArray(new String[hosts.size()]));
  job.setInt(HOSTS_PER_SPLIT, hostsPerSplit);
  return job;
}
 
Example 8
Source File: TeraSort.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}
 
Example 9
Source File: Submitter.java    From hadoop with Apache License 2.0 4 votes vote down vote up
private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass(Submitter.INPUT_FORMAT, 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript);
    setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}
 
Example 10
Source File: Submitter.java    From big-c with Apache License 2.0 4 votes vote down vote up
private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass(Submitter.INPUT_FORMAT, 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript);
    setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}
 
Example 11
Source File: FreeGenerator.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
    System.err.println("\tinputDir\tinput directory containing one or more input files.");
    System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
    System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
    System.err.println("\t-filter\trun current URLFilters on input URLs");
    System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
    return -1;
  }
  boolean filter = false;
  boolean normalize = false;
  if (args.length > 2) {
    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else {
        LOG.error("Unknown argument: " + args[i] + ", exiting ...");
        return -1;
      }
    }
  }
  
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("FreeGenerator: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());
  job.setBoolean(FILTER_KEY, filter);
  job.setBoolean(NORMALIZE_KEY, normalize);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  job.setInputFormat(TextInputFormat.class);
  job.setMapperClass(FG.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Generator.SelectorEntry.class);
  job.setPartitionerClass(URLPartitioner.class);
  job.setReducerClass(FG.class);
  String segName = Generator.generateSegmentName();
  job.setNumReduceTasks(job.getNumMapTasks());
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);
  job.setOutputKeyComparatorClass(Generator.HashComparator.class);
  FileOutputFormat.setOutputPath(job, new Path(args[1],
      new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
  try {
    JobClient.runJob(job);
  } catch (Exception e) {
    LOG.error("FAILED: " + StringUtils.stringifyException(e));
    return -1;
  }
  long end = System.currentTimeMillis();
  LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}
 
Example 12
Source File: Submitter.java    From RDFS with Apache License 2.0 4 votes vote down vote up
private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
  setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
  setIfUnset(conf, "mapred.output.key.class", textClassname);
  setIfUnset(conf, "mapred.output.value.class", textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass("mapred.pipes.user.inputformat", 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    DistributedCache.createSymlink(conf);
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf,"mapred.map.task.debug.script",defScript);
    setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}
 
Example 13
Source File: IndexUpdater.java    From RDFS with Apache License 2.0 4 votes vote down vote up
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}
 
Example 14
Source File: FreeGenerator.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
    System.err.println("\tinputDir\tinput directory containing one or more input files.");
    System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
    System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
    System.err.println("\t-filter\trun current URLFilters on input URLs");
    System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
    return -1;
  }
  boolean filter = false;
  boolean normalize = false;
  if (args.length > 2) {
    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else {
        LOG.error("Unknown argument: " + args[i] + ", exiting ...");
        return -1;
      }
    }
  }

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("FreeGenerator: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());
  job.setBoolean(FILTER_KEY, filter);
  job.setBoolean(NORMALIZE_KEY, normalize);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  job.setInputFormat(TextInputFormat.class);
  job.setMapperClass(FG.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Generator.SelectorEntry.class);
  job.setPartitionerClass(URLPartitioner.class);
  job.setReducerClass(FG.class);
  String segName = Generator.generateSegmentName();
  job.setNumReduceTasks(job.getNumMapTasks());
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);
  job.setOutputKeyComparatorClass(Generator.HashComparator.class);
  FileOutputFormat.setOutputPath(job, new Path(args[1],
      new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
  try {
    JobClient.runJob(job);
  } catch (Exception e) {
    LOG.error("FAILED: " + StringUtils.stringifyException(e));
    return -1;
  }
  long end = System.currentTimeMillis();
  LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}
 
Example 15
Source File: IndexUpdater.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}