Java Code Examples for org.apache.hadoop.mapred.JobConf#getNumReduceTasks()

The following examples show how to use org.apache.hadoop.mapred.JobConf#getNumReduceTasks() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ZephyrOutputFormat.java From zephyr with Apache License 2.0

6 votes

@Override
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws FileAlreadyExistsException, InvalidJobConfException, IOException {
    // Ensure that the output directory is set and not already there
    Path outDir = getOutputPath(job);
    if (outDir == null && job.getNumReduceTasks() != 0) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }
    if (outDir != null) {
        FileSystem fs = outDir.getFileSystem(job);
        // normalize the output directory
        outDir = fs.makeQualified(outDir);
        setOutputPath(job, outDir);

        // get delegation token for the outDir's file system
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[]{outDir}, job);
        String jobUuid = job.get("zephyr.job.uuid");
        if (jobUuid == null)
            throw new InvalidJobConfException("This output format REQUIRES the value zephyr.job.uuid to be specified in the job configuration!");
        // // check its existence
        // if (fs.exists(outDir)) {
        // throw new FileAlreadyExistsException("Output directory " + outDir
        // + " already exists");
        // }
    }
}

Example 2

Source File: TableMapReduceUtil.java From hbase with Apache License 2.0

6 votes

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.
 *
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job configuration to adjust.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 */
public static void initTableReduceJob(String table,
  Class<? extends TableReduce> reducer, JobConf job, Class partitioner,
  boolean addDependencyJars) throws IOException {
  job.setOutputFormat(TableOutputFormat.class);
  job.setReducerClass(reducer);
  job.set(TableOutputFormat.OUTPUT_TABLE, table);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Put.class);
  job.setStrings("io.serializations", job.get("io.serializations"),
      MutationSerialization.class.getName(), ResultSerialization.class.getName());
  if (partitioner == HRegionPartitioner.class) {
    job.setPartitionerClass(HRegionPartitioner.class);
    int regions =
      MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
    if (job.getNumReduceTasks() > regions) {
      job.setNumReduceTasks(regions);
    }
  } else if (partitioner != null) {
    job.setPartitionerClass(partitioner);
  }
  if (addDependencyJars) {
    addDependencyJars(job);
  }
  initCredentials(job);
}

Example 3

Source File: TotalOrderPartitioner.java From RDFS with Apache License 2.0

5 votes

/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link
   org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void configure(JobConf job) {
  try {
    String parts = getPartitionFile(job);
    final Path partFile = new Path(parts);
    final FileSystem fs = (DEFAULT_PATH.equals(parts))
      ? FileSystem.getLocal(job)     // assume in DistributedCache
      : partFile.getFileSystem(job);

    Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
    K[] splitPoints = readPartitions(fs, partFile, keyClass, job);
    if (splitPoints.length != job.getNumReduceTasks() - 1) {
      throw new IOException("Wrong number of partitions in keyset");
    }
    RawComparator<K> comparator =
      (RawComparator<K>) job.getOutputKeyComparator();
    for (int i = 0; i < splitPoints.length - 1; ++i) {
      if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) {
        throw new IOException("Split points are out of order");
      }
    }
    boolean natOrder =
      job.getBoolean("total.order.partitioner.natural.order", true);
    if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
      partitions = buildTrie((BinaryComparable[])splitPoints, 0,
          splitPoints.length, new byte[0],
          job.getInt("total.order.partitioner.max.trie.depth", 2));
    } else {
      partitions = new BinarySearchNode(splitPoints, comparator);
    }
  } catch (IOException e) {
    throw new IllegalArgumentException("Can't read partitions file", e);
  }
}

Example 4

Source File: InputSampler.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}

Example 5

Source File: TotalOrderPartitioner.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link
   org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void configure(JobConf job) {
  try {
    String parts = getPartitionFile(job);
    final Path partFile = new Path(parts);
    final FileSystem fs = (DEFAULT_PATH.equals(parts))
      ? FileSystem.getLocal(job)     // assume in DistributedCache
      : partFile.getFileSystem(job);

    Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
    K[] splitPoints = readPartitions(fs, partFile, keyClass, job);
    if (splitPoints.length != job.getNumReduceTasks() - 1) {
      throw new IOException("Wrong number of partitions in keyset");
    }
    RawComparator<K> comparator =
      (RawComparator<K>) job.getOutputKeyComparator();
    for (int i = 0; i < splitPoints.length - 1; ++i) {
      if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) {
        throw new IOException("Split points are out of order");
      }
    }
    boolean natOrder =
      job.getBoolean("total.order.partitioner.natural.order", true);
    if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
      partitions = buildTrie((BinaryComparable[])splitPoints, 0,
          splitPoints.length, new byte[0],
          job.getInt("total.order.partitioner.max.trie.depth", 2));
    } else {
      partitions = new BinarySearchNode(splitPoints, comparator);
    }
  } catch (IOException e) {
    throw new IllegalArgumentException("Can't read partitions file", e);
  }
}

Example 6

Source File: TeraInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf, 
                                      Path partFile) throws IOException {
  TeraInputFormat inFormat = new TeraInputFormat();
  TextSampler sampler = new TextSampler();
  Text key = new Text();
  Text value = new Text();
  int partitions = conf.getNumReduceTasks();
  long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
  InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
  int samples = Math.min(10, splits.length);
  long recordsPerSample = sampleSize / samples;
  int sampleStep = splits.length / samples;
  long records = 0;
  // take N samples from different parts of the input
  for(int i=0; i < samples; ++i) {
    RecordReader<Text,Text> reader = 
      inFormat.getRecordReader(splits[sampleStep * i], conf, null);
    while (reader.next(key, value)) {
      sampler.addKey(key);
      records += 1;
      if ((i+1) * recordsPerSample <= records) {
        break;
      }
    }
  }
  FileSystem outFs = partFile.getFileSystem(conf);
  if (outFs.exists(partFile)) {
    outFs.delete(partFile, false);
  }
  SequenceFile.Writer writer = 
    SequenceFile.createWriter(outFs, conf, partFile, Text.class, 
                              NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  for(Text split : sampler.createPartitions(partitions)) {
    writer.append(split, nullValue);
  }
  writer.close();
}

Example 7

Source File: TeraInputFormat.java From hadoop-book with Apache License 2.0

5 votes

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf,
        Path partFile) throws IOException {
    TeraInputFormat inFormat = new TeraInputFormat();
    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        RecordReader<Text, Text> reader =
                inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        while (reader.next(key, value)) {
            sampler.addKey(key);
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    SequenceFile.Writer writer =
            SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

Example 8

Source File: FetcherOutputFormat.java From nutch-htmlunit with Apache License 2.0

5 votes

public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
  Path out = FileOutputFormat.getOutputPath(job);
  if ((out == null) && (job.getNumReduceTasks() != 0)) {
  	throw new InvalidJobConfException(
  			"Output directory not set in JobConf.");
  }
  if (fs == null) {
  	fs = out.getFileSystem(job);
  }
  if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME)))
  	throw new IOException("Segment already fetched!");
}

Example 9

Source File: InputSampler.java From RDFS with Apache License 2.0

5 votes

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}

Example 10

Source File: JobSubmitter.java From hadoop with Apache License 2.0

5 votes

private void checkSpecs(Job job) throws ClassNotFoundException, 
    InterruptedException, IOException {
  JobConf jConf = (JobConf)job.getConfiguration();
  // Check the output specification
  if (jConf.getNumReduceTasks() == 0 ? 
      jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
    org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
      ReflectionUtils.newInstance(job.getOutputFormatClass(),
        job.getConfiguration());
    output.checkOutputSpecs(job);
  } else {
    jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
  }
}

Example 11

Source File: TeraInputFormat.java From RDFS with Apache License 2.0

5 votes

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf, 
                                      Path partFile) throws IOException {
  TeraInputFormat inFormat = new TeraInputFormat();
  TextSampler sampler = new TextSampler();
  Text key = new Text();
  Text value = new Text();
  int partitions = conf.getNumReduceTasks();
  long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
  InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
  int samples = Math.min(10, splits.length);
  long recordsPerSample = sampleSize / samples;
  int sampleStep = splits.length / samples;
  long records = 0;
  // take N samples from different parts of the input
  for(int i=0; i < samples; ++i) {
    RecordReader<Text,Text> reader = 
      inFormat.getRecordReader(splits[sampleStep * i], conf, null);
    while (reader.next(key, value)) {
      sampler.addKey(key);
      records += 1;
      if ((i+1) * recordsPerSample <= records) {
        break;
      }
    }
  }
  FileSystem outFs = partFile.getFileSystem(conf);
  if (outFs.exists(partFile)) {
    outFs.delete(partFile, false);
  }
  SequenceFile.Writer writer = 
    SequenceFile.createWriter(outFs, conf, partFile, Text.class, 
                              NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  for(Text split : sampler.createPartitions(partitions)) {
    writer.append(split, nullValue);
  }
  writer.close();
}

Example 12

Source File: SleepJob.java From RDFS with Apache License 2.0

5 votes

public RecordReader<IntWritable,IntWritable> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter)
    throws IOException {
  final int count = conf.getInt("sleep.job.map.sleep.count", 1);
  if (count < 0) throw new IOException("Invalid map count: " + count);
  final int redcount = conf.getInt("sleep.job.reduce.sleep.count", 1);
  if (redcount < 0)
    throw new IOException("Invalid reduce count: " + redcount);
  final int emitPerMapTask = (redcount * conf.getNumReduceTasks());
return new RecordReader<IntWritable,IntWritable>() {
    private int records = 0;
    private int emitCount = 0;
    
    public boolean next(IntWritable key, IntWritable value)
        throws IOException {
      key.set(emitCount);
      int emit = emitPerMapTask / count;
      if ((emitPerMapTask) % count > records) {
        ++emit;
      }
      emitCount += emit;
      value.set(emit);
      return records++ < count;
    }
    public IntWritable createKey() { return new IntWritable(); }
    public IntWritable createValue() { return new IntWritable(); }
    public long getPos() throws IOException { return records; }
    public void close() throws IOException { }
    public float getProgress() throws IOException {
      return records / ((float)count);
    }
  };
}

Example 13

Source File: TableMapReduceUtil.java From hbase with Apache License 2.0

5 votes

/**
 * Ensures that the given number of reduce tasks for the given job
 * configuration does not exceed the number of regions for the given table.
 *
 * @param table  The table to get the region count for.
 * @param job  The current job configuration to adjust.
 * @throws IOException When retrieving the table details fails.
 */
// Used by tests.
public static void limitNumReduceTasks(String table, JobConf job)
throws IOException {
  int regions =
    MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
  if (job.getNumReduceTasks() > regions)
    job.setNumReduceTasks(regions);
}

Example 14

Source File: FetcherOutputFormat.java From anthelion with Apache License 2.0

5 votes

public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
  Path out = FileOutputFormat.getOutputPath(job);
  if ((out == null) && (job.getNumReduceTasks() != 0)) {
  	throw new InvalidJobConfException(
  			"Output directory not set in JobConf.");
  }
  if (fs == null) {
  	fs = out.getFileSystem(job);
  }
  if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME)))
  	throw new IOException("Segment already fetched!");
}

Example 15

Source File: JobSubmitter.java From big-c with Apache License 2.0

5 votes

private void checkSpecs(Job job) throws ClassNotFoundException, 
    InterruptedException, IOException {
  JobConf jConf = (JobConf)job.getConfiguration();
  // Check the output specification
  if (jConf.getNumReduceTasks() == 0 ? 
      jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
    org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
      ReflectionUtils.newInstance(job.getOutputFormatClass(),
        job.getConfiguration());
    output.checkOutputSpecs(job);
  } else {
    jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
  }
}

Example 16

Source File: SegmentMerger.java From nutch-htmlunit with Apache License 2.0

4 votes

public void configure(JobConf conf) {
  setConf(conf);
  if (sliceSize > 0) {
    sliceSize = sliceSize / conf.getNumReduceTasks();
  }
}

Example 17

Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0

4 votes

public void configure(JobConf job) {
  topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks();
}

Example 18

Source File: CrawlDbReader.java From anthelion with Apache License 2.0

4 votes

public void configure(JobConf job) {
  topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks();
}

Example 19

Source File: SegmentMerger.java From anthelion with Apache License 2.0

4 votes

public void configure(JobConf conf) {
  setConf(conf);
  if (sliceSize > 0) {
    sliceSize = sliceSize / conf.getNumReduceTasks();
  }
}

Example 20

Source File: TaskCalculator.java From emr-dynamodb-connector with Apache License 2.0

4 votes

public int getMaxMapTasks() throws IOException {
  JobConf conf = (JobConf) jobClient.getConf();

  // Total number of nodes in the cluster
  int nodes = jobClient.getClusterStatus().getTaskTrackers();
  log.info("Cluster has " + nodes + " active nodes.");
  if (nodes == 0) {
    log.warn("Cluster doesn't have any nodes");
    return 0;
  }

  // Memory per slot
  int slotMemory = conf.getInt("yarn.scheduler.minimum-allocation-mb", 1024); // Default value
  // from yarn-default.xml

  // Number of slots in a core node
  int nodeMemory = nodeCapacityProvider.getCoreNodeMemoryMB();
  int nodeSlots = nodeMemory / slotMemory;

  // Number of slots for a mapper
  int mapMemory = conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB);
  int mapSlots = (int) Math.ceil((double) mapMemory / slotMemory);

  // Number of slots for an application master
  int amMemory = conf.getInt(MRJobConfig.MR_AM_VMEM_MB, MRJobConfig.DEFAULT_MR_AM_VMEM_MB);
  int appMasterSlots = (int) Math.ceil((double) amMemory / slotMemory);

  // Number of slots for a reducer
  int reduceMemory = conf.getInt(MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig
      .DEFAULT_REDUCE_MEMORY_MB);
  int reduceSlots = (int) Math.ceil((double) reduceMemory / slotMemory);

  // Number of reducers
  int reducers = conf.getNumReduceTasks();

  // Calculate the number of mappers
  int mappers = yarnContainerAllocator.getMaxMappers(nodes, reducers, nodeSlots,
      appMasterSlots, mapSlots, reduceSlots);

  log.info("Slot size: " + slotMemory + "MB.");
  log.info("Node manager can allocate " + nodeMemory + "MB (" + nodeSlots + " slots) for "
      + "containers on each node.");
  log.info("Each mapper needs: " + mapMemory + "MB. (" + mapSlots + " slots)");
  log.info("Each reducer needs: " + reduceMemory + "MB. (" + reduceSlots + " slots)");
  log.info("MapReduce Application Manager needs: " + amMemory + " MB. (" + appMasterSlots + " "
      + "slots)");
  log.info("Number of reducers: " + reducers);
  log.info("Max number of cluster map tasks: " + mappers);

  if (mappers < 1) {
    log.warn("The calculated max number of concurrent map tasks is less than 1. Use 1 instead.");
    mappers = 1;
  }

  return mappers;
}