Java Code Examples for org.apache.hadoop.mapreduce.Job#getNumReduceTasks()

The following examples show how to use org.apache.hadoop.mapreduce.Job#getNumReduceTasks() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PGBulkloadExportJob.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
@Override
protected int configureNumReduceTasks(Job job) throws IOException {
  if (job.getNumReduceTasks() < 1) {
    job.setNumReduceTasks(1);
  }
  return job.getNumReduceTasks();
}
 
Example 2
Source File: InputSampler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(Job job, Sampler<K,V> sampler) 
    throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = job.getConfiguration();
  final InputFormat inf = 
      ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
  int numPartitions = job.getNumReduceTasks();
  K[] samples = (K[])sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getSortComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  FileSystem fs = dst.getFileSystem(conf);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, 
    conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}
 
Example 3
Source File: InputSampler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(Job job, Sampler<K,V> sampler) 
    throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = job.getConfiguration();
  final InputFormat inf = 
      ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
  int numPartitions = job.getNumReduceTasks();
  K[] samples = (K[])sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getSortComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  FileSystem fs = dst.getFileSystem(conf);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, 
    conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}
 
Example 4
Source File: IngestWithReducerJobRunner.java    From geowave with Apache License 2.0 5 votes vote down vote up
@Override
protected void setupReducer(final Job job) {
  job.setReducerClass(IngestReducer.class);
  if (job.getNumReduceTasks() <= 1) {
    // the default is one reducer, if its only one, set it to 8 as the
    // default
    job.setNumReduceTasks(8);
  }
}
 
Example 5
Source File: TotalOrderPartitioner.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void setConf(Configuration conf) {
  try {
    this.conf = conf;
    String parts = getPartitionFile(conf);
    final Path partFile = new Path(parts);
    final FileSystem fs = (DEFAULT_PATH.equals(parts))
      ? FileSystem.getLocal(conf)     // assume in DistributedCache
      : partFile.getFileSystem(conf);

    Job job = Job.getInstance(conf);
    Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
    K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
    if (splitPoints.length != job.getNumReduceTasks() - 1) {
      throw new IOException("Wrong number of partitions in keyset");
    }
    RawComparator<K> comparator =
      (RawComparator<K>) job.getSortComparator();
    for (int i = 0; i < splitPoints.length - 1; ++i) {
      if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) {
        throw new IOException("Split points are out of order");
      }
    }
    boolean natOrder =
      conf.getBoolean(NATURAL_ORDER, true);
    if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
      partitions = buildTrie((BinaryComparable[])splitPoints, 0,
          splitPoints.length, new byte[0],
          // Now that blocks of identical splitless trie nodes are 
          // represented reentrantly, and we develop a leaf for any trie
          // node with only one split point, the only reason for a depth
          // limit is to refute stack overflow or bloat in the pathological
          // case where the split points are long and mostly look like bytes 
          // iii...iixii...iii   .  Therefore, we make the default depth
          // limit large but not huge.
          conf.getInt(MAX_TRIE_DEPTH, 200));
    } else {
      partitions = new BinarySearchNode(splitPoints, comparator);
    }
  } catch (IOException e) {
    throw new IllegalArgumentException("Can't read partitions file", e);
  }
}
 
Example 6
Source File: TotalOrderPartitioner.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void setConf(Configuration conf) {
  try {
    this.conf = conf;
    String parts = getPartitionFile(conf);
    final Path partFile = new Path(parts);
    final FileSystem fs = (DEFAULT_PATH.equals(parts))
      ? FileSystem.getLocal(conf)     // assume in DistributedCache
      : partFile.getFileSystem(conf);

    Job job = Job.getInstance(conf);
    Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
    K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
    if (splitPoints.length != job.getNumReduceTasks() - 1) {
      throw new IOException("Wrong number of partitions in keyset");
    }
    RawComparator<K> comparator =
      (RawComparator<K>) job.getSortComparator();
    for (int i = 0; i < splitPoints.length - 1; ++i) {
      if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) {
        throw new IOException("Split points are out of order");
      }
    }
    boolean natOrder =
      conf.getBoolean(NATURAL_ORDER, true);
    if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
      partitions = buildTrie((BinaryComparable[])splitPoints, 0,
          splitPoints.length, new byte[0],
          // Now that blocks of identical splitless trie nodes are 
          // represented reentrantly, and we develop a leaf for any trie
          // node with only one split point, the only reason for a depth
          // limit is to refute stack overflow or bloat in the pathological
          // case where the split points are long and mostly look like bytes 
          // iii...iixii...iii   .  Therefore, we make the default depth
          // limit large but not huge.
          conf.getInt(MAX_TRIE_DEPTH, 200));
    } else {
      partitions = new BinarySearchNode(splitPoints, comparator);
    }
  } catch (IOException e) {
    throw new IllegalArgumentException("Can't read partitions file", e);
  }
}
 
Example 7
Source File: TableMapReduceUtil.java    From hbase with Apache License 2.0 4 votes vote down vote up
/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.
 *
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary HBase configuration.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param quorumAddress Distant cluster to write to; default is null for
 * output to the cluster that is designated in <code>hbase-site.xml</code>.
 * Set this String to the zookeeper ensemble of an alternate remote cluster
 * when you would have the reduce write a cluster that is other than the
 * default; e.g. copying tables between clusters, the source would be
 * designated by <code>hbase-site.xml</code> and this param would have the
 * ensemble address of the remote cluster.  The format to pass is particular.
 * Pass <code> &lt;hbase.zookeeper.quorum&gt;:&lt;
 *             hbase.zookeeper.client.port&gt;:&lt;zookeeper.znode.parent&gt;
 * </code> such as <code>server,server2,server3:2181:/hbase</code>.
 * @param serverClass redefined hbase.regionserver.class
 * @param serverImpl redefined hbase.regionserver.impl
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 */
public static void initTableReducerJob(String table,
  Class<? extends TableReducer> reducer, Job job,
  Class partitioner, String quorumAddress, String serverClass,
  String serverImpl, boolean addDependencyJars) throws IOException {

  Configuration conf = job.getConfiguration();
  HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
  job.setOutputFormatClass(TableOutputFormat.class);
  if (reducer != null) job.setReducerClass(reducer);
  conf.set(TableOutputFormat.OUTPUT_TABLE, table);
  conf.setStrings("io.serializations", conf.get("io.serializations"),
      MutationSerialization.class.getName(), ResultSerialization.class.getName());
  // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
  if (quorumAddress != null) {
    // Calling this will validate the format
    ZKConfig.validateClusterKey(quorumAddress);
    conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
  }
  if (serverClass != null && serverImpl != null) {
    conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
    conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
  }
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Writable.class);
  if (partitioner == HRegionPartitioner.class) {
    job.setPartitionerClass(HRegionPartitioner.class);
    int regions = MetaTableAccessor.getRegionCount(conf, TableName.valueOf(table));
    if (job.getNumReduceTasks() > regions) {
      job.setNumReduceTasks(regions);
    }
  } else if (partitioner != null) {
    job.setPartitionerClass(partitioner);
  }

  if (addDependencyJars) {
    addDependencyJars(job);
  }

  initCredentials(job);
}
 
Example 8
Source File: TableMapReduceUtil.java    From hbase with Apache License 2.0 3 votes vote down vote up
/**
 * Ensures that the given number of reduce tasks for the given job
 * configuration does not exceed the number of regions for the given table.
 *
 * @param table  The table to get the region count for.
 * @param job  The current job to adjust.
 * @throws IOException When retrieving the table details fails.
 */
public static void limitNumReduceTasks(String table, Job job)
throws IOException {
  int regions =
    MetaTableAccessor.getRegionCount(job.getConfiguration(), TableName.valueOf(table));
  if (job.getNumReduceTasks() > regions)
    job.setNumReduceTasks(regions);
}