Java Code Examples for org.apache.hadoop.mapred.JobConf#getInputFormat()

The following examples show how to use org.apache.hadoop.mapred.JobConf#getInputFormat() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static InputFormat<?, ?> getInputFormat(Table table, final HiveConf hiveConf) {
  try (final ContextClassLoaderSwapper ccls = ContextClassLoaderSwapper.newInstance()) {
    final JobConf job = new JobConf(hiveConf);
    final Class<? extends InputFormat> inputFormatClazz = getInputFormatClass(job, table, null);
    job.setInputFormat(inputFormatClazz);
    return job.getInputFormat();
  }
}
 
Example 2
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static InputFormat<?, ?> getInputFormat(Table table, final HiveConf hiveConf) {
  try (final ContextClassLoaderSwapper ccls = ContextClassLoaderSwapper.newInstance()) {
    final JobConf job = new JobConf(hiveConf);
    final Class<? extends InputFormat> inputFormatClazz = getInputFormatClass(job, table, null);
    job.setInputFormat(inputFormatClazz);
    return job.getInputFormat();
  }
}
 
Example 3
Source File: AbstractEvaluatorToPartitionStrategy.java    From reef with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
AbstractEvaluatorToPartitionStrategy(
    final String inputFormatClassName, final Set<String> serializedDataPartitions) {
  LOG.fine("AbstractEvaluatorToPartitionStrategy injected");
  Validate.notEmpty(inputFormatClassName);
  Validate.notEmpty(serializedDataPartitions);

  locationToSplits = new ConcurrentHashMap<>();
  evaluatorToSplits = new ConcurrentHashMap<>();
  unallocatedSplits = new LinkedBlockingQueue<>();
  setUp();

  final Map<DistributedDataSetPartition, InputSplit[]> splitsPerPartition = new HashMap<>();
  for (final String serializedDataPartition : serializedDataPartitions) {
    final DistributedDataSetPartition dp = DistributedDataSetPartitionSerializer.deserialize(serializedDataPartition);
    final ExternalConstructor<JobConf> jobConfExternalConstructor = new JobConfExternalConstructor(
        inputFormatClassName, dp.getPath());
    try {
      final JobConf jobConf = jobConfExternalConstructor.newInstance();
      final InputFormat inputFormat = jobConf.getInputFormat();
      final InputSplit[] inputSplits = inputFormat.getSplits(jobConf, dp.getDesiredSplits());
      if (LOG.isLoggable(Level.FINEST)) {
        LOG.log(Level.FINEST, "Splits for partition: {0} {1}", new Object[] {dp, Arrays.toString(inputSplits)});
      }
      this.totalNumberOfSplits += inputSplits.length;
      splitsPerPartition.put(dp, inputSplits);
    } catch (final IOException e) {
      throw new RuntimeException("Unable to get InputSplits using the specified InputFormat", e);
    }
  }
  init(splitsPerPartition);
  LOG.log(Level.FINE, "Total Number of splits: {0}", this.totalNumberOfSplits);
}
 
Example 4
Source File: InputSampler.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}
 
Example 5
Source File: MRHelpers.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
@Private
public static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
    JobConf jobConf, String inputFormatName, int numTasks) throws IOException {
  org.apache.hadoop.mapred.InputFormat inputFormat = jobConf.getInputFormat();
  org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;
  if (inputFormatName != null && !inputFormatName.isEmpty()) {
    if (!inputFormat.getClass().equals(
        org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class)){
      throw new TezUncheckedException(
      "Expected " +
      org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class.getName()
      + " in conf but got: " + inputFormat.getClass().getName());
    }
    try {
      inputFormat = (org.apache.hadoop.mapred.InputFormat) 
          ReflectionUtils.newInstance(Class.forName(inputFormatName), jobConf);
    } catch (ClassNotFoundException e) {
      throw new TezUncheckedException(e);
    }
    org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat = 
        new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
    groupedFormat.setConf(jobConf);
    groupedFormat.setInputFormat(inputFormat);
    groupedFormat.setDesiredNumberOfSplits(numTasks);
    finalInputFormat = groupedFormat;
  }
  org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
      .getSplits(jobConf, jobConf.getNumMapTasks());
  // sort the splits into order based on size, so that the biggest
  // go first
  Arrays.sort(splits, new OldInputSplitComparator());
  return splits;
}
 
Example 6
Source File: MRInputHelpers.java    From tez with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
    JobConf jobConf, boolean groupSplits, boolean sortSplits, int numTasks)
    throws IOException {

  // This is the real InputFormat
  org.apache.hadoop.mapred.InputFormat inputFormat;
  try {
    inputFormat = jobConf.getInputFormat();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }

  org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;

  if (groupSplits) {
    org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat =
        new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
    groupedFormat.setConf(jobConf);
    groupedFormat.setInputFormat(inputFormat);
    groupedFormat.setDesiredNumberOfSplits(numTasks);
    finalInputFormat = groupedFormat;
  } else {
    finalInputFormat = inputFormat;
  }
  org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
      .getSplits(jobConf, jobConf.getNumMapTasks());
  if (sortSplits) {
    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new OldInputSplitComparator());
  }
  return splits;
}
 
Example 7
Source File: InputSampler.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}
 
Example 8
Source File: InputFormatExternalConstructor.java    From reef with Apache License 2.0 4 votes vote down vote up
@Inject
public InputFormatExternalConstructor(final JobConf jobConf) {
  this.jobConf = jobConf;
  inputFormat = jobConf.getInputFormat();
}