org.apache.hadoop.mapred.JobConf#getInputFormat

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

5 votes

public static InputFormat<?, ?> getInputFormat(Table table, final HiveConf hiveConf) {
  try (final ContextClassLoaderSwapper ccls = ContextClassLoaderSwapper.newInstance()) {
    final JobConf job = new JobConf(hiveConf);
    final Class<? extends InputFormat> inputFormatClazz = getInputFormatClass(job, table, null);
    job.setInputFormat(inputFormatClazz);
    return job.getInputFormat();
  }
}

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

5 votes

public static InputFormat<?, ?> getInputFormat(Table table, final HiveConf hiveConf) {
  try (final ContextClassLoaderSwapper ccls = ContextClassLoaderSwapper.newInstance()) {
    final JobConf job = new JobConf(hiveConf);
    final Class<? extends InputFormat> inputFormatClazz = getInputFormatClass(job, table, null);
    job.setInputFormat(inputFormatClazz);
    return job.getInputFormat();
  }
}

Source File: AbstractEvaluatorToPartitionStrategy.java From reef with Apache License 2.0

5 votes

@SuppressWarnings("rawtypes")
AbstractEvaluatorToPartitionStrategy(
    final String inputFormatClassName, final Set<String> serializedDataPartitions) {
  LOG.fine("AbstractEvaluatorToPartitionStrategy injected");
  Validate.notEmpty(inputFormatClassName);
  Validate.notEmpty(serializedDataPartitions);

  locationToSplits = new ConcurrentHashMap<>();
  evaluatorToSplits = new ConcurrentHashMap<>();
  unallocatedSplits = new LinkedBlockingQueue<>();
  setUp();

  final Map<DistributedDataSetPartition, InputSplit[]> splitsPerPartition = new HashMap<>();
  for (final String serializedDataPartition : serializedDataPartitions) {
    final DistributedDataSetPartition dp = DistributedDataSetPartitionSerializer.deserialize(serializedDataPartition);
    final ExternalConstructor<JobConf> jobConfExternalConstructor = new JobConfExternalConstructor(
        inputFormatClassName, dp.getPath());
    try {
      final JobConf jobConf = jobConfExternalConstructor.newInstance();
      final InputFormat inputFormat = jobConf.getInputFormat();
      final InputSplit[] inputSplits = inputFormat.getSplits(jobConf, dp.getDesiredSplits());
      if (LOG.isLoggable(Level.FINEST)) {
        LOG.log(Level.FINEST, "Splits for partition: {0} {1}", new Object[] {dp, Arrays.toString(inputSplits)});
      }
      this.totalNumberOfSplits += inputSplits.length;
      splitsPerPartition.put(dp, inputSplits);
    } catch (final IOException e) {
      throw new RuntimeException("Unable to get InputSplits using the specified InputFormat", e);
    }
  }
  init(splitsPerPartition);
  LOG.log(Level.FINE, "Total Number of splits: {0}", this.totalNumberOfSplits);
}

Source File: InputSampler.java From RDFS with Apache License 2.0

5 votes

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}

Source File: MRHelpers.java From incubator-tez with Apache License 2.0

5 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
@Private
public static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
    JobConf jobConf, String inputFormatName, int numTasks) throws IOException {
  org.apache.hadoop.mapred.InputFormat inputFormat = jobConf.getInputFormat();
  org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;
  if (inputFormatName != null && !inputFormatName.isEmpty()) {
    if (!inputFormat.getClass().equals(
        org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class)){
      throw new TezUncheckedException(
      "Expected " +
      org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class.getName()
      + " in conf but got: " + inputFormat.getClass().getName());
    }
    try {
      inputFormat = (org.apache.hadoop.mapred.InputFormat) 
          ReflectionUtils.newInstance(Class.forName(inputFormatName), jobConf);
    } catch (ClassNotFoundException e) {
      throw new TezUncheckedException(e);
    }
    org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat = 
        new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
    groupedFormat.setConf(jobConf);
    groupedFormat.setInputFormat(inputFormat);
    groupedFormat.setDesiredNumberOfSplits(numTasks);
    finalInputFormat = groupedFormat;
  }
  org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
      .getSplits(jobConf, jobConf.getNumMapTasks());
  // sort the splits into order based on size, so that the biggest
  // go first
  Arrays.sort(splits, new OldInputSplitComparator());
  return splits;
}

Source File: MRInputHelpers.java From tez with Apache License 2.0

5 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
    JobConf jobConf, boolean groupSplits, boolean sortSplits, int numTasks)
    throws IOException {

  // This is the real InputFormat
  org.apache.hadoop.mapred.InputFormat inputFormat;
  try {
    inputFormat = jobConf.getInputFormat();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }

  org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;

  if (groupSplits) {
    org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat =
        new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
    groupedFormat.setConf(jobConf);
    groupedFormat.setInputFormat(inputFormat);
    groupedFormat.setDesiredNumberOfSplits(numTasks);
    finalInputFormat = groupedFormat;
  } else {
    finalInputFormat = inputFormat;
  }
  org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
      .getSplits(jobConf, jobConf.getNumMapTasks());
  if (sortSplits) {
    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new OldInputSplitComparator());
  }
  return splits;
}

Source File: InputSampler.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}

Source File: InputFormatExternalConstructor.java From reef with Apache License 2.0

4 votes

@Inject
public InputFormatExternalConstructor(final JobConf jobConf) {
  this.jobConf = jobConf;
  inputFormat = jobConf.getInputFormat();
}

Java Code Examples for org.apache.hadoop.mapred.JobConf#getInputFormat()