org.apache.hadoop.mapred.InputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapred.InputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: InputSampler.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example #2
Source File: TestMultipleInputs.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public void testAddInputPathWithMapper() {
  final JobConf conf = new JobConf();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
     MapClass.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
     KeyValueTextInputFormat.class, MapClass2.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
     .getMapperTypeMap(conf);

  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
  assertEquals(MapClass.class, maps.get(new Path("/foo")));
  assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
 
Example #3
Source File: InputSampler.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example #4
Source File: InternalHiveSplitFactory.java    From presto with Apache License 2.0 6 votes vote down vote up
public InternalHiveSplitFactory(
        FileSystem fileSystem,
        String partitionName,
        InputFormat<?, ?> inputFormat,
        Properties schema,
        List<HivePartitionKey> partitionKeys,
        TupleDomain<HiveColumnHandle> effectivePredicate,
        BooleanSupplier partitionMatchSupplier,
        TableToPartitionMapping tableToPartitionMapping,
        Optional<BucketConversion> bucketConversion,
        boolean forceLocalScheduling,
        boolean s3SelectPushdownEnabled)
{
    this.fileSystem = requireNonNull(fileSystem, "fileSystem is null");
    this.partitionName = requireNonNull(partitionName, "partitionName is null");
    this.inputFormat = requireNonNull(inputFormat, "inputFormat is null");
    this.schema = requireNonNull(schema, "schema is null");
    this.partitionKeys = requireNonNull(partitionKeys, "partitionKeys is null");
    pathDomain = getPathDomain(requireNonNull(effectivePredicate, "effectivePredicate is null"));
    this.partitionMatchSupplier = requireNonNull(partitionMatchSupplier, "partitionMatchSupplier is null");
    this.tableToPartitionMapping = requireNonNull(tableToPartitionMapping, "tableToPartitionMapping is null");
    this.bucketConversion = requireNonNull(bucketConversion, "bucketConversion is null");
    this.forceLocalScheduling = forceLocalScheduling;
    this.s3SelectPushdownEnabled = s3SelectPushdownEnabled;
}
 
Example #5
Source File: InputSampler.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example #6
Source File: HdfsSerDeImportServiceTest.java    From hadoop-etl-udfs with MIT License 6 votes vote down vote up
private void runImportRCFile(ExaIterator ctx, List<HCatTableColumn> columns, List<HCatTableColumn> partitionColumns, List<OutputColumnSpec> outputColumns, String file) throws Exception {
    List<HCatSerDeParameter> serDeParameters = new ArrayList<>();
    serDeParameters.add(new HCatSerDeParameter("serialization.format", "1"));
    
    String inputFormatClassName = "org.apache.hadoop.hive.ql.io.RCFileInputFormat";
    String serDeClassName = "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe";
    String hdfsUser = "hdfs";
    boolean useKerberos = false;
    
    List<String> hdfsServers = new ArrayList<>();
    hdfsServers.add("file:///");
    final Configuration conf = new Configuration();
    FileSystem fs = HdfsService.getFileSystem(hdfsServers,conf);
    
    InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) UdfUtils.getInstanceByName(inputFormatClassName);
    AbstractSerDe serDe = (AbstractSerDe) UdfUtils.getInstanceByName(serDeClassName);
    HdfsSerDeImportService.importFile(fs, file, partitionColumns, inputFormat, serDe, serDeParameters, hdfsServers, hdfsUser, columns, outputColumns, useKerberos, false, ctx);
}
 
Example #7
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private static List<InputSplit> getInputSplits(final InputFormat<?, ?> format, final JobConf job) {
  InputSplit[] inputSplits;
  try {
    // Parquet logic in hive-3.1.1 does not check recursively by default.
    job.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
    inputSplits = format.getSplits(job, 1);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }

  if (null == inputSplits) {
    return Collections.emptyList();
  } else {
    return Arrays.asList(inputSplits);
  }
}
 
Example #8
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * When impersonation is not possible and when last modified times are not available,
 * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated.
 *
 * @param hiveStorageCapabilities The capabilities of the storage mechanism.
 * @param format                  The file input format.
 * @return true if FSUpdateKeys should be generated. False if not.
 */
public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities,
                                                         final InputFormat<?, ?> format) {

  if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) {
    return false;
  }

  // Files in a filesystem have last modified times and filesystem permissions. Generate
  // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat
  // as well as OrcInputFormat represent files.
  if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) {
    return true;
  }

  return false;
}
 
Example #9
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * When impersonation is not possible and when last modified times are not available,
 * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated.
 *
 * @param hiveStorageCapabilities The capabilities of the storage mechanism.
 * @param format                  The file input format.
 * @return true if FSUpdateKeys should be generated. False if not.
 */
public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities,
                                                         final InputFormat<?, ?> format) {

  if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) {
    return false;
  }

  // Files in a filesystem have last modified times and filesystem permissions. Generate
  // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat
  // as well as OrcInputFormat represent files.
  if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) {
    return true;
  }

  return false;
}
 
Example #10
Source File: HiveUtils.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Get paths from a Hive location using the provided input format.
 */
public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException {
  JobConf jobConf = new JobConf(getHadoopConfiguration());

  Set<Path> paths = Sets.newHashSet();

  FileInputFormat.addInputPaths(jobConf, location.toString());
  InputSplit[] splits = inputFormat.getSplits(jobConf, 1000);
  for (InputSplit split : splits) {
    if (!(split instanceof FileSplit)) {
      throw new IOException("Not a file split. Found " + split.getClass().getName());
    }
    FileSplit fileSplit = (FileSplit) split;
    paths.add(fileSplit.getPath());
  }

  return paths;
}
 
Example #11
Source File: HiveUtilities.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Get {@link InputFormat} class name for given table and partition definitions. We try to get the InputFormat class name
 * from inputFormat if explicitly specified in inputFormat, else we get the InputFormat class name from storageHandlerName.
 * @param jobConf
 * @param inputFormat
 * @param storageHandlerName
 * @return InputFormat
 * @throws Exception
 */
public static final Class<? extends InputFormat<?, ?>> getInputFormatClass(final JobConf jobConf, Optional<String> inputFormat,
  Optional<String> storageHandlerName) throws Exception {
  if (inputFormat.isPresent()) {
    return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormat.get());
  }

  if (storageHandlerName.isPresent()) {
    try (final ContextClassLoaderSwapper swapper = ContextClassLoaderSwapper.newInstance()) {
      // HiveUtils.getStorageHandler() depends on the current context classloader if you query and HBase table,
      // and don't have an HBase session open.
      final HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(jobConf, storageHandlerName.get());
      return (Class<? extends InputFormat<?, ?>>) storageHandler.getInputFormatClass();
    }
  }

  throw new ExecutionSetupException("Unable to get Hive table InputFormat class. There is neither " +
    "InputFormat class explicitly specified nor a StorageHandler class provided.");
}
 
Example #12
Source File: SequenceFileRecordReader.java    From Bats with Apache License 2.0 6 votes vote down vote up
private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader(
  final InputFormat<BytesWritable, BytesWritable> inputFormat,
  final JobConf jobConf) throws ExecutionSetupException {
  try {
    final UserGroupInformation ugi = ImpersonationUtil.createProxyUgi(this.opUserName, this.queryUserName);
    return ugi.doAs(new PrivilegedExceptionAction<org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable>>() {
      @Override
      public org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> run() throws Exception {
        return inputFormat.getRecordReader(split, jobConf, Reporter.NULL);
      }
    });
  } catch (IOException | InterruptedException e) {
    throw new ExecutionSetupException(
      String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d",
        split.getPath(), split.getStart(), split.getLength()), e);
  }
}
 
Example #13
Source File: InputSampler.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example #14
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}
 
Example #15
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static List<InputSplit> getInputSplits(final InputFormat<?, ?> format, final JobConf job) {
  InputSplit[] inputSplits;
  try {
    inputSplits = format.getSplits(job, 1);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }

  if (null == inputSplits) {
    return Collections.emptyList();
  } else {
    return Arrays.asList(inputSplits);
  }
}
 
Example #16
Source File: MultipleInputs.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 * 
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 * @param mapperClass {@link Mapper} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path,
    Class<? extends InputFormat> inputFormatClass,
    Class<? extends Mapper> mapperClass) {

  addInputPath(conf, path, inputFormatClass);

  String mapperMapping = path.toString() + ";" + mapperClass.getName();
  String mappers = conf.get("mapred.input.dir.mappers");
  conf.set("mapred.input.dir.mappers", mappers == null ? mapperMapping
     : mappers + "," + mapperMapping);

  conf.setMapperClass(DelegatingMapper.class);
}
 
Example #17
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static List<ColumnInfo> buildColumnInfo(final Table table, final InputFormat<?, ?> format, boolean
  includeComplexParquetCols) {
  final List<ColumnInfo> columnInfos = new ArrayList<>();
  for (FieldSchema hiveField : table.getSd().getCols()) {
    final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(hiveField.getType());
    Field f = HiveSchemaConverter.getArrowFieldFromHiveType(hiveField.getName(), typeInfo, format, includeComplexParquetCols);
    if (f != null) {
      columnInfos.add(getColumnInfo(typeInfo));
    }
  }
  return columnInfos;
}
 
Example #18
Source File: TaggedInputSplit.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new TaggedInputSplit.
 * 
 * @param inputSplit The InputSplit to be tagged
 * @param conf The configuration to use
 * @param inputFormatClass The InputFormat class to use for this job
 * @param mapperClass The Mapper class to use for this job
 */
public TaggedInputSplit(InputSplit inputSplit, Configuration conf,
    Class<? extends InputFormat> inputFormatClass,
    Class<? extends Mapper> mapperClass) {
  this.inputSplitClass = inputSplit.getClass();
  this.inputSplit = inputSplit;
  this.conf = conf;
  this.inputFormatClass = inputFormatClass;
  this.mapperClass = mapperClass;
}
 
Example #19
Source File: CompositeInputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Convenience method for constructing composite formats.
 * Given operation (op), Object class (inf), set of paths (p) return:
 * {@code <op>(tbl(<inf>,<p1>),tbl(<inf>,<p2>),...,tbl(<inf>,<pn>)) }
 */
public static String compose(String op, Class<? extends InputFormat> inf,
    Path... path) {
  ArrayList<String> tmp = new ArrayList<String>(path.length);
  for (Path p : path) {
    tmp.add(p.toString());
  }
  return compose(op, inf, tmp.toArray(new String[0]));
}
 
Example #20
Source File: HdfsSerDeExportServiceTest.java    From hadoop-etl-udfs with MIT License 5 votes vote down vote up
private void importFile(ExaIterator ctx, List<HCatTableColumn> columns, List<HCatTableColumn> partitionColumns, String file, String inputFormatName, String serdeName) throws Exception {
    List<HCatSerDeParameter> serDeParameters = new ArrayList<>();
    serDeParameters.add(new HCatSerDeParameter("serialization.format", "1"));
    String hdfsUser = "hdfs";
    boolean useKerberos = false;
    List<String> hdfsServers = new ArrayList<>();
    hdfsServers.add("file:///");
    final Configuration conf = new Configuration();
    FileSystem fs = HdfsService.getFileSystem(hdfsServers, conf);
    InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) UdfUtils.getInstanceByName(inputFormatName);
    AbstractSerDe serDe = (AbstractSerDe) UdfUtils.getInstanceByName(serdeName);
    List<OutputColumnSpec> outputColumns = OutputColumnSpecUtil.generateDefaultOutputSpecification(columns, new ArrayList<HCatTableColumn>());
    HdfsSerDeImportService.importFile(fs, file, partitionColumns, inputFormat, serDe, serDeParameters, hdfsServers, hdfsUser, columns, outputColumns, useKerberos, false, ctx);
}
 
Example #21
Source File: DelegatingInputFormat.java    From RDFS with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}
 
Example #22
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static List<ColumnInfo> buildColumnInfo(final Table table, final InputFormat<?, ?> format, final boolean includeComplexParquetCols) {
  final List<ColumnInfo> columnInfos = new ArrayList<>();
  for (FieldSchema hiveField : table.getSd().getCols()) {
    final TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(hiveField.getType());
    Field f = HiveSchemaConverter.getArrowFieldFromHiveType(hiveField.getName(), typeInfo, format, includeComplexParquetCols);
    if (f != null) {
      columnInfos.add(getColumnInfo(typeInfo));
    }
  }
  return columnInfos;
}
 
Example #23
Source File: GemFireXDFragmenter.java    From gemfirexd-oss with Apache License 2.0 5 votes vote down vote up
private InputSplit[] getSplits() throws IOException {
  InputFormat<Key, Row> inputFormat = this.gfxdManager.getInputFormat();
  try {
    return inputFormat.getSplits(this.jobConf, 1);
  } catch (FileNotFoundException fnfe) {
    throw new FileNotFoundException(
        "Table "
            + this.gfxdManager.getTable()
            + " not found. "
            + "The LOCATION string may contain incorrect value for one or more of the following:"
            + "1. Path to HDFSSTORE (homeDir), 2. Schema name or 3. Table name. "
            + GemFireXDManager.LOCATION_FORMAT);
  }
}
 
Example #24
Source File: DelegatingInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}
 
Example #25
Source File: HiveUtils.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * @return an instance of the {@link InputFormat} in this {@link StorageDescriptor}.
 */
public static InputFormat<?, ?> getInputFormat(StorageDescriptor sd) throws IOException {
  try {
    InputFormat<?, ?> inputFormat =
        ConstructorUtils.invokeConstructor((Class<? extends InputFormat>) Class.forName(sd.getInputFormat()));
    if (inputFormat instanceof JobConfigurable) {
      ((JobConfigurable) inputFormat).configure(new JobConf(getHadoopConfiguration()));
    }
    return inputFormat;
  } catch (ReflectiveOperationException re) {
    throw new IOException("Failed to instantiate input format.", re);
  }
}
 
Example #26
Source File: InputSampler.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}
 
Example #27
Source File: MultipleInputs.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 * 
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 * @param mapperClass {@link Mapper} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path,
    Class<? extends InputFormat> inputFormatClass,
    Class<? extends Mapper> mapperClass) {

  addInputPath(conf, path, inputFormatClass);

  String mapperMapping = path.toString() + ";" + mapperClass.getName();
  String mappers = conf.get("mapreduce.input.multipleinputs.dir.mappers");
  conf.set("mapreduce.input.multipleinputs.dir.mappers", mappers == null ? mapperMapping
     : mappers + "," + mapperMapping);

  conf.setMapperClass(DelegatingMapper.class);
}
 
Example #28
Source File: MultipleInputs.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Add a {@link Path} with a custom {@link InputFormat} to the list of
 * inputs for the map-reduce job.
 * 
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path,
    Class<? extends InputFormat> inputFormatClass) {

  String inputFormatMapping = path.toString() + ";"
     + inputFormatClass.getName();
  String inputFormats = conf.get("mapreduce.input.multipleinputs.dir.formats");
  conf.set("mapreduce.input.multipleinputs.dir.formats",
     inputFormats == null ? inputFormatMapping : inputFormats + ","
         + inputFormatMapping);

  conf.setInputFormat(DelegatingInputFormat.class);
}
 
Example #29
Source File: InputOutputInfo.java    From systemds with Apache License 2.0 5 votes vote down vote up
public InputOutputInfo(Class<? extends InputFormat> formatClsIn, Class<? extends OutputFormat> formatClsOut,
	Class<? extends Writable> keyCls, Class<? extends Writable> valueCls)
{
	inputFormatClass = formatClsIn;
	outputFormatClass = formatClsOut;
	keyClass = keyCls;
	valueClass = valueCls;
}
 
Example #30
Source File: TaggedInputSplit.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new TaggedInputSplit.
 * 
 * @param inputSplit The InputSplit to be tagged
 * @param conf The configuration to use
 * @param inputFormatClass The InputFormat class to use for this job
 * @param mapperClass The Mapper class to use for this job
 */
public TaggedInputSplit(InputSplit inputSplit, Configuration conf,
    Class<? extends InputFormat> inputFormatClass,
    Class<? extends Mapper> mapperClass) {
  this.inputSplitClass = inputSplit.getClass();
  this.inputSplit = inputSplit;
  this.conf = conf;
  this.inputFormatClass = inputFormatClass;
  this.mapperClass = mapperClass;
}