org.apache.hadoop.mapreduce.InputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.InputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractHadoopJob.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}
 
Example #2
Source File: MRJobTestUtil.java    From sqoop-on-spark with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
public static boolean runJob(Configuration conf,
    Class<? extends InputFormat<?,?>> inputFormatClass,
    Class<? extends Mapper<?,?,?,?>> mapperClass,
    Class<? extends OutputFormat<?,?>> outputFormatClass) throws IOException,
    InterruptedException, ClassNotFoundException {
  Job job = new Job(conf);
  job.setInputFormatClass(inputFormatClass);
  job.setMapperClass(mapperClass);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setOutputFormatClass(outputFormatClass);
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NullWritable.class);

  boolean ret = job.waitForCompletion(true);

  // Hadoop 1.0 (and 0.20) have nasty bug when job committer is not called in
  // LocalJobRuner
  if (isHadoop1()) {
    callOutputCommitter(job, outputFormatClass);
  }

  return ret;
}
 
Example #3
Source File: HadoopElementIterator.java    From tinkerpop with Apache License 2.0 6 votes vote down vote up
public HadoopElementIterator(final HadoopGraph graph) {
    try {
        this.graph = graph;
        final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration());
        final InputFormat<NullWritable, VertexWritable> inputFormat = ConfUtil.getReaderAsInputFormat(configuration);
        if (inputFormat instanceof FileInputFormat) {
            final Storage storage = FileSystemStorage.open(configuration);
            if (!this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
                return; // there is no input location and thus, no data (empty graph)
            if (!Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).isPresent())
                return; // there is no data at the input location (empty graph)
            configuration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).get());
        }
        final List<InputSplit> splits = inputFormat.getSplits(new JobContextImpl(configuration, new JobID(UUID.randomUUID().toString(), 1)));
        for (final InputSplit split : splits) {
            this.readers.add(inputFormat.createRecordReader(split, new TaskAttemptContextImpl(configuration, new TaskAttemptID())));
        }
    } catch (final Exception e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}
 
Example #4
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test validates behavior of {@link
 * HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
 * InputFormat's {@link InputFormat#getSplits(JobContext)} getSplits(JobContext)} returns NULL
 * value.
 */
@Test
public void testComputeSplitsIfGetSplitsReturnsNullValue() throws Exception {
  InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
  Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(null);
  HadoopInputFormatBoundedSource<Text, Employee> hifSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          mockInputSplit);
  thrown.expect(IOException.class);
  thrown.expectMessage("Error in computing splits, getSplits() returns null.");
  hifSource.setInputFormatObj(mockInputFormat);
  hifSource.computeSplitsIfNecessary();
}
 
Example #5
Source File: HadoopFormatIOCassandraIT.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Returns Hadoop configuration for reading data from Cassandra. To read data from Cassandra using
 * HadoopFormatIO, following properties must be set: InputFormat class, InputFormat key class,
 * InputFormat value class, Thrift address, Thrift port, partitioner class, keyspace and
 * columnfamily name.
 */
private static Configuration getConfiguration(HadoopFormatIOTestOptions options) {
  Configuration conf = new Configuration();
  conf.set(CASSANDRA_THRIFT_PORT_PROPERTY, options.getCassandraServerPort().toString());
  conf.set(CASSANDRA_THRIFT_ADDRESS_PROPERTY, options.getCassandraServerIp());
  conf.set(CASSANDRA_PARTITIONER_CLASS_PROPERTY, CASSANDRA_PARTITIONER_CLASS_VALUE);
  conf.set(CASSANDRA_KEYSPACE_PROPERTY, CASSANDRA_KEYSPACE);
  conf.set(CASSANDRA_COLUMNFAMILY_PROPERTY, CASSANDRA_TABLE);
  // Set user name and password if Cassandra instance has security configured.
  conf.set(USERNAME, options.getCassandraUserName());
  conf.set(PASSWORD, options.getCassandraPassword());
  conf.set(INPUT_KEYSPACE_USERNAME_CONFIG, options.getCassandraUserName());
  conf.set(INPUT_KEYSPACE_PASSWD_CONFIG, options.getCassandraPassword());
  conf.setClass(
      "mapreduce.job.inputformat.class",
      org.apache.cassandra.hadoop.cql3.CqlInputFormat.class,
      InputFormat.class);
  conf.setClass("key.class", Long.class, Object.class);
  conf.setClass("value.class", Row.class, Object.class);
  return conf;
}
 
Example #6
Source File: HadoopScanRunner.java    From titan1withtp3.1 with Apache License 2.0 6 votes vote down vote up
public static ScanMetrics runScanJob(ScanJob scanJob, Configuration conf, String confRootField,
                                 org.apache.hadoop.conf.Configuration hadoopConf,
                                 Class<? extends InputFormat> inputFormat)
        throws IOException, InterruptedException, ClassNotFoundException {

    ModifiableHadoopConfiguration scanConf =
            ModifiableHadoopConfiguration.of(TitanHadoopConfiguration.MAPRED_NS, hadoopConf);

    tryToLoadClassByName(scanJob);

    // Set the ScanJob class
    scanConf.set(TitanHadoopConfiguration.SCAN_JOB_CLASS, scanJob.getClass().getName());

    String jobName = HadoopScanMapper.class.getSimpleName() + "[" + scanJob + "]";

    return runJob(conf, confRootField, hadoopConf, inputFormat, jobName, HadoopScanMapper.class);
}
 
Example #7
Source File: TestTableInputFormat.java    From hbase with Apache License 2.0 6 votes vote down vote up
void testInputFormat(Class<? extends InputFormat> clazz)
    throws IOException, InterruptedException, ClassNotFoundException {
  final Job job = MapreduceTestingShim.createJob(UTIL.getConfiguration());
  job.setInputFormatClass(clazz);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setMapperClass(ExampleVerifier.class);
  job.setNumReduceTasks(0);

  LOG.debug("submitting job.");
  assertTrue("job failed!", job.waitForCompletion(true));
  assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getValue());
  assertEquals("Saw any instances of the filtered out row.", 0, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getValue());
  assertEquals("Saw the wrong number of instances of columnA.", 1, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getValue());
  assertEquals("Saw the wrong number of instances of columnB.", 1, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getValue());
  assertEquals("Saw the wrong count of values for the filtered-for row.", 2, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getValue());
  assertEquals("Saw the wrong count of values for the filtered-out row.", 0, job.getCounters()
      .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getValue());
}
 
Example #8
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test validates functionality of {@link HadoopFormatIO.Read#validateTransform()
 * Read.validateTransform()} function when myKeyTranslate's (simple function provided by user for
 * key translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set
 * in configuration as "key.class").
 */
@Test
public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() {
  SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType =
      new SimpleFunction<LongWritable, String>() {
        @Override
        public String apply(LongWritable input) {
          return input.toString();
        }
      };
  HadoopFormatIO.Read<String, Employee> read =
      HadoopFormatIO.<String, Employee>read()
          .withConfiguration(serConf.get())
          .withKeyTranslation(myKeyTranslateWithWrongInputType);
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(
      String.format(
          "Key translation's input type is not same as hadoop InputFormat : %s key "
              + "class : %s",
          serConf.get().getClass("mapreduce.job.inputformat.class", InputFormat.class),
          serConf.get().getClass("key.class", Object.class)));
  read.validateTransform();
}
 
Example #9
Source File: TestCombineFileInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testReinit() throws Exception {
  // Test that a split containing multiple files works correctly,
  // with the child RecordReader getting its initialize() method
  // called a second time.
  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf = new Configuration();
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1"), new Path("file2") };
  long [] lengths = { 1, 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);
  RecordReader rr = inputFormat.createRecordReader(split, context);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // first initialize() call comes from MapTask. We'll do it here.
  rr.initialize(split, context);

  // First value is first filename.
  assertTrue(rr.nextKeyValue());
  assertEquals("file1", rr.getCurrentValue().toString());

  // The inner RR will return false, because it only emits one (k, v) pair.
  // But there's another sub-split to process. This returns true to us.
  assertTrue(rr.nextKeyValue());
  
  // And the 2nd rr will have its initialize method called correctly.
  assertEquals("file2", rr.getCurrentValue().toString());
  
  // But after both child RR's have returned their singleton (k, v), this
  // should also return false.
  assertFalse(rr.nextKeyValue());
}
 
Example #10
Source File: MneMapreducePersonDataTest.java    From mnemonic with Apache License 2.0 5 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWritePersonData" })
public void testReadPersonData() throws Exception {
  long sumage = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
          new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat =
          new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
      RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader =
          inputFormat.createRecordReader(split, m_tacontext);
      MneDurableInputValue<Person<Long>> personval = null;
      while (reader.nextKeyValue()) {
        personval = reader.getCurrentValue();
        AssertJUnit.assertTrue(personval.getValue().getAge() < 51);
        sumage += personval.getValue().getAge();
        ++reccnt;
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  AssertJUnit.assertEquals(m_sumage, sumage);
  System.out.println(String.format("The checksum of ages is %d", sumage));
}
 
Example #11
Source File: TestCombineFileInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordReaderInit() throws InterruptedException, IOException {
  // Test that we properly initialize the child recordreader when
  // CombineFileInputFormat and CombineFileRecordReader are used.

  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf1 = new Configuration();
  conf1.set(DUMMY_KEY, "STATE1");
  TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1") };
  long [] lengths = { 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);

  RecordReader rr = inputFormat.createRecordReader(split, context1);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // Verify that the initial configuration is the one being used.
  // Right after construction the dummy key should have value "STATE1"
  assertEquals("Invalid initial dummy key value", "STATE1",
    rr.getCurrentKey().toString());

  // Switch the active context for the RecordReader...
  Configuration conf2 = new Configuration();
  conf2.set(DUMMY_KEY, "STATE2");
  TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId);
  rr.initialize(split, context2);

  // And verify that the new context is updated into the child record reader.
  assertEquals("Invalid secondary dummy key value", "STATE2",
    rr.getCurrentKey().toString());
}
 
Example #12
Source File: HadoopInputFormatTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private HadoopInputFormat<String, Long> setupHadoopInputFormat(InputFormat<String, Long> inputFormat, Job job,
																RecordReader<String, Long> recordReader) {

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat,
			String.class, Long.class, job);
	hadoopInputFormat.recordReader = recordReader;

	return hadoopInputFormat;
}
 
Example #13
Source File: HadoopScanRunner.java    From titan1withtp3.1 with Apache License 2.0 5 votes vote down vote up
/**
 * Run a ScanJob on Hadoop MapReduce.
 * <p>
 * The {@code confRootField} parameter must be a string in the format
 * {@code package.package...class#fieldname}, where {@code fieldname} is the
 * name of a public static field on the class specified by the portion of the
 * string before the {@code #}.  The {@code #} itself is just a separator and
 * is discarded.
 * <p>
 * When a MapReduce task process prepares to execute the {@code ScanJob}, it will
 * read the public static field named by {@code confFieldRoot} and cast it to a
 * {@link ConfigNamespace}.  This namespace object becomes the root of a
 * {@link Configuration} instantiated, populated with the key-value pairs
 * from the {@code conf} parameter, and then passed into the {@code ScanJob}.
 * <p>
 * This method blocks until the ScanJob completes, then returns the metrics
 * generated by the job during its execution.  It does not timeout.
 *
 * @param conf configuration settings for the ScanJob
 * @param confRootField the root of the ScanJob's configuration
 * @param hadoopConf the Configuration passed to the MapReduce Job
 * @param inputFormat the InputFormat<StaticBuffer, Iterable<Entry>>
 *        that reads (row, columns) pairs out of a Titan edgestore
 * @return metrics generated by the ScanJob
 * @throws IOException if the job fails for any reason
 * @throws ClassNotFoundException if {@code scanJob.getClass()} or if Hadoop
 *         MapReduce's internal job-submission-related reflection fails
 * @throws InterruptedException if interrupted while waiting for the Hadoop
 *         MapReduce job to complete
 */
public static ScanMetrics runJob(Configuration conf, String confRootField,
                                 org.apache.hadoop.conf.Configuration hadoopConf,
                                 Class<? extends InputFormat> inputFormat, String jobName,
                                 Class<? extends Mapper> mapperClass)
        throws IOException, InterruptedException, ClassNotFoundException {

    Preconditions.checkArgument(null != hadoopConf);
    Preconditions.checkArgument(null != inputFormat);

    if (null != conf) {
        Preconditions.checkArgument(null != confRootField,
                "Configuration root field must be provided when configuration instance is provided");
    }

    ModifiableHadoopConfiguration scanConf =
            ModifiableHadoopConfiguration.of(TitanHadoopConfiguration.MAPRED_NS, hadoopConf);

    if (null != confRootField) {
        // Set the scanjob configuration root
        scanConf.set(TitanHadoopConfiguration.SCAN_JOB_CONFIG_ROOT, confRootField);

        // Instantiate scanjob configuration root
        ConfigNamespace confRoot = HadoopScanMapper.getJobRoot(confRootField);

        // Create writable view of scanjob configuration atop the Hadoop Configuration instance, where all keys are prefixed with SCAN_JOB_CONFIG_KEYS
        ModifiableConfiguration hadoopJobConf = ModifiableHadoopConfiguration.prefixView(confRoot,
                TitanHadoopConfiguration.SCAN_JOB_CONFIG_KEYS, scanConf);

        // Copy scanjob settings from the Titan Configuration instance to the Hadoop Configuration instance
        Map<String, Object> jobConfMap = conf.getSubset(confRoot);
        for (Map.Entry<String, Object> jobConfEntry : jobConfMap.entrySet()) {
            hadoopJobConf.set((ConfigOption) ConfigElement.parse(confRoot, jobConfEntry.getKey()).element, jobConfEntry.getValue());
        }
    }

    return runJob(scanConf.getHadoopConfiguration(), inputFormat, jobName, mapperClass);
}
 
Example #14
Source File: JsonLoader.java    From spork with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public InputFormat getInputFormat() throws IOException {
    // We will use TextInputFormat, the default Hadoop input format for
    // text.  It has a LongWritable key that we will ignore, and the value
    // is a Text (a string writable) that the JSON data is in.
    return new TextInputFormat();
}
 
Example #15
Source File: InputSampler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(Job job, Sampler<K,V> sampler) 
    throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = job.getConfiguration();
  final InputFormat inf = 
      ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
  int numPartitions = job.getNumReduceTasks();
  K[] samples = (K[])sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getSortComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
  FileSystem fs = dst.getFileSystem(conf);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, 
    conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}
 
Example #16
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * This test validates behavior of {@link
 * HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} if Hadoop
 * InputFormat's {@link InputFormat#getSplits(JobContext)} getSplits(JobContext)} returns
 * InputSplit list having some null values.
 */
@Test
public void testComputeSplitsIfGetSplitsReturnsListHavingNullValues() throws Exception {
  // InputSplit list having null value.
  InputSplit mockInputSplit =
      Mockito.mock(InputSplit.class, Mockito.withSettings().extraInterfaces(Writable.class));
  List<InputSplit> inputSplitList = new ArrayList<>();
  inputSplitList.add(mockInputSplit);
  inputSplitList.add(null);
  InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class)))
      .thenReturn(inputSplitList);
  HadoopInputFormatBoundedSource<Text, Employee> hifSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          new SerializableSplit());
  thrown.expect(IOException.class);
  thrown.expectMessage(
      "Error in computing splits, split is null in InputSplits list populated "
          + "by getSplits() : ");
  hifSource.setInputFormatObj(mockInputFormat);
  hifSource.computeSplitsIfNecessary();
}
 
Example #17
Source File: InputSampler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example #18
Source File: InputSampler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example #19
Source File: ExportJobBase.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
@Override
protected Class<? extends InputFormat> getInputFormatClass()
    throws ClassNotFoundException {
  Class<? extends InputFormat> configuredIF = super.getInputFormatClass();
  if (null == configuredIF) {
    return ExportInputFormat.class;
  } else {
    return configuredIF;
  }
}
 
Example #20
Source File: TestMultipleInputs.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public void testAddInputPathWithFormat() throws IOException {
  final Job conf = Job.getInstance();
  MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class);
  MultipleInputs.addInputPath(conf, new Path("/bar"),
      KeyValueTextInputFormat.class);
  final Map<Path, InputFormat> inputs = MultipleInputs
     .getInputFormatMap(conf);
  assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
  assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
     .getClass());
}
 
Example #21
Source File: DelegatingRecordReader.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs the DelegatingRecordReader.
 * 
 * @param split TaggegInputSplit object
 * @param context TaskAttemptContext object
 *  
 * @throws IOException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
public DelegatingRecordReader(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {
  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.
  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
      .newInstance(taggedInputSplit.getInputFormatClass(), context
          .getConfiguration());
  originalRR = inputFormat.createRecordReader(taggedInputSplit
      .getInputSplit(), context);
}
 
Example #22
Source File: TableMapReduceUtil.java    From hbase with Apache License 2.0 5 votes vote down vote up
/**
 * Use this before submitting a TableMap job. It will appropriately set up
 * the job.
 *
 * @param table  The table name to read from.
 * @param scan  The scan instance with the columns, time range etc.
 * @param mapper  The mapper class to use.
 * @param outputKeyClass  The class of the output key.
 * @param outputValueClass  The class of the output value.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary HBase configuration.
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @param initCredentials whether to initialize hbase auth credentials for the job
 * @param inputFormatClass the input format
 * @throws IOException When setting up the details fails.
 */
public static void initTableMapperJob(String table, Scan scan,
    Class<? extends TableMapper> mapper,
    Class<?> outputKeyClass,
    Class<?> outputValueClass, Job job,
    boolean addDependencyJars, boolean initCredentials,
    Class<? extends InputFormat> inputFormatClass)
throws IOException {
  job.setInputFormatClass(inputFormatClass);
  if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
  if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
  job.setMapperClass(mapper);
  if (Put.class.equals(outputValueClass)) {
    job.setCombinerClass(PutCombiner.class);
  }
  Configuration conf = job.getConfiguration();
  HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
  conf.set(TableInputFormat.INPUT_TABLE, table);
  conf.set(TableInputFormat.SCAN, convertScanToString(scan));
  conf.setStrings("io.serializations", conf.get("io.serializations"),
      MutationSerialization.class.getName(), ResultSerialization.class.getName(),
      CellSerialization.class.getName());
  if (addDependencyJars) {
    addDependencyJars(job);
  }
  if (initCredentials) {
    initCredentials(job);
  }
}
 
Example #23
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * This test validates functionality of {@link
 * HadoopFormatIO.Read#withConfiguration(Configuration) withConfiguration(Configuration)} function
 * when value class is not provided by the user in configuration.
 */
@Test
public void testReadValidationFailsMissingValueClassInConf() {
  Configuration configuration = new Configuration();
  configuration.setClass(
      "mapreduce.job.inputformat.class", EmployeeInputFormat.class, InputFormat.class);
  configuration.setClass("key.class", Text.class, Object.class);
  thrown.expect(IllegalArgumentException.class);
  HadoopFormatIO.<Text, Employee>read().withConfiguration(configuration);
}
 
Example #24
Source File: TestCombineFileInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testReinit() throws Exception {
  // Test that a split containing multiple files works correctly,
  // with the child RecordReader getting its initialize() method
  // called a second time.
  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf = new Configuration();
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1"), new Path("file2") };
  long [] lengths = { 1, 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);
  RecordReader rr = inputFormat.createRecordReader(split, context);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // first initialize() call comes from MapTask. We'll do it here.
  rr.initialize(split, context);

  // First value is first filename.
  assertTrue(rr.nextKeyValue());
  assertEquals("file1", rr.getCurrentValue().toString());

  // The inner RR will return false, because it only emits one (k, v) pair.
  // But there's another sub-split to process. This returns true to us.
  assertTrue(rr.nextKeyValue());
  
  // And the 2nd rr will have its initialize method called correctly.
  assertEquals("file2", rr.getCurrentValue().toString());
  
  // But after both child RR's have returned their singleton (k, v), this
  // should also return false.
  assertFalse(rr.nextKeyValue());
}
 
Example #25
Source File: TestCombineFileInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordReaderInit() throws InterruptedException, IOException {
  // Test that we properly initialize the child recordreader when
  // CombineFileInputFormat and CombineFileRecordReader are used.

  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf1 = new Configuration();
  conf1.set(DUMMY_KEY, "STATE1");
  TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1") };
  long [] lengths = { 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);

  RecordReader rr = inputFormat.createRecordReader(split, context1);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // Verify that the initial configuration is the one being used.
  // Right after construction the dummy key should have value "STATE1"
  assertEquals("Invalid initial dummy key value", "STATE1",
    rr.getCurrentKey().toString());

  // Switch the active context for the RecordReader...
  Configuration conf2 = new Configuration();
  conf2.set(DUMMY_KEY, "STATE2");
  TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId);
  rr.initialize(split, context2);

  // And verify that the new context is updated into the child record reader.
  assertEquals("Invalid secondary dummy key value", "STATE2",
    rr.getCurrentKey().toString());
}
 
Example #26
Source File: InjectableConnManager.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
/**
 * Allow the user to inject custom mapper, input, and output formats
 * into the importTable() process.
 */
@Override
@SuppressWarnings("unchecked")
public void importTable(ImportJobContext context)
    throws IOException, ImportException {

  SqoopOptions options = context.getOptions();
  Configuration conf = options.getConf();

  Class<? extends Mapper> mapperClass = (Class<? extends Mapper>)
      conf.getClass(MAPPER_KEY, Mapper.class);
  Class<? extends InputFormat> ifClass = (Class<? extends InputFormat>)
      conf.getClass(INPUT_FORMAT_KEY, TextInputFormat.class);
  Class<? extends OutputFormat> ofClass = (Class<? extends OutputFormat>)
      conf.getClass(OUTPUT_FORMAT_KEY, TextOutputFormat.class);

  Class<? extends ImportJobBase> jobClass = (Class<? extends ImportJobBase>)
      conf.getClass(IMPORT_JOB_KEY, ImportJobBase.class);

  String tableName = context.getTableName();

  // Instantiate the user's chosen ImportJobBase instance.
  ImportJobBase importJob = ReflectionUtils.newInstance(jobClass, conf);

  // And configure the dependencies to inject
  importJob.setOptions(options);
  importJob.setMapperClass(mapperClass);
  importJob.setInputFormatClass(ifClass);
  importJob.setOutputFormatClass(ofClass);

  importJob.runImport(tableName, context.getJarFile(),
      getSplitColumn(options, tableName), conf);
}
 
Example #27
Source File: HadoopInputFormatTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private HadoopInputFormat<String, Long> setupHadoopInputFormat(InputFormat<String, Long> inputFormat, Job job,
																RecordReader<String, Long> recordReader) {

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat,
			String.class, Long.class, job);
	hadoopInputFormat.recordReader = recordReader;

	return hadoopInputFormat;
}
 
Example #28
Source File: ImportJobBase.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
public ImportJobBase(final SqoopOptions opts,
    final Class<? extends Mapper> mapperClass,
    final Class<? extends InputFormat> inputFormatClass,
    final Class<? extends OutputFormat> outputFormatClass,
    final ImportJobContext context) {
  super(opts, mapperClass, inputFormatClass, outputFormatClass, context);
}
 
Example #29
Source File: MneMapreduceLongDataTest.java    From mnemonic with Apache License 2.0 5 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWriteLongData" })
public void testReadLongData() throws Exception {
  long sum = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
          new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat =
          new MneInputFormat<MneDurableInputValue<Long>, Long>();
      RecordReader<NullWritable, MneDurableInputValue<Long>> reader =
          inputFormat.createRecordReader(split, m_tacontext);
      MneDurableInputValue<Long> mdval = null;
      while (reader.nextKeyValue()) {
        mdval = reader.getCurrentValue();
        sum += mdval.getValue();
        ++reccnt;
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_sum, sum);
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  System.out.println(String.format("The checksum of long data is %d", sum));
}
 
Example #30
Source File: DelegatingRecordReader.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs the DelegatingRecordReader.
 * 
 * @param split TaggegInputSplit object
 * @param context TaskAttemptContext object
 *  
 * @throws IOException
 * @throws InterruptedException
 */
@SuppressWarnings("unchecked")
public DelegatingRecordReader(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {
  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.
  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
      .newInstance(taggedInputSplit.getInputFormatClass(), context
          .getConfiguration());
  originalRR = inputFormat.createRecordReader(taggedInputSplit
      .getInputSplit(), context);
}