Java Code Examples for org.apache.hadoop.mapreduce.InputFormat#createRecordReader()

The following examples show how to use org.apache.hadoop.mapreduce.InputFormat#createRecordReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StatementPatternStorageTest.java    From rya with Apache License 2.0 6 votes vote down vote up
protected List<StatementPatternStorage> createStorages(final String location) throws IOException, InterruptedException {
    final List<StatementPatternStorage> storages = new ArrayList<StatementPatternStorage>();
    StatementPatternStorage storage = new StatementPatternStorage();
    final InputFormat<?, ?> inputFormat = storage.getInputFormat();
    Job job = Job.getInstance(new Configuration());
    storage.setLocation(location, job);
    final List<InputSplit> splits = inputFormat.getSplits(job);
    assertNotNull(splits);

    for (final InputSplit inputSplit : splits) {
        storage = new StatementPatternStorage();
        job = Job.getInstance(new Configuration());
        storage.setLocation(location, job);
        final TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration(),
                new TaskAttemptID("jtid", 0, TaskType.REDUCE, 0, 0));
        final RecordReader<?, ?> recordReader = inputFormat.createRecordReader(inputSplit,
                taskAttemptContext);
        recordReader.initialize(inputSplit, taskAttemptContext);

        storage.prepareToRead(recordReader, null);
        storages.add(storage);
    }
    return storages;
}
 
Example 2
Source File: TestCombineTextInputFormat.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
  InputSplit split, Job job) throws IOException, InterruptedException {
  List<Text> result = new ArrayList<Text>();
  Configuration conf = job.getConfiguration();
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(conf);
  RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
    MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
  MapContext<LongWritable,Text,LongWritable,Text> mcontext =
    new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
    context.getTaskAttemptID(), reader, null, null,
    MapReduceTestUtil.createDummyReporter(),
    split);
  reader.initialize(split, mcontext);
  while (reader.nextKeyValue()) {
    result.add(new Text(reader.getCurrentValue()));
  }
  return result;
}
 
Example 3
Source File: TestMRSequenceFileInputFilter.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}
 
Example 4
Source File: GraphFilterRecordReader.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
    final Configuration configuration = taskAttemptContext.getConfiguration();
    final InputFormat<NullWritable, VertexWritable> inputFormat = ReflectionUtils.newInstance(configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class), configuration);
    if (!(inputFormat instanceof GraphFilterAware) && configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
    this.recordReader.initialize(inputSplit, taskAttemptContext);
}
 
Example 5
Source File: TestCombineFileInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testReinit() throws Exception {
  // Test that a split containing multiple files works correctly,
  // with the child RecordReader getting its initialize() method
  // called a second time.
  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf = new Configuration();
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1"), new Path("file2") };
  long [] lengths = { 1, 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);
  RecordReader rr = inputFormat.createRecordReader(split, context);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // first initialize() call comes from MapTask. We'll do it here.
  rr.initialize(split, context);

  // First value is first filename.
  assertTrue(rr.nextKeyValue());
  assertEquals("file1", rr.getCurrentValue().toString());

  // The inner RR will return false, because it only emits one (k, v) pair.
  // But there's another sub-split to process. This returns true to us.
  assertTrue(rr.nextKeyValue());
  
  // And the 2nd rr will have its initialize method called correctly.
  assertEquals("file2", rr.getCurrentValue().toString());
  
  // But after both child RR's have returned their singleton (k, v), this
  // should also return false.
  assertFalse(rr.nextKeyValue());
}
 
Example 6
Source File: InputSampler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 7
Source File: MneMapreducePersonDataTest.java    From mnemonic with Apache License 2.0 5 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWritePersonData" })
public void testReadPersonData() throws Exception {
  long sumage = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
          new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat =
          new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
      RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader =
          inputFormat.createRecordReader(split, m_tacontext);
      MneDurableInputValue<Person<Long>> personval = null;
      while (reader.nextKeyValue()) {
        personval = reader.getCurrentValue();
        AssertJUnit.assertTrue(personval.getValue().getAge() < 51);
        sumage += personval.getValue().getAge();
        ++reccnt;
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  AssertJUnit.assertEquals(m_sumage, sumage);
  System.out.println(String.format("The checksum of ages is %d", sumage));
}
 
Example 8
Source File: IndexedStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * IndexableLoadFunc interface implementation
 */
@Override
public void initialize(Configuration conf) throws IOException {
    try {
        InputFormat inputFormat = this.getInputFormat();
        TaskAttemptID id = HadoopShims.getNewTaskAttemptID();

        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
                    conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }
        List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
        this.readers = new IndexedStorageRecordReader[fileSplits.size()];

        int idx = 0;
        Iterator<FileSplit> it = fileSplits.iterator();
        while (it.hasNext()) {
            FileSplit fileSplit = it.next();
            TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
            IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat.createRecordReader(fileSplit, context);
            r.initialize(fileSplit, context);
            this.readers[idx] = r;
            idx++;
        }

        Arrays.sort(this.readers, this.readerComparator);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}
 
Example 9
Source File: MneMapreduceLongDataTest.java    From mnemonic with Apache License 2.0 5 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWriteLongData" })
public void testReadLongData() throws Exception {
  long sum = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
          new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat =
          new MneInputFormat<MneDurableInputValue<Long>, Long>();
      RecordReader<NullWritable, MneDurableInputValue<Long>> reader =
          inputFormat.createRecordReader(split, m_tacontext);
      MneDurableInputValue<Long> mdval = null;
      while (reader.nextKeyValue()) {
        mdval = reader.getCurrentValue();
        sum += mdval.getValue();
        ++reccnt;
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_sum, sum);
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  System.out.println(String.format("The checksum of long data is %d", sum));
}
 
Example 10
Source File: InputSampler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 11
Source File: CubertInputFormat.java    From Cubert with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader<K, V> createRecordReader(InputSplit split,
                                             TaskAttemptContext context) throws IOException,
        InterruptedException
{
    Configuration conf = context.getConfiguration();
    ConfigurationDiff confDiff = new ConfigurationDiff(conf);

    MultiMapperSplit mmSplit = (MultiMapperSplit) split;
    int multiMapperIndex = mmSplit.getMultiMapperIndex();

    confDiff.applyDiff(multiMapperIndex);

    // reset the conf to multiMapperIndex
    InputSplit actualSplit = mmSplit.getActualSplit();

    // get the actual input format class
    InputFormat<K, V> actualInputFormat = getActualInputFormat(context);

    RecordReader<K, V> reader = null;

    if (actualSplit instanceof CombineFileSplit)
    {
        reader =
                new CombinedFileRecordReader<K, V>(actualInputFormat,
                                                   (CombineFileSplit) actualSplit,
                                                   context);
    }
    else
    {
        reader = actualInputFormat.createRecordReader(actualSplit, context);
    }

    // confDiff.undoDiff(multiMapperIndex);

    return new MultiMapperRecordReader<K, V>(reader);
}
 
Example 12
Source File: TestMRSequenceFileInputFilter.java    From big-c with Apache License 2.0 5 votes vote down vote up
private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}
 
Example 13
Source File: TestCombineFileInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordReaderInit() throws InterruptedException, IOException {
  // Test that we properly initialize the child recordreader when
  // CombineFileInputFormat and CombineFileRecordReader are used.

  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf1 = new Configuration();
  conf1.set(DUMMY_KEY, "STATE1");
  TaskAttemptContext context1 = new TaskAttemptContextImpl(conf1, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1") };
  long [] lengths = { 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);

  RecordReader rr = inputFormat.createRecordReader(split, context1);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // Verify that the initial configuration is the one being used.
  // Right after construction the dummy key should have value "STATE1"
  assertEquals("Invalid initial dummy key value", "STATE1",
    rr.getCurrentKey().toString());

  // Switch the active context for the RecordReader...
  Configuration conf2 = new Configuration();
  conf2.set(DUMMY_KEY, "STATE2");
  TaskAttemptContext context2 = new TaskAttemptContextImpl(conf2, taskId);
  rr.initialize(split, context2);

  // And verify that the new context is updated into the child record reader.
  assertEquals("Invalid secondary dummy key value", "STATE2",
    rr.getCurrentKey().toString());
}
 
Example 14
Source File: TestCombineFileInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testReinit() throws Exception {
  // Test that a split containing multiple files works correctly,
  // with the child RecordReader getting its initialize() method
  // called a second time.
  TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
  Configuration conf = new Configuration();
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);

  // This will create a CombineFileRecordReader that itself contains a
  // DummyRecordReader.
  InputFormat inputFormat = new ChildRRInputFormat();

  Path [] files = { new Path("file1"), new Path("file2") };
  long [] lengths = { 1, 1 };

  CombineFileSplit split = new CombineFileSplit(files, lengths);
  RecordReader rr = inputFormat.createRecordReader(split, context);
  assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  // first initialize() call comes from MapTask. We'll do it here.
  rr.initialize(split, context);

  // First value is first filename.
  assertTrue(rr.nextKeyValue());
  assertEquals("file1", rr.getCurrentValue().toString());

  // The inner RR will return false, because it only emits one (k, v) pair.
  // But there's another sub-split to process. This returns true to us.
  assertTrue(rr.nextKeyValue());
  
  // And the 2nd rr will have its initialize method called correctly.
  assertEquals("file2", rr.getCurrentValue().toString());
  
  // But after both child RR's have returned their singleton (k, v), this
  // should also return false.
  assertFalse(rr.nextKeyValue());
}
 
Example 15
Source File: InputSampler.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 16
Source File: TestMRSequenceFileAsBinaryOutputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
public void testBinary() throws IOException, InterruptedException {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf);
  
  Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
                  "outseq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  FileOutputFormat.setOutputPath(job, outdir);

  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, 
                                        IntWritable.class );
  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, 
                                        DoubleWritable.class ); 

  SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
  SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, 
                                                     CompressionType.BLOCK);

  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();

  TaskAttemptContext context = 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
  OutputFormat<BytesWritable, BytesWritable> outputFormat = 
    new SequenceFileAsBinaryOutputFormat();
  OutputCommitter committer = outputFormat.getOutputCommitter(context);
  committer.setupJob(job);
  RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
    getRecordWriter(context);

  IntWritable iwritable = new IntWritable();
  DoubleWritable dwritable = new DoubleWritable();
  DataOutputBuffer outbuf = new DataOutputBuffer();
  LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
  try {
    for (int i = 0; i < RECORDS; ++i) {
      iwritable = new IntWritable(r.nextInt());
      iwritable.write(outbuf);
      bkey.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      dwritable = new DoubleWritable(r.nextDouble());
      dwritable.write(outbuf);
      bval.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      writer.write(bkey, bval);
    }
  } finally {
    writer.close(context);
  }
  committer.commitTask(context);
  committer.commitJob(job);

  InputFormat<IntWritable, DoubleWritable> iformat =
    new SequenceFileInputFormat<IntWritable, DoubleWritable>();
  int count = 0;
  r.setSeed(seed);
  SequenceFileInputFormat.setInputPaths(job, outdir);
  LOG.info("Reading data by SequenceFileInputFormat");
  for (InputSplit split : iformat.getSplits(job)) {
    RecordReader<IntWritable, DoubleWritable> reader =
      iformat.createRecordReader(split, context);
    MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<IntWritable, DoubleWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      int sourceInt;
      double sourceDouble;
      while (reader.nextKeyValue()) {
        sourceInt = r.nextInt();
        sourceDouble = r.nextDouble();
        iwritable = reader.getCurrentKey();
        dwritable = reader.getCurrentValue();
        assertEquals(
            "Keys don't match: " + "*" + iwritable.get() + ":" + 
                                         sourceInt + "*",
            sourceInt, iwritable.get());
        assertTrue(
            "Vals don't match: " + "*" + dwritable.get() + ":" +
                                         sourceDouble + "*",
            Double.compare(dwritable.get(), sourceDouble) == 0 );
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}
 
Example 17
Source File: MneMapreduceChunkDataTest.java    From mnemonic with Apache License 2.0 4 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWriteChunkData" })
public void testReadChunkData() throws Exception {
  List<String> partfns = new ArrayList<String>();
  long reccnt = 0L;
  long tsize = 0L;
  Checksum cs = new CRC32();
  cs.reset();
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      partfns.add(listfiles[idx].getName());
    }
  }
  Collections.sort(partfns); // keep the order for checksum
  for (int idx = 0; idx < partfns.size(); ++idx) {
    System.out.println(String.format("Verifying : %s", partfns.get(idx)));
    FileSplit split = new FileSplit(
        new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]);
    InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat =
        new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>();
    RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader =
        inputFormat.createRecordReader(split, m_tacontext);
    MneDurableInputValue<DurableChunk<?>> dchkval = null;
    while (reader.nextKeyValue()) {
      dchkval = reader.getCurrentValue();
      byte b;
      for (int j = 0; j < dchkval.getValue().getSize(); ++j) {
        b = unsafe.getByte(dchkval.getValue().get() + j);
        cs.update(b);
      }
      tsize += dchkval.getValue().getSize();
      ++reccnt;
    }
    reader.close();
  }
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  AssertJUnit.assertEquals(m_totalsize, tsize);
  AssertJUnit.assertEquals(m_checksum, cs.getValue());
  System.out.println(String.format("The checksum of chunk is %d", m_checksum));
}
 
Example 18
Source File: TestMRSequenceFileAsBinaryOutputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
public void testBinary() throws IOException, InterruptedException {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf);
  
  Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
                  "outseq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  FileOutputFormat.setOutputPath(job, outdir);

  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, 
                                        IntWritable.class );
  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, 
                                        DoubleWritable.class ); 

  SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
  SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, 
                                                     CompressionType.BLOCK);

  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();

  TaskAttemptContext context = 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
  OutputFormat<BytesWritable, BytesWritable> outputFormat = 
    new SequenceFileAsBinaryOutputFormat();
  OutputCommitter committer = outputFormat.getOutputCommitter(context);
  committer.setupJob(job);
  RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
    getRecordWriter(context);

  IntWritable iwritable = new IntWritable();
  DoubleWritable dwritable = new DoubleWritable();
  DataOutputBuffer outbuf = new DataOutputBuffer();
  LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
  try {
    for (int i = 0; i < RECORDS; ++i) {
      iwritable = new IntWritable(r.nextInt());
      iwritable.write(outbuf);
      bkey.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      dwritable = new DoubleWritable(r.nextDouble());
      dwritable.write(outbuf);
      bval.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      writer.write(bkey, bval);
    }
  } finally {
    writer.close(context);
  }
  committer.commitTask(context);
  committer.commitJob(job);

  InputFormat<IntWritable, DoubleWritable> iformat =
    new SequenceFileInputFormat<IntWritable, DoubleWritable>();
  int count = 0;
  r.setSeed(seed);
  SequenceFileInputFormat.setInputPaths(job, outdir);
  LOG.info("Reading data by SequenceFileInputFormat");
  for (InputSplit split : iformat.getSplits(job)) {
    RecordReader<IntWritable, DoubleWritable> reader =
      iformat.createRecordReader(split, context);
    MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<IntWritable, DoubleWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      int sourceInt;
      double sourceDouble;
      while (reader.nextKeyValue()) {
        sourceInt = r.nextInt();
        sourceDouble = r.nextDouble();
        iwritable = reader.getCurrentKey();
        dwritable = reader.getCurrentValue();
        assertEquals(
            "Keys don't match: " + "*" + iwritable.get() + ":" + 
                                         sourceInt + "*",
            sourceInt, iwritable.get());
        assertTrue(
            "Vals don't match: " + "*" + dwritable.get() + ":" +
                                         sourceDouble + "*",
            Double.compare(dwritable.get(), sourceDouble) == 0 );
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}
 
Example 19
Source File: TestCombineSequenceFileInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}
 
Example 20
Source File: TestMRSequenceFileAsBinaryInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
public void testBinary() throws IOException, InterruptedException {
  Job job = Job.getInstance();
  FileSystem fs = FileSystem.getLocal(job.getConfiguration());
  Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
  Path file = new Path(dir, "testbinary.seq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  fs.delete(dir, true);
  FileInputFormat.setInputPaths(job, dir);

  Text tkey = new Text();
  Text tval = new Text();

  SequenceFile.Writer writer = new SequenceFile.Writer(fs,
    job.getConfiguration(), file, Text.class, Text.class);
  try {
    for (int i = 0; i < RECORDS; ++i) {
      tkey.set(Integer.toString(r.nextInt(), 36));
      tval.set(Long.toString(r.nextLong(), 36));
      writer.append(tkey, tval);
    }
  } finally {
    writer.close();
  }
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  InputFormat<BytesWritable,BytesWritable> bformat =
    new SequenceFileAsBinaryInputFormat();

  int count = 0;
  r.setSeed(seed);
  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();
  Text cmpkey = new Text();
  Text cmpval = new Text();
  DataInputBuffer buf = new DataInputBuffer();
  FileInputFormat.setInputPaths(job, file);
  for (InputSplit split : bformat.getSplits(job)) {
    RecordReader<BytesWritable, BytesWritable> reader =
          bformat.createRecordReader(split, context);
    MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<BytesWritable, BytesWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        bkey = reader.getCurrentKey();
        bval = reader.getCurrentValue();
        tkey.set(Integer.toString(r.nextInt(), 36));
        tval.set(Long.toString(r.nextLong(), 36));
        buf.reset(bkey.getBytes(), bkey.getLength());
        cmpkey.readFields(buf);
        buf.reset(bval.getBytes(), bval.getLength());
        cmpval.readFields(buf);
        assertTrue(
          "Keys don't match: " + "*" + cmpkey.toString() + ":" +
          tkey.toString() + "*",
          cmpkey.toString().equals(tkey.toString()));
        assertTrue(
          "Vals don't match: " + "*" + cmpval.toString() + ":" +
          tval.toString() + "*",
          cmpval.toString().equals(tval.toString()));
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}