Java Code Examples for org.apache.hadoop.mapreduce.RecordReader#close()

The following examples show how to use org.apache.hadoop.mapreduce.RecordReader#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock1346406GzipCompressed() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="eth1346406.bin.gz";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
	Path file = new Path(fileNameBlock);
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();

	List<InputSplit> splits = format.getSplits(job);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	assertEquals( 1, splits.size(),"Only one split generated for block 1346406");
	RecordReader<BytesWritable, EthereumBlock> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull( reader,"Format returned  null RecordReader");
	reader.initialize(splits.get(0),context);
	BytesWritable key = new BytesWritable();
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.nextKeyValue(),"Input Split for block 1346406 contains at least one block");
	key=reader.getCurrentKey();
	block=reader.getCurrentValue();
	assertEquals( 6, block.getEthereumTransactions().size(),"Block 1346406 must have 6 transactions");
	assertFalse( reader.nextKeyValue(),"No further blocks in block 1346406");
	reader.close();
}
 
Example 2
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock1346406() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="eth1346406.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
	Path file = new Path(fileNameBlock);
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();

	List<InputSplit> splits = format.getSplits(job);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	assertEquals( 1, splits.size(),"Only one split generated for block 1346406");
	RecordReader<BytesWritable, EthereumBlock> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull( reader,"Format returned  null RecordReader");
	reader.initialize(splits.get(0),context);
	BytesWritable key = new BytesWritable();
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.nextKeyValue(),"Input Split for block 1346406 contains at least one block");
	key=reader.getCurrentKey();
	block=reader.getCurrentValue();
	assertEquals( 6, block.getEthereumTransactions().size(),"Block 1346406 must have 6 transactions");
	assertFalse( reader.nextKeyValue(),"No further blocks in block 1346406");
	reader.close();
}
 
Example 3
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock403419() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="block403419.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
	Path file = new Path(fileNameBlock);
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();

	List<InputSplit> splits = format.getSplits(job);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	assertEquals( 1, splits.size(),"Only one split generated for block 403419");
	RecordReader<BytesWritable, EthereumBlock> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull( reader,"Format returned  null RecordReader");
	reader.initialize(splits.get(0),context);
	BytesWritable key = new BytesWritable();
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.nextKeyValue(),"Input Split for block 403419 contains at least one block");
	key=reader.getCurrentKey();
	block=reader.getCurrentValue();
	assertEquals( 2, block.getEthereumTransactions().size(),"Block 403419 must have 2 transactions");
	EthereumBlockHeader ethereumBlockHeader = block.getEthereumBlockHeader();
	assertEquals(
			"f8b483dba2c3b7176a3da549ad41a48bb3121069",
			bytesToHex(ethereumBlockHeader.getCoinBase()).toLowerCase(),
			"Block 403419 was mined by f8b483dba2c3b7176a3da549ad41a48bb3121069"
	);
	assertEquals(
			"08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322",
			bytesToHex(ethereumBlockHeader.getParentHash()).toLowerCase(),
			"The parent of block 403419 has hash 08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322"
	);
	assertFalse( reader.nextKeyValue(),"No further blocks in block 403419");
	reader.close();
}
 
Example 4
Source File: InputSampler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 5
Source File: HadoopElementIterator.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Override
public void close() {
    try {
        for (final RecordReader reader : this.readers) {
            reader.close();
        }
    } catch (final IOException e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}
 
Example 6
Source File: Chain.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
void runMapper(TaskInputOutputContext context, int index) throws IOException,
    InterruptedException {
  Mapper mapper = mappers.get(index);
  RecordReader rr = new ChainRecordReader(context);
  RecordWriter rw = new ChainRecordWriter(context);
  Mapper.Context mapperContext = createMapContext(rr, rw, context,
      getConf(index));
  mapper.run(mapperContext);
  rr.close();
  rw.close(context);
}
 
Example 7
Source File: CompositeRecordReader.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Close all child RRs.
 */
public void close() throws IOException {
  if (kids != null) {
    for (RecordReader<K,? extends Writable> rr : kids) {
      rr.close();
    }
  }
  if (jc != null) {
    jc.close();
  }
}
 
Example 8
Source File: InputSampler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 9
Source File: Chain.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
void runMapper(TaskInputOutputContext context, int index) throws IOException,
    InterruptedException {
  Mapper mapper = mappers.get(index);
  RecordReader rr = new ChainRecordReader(context);
  RecordWriter rw = new ChainRecordWriter(context);
  Mapper.Context mapperContext = createMapContext(rr, rw, context,
      getConf(index));
  mapper.run(mapperContext);
  rr.close();
  rw.close(context);
}
 
Example 10
Source File: TestFixedLengthInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
private static List<String> readSplit(FixedLengthInputFormat format, 
                                      InputSplit split, 
                                      Job job) throws Exception {
  List<String> result = new ArrayList<String>();
  TaskAttemptContext context = MapReduceTestUtil.
      createDummyMapTaskAttemptContext(job.getConfiguration());
  RecordReader<LongWritable, BytesWritable> reader =
      format.createRecordReader(split, context);
  MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
      mcontext =
      new MapContextImpl<LongWritable, BytesWritable, LongWritable,
      BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
      reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
  LongWritable key;
  BytesWritable value;
  try {
    reader.initialize(split, mcontext);
    while (reader.nextKeyValue()) {
      key = reader.getCurrentKey();
      value = reader.getCurrentValue();
      result.add(new String(value.getBytes(), 0, value.getLength()));
    }
  } finally {
    reader.close();
  }
  return result;
}
 
Example 11
Source File: MneMapreducePersonDataTest.java    From mnemonic with Apache License 2.0 5 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWritePersonData" })
public void testReadPersonData() throws Exception {
  long sumage = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
          new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat =
          new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
      RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader =
          inputFormat.createRecordReader(split, m_tacontext);
      MneDurableInputValue<Person<Long>> personval = null;
      while (reader.nextKeyValue()) {
        personval = reader.getCurrentValue();
        AssertJUnit.assertTrue(personval.getValue().getAge() < 51);
        sumage += personval.getValue().getAge();
        ++reccnt;
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  AssertJUnit.assertEquals(m_sumage, sumage);
  System.out.println(String.format("The checksum of ages is %d", sumage));
}
 
Example 12
Source File: InputSampler.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 13
Source File: TestCombineSequenceFileInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}
 
Example 14
Source File: TestCombineSequenceFileInputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}
 
Example 15
Source File: TestMRKeyValueTextInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testSplitableCodecs() throws Exception {
  final Job job = Job.getInstance(defaultConf);
  final Configuration conf = job.getConfiguration();

  // Create the codec
  CompressionCodec codec = null;
  try {
    codec = (CompressionCodec)
    ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
  } catch (ClassNotFoundException cnfe) {
    throw new IOException("Illegal codec!");
  }
  Path file = new Path(workDir, "test"+codec.getDefaultExtension());

  int seed = new Random().nextInt();
  LOG.info("seed = " + seed);
  Random random = new Random(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int MAX_LENGTH = 500000;
  FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
  // for a variety of lengths
  for (int length = 0; length < MAX_LENGTH;
       length += random.nextInt(MAX_LENGTH / 4) + 1) {

    LOG.info("creating; entries = " + length);

    // create a file with length entries
    Writer writer =
      new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
    try {
      for (int i = 0; i < length; i++) {
        writer.write(Integer.toString(i * 2));
        writer.write("\t");
        writer.write(Integer.toString(i));
        writer.write("\n");
      }
    } finally {
      writer.close();
    }

    // try splitting the file in a variety of sizes
    KeyValueTextInputFormat format = new KeyValueTextInputFormat();
    assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
    for (int i = 0; i < 3; i++) {
      int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
      LOG.info("splitting: requesting = " + numSplits);
      List<InputSplit> splits = format.getSplits(job);
      LOG.info("splitting: got =        " + splits.size());

      // check each split
      BitSet bits = new BitSet(length);
      for (int j = 0; j < splits.size(); j++) {
        LOG.debug("split["+j+"]= " + splits.get(j));
        TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<Text, Text> reader = format.createRecordReader(
          splits.get(j), context);
        Class<?> clazz = reader.getClass();
        MapContext<Text, Text, Text, Text> mcontext =
          new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(),
          context.getTaskAttemptID(), reader, null, null,
          MapReduceTestUtil.createDummyReporter(), splits.get(j));
        reader.initialize(splits.get(j), mcontext);

        Text key = null;
        Text value = null;
        try {
          int count = 0;
          while (reader.nextKeyValue()) {
            key = reader.getCurrentKey();
            value = reader.getCurrentValue();
            final int k = Integer.parseInt(key.toString());
            final int v = Integer.parseInt(value.toString());
            assertEquals("Bad key", 0, k % 2);
            assertEquals("Mismatched key/value", k / 2, v);
            LOG.debug("read " + k + "," + v);
            assertFalse(k + "," + v + " in multiple partitions.",bits.get(v));
            bits.set(v);
            count++;
          }
          if (count > 0) {
            LOG.info("splits["+j+"]="+splits.get(j)+" count=" + count);
          } else {
            LOG.debug("splits["+j+"]="+splits.get(j)+" count=" + count);
          }
        } finally {
          reader.close();
        }
      }
      assertEquals("Some keys in no partition.", length, bits.cardinality());
    }

  }
}
 
Example 16
Source File: TestMRKeyValueTextInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testFormat() throws Exception {
  Job job = Job.getInstance(new Configuration(defaultConf));
  Path file = new Path(workDir, "test.txt");

  int seed = new Random().nextInt();
  LOG.info("seed = " + seed);
  Random random = new Random(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int MAX_LENGTH = 10000;
  // for a variety of lengths
  for (int length = 0; length < MAX_LENGTH;
       length += random.nextInt(MAX_LENGTH / 10) + 1) {

    LOG.debug("creating; entries = " + length);

    // create a file with length entries
    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
      for (int i = 0; i < length; i++) {
        writer.write(Integer.toString(i * 2));
        writer.write("\t");
        writer.write(Integer.toString(i));
        writer.write("\n");
      }
    } finally {
      writer.close();
    }

    // try splitting the file in a variety of sizes
    KeyValueTextInputFormat format = new KeyValueTextInputFormat();
    for (int i = 0; i < 3; i++) {
      int numSplits = random.nextInt(MAX_LENGTH / 20) + 1;
      LOG.debug("splitting: requesting = " + numSplits);
      List<InputSplit> splits = format.getSplits(job);
      LOG.debug("splitting: got =        " + splits.size());

      // check each split
      BitSet bits = new BitSet(length);
      for (int j = 0; j < splits.size(); j++) {
        LOG.debug("split["+j+"]= " + splits.get(j));
        TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<Text, Text> reader = format.createRecordReader(
          splits.get(j), context);
        Class<?> clazz = reader.getClass();
        assertEquals("reader class is KeyValueLineRecordReader.", 
          KeyValueLineRecordReader.class, clazz);
        MapContext<Text, Text, Text, Text> mcontext = 
          new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), 
          context.getTaskAttemptID(), reader, null, null, 
          MapReduceTestUtil.createDummyReporter(), splits.get(j));
        reader.initialize(splits.get(j), mcontext);

        Text key = null;
        Text value = null;
        try {
          int count = 0;
          while (reader.nextKeyValue()) {
            key = reader.getCurrentKey();
            clazz = key.getClass();
            assertEquals("Key class is Text.", Text.class, clazz);
            value = reader.getCurrentValue();
            clazz = value.getClass();
            assertEquals("Value class is Text.", Text.class, clazz);
            final int k = Integer.parseInt(key.toString());
            final int v = Integer.parseInt(value.toString());
            assertEquals("Bad key", 0, k % 2);
            assertEquals("Mismatched key/value", k / 2, v);
            LOG.debug("read " + v);
            assertFalse("Key in multiple partitions.", bits.get(v));
            bits.set(v);
            count++;
          }
          LOG.debug("splits[" + j + "]=" + splits.get(j) +" count=" + count);
        } finally {
          reader.close();
        }
      }
      assertEquals("Some keys in no partition.", length, bits.cardinality());
    }

  }
}
 
Example 17
Source File: TestMRSequenceFileAsBinaryInputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
public void testBinary() throws IOException, InterruptedException {
  Job job = Job.getInstance();
  FileSystem fs = FileSystem.getLocal(job.getConfiguration());
  Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
  Path file = new Path(dir, "testbinary.seq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  fs.delete(dir, true);
  FileInputFormat.setInputPaths(job, dir);

  Text tkey = new Text();
  Text tval = new Text();

  SequenceFile.Writer writer = new SequenceFile.Writer(fs,
    job.getConfiguration(), file, Text.class, Text.class);
  try {
    for (int i = 0; i < RECORDS; ++i) {
      tkey.set(Integer.toString(r.nextInt(), 36));
      tval.set(Long.toString(r.nextLong(), 36));
      writer.append(tkey, tval);
    }
  } finally {
    writer.close();
  }
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  InputFormat<BytesWritable,BytesWritable> bformat =
    new SequenceFileAsBinaryInputFormat();

  int count = 0;
  r.setSeed(seed);
  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();
  Text cmpkey = new Text();
  Text cmpval = new Text();
  DataInputBuffer buf = new DataInputBuffer();
  FileInputFormat.setInputPaths(job, file);
  for (InputSplit split : bformat.getSplits(job)) {
    RecordReader<BytesWritable, BytesWritable> reader =
          bformat.createRecordReader(split, context);
    MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<BytesWritable, BytesWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        bkey = reader.getCurrentKey();
        bval = reader.getCurrentValue();
        tkey.set(Integer.toString(r.nextInt(), 36));
        tval.set(Long.toString(r.nextLong(), 36));
        buf.reset(bkey.getBytes(), bkey.getLength());
        cmpkey.readFields(buf);
        buf.reset(bval.getBytes(), bval.getLength());
        cmpval.readFields(buf);
        assertTrue(
          "Keys don't match: " + "*" + cmpkey.toString() + ":" +
          tkey.toString() + "*",
          cmpkey.toString().equals(tkey.toString()));
        assertTrue(
          "Vals don't match: " + "*" + cmpval.toString() + ":" +
          tval.toString() + "*",
          cmpval.toString().equals(tval.toString()));
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}
 
Example 18
Source File: GFRecordReaderJUnitTest.java    From gemfirexd-oss with Apache License 2.0 4 votes vote down vote up
public void testGFRecordReaderNHop1Split() throws Exception {
  cluster = super.initMiniCluster(CLUSTER_PORT, 1);
  
  int entryCount = 2;
  int bucketCount = 3;
  HashSet<String> keySet = new HashSet<String>();
  
  for (int j = 0; j < bucketCount; j++) {
    HdfsSortedOplogOrganizer bucket = new HdfsSortedOplogOrganizer(
        regionManager, j);
    ArrayList<TestEvent> items = new ArrayList<TestEvent>();
    for (int i = 0; i < entryCount; i++) {
      String key = "key - " + j + " : " + i;
      items.add(new TestEvent(key, ("value-" + System.nanoTime())));
      keySet.add(key);
    }
    bucket.flush(items.iterator(), entryCount);
  }
  
  assertEquals(entryCount * bucketCount, keySet.size());
  
  Configuration conf = hdfsStore.getFileSystem().getConf();
  GFInputFormat gfInputFormat = new GFInputFormat();
  Job job = Job.getInstance(conf, "test");
  
  conf = job.getConfiguration();
  conf.set(GFInputFormat.INPUT_REGION, getName());
  conf.set(GFInputFormat.HOME_DIR, testDataDir.getName());
  conf.setBoolean(GFInputFormat.CHECKPOINT, false);
  
  List<InputSplit> splits = gfInputFormat.getSplits(job);
  assertEquals(1, splits.size());
  
  CombineFileSplit split = (CombineFileSplit) splits.get(0);
  assertEquals(bucketCount, split.getNumPaths());
  
  TaskAttemptContext context = new TaskAttemptContextImpl(conf,
      new TaskAttemptID());
  RecordReader<GFKey, PersistedEventImpl> reader = gfInputFormat
      .createRecordReader(split, context);
  reader.initialize(split, context);
  
  while (reader.nextKeyValue()) {
    keySet.remove(reader.getCurrentKey().getKey());
  }
  assertEquals(0, keySet.size());
  
  reader.close();
}
 
Example 19
Source File: InputSampler.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 20
Source File: TestCombineTextInputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testFormat() throws Exception {
  Job job = Job.getInstance(new Configuration(defaultConf));

  Random random = new Random();
  long seed = random.nextLong();
  LOG.info("seed = " + seed);
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with various lengths
  createFiles(length, numFiles, random);

  // create a combined split for the files
  CombineTextInputFormat format = new CombineTextInputFormat();
  for (int i = 0; i < 3; i++) {
    int numSplits = random.nextInt(length/20) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    LOG.debug("split= " + split);
    TaskAttemptContext context = MapReduceTestUtil.
      createDummyMapTaskAttemptContext(job.getConfiguration());
    RecordReader<LongWritable, Text> reader =
      format.createRecordReader(split, context);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());
    MapContext<LongWritable,Text,LongWritable,Text> mcontext =
      new MapContextImpl<LongWritable,Text,LongWritable,Text>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);

    try {
      int count = 0;
      while (reader.nextKeyValue()) {
        LongWritable key = reader.getCurrentKey();
        assertNotNull("Key should not be null.", key);
        Text value = reader.getCurrentValue();
        final int v = Integer.parseInt(value.toString());
        LOG.debug("read " + v);
        assertFalse("Key in multiple partitions.", bits.get(v));
        bits.set(v);
        count++;
      }
      LOG.debug("split=" + split + " count=" + count);
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}