Java Code Examples for org.apache.hadoop.mapreduce.RecordReader#nextKeyValue()

The following examples show how to use org.apache.hadoop.mapreduce.RecordReader#nextKeyValue() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GobblinWorkUnitsInputFormatTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Test
public void testRecordReader()
    throws Exception {

  List<String> paths = Lists.newArrayList("/path1", "/path2");
  GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths);

  GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
  RecordReader<LongWritable, Text> recordReader =
      inputFormat.createRecordReader(split, new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1,
      TaskType.MAP, 1, 1)));

  recordReader.nextKeyValue();
  Assert.assertEquals(recordReader.getCurrentKey().get(), 0);
  Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1");

  recordReader.nextKeyValue();
  Assert.assertEquals(recordReader.getCurrentKey().get(), 1);
  Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2");

  Assert.assertFalse(recordReader.nextKeyValue());

}
 
Example 2
Source File: TestCombineTextInputFormat.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
  InputSplit split, Job job) throws IOException, InterruptedException {
  List<Text> result = new ArrayList<Text>();
  Configuration conf = job.getConfiguration();
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(conf);
  RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
    MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
  MapContext<LongWritable,Text,LongWritable,Text> mcontext =
    new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
    context.getTaskAttemptID(), reader, null, null,
    MapReduceTestUtil.createDummyReporter(),
    split);
  reader.initialize(split, mcontext);
  while (reader.nextKeyValue()) {
    result.add(new Text(reader.getCurrentValue()));
  }
  return result;
}
 
Example 3
Source File: TestMRKeyValueTextInputFormat.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private static List<Text> readSplit(KeyValueTextInputFormat format, 
    InputSplit split, Job job) throws IOException, InterruptedException {
  List<Text> result = new ArrayList<Text>();
  Configuration conf = job.getConfiguration();
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(conf);
  RecordReader<Text, Text> reader = format.createRecordReader(split, 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
  MapContext<Text, Text, Text, Text> mcontext = 
    new MapContextImpl<Text, Text, Text, Text>(conf, 
    context.getTaskAttemptID(), reader, null, null,
    MapReduceTestUtil.createDummyReporter(), 
    split);
  reader.initialize(split, mcontext);
  while (reader.nextKeyValue()) {
    result.add(new Text(reader.getCurrentValue()));
  }
  reader.close();
  return result;
}
 
Example 4
Source File: TestGetAllFields.java    From logparser with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetAllField() throws IOException, InterruptedException {
    ApacheHttpdLogfileInputFormat inputFormat =
        new ApacheHttpdLogfileInputFormat(
            "common",
            Collections.singleton(ApacheHttpdLogfileRecordReader.FIELDS), Collections.emptyMap(), Collections.emptyList());
    RecordReader<LongWritable, ParsedRecord> reader = inputFormat.getRecordReader();

    Set<String> possibleFields = new HashSet<>(100);
    while (reader.nextKeyValue()){
        ParsedRecord currentValue = reader.getCurrentValue();
        String value = currentValue.getString(ApacheHttpdLogfileRecordReader.FIELDS);
        if (value == null) {
            continue;
        }
        possibleFields.add(value);
    }
    assertTrue(possibleFields.containsAll(Arrays.asList(
        // A subset of all fields that come out
        "STRING:connection.client.user",
        "IP:connection.client.host.last",
        "TIME.STAMP:request.receive.time.last",
        "TIME.EPOCH:request.receive.time.epoch",
        "STRING:request.status.last",
        "STRING:connection.client.user.last",
        "HTTP.METHOD:request.firstline.original.method",
        "HTTP.URI:request.firstline.uri",
        "HTTP.PATH:request.firstline.uri.path",
        "HTTP.QUERYSTRING:request.firstline.uri.query",
        "STRING:request.firstline.uri.query.*",
        "BYTES:response.body.bytesclf",
        "BYTESCLF:response.body.bytesclf",
        "IP:connection.client.host")));
}
 
Example 5
Source File: MneMapreduceLongDataTest.java    From mnemonic with Apache License 2.0 5 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWriteLongData" })
public void testReadLongData() throws Exception {
  long sum = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
          new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat =
          new MneInputFormat<MneDurableInputValue<Long>, Long>();
      RecordReader<NullWritable, MneDurableInputValue<Long>> reader =
          inputFormat.createRecordReader(split, m_tacontext);
      MneDurableInputValue<Long> mdval = null;
      while (reader.nextKeyValue()) {
        mdval = reader.getCurrentValue();
        sum += mdval.getValue();
        ++reccnt;
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_sum, sum);
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  System.out.println(String.format("The checksum of long data is %d", sum));
}
 
Example 6
Source File: InputSampler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 7
Source File: TestMRSequenceFileInputFilter.java    From big-c with Apache License 2.0 5 votes vote down vote up
private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}
 
Example 8
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock0to10() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="eth0to10.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
	Path file = new Path(fileNameBlock);
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();

	List<InputSplit> splits = format.getSplits(job);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	assertEquals( 1, splits.size(),"Only one split generated for block 0..10");
	RecordReader<BytesWritable, EthereumBlock> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull( reader,"Format returned  null RecordReader");
	reader.initialize(splits.get(0),context);
	BytesWritable key = new BytesWritable();
	EthereumBlock block = new EthereumBlock();
	int count=0;
	while (count<11) {
		if (reader.nextKeyValue()) {
			count++;
		}
	}
	assertEquals(11,count,"Block 0..10 contains 11 blocks");

	assertFalse( reader.nextKeyValue(),"No further blocks in block 0..10");
	reader.close();
}
 
Example 9
Source File: TestFixedLengthInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private static List<String> readSplit(FixedLengthInputFormat format, 
                                      InputSplit split, 
                                      Job job) throws Exception {
  List<String> result = new ArrayList<String>();
  TaskAttemptContext context = MapReduceTestUtil.
      createDummyMapTaskAttemptContext(job.getConfiguration());
  RecordReader<LongWritable, BytesWritable> reader =
      format.createRecordReader(split, context);
  MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
      mcontext =
      new MapContextImpl<LongWritable, BytesWritable, LongWritable,
      BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
      reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
  LongWritable key;
  BytesWritable value;
  try {
    reader.initialize(split, mcontext);
    while (reader.nextKeyValue()) {
      key = reader.getCurrentKey();
      value = reader.getCurrentValue();
      result.add(new String(value.getBytes(), 0, value.getLength()));
    }
  } finally {
    reader.close();
  }
  return result;
}
 
Example 10
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock3510000to3510010() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	Configuration conf = new Configuration(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="eth351000to3510010.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();
	Path file = new Path(fileNameBlock);
	Job job = Job.getInstance(conf);
	FileInputFormat.setInputPaths(job, file);
	EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();

	List<InputSplit> splits = format.getSplits(job);
	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
	assertEquals( 1, splits.size(),"Only one split generated for block 3510000 .. 3510010");
	RecordReader<BytesWritable, EthereumBlock> reader = format.createRecordReader(splits.get(0), context);
	assertNotNull( reader,"Format returned  null RecordReader");
	reader.initialize(splits.get(0),context);
	BytesWritable key = new BytesWritable();
	EthereumBlock block = new EthereumBlock();
	int count=0;
	while (count<11) {
		if (reader.nextKeyValue()) {
			count++;
		}
	}
	assertEquals(11,count,"Block 3510000 .. 3510010 contains 11 blocks");

	assertFalse( reader.nextKeyValue(),"No further blocks in block 3510000 .. 3510010");
	reader.close();
}
 
Example 11
Source File: InputSampler.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 12
Source File: TestCombineTextInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testFormat() throws Exception {
  Job job = Job.getInstance(new Configuration(defaultConf));

  Random random = new Random();
  long seed = random.nextLong();
  LOG.info("seed = " + seed);
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with various lengths
  createFiles(length, numFiles, random);

  // create a combined split for the files
  CombineTextInputFormat format = new CombineTextInputFormat();
  for (int i = 0; i < 3; i++) {
    int numSplits = random.nextInt(length/20) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    LOG.debug("split= " + split);
    TaskAttemptContext context = MapReduceTestUtil.
      createDummyMapTaskAttemptContext(job.getConfiguration());
    RecordReader<LongWritable, Text> reader =
      format.createRecordReader(split, context);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());
    MapContext<LongWritable,Text,LongWritable,Text> mcontext =
      new MapContextImpl<LongWritable,Text,LongWritable,Text>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);

    try {
      int count = 0;
      while (reader.nextKeyValue()) {
        LongWritable key = reader.getCurrentKey();
        assertNotNull("Key should not be null.", key);
        Text value = reader.getCurrentValue();
        final int v = Integer.parseInt(value.toString());
        LOG.debug("read " + v);
        assertFalse("Key in multiple partitions.", bits.get(v));
        bits.set(v);
        count++;
      }
      LOG.debug("split=" + split + " count=" + count);
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}
 
Example 13
Source File: GraphXEdgeInputFormatTest.java    From rya with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("rawtypes")
@Test
public void testInputFormat() throws Exception {
    RyaStatement input = RyaStatement.builder()
        .setSubject(new RyaIRI("http://www.google.com"))
        .setPredicate(new RyaIRI("http://some_other_uri"))
        .setObject(new RyaIRI("http://www.yahoo.com"))
        .setColumnVisibility(new byte[0])
        .setValue(new byte[0])
        .build();

    apiImpl.add(input);

    Job jobConf = Job.getInstance();

    GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName());
    GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password);
    GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);

    GraphXEdgeInputFormat.setScanIsolation(jobConf, false);
    GraphXEdgeInputFormat.setLocalIterators(jobConf, false);
    GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false);

    GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat();

    JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID());

    List<InputSplit> splits = inputFormat.getSplits(context);

    Assert.assertEquals(1, splits.size());

    TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1));

    RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext);

    RecordReader ryaStatementRecordReader = (RecordReader) reader;
    ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext);

    List<Edge> results = new ArrayList<Edge>();
    while(ryaStatementRecordReader.nextKeyValue()) {
        Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue();
        long srcId = writable.srcId();
        long destId = writable.dstId();
        RyaTypeWritable rtw = null;
        Object text = ryaStatementRecordReader.getCurrentKey();
        Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw);
        results.add(edge);

        System.out.println(text);
    }

    System.out.println(results.size());
    System.out.println(results);
    Assert.assertTrue(results.size() == 2);
}
 
Example 14
Source File: TestCombineSequenceFileInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}
 
Example 15
Source File: TestMRKeyValueTextInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testSplitableCodecs() throws Exception {
  final Job job = Job.getInstance(defaultConf);
  final Configuration conf = job.getConfiguration();

  // Create the codec
  CompressionCodec codec = null;
  try {
    codec = (CompressionCodec)
    ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
  } catch (ClassNotFoundException cnfe) {
    throw new IOException("Illegal codec!");
  }
  Path file = new Path(workDir, "test"+codec.getDefaultExtension());

  int seed = new Random().nextInt();
  LOG.info("seed = " + seed);
  Random random = new Random(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int MAX_LENGTH = 500000;
  FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
  // for a variety of lengths
  for (int length = 0; length < MAX_LENGTH;
       length += random.nextInt(MAX_LENGTH / 4) + 1) {

    LOG.info("creating; entries = " + length);

    // create a file with length entries
    Writer writer =
      new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
    try {
      for (int i = 0; i < length; i++) {
        writer.write(Integer.toString(i * 2));
        writer.write("\t");
        writer.write(Integer.toString(i));
        writer.write("\n");
      }
    } finally {
      writer.close();
    }

    // try splitting the file in a variety of sizes
    KeyValueTextInputFormat format = new KeyValueTextInputFormat();
    assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
    for (int i = 0; i < 3; i++) {
      int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
      LOG.info("splitting: requesting = " + numSplits);
      List<InputSplit> splits = format.getSplits(job);
      LOG.info("splitting: got =        " + splits.size());

      // check each split
      BitSet bits = new BitSet(length);
      for (int j = 0; j < splits.size(); j++) {
        LOG.debug("split["+j+"]= " + splits.get(j));
        TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<Text, Text> reader = format.createRecordReader(
          splits.get(j), context);
        Class<?> clazz = reader.getClass();
        MapContext<Text, Text, Text, Text> mcontext =
          new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(),
          context.getTaskAttemptID(), reader, null, null,
          MapReduceTestUtil.createDummyReporter(), splits.get(j));
        reader.initialize(splits.get(j), mcontext);

        Text key = null;
        Text value = null;
        try {
          int count = 0;
          while (reader.nextKeyValue()) {
            key = reader.getCurrentKey();
            value = reader.getCurrentValue();
            final int k = Integer.parseInt(key.toString());
            final int v = Integer.parseInt(value.toString());
            assertEquals("Bad key", 0, k % 2);
            assertEquals("Mismatched key/value", k / 2, v);
            LOG.debug("read " + k + "," + v);
            assertFalse(k + "," + v + " in multiple partitions.",bits.get(v));
            bits.set(v);
            count++;
          }
          if (count > 0) {
            LOG.info("splits["+j+"]="+splits.get(j)+" count=" + count);
          } else {
            LOG.debug("splits["+j+"]="+splits.get(j)+" count=" + count);
          }
        } finally {
          reader.close();
        }
      }
      assertEquals("Some keys in no partition.", length, bits.cardinality());
    }

  }
}
 
Example 16
Source File: TestCombineSequenceFileInputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}
 
Example 17
Source File: TestMRSequenceFileAsBinaryOutputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
public void testBinary() throws IOException, InterruptedException {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf);
  
  Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
                  "outseq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  FileOutputFormat.setOutputPath(job, outdir);

  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, 
                                        IntWritable.class );
  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, 
                                        DoubleWritable.class ); 

  SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
  SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, 
                                                     CompressionType.BLOCK);

  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();

  TaskAttemptContext context = 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
  OutputFormat<BytesWritable, BytesWritable> outputFormat = 
    new SequenceFileAsBinaryOutputFormat();
  OutputCommitter committer = outputFormat.getOutputCommitter(context);
  committer.setupJob(job);
  RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
    getRecordWriter(context);

  IntWritable iwritable = new IntWritable();
  DoubleWritable dwritable = new DoubleWritable();
  DataOutputBuffer outbuf = new DataOutputBuffer();
  LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
  try {
    for (int i = 0; i < RECORDS; ++i) {
      iwritable = new IntWritable(r.nextInt());
      iwritable.write(outbuf);
      bkey.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      dwritable = new DoubleWritable(r.nextDouble());
      dwritable.write(outbuf);
      bval.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      writer.write(bkey, bval);
    }
  } finally {
    writer.close(context);
  }
  committer.commitTask(context);
  committer.commitJob(job);

  InputFormat<IntWritable, DoubleWritable> iformat =
    new SequenceFileInputFormat<IntWritable, DoubleWritable>();
  int count = 0;
  r.setSeed(seed);
  SequenceFileInputFormat.setInputPaths(job, outdir);
  LOG.info("Reading data by SequenceFileInputFormat");
  for (InputSplit split : iformat.getSplits(job)) {
    RecordReader<IntWritable, DoubleWritable> reader =
      iformat.createRecordReader(split, context);
    MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<IntWritable, DoubleWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      int sourceInt;
      double sourceDouble;
      while (reader.nextKeyValue()) {
        sourceInt = r.nextInt();
        sourceDouble = r.nextDouble();
        iwritable = reader.getCurrentKey();
        dwritable = reader.getCurrentValue();
        assertEquals(
            "Keys don't match: " + "*" + iwritable.get() + ":" + 
                                         sourceInt + "*",
            sourceInt, iwritable.get());
        assertTrue(
            "Vals don't match: " + "*" + dwritable.get() + ":" +
                                         sourceDouble + "*",
            Double.compare(dwritable.get(), sourceDouble) == 0 );
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}
 
Example 18
Source File: InputSampler.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}
 
Example 19
Source File: RecordReaderWriterTest.java    From tinkerpop with Apache License 2.0 4 votes vote down vote up
private static void validateFileSplits(final List<FileSplit> fileSplits, final Configuration configuration,
                                       final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass,
                                       final Optional<Class<? extends OutputFormat<NullWritable, VertexWritable>>> outFormatClass) throws Exception {

    final InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, configuration);
    final TaskAttemptContext job = new TaskAttemptContextImpl(configuration, new TaskAttemptID(UUID.randomUUID().toString(), 0, TaskType.MAP, 0, 0));

    int vertexCount = 0;
    int outEdgeCount = 0;
    int inEdgeCount = 0;

    final OutputFormat<NullWritable, VertexWritable> outputFormat = outFormatClass.isPresent() ? ReflectionUtils.newInstance(outFormatClass.get(), configuration) : null;
    final RecordWriter<NullWritable, VertexWritable> writer = null == outputFormat ? null : outputFormat.getRecordWriter(job);

    boolean foundKeyValue = false;
    for (final FileSplit split : fileSplits) {
        logger.info("\treading file split {}", split.getPath().getName() + " ({}", split.getStart() + "..." + (split.getStart() + split.getLength()), "{} {} bytes)");
        final RecordReader reader = inputFormat.createRecordReader(split, job);

        float lastProgress = -1f;
        while (reader.nextKeyValue()) {
            //System.out.println("" + reader.getProgress() + "> " + reader.getCurrentKey() + ": " + reader.getCurrentValue());
            final float progress = reader.getProgress();
            assertTrue(progress >= lastProgress);
            assertEquals(NullWritable.class, reader.getCurrentKey().getClass());
            final VertexWritable vertexWritable = (VertexWritable) reader.getCurrentValue();
            if (null != writer) writer.write(NullWritable.get(), vertexWritable);
            vertexCount++;
            outEdgeCount = outEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.OUT));
            inEdgeCount = inEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.IN));
            //
            final Vertex vertex = vertexWritable.get();
            assertEquals(Integer.class, vertex.id().getClass());
            if (vertex.value("name").equals("SUGAR MAGNOLIA")) {
                foundKeyValue = true;
                assertEquals(92, IteratorUtils.count(vertex.edges(Direction.OUT)));
                assertEquals(77, IteratorUtils.count(vertex.edges(Direction.IN)));
            }
            lastProgress = progress;
        }
    }

    assertEquals(8049, outEdgeCount);
    assertEquals(8049, inEdgeCount);
    assertEquals(outEdgeCount, inEdgeCount);
    assertEquals(808, vertexCount);
    assertTrue(foundKeyValue);

    if (null != writer) {
        writer.close(new TaskAttemptContextImpl(configuration, job.getTaskAttemptID()));
        for (int i = 1; i < 10; i++) {
            final File outputDirectory = new File(new URL(configuration.get("mapreduce.output.fileoutputformat.outputdir")).toURI());
            final List<FileSplit> splits = generateFileSplits(new File(outputDirectory.getAbsoluteFile() + "/_temporary/0/_temporary/" + job.getTaskAttemptID().getTaskID().toString().replace("task", "attempt") + "_0" + "/part-m-00000"), i);
            validateFileSplits(splits, configuration, inputFormatClass, Optional.empty());
        }
    }
}
 
Example 20
Source File: MneMapreduceChunkDataTest.java    From mnemonic with Apache License 2.0 4 votes vote down vote up
@Test(enabled = true, dependsOnMethods = { "testWriteChunkData" })
public void testReadChunkData() throws Exception {
  List<String> partfns = new ArrayList<String>();
  long reccnt = 0L;
  long tsize = 0L;
  Checksum cs = new CRC32();
  cs.reset();
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
        && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
        && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      partfns.add(listfiles[idx].getName());
    }
  }
  Collections.sort(partfns); // keep the order for checksum
  for (int idx = 0; idx < partfns.size(); ++idx) {
    System.out.println(String.format("Verifying : %s", partfns.get(idx)));
    FileSplit split = new FileSplit(
        new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]);
    InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat =
        new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>();
    RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader =
        inputFormat.createRecordReader(split, m_tacontext);
    MneDurableInputValue<DurableChunk<?>> dchkval = null;
    while (reader.nextKeyValue()) {
      dchkval = reader.getCurrentValue();
      byte b;
      for (int j = 0; j < dchkval.getValue().getSize(); ++j) {
        b = unsafe.getByte(dchkval.getValue().get() + j);
        cs.update(b);
      }
      tsize += dchkval.getValue().getSize();
      ++reccnt;
    }
    reader.close();
  }
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  AssertJUnit.assertEquals(m_totalsize, tsize);
  AssertJUnit.assertEquals(m_checksum, cs.getValue());
  System.out.println(String.format("The checksum of chunk is %d", m_checksum));
}