Java Code Examples for org.apache.hadoop.mapreduce.InputFormat#getSplits()

The following examples show how to use org.apache.hadoop.mapreduce.InputFormat#getSplits() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: AbstractHadoopJob.java From Kylin with Apache License 2.0

6 votes

protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Example 2

Source File: StatementPatternStorageTest.java From rya with Apache License 2.0

6 votes

protected List<StatementPatternStorage> createStorages(final String location) throws IOException, InterruptedException {
    final List<StatementPatternStorage> storages = new ArrayList<StatementPatternStorage>();
    StatementPatternStorage storage = new StatementPatternStorage();
    final InputFormat<?, ?> inputFormat = storage.getInputFormat();
    Job job = Job.getInstance(new Configuration());
    storage.setLocation(location, job);
    final List<InputSplit> splits = inputFormat.getSplits(job);
    assertNotNull(splits);

    for (final InputSplit inputSplit : splits) {
        storage = new StatementPatternStorage();
        job = Job.getInstance(new Configuration());
        storage.setLocation(location, job);
        final TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration(),
                new TaskAttemptID("jtid", 0, TaskType.REDUCE, 0, 0));
        final RecordReader<?, ?> recordReader = inputFormat.createRecordReader(inputSplit,
                taskAttemptContext);
        recordReader.initialize(inputSplit, taskAttemptContext);

        storage.prepareToRead(recordReader, null);
        storages.add(storage);
    }
    return storages;
}

Example 3

Source File: HadoopElementIterator.java From tinkerpop with Apache License 2.0

6 votes

public HadoopElementIterator(final HadoopGraph graph) {
    try {
        this.graph = graph;
        final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration());
        final InputFormat<NullWritable, VertexWritable> inputFormat = ConfUtil.getReaderAsInputFormat(configuration);
        if (inputFormat instanceof FileInputFormat) {
            final Storage storage = FileSystemStorage.open(configuration);
            if (!this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
                return; // there is no input location and thus, no data (empty graph)
            if (!Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).isPresent())
                return; // there is no data at the input location (empty graph)
            configuration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).get());
        }
        final List<InputSplit> splits = inputFormat.getSplits(new JobContextImpl(configuration, new JobID(UUID.randomUUID().toString(), 1)));
        for (final InputSplit split : splits) {
            this.readers.add(inputFormat.createRecordReader(split, new TaskAttemptContextImpl(configuration, new TaskAttemptID())));
        }
    } catch (final Exception e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}

Example 4

Source File: AbstractHadoopJob.java From kylin with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Example 5

Source File: AbstractHadoopJob.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Example 6

Source File: TestMRSequenceFileInputFilter.java From big-c with Apache License 2.0

5 votes

private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}

Example 7

Source File: IndexedStorage.java From spork with Apache License 2.0

5 votes

/**
 * IndexableLoadFunc interface implementation
 */
@Override
public void initialize(Configuration conf) throws IOException {
    try {
        InputFormat inputFormat = this.getInputFormat();
        TaskAttemptID id = HadoopShims.getNewTaskAttemptID();

        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
                    conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }
        List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
        this.readers = new IndexedStorageRecordReader[fileSplits.size()];

        int idx = 0;
        Iterator<FileSplit> it = fileSplits.iterator();
        while (it.hasNext()) {
            FileSplit fileSplit = it.next();
            TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
            IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat.createRecordReader(fileSplit, context);
            r.initialize(fileSplit, context);
            this.readers[idx] = r;
            idx++;
        }

        Arrays.sort(this.readers, this.readerComparator);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}

Example 8

Source File: InputSampler.java From big-c with Apache License 2.0

5 votes

/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 9

Source File: InputSampler.java From big-c with Apache License 2.0

5 votes

/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 10

Source File: InputSampler.java From hadoop with Apache License 2.0

5 votes

/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 11

Source File: InputSampler.java From hadoop with Apache License 2.0

5 votes

/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 12

Source File: TestMRSequenceFileInputFilter.java From hadoop with Apache License 2.0

5 votes

private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}

Example 13

Source File: TestMRSequenceFileAsBinaryInputFormat.java From big-c with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Job job = Job.getInstance();
  FileSystem fs = FileSystem.getLocal(job.getConfiguration());
  Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
  Path file = new Path(dir, "testbinary.seq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  fs.delete(dir, true);
  FileInputFormat.setInputPaths(job, dir);

  Text tkey = new Text();
  Text tval = new Text();

  SequenceFile.Writer writer = new SequenceFile.Writer(fs,
    job.getConfiguration(), file, Text.class, Text.class);
  try {
    for (int i = 0; i < RECORDS; ++i) {
      tkey.set(Integer.toString(r.nextInt(), 36));
      tval.set(Long.toString(r.nextLong(), 36));
      writer.append(tkey, tval);
    }
  } finally {
    writer.close();
  }
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  InputFormat<BytesWritable,BytesWritable> bformat =
    new SequenceFileAsBinaryInputFormat();

  int count = 0;
  r.setSeed(seed);
  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();
  Text cmpkey = new Text();
  Text cmpval = new Text();
  DataInputBuffer buf = new DataInputBuffer();
  FileInputFormat.setInputPaths(job, file);
  for (InputSplit split : bformat.getSplits(job)) {
    RecordReader<BytesWritable, BytesWritable> reader =
          bformat.createRecordReader(split, context);
    MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<BytesWritable, BytesWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        bkey = reader.getCurrentKey();
        bval = reader.getCurrentValue();
        tkey.set(Integer.toString(r.nextInt(), 36));
        tval.set(Long.toString(r.nextLong(), 36));
        buf.reset(bkey.getBytes(), bkey.getLength());
        cmpkey.readFields(buf);
        buf.reset(bval.getBytes(), bval.getLength());
        cmpval.readFields(buf);
        assertTrue(
          "Keys don't match: " + "*" + cmpkey.toString() + ":" +
          tkey.toString() + "*",
          cmpkey.toString().equals(tkey.toString()));
        assertTrue(
          "Vals don't match: " + "*" + cmpval.toString() + ":" +
          tval.toString() + "*",
          cmpval.toString().equals(tval.toString()));
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Example 14

Source File: TestMRSequenceFileAsBinaryOutputFormat.java From big-c with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf);
  
  Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
                  "outseq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  FileOutputFormat.setOutputPath(job, outdir);

  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, 
                                        IntWritable.class );
  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, 
                                        DoubleWritable.class ); 

  SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
  SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, 
                                                     CompressionType.BLOCK);

  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();

  TaskAttemptContext context = 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
  OutputFormat<BytesWritable, BytesWritable> outputFormat = 
    new SequenceFileAsBinaryOutputFormat();
  OutputCommitter committer = outputFormat.getOutputCommitter(context);
  committer.setupJob(job);
  RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
    getRecordWriter(context);

  IntWritable iwritable = new IntWritable();
  DoubleWritable dwritable = new DoubleWritable();
  DataOutputBuffer outbuf = new DataOutputBuffer();
  LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
  try {
    for (int i = 0; i < RECORDS; ++i) {
      iwritable = new IntWritable(r.nextInt());
      iwritable.write(outbuf);
      bkey.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      dwritable = new DoubleWritable(r.nextDouble());
      dwritable.write(outbuf);
      bval.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      writer.write(bkey, bval);
    }
  } finally {
    writer.close(context);
  }
  committer.commitTask(context);
  committer.commitJob(job);

  InputFormat<IntWritable, DoubleWritable> iformat =
    new SequenceFileInputFormat<IntWritable, DoubleWritable>();
  int count = 0;
  r.setSeed(seed);
  SequenceFileInputFormat.setInputPaths(job, outdir);
  LOG.info("Reading data by SequenceFileInputFormat");
  for (InputSplit split : iformat.getSplits(job)) {
    RecordReader<IntWritable, DoubleWritable> reader =
      iformat.createRecordReader(split, context);
    MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<IntWritable, DoubleWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      int sourceInt;
      double sourceDouble;
      while (reader.nextKeyValue()) {
        sourceInt = r.nextInt();
        sourceDouble = r.nextDouble();
        iwritable = reader.getCurrentKey();
        dwritable = reader.getCurrentValue();
        assertEquals(
            "Keys don't match: " + "*" + iwritable.get() + ":" + 
                                         sourceInt + "*",
            sourceInt, iwritable.get());
        assertTrue(
            "Vals don't match: " + "*" + dwritable.get() + ":" +
                                         sourceDouble + "*",
            Double.compare(dwritable.get(), sourceDouble) == 0 );
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Example 15

Source File: TestCombineSequenceFileInputFormat.java From big-c with Apache License 2.0

4 votes

@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}

Example 16

Source File: InputSampler.java From big-c with Apache License 2.0

4 votes

/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 17

Source File: InputSampler.java From hadoop with Apache License 2.0

4 votes

/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 18

Source File: TestCombineSequenceFileInputFormat.java From hadoop with Apache License 2.0

4 votes

@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}

Example 19

Source File: TestMRSequenceFileAsBinaryInputFormat.java From hadoop with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Job job = Job.getInstance();
  FileSystem fs = FileSystem.getLocal(job.getConfiguration());
  Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
  Path file = new Path(dir, "testbinary.seq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  fs.delete(dir, true);
  FileInputFormat.setInputPaths(job, dir);

  Text tkey = new Text();
  Text tval = new Text();

  SequenceFile.Writer writer = new SequenceFile.Writer(fs,
    job.getConfiguration(), file, Text.class, Text.class);
  try {
    for (int i = 0; i < RECORDS; ++i) {
      tkey.set(Integer.toString(r.nextInt(), 36));
      tval.set(Long.toString(r.nextLong(), 36));
      writer.append(tkey, tval);
    }
  } finally {
    writer.close();
  }
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  InputFormat<BytesWritable,BytesWritable> bformat =
    new SequenceFileAsBinaryInputFormat();

  int count = 0;
  r.setSeed(seed);
  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();
  Text cmpkey = new Text();
  Text cmpval = new Text();
  DataInputBuffer buf = new DataInputBuffer();
  FileInputFormat.setInputPaths(job, file);
  for (InputSplit split : bformat.getSplits(job)) {
    RecordReader<BytesWritable, BytesWritable> reader =
          bformat.createRecordReader(split, context);
    MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<BytesWritable, BytesWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        bkey = reader.getCurrentKey();
        bval = reader.getCurrentValue();
        tkey.set(Integer.toString(r.nextInt(), 36));
        tval.set(Long.toString(r.nextLong(), 36));
        buf.reset(bkey.getBytes(), bkey.getLength());
        cmpkey.readFields(buf);
        buf.reset(bval.getBytes(), bval.getLength());
        cmpval.readFields(buf);
        assertTrue(
          "Keys don't match: " + "*" + cmpkey.toString() + ":" +
          tkey.toString() + "*",
          cmpkey.toString().equals(tkey.toString()));
        assertTrue(
          "Vals don't match: " + "*" + cmpval.toString() + ":" +
          tval.toString() + "*",
          cmpval.toString().equals(tval.toString()));
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Example 20

Source File: TestMRSequenceFileAsBinaryOutputFormat.java From hadoop with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf);
  
  Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
                  "outseq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  FileOutputFormat.setOutputPath(job, outdir);

  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, 
                                        IntWritable.class );
  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, 
                                        DoubleWritable.class ); 

  SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
  SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, 
                                                     CompressionType.BLOCK);

  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();

  TaskAttemptContext context = 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
  OutputFormat<BytesWritable, BytesWritable> outputFormat = 
    new SequenceFileAsBinaryOutputFormat();
  OutputCommitter committer = outputFormat.getOutputCommitter(context);
  committer.setupJob(job);
  RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
    getRecordWriter(context);

  IntWritable iwritable = new IntWritable();
  DoubleWritable dwritable = new DoubleWritable();
  DataOutputBuffer outbuf = new DataOutputBuffer();
  LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
  try {
    for (int i = 0; i < RECORDS; ++i) {
      iwritable = new IntWritable(r.nextInt());
      iwritable.write(outbuf);
      bkey.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      dwritable = new DoubleWritable(r.nextDouble());
      dwritable.write(outbuf);
      bval.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      writer.write(bkey, bval);
    }
  } finally {
    writer.close(context);
  }
  committer.commitTask(context);
  committer.commitJob(job);

  InputFormat<IntWritable, DoubleWritable> iformat =
    new SequenceFileInputFormat<IntWritable, DoubleWritable>();
  int count = 0;
  r.setSeed(seed);
  SequenceFileInputFormat.setInputPaths(job, outdir);
  LOG.info("Reading data by SequenceFileInputFormat");
  for (InputSplit split : iformat.getSplits(job)) {
    RecordReader<IntWritable, DoubleWritable> reader =
      iformat.createRecordReader(split, context);
    MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<IntWritable, DoubleWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      int sourceInt;
      double sourceDouble;
      while (reader.nextKeyValue()) {
        sourceInt = r.nextInt();
        sourceDouble = r.nextDouble();
        iwritable = reader.getCurrentKey();
        dwritable = reader.getCurrentValue();
        assertEquals(
            "Keys don't match: " + "*" + iwritable.get() + ":" + 
                                         sourceInt + "*",
            sourceInt, iwritable.get());
        assertTrue(
            "Vals don't match: " + "*" + dwritable.get() + ":" +
                                         sourceDouble + "*",
            Double.compare(dwritable.get(), sourceDouble) == 0 );
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}