org.apache.hadoop.mapred.lib.CombineFileSplit Java Examples

The following examples show how to use org.apache.hadoop.mapred.lib.CombineFileSplit. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: FragmenterJUnitTest.java From gemfirexd-oss with Apache License 2.0

6 votes

private void verifyFragments(InputSplit[] fs, List<Fragment> fragments) throws Exception {
  log("Total fragments [expected, actual]: " + fs.length + ", " + fragments.size());
  assertEquals(fs.length, fragments.size());

  for (int i = 0; i < fs.length; i++) {
    CombineFileSplit split = (CombineFileSplit) fs[i];
    Fragment frag = fragments.get(i);

    log("Number of hosts hosting the fragment [expected, actual]: " + fs[i].getLocations().length + ",  " +  frag.getReplicas().length);
    assertEquals(fs[i].getLocations().length, frag.getReplicas().length);

    log("Fragment source name [expected, actual]: " + split.getPath(0).toString() +  ",  " + frag.getSourceName());
    assertEquals(split.getPath(0).toString(), "/" + frag.getSourceName());

    for (int j = 0; j < frag.getReplicas().length; j++) {
      log("Fragment host [expected, actual]: " + fs[i].getLocations()[j] + ",  " + frag.getReplicas()[j]);
      assertEquals(fs[i].getLocations()[j], frag.getReplicas()[j]);

      log(" User data [expected, actual]: " + null + ",  " + frag.getUserData());
      assertEquals(null, frag.getUserData());
    }
  }
}

Example #2

Source File: GFInputFormat.java From gemfirexd-oss with Apache License 2.0

6 votes

/**
 * Creates an input split for every block occupied by hoplogs of the input
 * regions
 * 
 * @param job 
 * @param hoplogs
 * @return array of input splits of type file input split
 * @throws IOException
 */
private InputSplit[] createSplits(JobConf job, Collection<FileStatus> hoplogs)
    throws IOException {
  if (hoplogs == null || hoplogs.isEmpty()) {
    return new InputSplit[0];
  }

  HoplogOptimizedSplitter splitter = new HoplogOptimizedSplitter(hoplogs);
  List<org.apache.hadoop.mapreduce.InputSplit> mr2Splits = splitter.getOptimizedSplits(conf);
  InputSplit[] splits = new InputSplit[mr2Splits.size()];
  int i = 0;
  for (org.apache.hadoop.mapreduce.InputSplit inputSplit : mr2Splits) {
    org.apache.hadoop.mapreduce.lib.input.CombineFileSplit mr2Spit;
    mr2Spit = (org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) inputSplit;
    
    CombineFileSplit split = new CombineFileSplit(job, mr2Spit.getPaths(),
        mr2Spit.getStartOffsets(), mr2Spit.getLengths(),
        mr2Spit.getLocations());
    splits[i] = split;
    i++;
  }

  return splits;
}

Example #3

Source File: GemFireXDFragmenter.java From gemfirexd-oss with Apache License 2.0

6 votes

@Override
  public List<Fragment> getFragments() throws IOException {
    InputSplit[] splits;
//    try {
      splits = getSplits();
//    } finally {
//      this.gfxdManager.resetLonerSystemInUse();
//    }

    for (InputSplit split : splits) {
      CombineFileSplit cSplit = (CombineFileSplit)split;
      
      if (cSplit.getLength() > 0L) {
        String filepath = cSplit.getPath(0).toUri().getPath();
        filepath = filepath.substring(1);
        if (this.gfxdManager.getLogger().isDebugEnabled()) {
          this.gfxdManager.getLogger().debug("fragment-filepath " + filepath);
        }
        byte[] data = this.gfxdManager.populateUserData(cSplit);
        this.fragments.add(new Fragment(filepath, cSplit.getLocations(), data));
      }
    }
    return this.fragments;
  }

Example #4

Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0

6 votes

static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}

Example #5

Source File: RowInputFormat.java From gemfirexd-oss with Apache License 2.0

6 votes

private InputSplit[] createSplits(JobConf job, Collection<FileStatus> hoplogs)
    throws IOException {
  if (hoplogs == null || hoplogs.isEmpty()) {
    return new InputSplit[0];
  }
  
  HoplogOptimizedSplitter splitter = new HoplogOptimizedSplitter(hoplogs);
  List<org.apache.hadoop.mapreduce.InputSplit> mr2Splits = splitter.getOptimizedSplits(conf);
  InputSplit[] splits = new InputSplit[mr2Splits.size()];
  int i = 0;
  for (org.apache.hadoop.mapreduce.InputSplit inputSplit : mr2Splits) {
    org.apache.hadoop.mapreduce.lib.input.CombineFileSplit mr2Spit;
    mr2Spit = (org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) inputSplit;
    
    CombineFileSplit split = new CombineFileSplit(job, mr2Spit.getPaths(),
        mr2Spit.getStartOffsets(), mr2Spit.getLengths(),
        mr2Spit.getLocations());
    splits[i] = getSplit(split);
    i++;
  }

  return splits;
}

Example #6

Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

6 votes

public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim,
    Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
  this.inputSplitShim = inputSplitShim;
  this.pathToPartitionInfo = pathToPartitionInfo;
  if (job != null) {
    if (this.pathToPartitionInfo == null) {
      this.pathToPartitionInfo = Utilities.getMapWork(job).getPathToPartitionInfo();
    }

    // extract all the inputFormatClass names for each chunk in the
    // CombinedSplit.
    Path[] ipaths = inputSplitShim.getPaths();
    if (ipaths.length > 0) {
      PartitionDesc part = getPartitionFromPath(this.pathToPartitionInfo, ipaths[0],
          IOPrepareCache.get().getPartitionDescMap());
      inputFormatClassName = part.getInputFileFormatClass().getName();
    }
  }
}

Example #7

Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

6 votes

@Override
public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter,
    Class<RecordReader<K, V>> rrClass) throws IOException {
  isRealTime = Boolean.valueOf(job.get("hudi.hive.realtime", "false"));
  if (isRealTime) {
    List<RecordReader> recordReaders = new LinkedList<>();
    ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only "
        + HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName());
    for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
      if (split.getPaths().length == 0) {
        continue;
      }
      FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(split.getPath(0).toString(), true, job);
      recordReaders.add(inputFormat.getRecordReader(inputSplit, job, reporter));
    }
    return new HoodieCombineRealtimeRecordReader(job, split, recordReaders);
  }
  return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass);
}

Example #8

Source File: RowInputFormat.java From gemfirexd-oss with Apache License 2.0

6 votes

private InputSplit[] createSplits(JobConf job, Collection<FileStatus> hoplogs)
    throws IOException {
  if (hoplogs == null || hoplogs.isEmpty()) {
    return new InputSplit[0];
  }
  
  HoplogOptimizedSplitter splitter = new HoplogOptimizedSplitter(hoplogs);
  List<org.apache.hadoop.mapreduce.InputSplit> mr2Splits = splitter.getOptimizedSplits(conf);
  InputSplit[] splits = new InputSplit[mr2Splits.size()];
  int i = 0;
  for (org.apache.hadoop.mapreduce.InputSplit inputSplit : mr2Splits) {
    org.apache.hadoop.mapreduce.lib.input.CombineFileSplit mr2Spit;
    mr2Spit = (org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) inputSplit;
    
    CombineFileSplit split = new CombineFileSplit(job, mr2Spit.getPaths(),
        mr2Spit.getStartOffsets(), mr2Spit.getLengths(),
        mr2Spit.getLocations());
    splits[i] = getSplit(split);
    i++;
  }

  return splits;
}

Example #9

Source File: GemFireXDFragmenter.java From gemfirexd-oss with Apache License 2.0

6 votes

@Override
  public List<Fragment> getFragments() throws IOException {
    InputSplit[] splits;
//    try {
      splits = getSplits();
//    } finally {
//      this.gfxdManager.resetLonerSystemInUse();
//    }

    for (InputSplit split : splits) {
      CombineFileSplit cSplit = (CombineFileSplit)split;
      
      if (cSplit.getLength() > 0L) {
        String filepath = cSplit.getPath(0).toUri().getPath();
        filepath = filepath.substring(1);
        if (this.gfxdManager.getLogger().isDebugEnabled()) {
          this.gfxdManager.getLogger().debug("fragment-filepath " + filepath);
        }
        byte[] data = this.gfxdManager.populateUserData(cSplit);
        this.fragments.add(new Fragment(filepath, cSplit.getLocations(), data));
      }
    }
    return this.fragments;
  }

Example #10

Source File: FragmenterJUnitTest.java From gemfirexd-oss with Apache License 2.0

6 votes

private void verifyFragments(InputSplit[] fs, List<Fragment> fragments) throws Exception {
  log("Total fragments [expected, actual]: " + fs.length + ", " + fragments.size());
  assertEquals(fs.length, fragments.size());

  for (int i = 0; i < fs.length; i++) {
    CombineFileSplit split = (CombineFileSplit) fs[i];
    Fragment frag = fragments.get(i);

    log("Number of hosts hosting the fragment [expected, actual]: " + fs[i].getLocations().length + ",  " +  frag.getReplicas().length);
    assertEquals(fs[i].getLocations().length, frag.getReplicas().length);

    log("Fragment source name [expected, actual]: " + split.getPath(0).toString() +  ",  " + frag.getSourceName());
    assertEquals(split.getPath(0).toString(), "/" + frag.getSourceName());

    for (int j = 0; j < frag.getReplicas().length; j++) {
      log("Fragment host [expected, actual]: " + fs[i].getLocations()[j] + ",  " + frag.getReplicas()[j]);
      assertEquals(fs[i].getLocations()[j], frag.getReplicas()[j]);

      log(" User data [expected, actual]: " + null + ",  " + frag.getUserData());
      assertEquals(null, frag.getUserData());
    }
  }
}

Example #11

Source File: GFInputFormat.java From gemfirexd-oss with Apache License 2.0

6 votes

/**
 * Creates an input split for every block occupied by hoplogs of the input
 * regions
 * 
 * @param job 
 * @param hoplogs
 * @return array of input splits of type file input split
 * @throws IOException
 */
private InputSplit[] createSplits(JobConf job, Collection<FileStatus> hoplogs)
    throws IOException {
  if (hoplogs == null || hoplogs.isEmpty()) {
    return new InputSplit[0];
  }

  HoplogOptimizedSplitter splitter = new HoplogOptimizedSplitter(hoplogs);
  List<org.apache.hadoop.mapreduce.InputSplit> mr2Splits = splitter.getOptimizedSplits(conf);
  InputSplit[] splits = new InputSplit[mr2Splits.size()];
  int i = 0;
  for (org.apache.hadoop.mapreduce.InputSplit inputSplit : mr2Splits) {
    org.apache.hadoop.mapreduce.lib.input.CombineFileSplit mr2Spit;
    mr2Spit = (org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) inputSplit;
    
    CombineFileSplit split = new CombineFileSplit(job, mr2Spit.getPaths(),
        mr2Spit.getStartOffsets(), mr2Spit.getLengths(),
        mr2Spit.getLocations());
    splits[i] = split;
    i++;
  }

  return splits;
}

Example #12

Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

5 votes

/**
 * Create a generic Hive RecordReader than can iterate over all chunks in a CombinedFileSplit.
 */
@Override
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
  if (!(split instanceof CombineHiveInputSplit)) {
    return super.getRecordReader(split, job, reporter);
  }

  CombineHiveInputSplit hsplit = (CombineHiveInputSplit) split;

  String inputFormatClassName = null;
  Class<?> inputFormatClass;
  try {
    inputFormatClassName = hsplit.inputFormatClassName();
    inputFormatClass = job.getClassByName(inputFormatClassName);
  } catch (Exception e) {
    throw new IOException("cannot find class " + inputFormatClassName);
  }

  pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath(0));

  if (inputFormatClass.getName().equals(HoodieParquetInputFormat.class.getName())) {
    return ShimLoader.getHadoopShims().getCombineFileInputFormat().getRecordReader(job, (CombineFileSplit) split,
        reporter, CombineHiveRecordReader.class);
  } else if (inputFormatClass.getName().equals(HoodieParquetRealtimeInputFormat.class.getName())) {
    HoodieCombineFileInputFormatShim shims = new HoodieCombineFileInputFormatShim();
    IOContextMap.get(job).setInputPath(((CombineHiveInputSplit) split).getPath(0));
    return shims.getRecordReader(job, ((CombineHiveInputSplit) split).getInputSplitShim(),
        reporter, CombineHiveRecordReader.class);
  } else {
    throw new HoodieException("Unexpected input format : " + inputFormatClassName);
  }
}

Example #13

Source File: HoodieCombineRealtimeRecordReader.java From hudi with Apache License 2.0

5 votes

public HoodieCombineRealtimeRecordReader(JobConf jobConf, CombineFileSplit split,
    List<RecordReader> readers) {
  try {
    ValidationUtils.checkArgument(((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits().size() == readers
        .size(), "Num Splits does not match number of unique RecordReaders!");
    for (InputSplit rtSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
      LOG.info("Creating new RealtimeRecordReader for split");
      recordReaders.add(
          new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) rtSplit, jobConf, readers.remove(0)));
    }
    currentRecordReader = recordReaders.remove(0);
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}

Example #14

Source File: GemFireXDManager.java From gemfirexd-oss with Apache License 2.0

5 votes

/**
 * Make sure we do not generate a lot of data here as this will be duplicated
 * per split and sent to HAWQ master and later to datanodes.
 * 
 * The sequence in which data is written to out must match the sequence it is
 * read in {@link #readUserData()}
 * 
 * <p>
 * Only called from Fragmenter.
 * 
 * @param cSplit
 * @return
 */
public byte[] populateUserData(CombineFileSplit cSplit) throws IOException {
  // Construct user data
  ByteArrayOutputStream baos = new ByteArrayOutputStream();
  DataOutput out = new DataOutputStream(baos);

  // TODO Uncomment below statement (and its corresponding code in
  // readUserData()) when loner system is started from fragmenter as well as
  // from accessor.
  // 1. restart loner
  // out.write(RESTART_LONER_SYSTEM_CODE);
  // out.writeBoolean(this.restartLoner);

  // 2. home dir
  out.write(HOME_DIR_CODE);
  out.writeUTF(this.homeDir);

  // 3. schema.table
  out.write(SCHEMA_TABLE_NAME_CODE);
  out.writeUTF(this.schemaTableName);

  out.write(SPLIT_CODE);
  cSplit.write(out);

  // Serialize it and return
  return baos.toByteArray();
}

Example #15

Source File: GemFireXDManager.java From gemfirexd-oss with Apache License 2.0

5 votes

/**
 * This is only called from Accessor. The sequence in which switch cases
 * appear must match to the sequence followed in writing data to out in
 * {@link #populateUserData(FileSplit)}
 * 
 * @param data
 * @throws IOException
 */
public void readUserData() throws IOException {
  byte[] data = this.inputData.getFragmentMetadata();
  if (data != null && data.length > 0) {
    boolean done = false;
    ByteArrayDataInput in = new ByteArrayDataInput();
    in.initialize(data, null);
    while (!done) {
      try {
        switch (in.readByte()) {
        case HOME_DIR_CODE:
          this.homeDir = in.readUTF();
          this.logger.debug("Accessor received home dir: " + this.homeDir);
          break;
        case SCHEMA_TABLE_NAME_CODE:
          this.schemaTableName = in.readUTF();
          this.logger.debug("Accessor received schemaTable name: "
              + this.schemaTableName);
          break;
        case SPLIT_CODE:
          this.split = new CombineFileSplit();
          this.split.readFields(in);
          this.logger.debug("Accessor split read, total length: " + this.split.getLength());
          done = true;
          break;
        default:
          this.logger.error("Internal error: Invalid data from fragmenter.");
          done = true;
          break;
        }
      } catch (EOFException eofe) {
        this.logger.error("Internal error: Invalid data from fragmenter.");
        break; // from while().
      }
    }
  }
}

Example #16

Source File: AbstractGFRecordReader.java From gemfirexd-oss with Apache License 2.0

5 votes

/**
 * Initializes instance of record reader using file split and job
 * configuration
 * 
 * @param split
 * @param conf
 * @throws IOException
 */
public void initialize(CombineFileSplit split, JobConf conf) throws IOException {
  CombineFileSplit cSplit = (CombineFileSplit) split;
  Path[] path = cSplit.getPaths();
  long[] start = cSplit.getStartOffsets();
  long[] len = cSplit.getLengths();

  FileSystem fs = cSplit.getPath(0).getFileSystem(conf);
  this.splitIterator = HDFSSplitIterator.newInstance(fs, path, start, len, 0l, 0l);
}

Example #17

Source File: ImportCombineFileRecordReader.java From emr-dynamodb-connector with Apache License 2.0

5 votes

public ImportCombineFileRecordReader(CombineFileSplit combineFileSplit, JobConf job, Reporter
    reporter) throws IOException {
  this.combineFileSplit = combineFileSplit;
  this.job = job;
  this.reporter = reporter;

  processedPathCount = 0;
  currentRecordReader = getRecordReader(combineFileSplit.getPath(processedPathCount));
}

Example #18

Source File: GFInputFormat.java From gemfirexd-oss with Apache License 2.0

5 votes

@Override
public RecordReader<GFKey, PersistedEventImpl> getRecordReader(
    InputSplit split, JobConf job, Reporter reporter) throws IOException {

  CombineFileSplit cSplit = (CombineFileSplit) split;
  AbstractGFRecordReader reader = new AbstractGFRecordReader();
  reader.initialize(cSplit, job);
  return reader;
}

Example #19

Source File: GFInputFormat.java From gemfirexd-oss with Apache License 2.0

5 votes

@Override
public RecordReader<GFKey, PersistedEventImpl> getRecordReader(
    InputSplit split, JobConf job, Reporter reporter) throws IOException {

  CombineFileSplit cSplit = (CombineFileSplit) split;
  AbstractGFRecordReader reader = new AbstractGFRecordReader();
  reader.initialize(cSplit, job);
  return reader;
}

Example #20

Source File: GemFireXDManager.java From gemfirexd-oss with Apache License 2.0

5 votes

/**
 * This is only called from Accessor. The sequence in which switch cases
 * appear must match to the sequence followed in writing data to out in
 * {@link #populateUserData(FileSplit)}
 * 
 * @param data
 * @throws IOException
 */
public void readUserData() throws IOException {
  byte[] data = this.inputData.getFragmentMetadata();
  if (data != null && data.length > 0) {
    boolean done = false;
    ByteArrayDataInput in = new ByteArrayDataInput();
    in.initialize(data, null);
    while (!done) {
      try {
        switch (in.readByte()) {
        case HOME_DIR_CODE:
          this.homeDir = in.readUTF();
          this.logger.debug("Accessor received home dir: " + this.homeDir);
          break;
        case SCHEMA_TABLE_NAME_CODE:
          this.schemaTableName = in.readUTF();
          this.logger.debug("Accessor received schemaTable name: "
              + this.schemaTableName);
          break;
        case SPLIT_CODE:
          this.split = new CombineFileSplit();
          this.split.readFields(in);
          this.logger.debug("Accessor split read, total length: " + this.split.getLength());
          done = true;
          break;
        default:
          this.logger.error("Internal error: Invalid data from fragmenter.");
          done = true;
          break;
        }
      } catch (EOFException eofe) {
        this.logger.error("Internal error: Invalid data from fragmenter.");
        break; // from while().
      }
    }
  }
}

Example #21

Source File: GemFireXDManager.java From gemfirexd-oss with Apache License 2.0

5 votes

/**
 * Make sure we do not generate a lot of data here as this will be duplicated
 * per split and sent to HAWQ master and later to datanodes.
 * 
 * The sequence in which data is written to out must match the sequence it is
 * read in {@link #readUserData()}
 * 
 * <p>
 * Only called from Fragmenter.
 * 
 * @param cSplit
 * @return
 */
public byte[] populateUserData(CombineFileSplit cSplit) throws IOException {
  // Construct user data
  ByteArrayOutputStream baos = new ByteArrayOutputStream();
  DataOutput out = new DataOutputStream(baos);

  // TODO Uncomment below statement (and its corresponding code in
  // readUserData()) when loner system is started from fragmenter as well as
  // from accessor.
  // 1. restart loner
  // out.write(RESTART_LONER_SYSTEM_CODE);
  // out.writeBoolean(this.restartLoner);

  // 2. home dir
  out.write(HOME_DIR_CODE);
  out.writeUTF(this.homeDir);

  // 3. schema.table
  out.write(SCHEMA_TABLE_NAME_CODE);
  out.writeUTF(this.schemaTableName);

  out.write(SPLIT_CODE);
  cSplit.write(out);

  // Serialize it and return
  return baos.toByteArray();
}

Example #22

Source File: AbstractGFRecordReader.java From gemfirexd-oss with Apache License 2.0

5 votes

/**
 * Initializes instance of record reader using file split and job
 * configuration
 * 
 * @param split
 * @param conf
 * @throws IOException
 */
public void initialize(CombineFileSplit split, JobConf conf) throws IOException {
  CombineFileSplit cSplit = (CombineFileSplit) split;
  Path[] path = cSplit.getPaths();
  long[] start = cSplit.getStartOffsets();
  long[] len = cSplit.getLengths();

  FileSystem fs = cSplit.getPath(0).getFileSystem(conf);
  this.splitIterator = HDFSSplitIterator.newInstance(fs, path, start, len, 0l, 0l);
}

Example #23

Source File: TestDataHelper.java From gemfirexd-oss with Apache License 2.0

4 votes

private static InputSplit getCombineSplit(Path path, long l, long m, String[] strings) {
  Path[] paths = {path};
  long[] offsets = {l};
  long[] lengths = {m};
  return new CombineFileSplit(null, paths, offsets, lengths, strings);
}

Example #24

Source File: TestCombineTextInputFormat.java From hadoop with Apache License 2.0

4 votes

@Test(timeout=10000)
public void testFormat() throws Exception {
  JobConf job = new JobConf(defaultConf);

  Random random = new Random();
  long seed = random.nextLong();
  LOG.info("seed = "+seed);
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  createFiles(length, numFiles, random);

  // create a combined split for the files
  CombineTextInputFormat format = new CombineTextInputFormat();
  LongWritable key = new LongWritable();
  Text value = new Text();
  for (int i = 0; i < 3; i++) {
    int numSplits = random.nextInt(length/20)+1;
    LOG.info("splitting: requesting = " + numSplits);
    InputSplit[] splits = format.getSplits(job, numSplits);
    LOG.info("splitting: got =        " + splits.length);

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.length);
    InputSplit split = splits[0];
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    LOG.debug("split= " + split);
    RecordReader<LongWritable, Text> reader =
      format.getRecordReader(split, job, voidReporter);
    try {
      int count = 0;
      while (reader.next(key, value)) {
        int v = Integer.parseInt(value.toString());
        LOG.debug("read " + v);
        if (bits.get(v)) {
          LOG.warn("conflict with " + v +
                   " at position "+reader.getPos());
        }
        assertFalse("Key in multiple partitions.", bits.get(v));
        bits.set(v);
        count++;
      }
      LOG.info("splits="+split+" count=" + count);
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}

Example #25

Source File: TestGemFireXDManager.java From gemfirexd-oss with Apache License 2.0

4 votes

@Override
public byte[] populateUserData(CombineFileSplit fileSplit) throws IOException {
  return null;
}

Example #26

Source File: GFXDHiveInputFormat.java From gemfirexd-oss with Apache License 2.0

4 votes

@Override
protected InputSplit getSplit(CombineFileSplit split) {
  return new GFXDHiveSplit(split);
}

Example #27

Source File: GFXDHiveSplit.java From gemfirexd-oss with Apache License 2.0

4 votes

public GFXDHiveSplit(CombineFileSplit fileSplit) {
  this.combineFileSplit = fileSplit;
}

Example #28

Source File: GFXDHiveSplit.java From gemfirexd-oss with Apache License 2.0

4 votes

public void readFields(DataInput in) throws IOException {
  this.combineFileSplit  = new CombineFileSplit();
  this.combineFileSplit.readFields(in);
}

Example #29

Source File: MapRedRowRecordReader.java From gemfirexd-oss with Apache License 2.0

4 votes

protected Path[] getSplitPaths(InputSplit split){
  CombineFileSplit combineSplit = (CombineFileSplit) split;
  return combineSplit.getPaths();
}

Example #30

Source File: MapRedRowRecordReader.java From gemfirexd-oss with Apache License 2.0

4 votes

protected long[] getStartOffsets(InputSplit split){
  CombineFileSplit combineSplit = (CombineFileSplit) split;
  return combineSplit.getStartOffsets();
}