org.apache.hadoop.mapred.InputSplit Java Examples

The following examples show how to use org.apache.hadoop.mapred.InputSplit. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>();
  try {
    JobConf jobConf = JobUtil.createJobConf(options, query);
    LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE);
    //numSplits arg not currently supported, use 1 as dummy arg
    InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1);
    for (InputSplit split : splits) {
      tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax()));
    }
  } catch (IOException e) {
    LOG.error("Unable to submit query to HS2");
    throw new RuntimeException(e);
  }
  return tasks;
}
 
Example #2
Source File: OrcInputFormat.java    From hive-dwrf with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<NullWritable, OrcLazyRow>
    getRecordReader(InputSplit inputSplit, JobConf conf,
                    Reporter reporter) throws IOException {
  ReaderWriterProfiler.setProfilerOptions(conf);
  FileSplit fileSplit = (FileSplit) inputSplit;
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem(conf);
  reporter.setStatus(fileSplit.toString());

  return new OrcRecordReader(
      OrcFile.createReader(fs, path, conf),
      conf,
      fileSplit.getStart(),
      fileSplit.getLength()
  );
}
 
Example #3
Source File: ImportRecordReaderFactory.java    From emr-dynamodb-connector with Apache License 2.0 6 votes vote down vote up
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}
 
Example #4
Source File: HiveDynamoDBInputFormat.java    From emr-dynamodb-connector with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<Text, DynamoDBItemWritable> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws
    IOException {
  reporter.progress();

  Map<String, String> columnMapping =
      HiveDynamoDBUtil.fromJsonString(conf.get(DynamoDBConstants.DYNAMODB_COLUMN_MAPPING));
  Map<String, String> hiveTypeMapping = HiveDynamoDBUtil.extractHiveTypeMapping(conf);
  DynamoDBQueryFilter queryFilter = getQueryFilter(conf, columnMapping, hiveTypeMapping);
  DynamoDBSplit bbSplit = (DynamoDBSplit) split;
  bbSplit.setDynamoDBFilterPushdown(queryFilter);

  Collection<String> attributes = (columnMapping == null ? null : columnMapping.values());
  DynamoDBRecordReaderContext context = buildHiveDynamoDBRecordReaderContext(bbSplit, conf,
      reporter, attributes);
  return new DefaultDynamoDBRecordReader(context);
}
 
Example #5
Source File: AbstractEvaluatorToPartitionStrategy.java    From reef with Apache License 2.0 6 votes vote down vote up
/**
 * Allocates the first available split into the evaluator.
 *
 * @param evaluatorId
 *          the evaluator id
 * @param value
 *          the queue of splits
 * @return a numberedSplit or null if it cannot find one
 */
protected NumberedSplit<InputSplit> allocateSplit(final String evaluatorId,
    final BlockingQueue<NumberedSplit<InputSplit>> value) {
  if (value == null) {
    LOG.log(Level.FINE, "Queue of splits can't be empty. Returning null");
    return null;
  }
  while (true) {
    final NumberedSplit<InputSplit> split = value.poll();
    if (split == null) {
      return null;
    }
    if (value == unallocatedSplits || unallocatedSplits.remove(split)) {
      LOG.log(Level.FINE, "Found split-" + split.getIndex() + " in the queue");
      final NumberedSplit<InputSplit> old = evaluatorToSplits.putIfAbsent(evaluatorId, split);
      if (old != null) {
        throw new RuntimeException("Trying to assign different splits to the same evaluator is not supported");
      } else {
        LOG.log(Level.FINE, "Returning " + split.getIndex());
        return split;
      }
    }
  }
}
 
Example #6
Source File: FrameReaderTextCell.java    From systemds with Apache License 2.0 6 votes vote down vote up
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, 
		ValueType[] schema, String[] names, long rlen, long clen)
	throws IOException
{
	if( fs.isDirectory(path) ) {
		FileInputFormat.addInputPath(job, path);
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		for(InputSplit split: splits)
			readTextCellFrameFromInputSplit(split, informat, job, dest);
	}
	else {
		readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
	}
}
 
Example #7
Source File: MR4CInputFormatTest.java    From mr4c with Apache License 2.0 6 votes vote down vote up
private void doTest(List<List<String>> frameSplits, int overlapBefore, int overlapAfter, Integer chunkSize) throws Exception {
	AlgorithmConfig algoConfig = m_mgr.getExecutionSource().getAlgorithmConfig();
	algoConfig.addDimension(new DimensionConfig("frame", true, overlapBefore, overlapAfter, null, chunkSize, false));
	algoConfig.addDimension(new DimensionConfig("type", false, 0, 0, null, null, false));

	Set<Set<DataKey>> expectedKeySplits = buildExpectedSplits(frameSplits);
	MR4CInputFormat format = new MR4CInputFormat();
	InputSplit[] splits = format.getSplits( m_mgr.getExecutionSource(), 4);

	Set<Set<DataKey>> actualKeySplits=  new HashSet<Set<DataKey>>();
	for ( InputSplit split : splits ) {
		MR4CInputSplit bbSplit = (MR4CInputSplit) split;
		actualKeySplits.add(new HashSet<DataKey>(bbSplit.getKeys().getKeys()));
	}
	assertEquals(expectedKeySplits, actualKeySplits);
}
 
Example #8
Source File: HadoopInputFormatTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testOpenWithConfigurableReader() throws Exception {
	ConfigurableDummyRecordReader recordReader = mock(ConfigurableDummyRecordReader.class);
	DummyInputFormat inputFormat = mock(DummyInputFormat.class);
	when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader);

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf());
	hadoopInputFormat.open(getHadoopInputSplit());

	verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class));
	verify(recordReader, times(1)).setConf(any(JobConf.class));
	verify(recordReader, times(1)).createKey();
	verify(recordReader, times(1)).createValue();

	assertThat(hadoopInputFormat.fetched, is(false));

}
 
Example #9
Source File: HadoopInputFormatTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testOpenClose() throws Exception {
	DummyRecordReader recordReader = mock(DummyRecordReader.class);
	DummyInputFormat inputFormat = mock(DummyInputFormat.class);
	when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader);

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf());
	hadoopInputFormat.open(getHadoopInputSplit());

	verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class));
	verify(recordReader, times(1)).createKey();
	verify(recordReader, times(1)).createValue();

	assertThat(hadoopInputFormat.fetched, is(false));

	hadoopInputFormat.close();
	verify(recordReader, times(1)).close();
}
 
Example #10
Source File: HdfsDataFragmenter.java    From pxf with Apache License 2.0 6 votes vote down vote up
protected List<InputSplit> getSplits(Path path) throws IOException {
    PxfInputFormat pxfInputFormat = new PxfInputFormat();
    PxfInputFormat.setInputPaths(jobConf, path);
    InputSplit[] splits = pxfInputFormat.getSplits(jobConf, 1);
    List<InputSplit> result = new ArrayList<>();

    /*
     * HD-2547: If the file is empty, an empty split is returned: no
     * locations and no length.
     */
    if (splits != null) {
        for (InputSplit split : splits) {
            if (split.getLength() > 0) {
                result.add(split);
            }
        }
    }

    return result;
}
 
Example #11
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readEthereumBlockInputFormatBlock3346406() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth3346406.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");

BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 3346406 contains at least one block");
assertEquals( 7, block.getEthereumTransactions().size(),"Block 3346406 must have 7 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 3346406");
   	reader.close();
}
 
Example #12
Source File: SplittableXmlInputFormat.java    From Hive-XML-SerDe with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader<LongWritable, Text> getRecordReader(InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {

    InputStream inputStream = null;
    try {
        inputStream = getInputStream(job, (FileSplit) inputSplit);
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    }
    long start = ((FileSplit) inputSplit).getStart();
    long end = start + inputSplit.getLength();

    return new HiveXmlRecordReader(job, inputStream, start, end);
}
 
Example #13
Source File: StormParsedInputFormat.java    From incubator-retired-mrql with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader<MRContainer, MRContainer> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    StormEvaluator.load_source_dir();  // load the parsed source parameters from a file
    String path = ((FileSplit)split).getPath().toString();
    ParsedDataSource ds = (ParsedDataSource)DataSource.get(path,Plan.conf);
    return new ParsedRecordReader((FileSplit)split,job,ds.parser,(Trees)ds.args);
}
 
Example #14
Source File: TezGroupedSplit.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public void addSplit(InputSplit split) {
  wrappedSplits.add(split);
  try {
    length += split.getLength();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }
}
 
Example #15
Source File: ParquetRecordReaderWrapper.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetRecordReaderWrapper(
    final ParquetInputFormat<ArrayWritable> newInputFormat,
    final InputSplit oldSplit,
    final JobConf oldJobConf,
    final Reporter reporter)
        throws IOException, InterruptedException {
  this(newInputFormat, oldSplit, oldJobConf, reporter,
      (new HiveBindingFactory()).create());
}
 
Example #16
Source File: DBInputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/** {@inheritDoc} */
@SuppressWarnings("unchecked")
public RecordReader<LongWritable, T> getRecordReader(InputSplit split,
    JobConf job, Reporter reporter) throws IOException {

  Class inputClass = dbConf.getInputClass();
  try {
    return new DBRecordReader((DBInputSplit) split, inputClass, job);
  }
  catch (SQLException ex) {
    throw new IOException(ex.getMessage());
  }
}
 
Example #17
Source File: MRInputHelpers.java    From tez with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
    JobConf jobConf, boolean groupSplits, boolean sortSplits, int numTasks)
    throws IOException {

  // This is the real InputFormat
  org.apache.hadoop.mapred.InputFormat inputFormat;
  try {
    inputFormat = jobConf.getInputFormat();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }

  org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;

  if (groupSplits) {
    org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat =
        new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
    groupedFormat.setConf(jobConf);
    groupedFormat.setInputFormat(inputFormat);
    groupedFormat.setDesiredNumberOfSplits(numTasks);
    finalInputFormat = groupedFormat;
  } else {
    finalInputFormat = inputFormat;
  }
  org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
      .getSplits(jobConf, jobConf.getNumMapTasks());
  if (sortSplits) {
    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new OldInputSplitComparator());
  }
  return splits;
}
 
Example #18
Source File: BinaryProtocol.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public void runMap(InputSplit split, int numReduces, 
                   boolean pipedInput) throws IOException {
  WritableUtils.writeVInt(stream, MessageType.RUN_MAP.code);
  writeObject(split);
  WritableUtils.writeVInt(stream, numReduces);
  WritableUtils.writeVInt(stream, pipedInput ? 1 : 0);
}
 
Example #19
Source File: InputFormatGrakn.java    From grakn with GNU Affero General Public License v3.0 5 votes vote down vote up
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    TaskAttemptContext tac = HadoopCompat.newTaskAttemptContext(jobConf, new TaskAttemptID());
    List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac);
    InputSplit[] oldInputSplits = new InputSplit[newInputSplits.size()];
    for (int i = 0; i < newInputSplits.size(); i++) {
        oldInputSplits[i] = (ColumnFamilySplit) newInputSplits.get(i);
    }
    return oldInputSplits;
}
 
Example #20
Source File: CompositeInputSplit.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Collect a set of hosts from all child InputSplits.
 */
public String[] getLocations() throws IOException {
  HashSet<String> hosts = new HashSet<String>();
  for (InputSplit s : splits) {
    String[] hints = s.getLocations();
    if (hints != null && hints.length > 0) {
      for (String host : hints) {
        hosts.add(host);
      }
    }
  }
  return hosts.toArray(new String[hosts.size()]);
}
 
Example #21
Source File: LoadGeneratorMR.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public RecordReader<LongWritable, Text> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) throws IOException {

  return new RecordReader<LongWritable, Text>() {

    boolean sentOneRecord = false;

    public boolean next(LongWritable key, Text value)
        throws IOException {
      key.set(1);
      value.set("dummy");
      if (sentOneRecord == false) { // first call
        sentOneRecord = true;
        return true;
      }
      return false; // we have sent one record - we are done
    }

    public LongWritable createKey() {
      return new LongWritable();
    }
    public Text createValue() {
      return new Text();
    }
    public long getPos() throws IOException {
      return 1;
    }
    public void close() throws IOException {
    }
    public float getProgress() throws IOException {
      return 1;
    }
  };
}
 
Example #22
Source File: DummyInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public RecordReader<Object, Object> getRecordReader(InputSplit split,
    JobConf job, Reporter reporter) throws IOException {
  return new RecordReader<Object, Object>() {

    boolean once = false;

    public boolean next(Object key, Object value) throws IOException {
      if (!once) {
        once = true;
        return true;
      }
      return false;
    }

    public Object createKey() {
      return new Object();
    }

    public Object createValue() {
      return new Object();
    }

    public long getPos() throws IOException {
      return 0L;
    }

    public void close() throws IOException {
    }

    public float getProgress() throws IOException {
      return 0.0f;
    }
  };
}
 
Example #23
Source File: TezGroupedSplit.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public TezGroupedSplit(int numSplits, String wrappedInputFormatName,
    String[] locations, String rack) {
  this.wrappedSplits = new ArrayList<InputSplit>(numSplits);
  this.wrappedInputFormatName = wrappedInputFormatName;
  this.locations = locations;
  this.rack = rack;
}
 
Example #24
Source File: GFInputFormat.java    From gemfirexd-oss with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader<GFKey, PersistedEventImpl> getRecordReader(
    InputSplit split, JobConf job, Reporter reporter) throws IOException {

  CombineFileSplit cSplit = (CombineFileSplit) split;
  AbstractGFRecordReader reader = new AbstractGFRecordReader();
  reader.initialize(cSplit, job);
  return reader;
}
 
Example #25
Source File: BackgroundHiveSplitLoader.java    From presto with Apache License 2.0 5 votes vote down vote up
private ListenableFuture<?> addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory)
        throws IOException
{
    ListenableFuture<?> lastResult = COMPLETED_FUTURE;
    for (InputSplit inputSplit : targetSplits) {
        Optional<InternalHiveSplit> internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit);
        if (internalHiveSplit.isPresent()) {
            lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get());
        }
        if (stopped) {
            return COMPLETED_FUTURE;
        }
    }
    return lastResult;
}
 
Example #26
Source File: MRReaderMapred.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public MRReaderMapred(JobConf jobConf, InputSplit inputSplit, TezCounters tezCounters,
    TezCounter inputRecordCounter) throws IOException {
  this.jobConf = jobConf;
  this.tezCounters = tezCounters;
  this.inputRecordCounter = inputRecordCounter;
  inputFormat = this.jobConf.getInputFormat();
  if (inputSplit != null) {
    this.inputSplit = inputSplit;
    setupOldRecordReader();
  }
}
 
Example #27
Source File: DelegatingInputFormat.java    From RDFS with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}
 
Example #28
Source File: ReaderTextCellParallel.java    From systemds with Apache License 2.0 5 votes vote down vote up
public ReadTask( InputSplit split, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen, long clen, boolean mm, FileFormatPropertiesMM mmProps ) {
	_split = split;
	_sparse = dest.isInSparseFormat();
	_informat = informat;
	_job = job;
	_dest = dest;
	_rlen = rlen;
	_clen = clen;
	_matrixMarket = mm;
	_mmProps = mmProps;
}
 
Example #29
Source File: DBInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/** {@inheritDoc} */
public InputSplit[] getSplits(JobConf job, int chunks) throws IOException {
  List<org.apache.hadoop.mapreduce.InputSplit> newSplits = 
    super.getSplits(Job.getInstance(job));
  InputSplit[] ret = new InputSplit[newSplits.size()];
  int i = 0;
  for (org.apache.hadoop.mapreduce.InputSplit s : newSplits) {
    org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split = 
  	(org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit)s;
    ret[i++] = new DBInputSplit(split.getStart(), split.getEnd());
  }
  return ret;
}
 
Example #30
Source File: TestInputOutputFormat.java    From hive-dwrf with Apache License 2.0 5 votes vote down vote up
@Test
public void testEmptyFile() throws Exception {
  JobConf job = new JobConf(conf);
  Properties properties = new Properties();
  HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
  FileSinkOperator.RecordWriter writer =
      outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
          properties, Reporter.NULL);
  writer.close(true);
  properties.setProperty("columns", "x,y");
  properties.setProperty("columns.types", "int:int");
  SerDe serde = new OrcSerde();
  serde.initialize(conf, properties);
  InputFormat<?,?> in = new OrcInputFormat();
  FileInputFormat.setInputPaths(conf, testFilePath.toString());
  InputSplit[] splits = in.getSplits(conf, 1);
  assertEquals(1, splits.length);

  // read the whole file
  conf.set("hive.io.file.readcolumn.ids", "0,1");
  org.apache.hadoop.mapred.RecordReader reader =
      in.getRecordReader(splits[0], conf, Reporter.NULL);
  Object key = reader.createKey();
  Object value = reader.createValue();
  assertEquals(0.0, reader.getProgress(), 0.00001);
  assertEquals(0, reader.getPos());
  assertEquals(false, reader.next(key, value));
  reader.close();
  assertEquals(null, serde.getSerDeStats());
}