org.apache.hadoop.mapred.RecordReader Java Exaples

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

6 votes

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// for decryption simply set the password

	job.set("hadoopoffice.read.security.crypt.password", "test2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNull(reader, "Null record reader implies invalid password");
}

Source File: ExcelCellFileInputFormat.java From hadoopoffice with Apache License 2.0

6 votes

@Override
public RecordReader<Text, SpreadSheetCellDAO> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
		throws IOException {
	/** Create reader **/
	try {
			 // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally
	 		job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel");
			return new ExcelCellRecordReader( (FileSplit) split,job,reporter);
		} catch (FormatNotUnderstoodException e) {
			// log
			LOGIF.error(e);
		} catch (GeneralSecurityException gse) {
			LOGIF.error(gse);
		}
	return null;
}

Source File: RealtimeUnmergedRecordReader.java From hudi with Apache License 2.0

6 votes

/**
 * Construct a Unmerged record reader that parallely consumes both parquet and log records and buffers for upstream
 * clients to consume.
 *
 * @param split File split
 * @param job Job Configuration
 * @param realReader Parquet Reader
 */
public RealtimeUnmergedRecordReader(HoodieRealtimeFileSplit split, JobConf job,
    RecordReader<NullWritable, ArrayWritable> realReader) {
  super(split, job);
  this.parquetReader = new SafeParquetRecordReaderWrapper(realReader);
  // Iterator for consuming records from parquet file
  this.parquetRecordsIterator = new RecordReaderValueIterator<>(this.parquetReader);
  this.executor = new BoundedInMemoryExecutor<>(getMaxCompactionMemoryInBytes(), getParallelProducers(),
      Option.empty(), x -> x, new DefaultSizeEstimator<>());
  // Consumer of this record reader
  this.iterator = this.executor.getQueue().iterator();
  this.logRecordScanner = new HoodieUnMergedLogRecordScanner(FSUtils.getFs(split.getPath().toString(), jobConf),
      split.getBasePath(), split.getDeltaLogPaths(), getReaderSchema(), split.getMaxCommitTime(),
      Boolean.parseBoolean(jobConf.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)),
      false, jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), record -> {
        // convert Hoodie log record to Hadoop AvroWritable and buffer
        GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get();
        ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(rec, getHiveSchema());
        this.executor.getQueue().insertRecord(aWritable);
  });
  // Start reading and buffering
  this.executor.startProducers();
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

6 votes

@Test
public void readExcelInputFormatExcel2003SingleSheetEncryptedNegativeLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2003encrypt.xls";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNull(reader, "Null record reader implies invalid password");
}

Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0

6 votes

@Override
public Void call() throws Exception {
	LongWritable key = new LongWritable();
	Text value = new Text();
	FastStringTokenizer st = new FastStringTokenizer(' ');
	
	RecordReader<LongWritable,Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL);
	try {
		//counting without locking as conflicts unlikely
		while( reader.next(key, value) ) {
			if( value.toString().charAt(0) == '%' )
				continue;
			st.reset( value.toString() );
			_rNnz[(int)st.nextLong()-1] ++;
			if( _isSymmetric )
				_rNnz[(int)st.nextLong()-1] ++;
		}
	}
	finally {
		IOUtilFunctions.closeSilently(reader);
	}
	return null;
}

Source File: HoodieParquetInputFormat.java From hudi with Apache License 2.0

6 votes

@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job,
    final Reporter reporter) throws IOException {
  // TODO enable automatic predicate pushdown after fixing issues
  // FileSplit fileSplit = (FileSplit) split;
  // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent());
  // String tableName = metadata.getTableName();
  // String mode = HoodieHiveUtil.readMode(job, tableName);

  // if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) {
  // FilterPredicate predicate = constructHoodiePredicate(job, tableName, split);
  // LOG.info("Setting parquet predicate push down as " + predicate);
  // ParquetInputFormat.setFilterPredicate(job, predicate);
  // clearOutExistingPredicate(job);
  // }
  return super.getRecordReader(split, job, reporter);
}

Source File: TestDBInputFormat.java From hadoop with Apache License 2.0

6 votes

/**
 * test DBInputFormat class. Class should split result for chunks
 * @throws Exception
 */
@Test(timeout = 10000)
public void testDBInputFormat() throws Exception {
  JobConf configuration = new JobConf();
  setupDriver(configuration);
  
  DBInputFormat<NullDBWritable> format = new DBInputFormat<NullDBWritable>();
  format.setConf(configuration);
  format.setConf(configuration);
  DBInputFormat.DBInputSplit splitter = new DBInputFormat.DBInputSplit(1, 10);
  Reporter reporter = mock(Reporter.class);
  RecordReader<LongWritable, NullDBWritable> reader = format.getRecordReader(
      splitter, configuration, reporter);

  configuration.setInt(MRJobConfig.NUM_MAPS, 3);
  InputSplit[] lSplits = format.getSplits(configuration, 3);
  assertEquals(5, lSplits[0].getLength());
  assertEquals(3, lSplits.length);

  // test reader .Some simple tests
  assertEquals(LongWritable.class, reader.createKey().getClass());
  assertEquals(0, reader.getPos());
  assertEquals(0, reader.getProgress(), 0.001);
  reader.close();
}

Source File: FrameReaderTextCSVParallel.java From systemds with Apache License 2.0

6 votes

@Override
public Long call() 
	throws Exception 
{
	RecordReader<LongWritable, Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL);
	LongWritable key = new LongWritable();
	Text value = new Text();
	long nrows = 0;
	
	// count rows from the first non-header row
	try {
		if ( _firstSplit && _hasHeader )
			reader.next(key, value);
		while ( reader.next(key, value) ) {
			String val = value.toString();
			nrows += ( val.startsWith(TfUtils.TXMTD_MVPREFIX)
				|| val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1; 
		}
	} 
	finally {
		IOUtilFunctions.closeSilently(reader);
	}

	return nrows;
}

Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0

6 votes

static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}

Source File: AutoInputFormat.java From hadoop with Apache License 2.0

6 votes

public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}

Source File: TestHoodieParquetInputFormat.java From hudi with Apache License 2.0

6 votes

private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
    int totalExpected) throws IOException {
  int actualCount = 0;
  int totalCount = 0;
  InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
  for (InputSplit split : splits) {
    RecordReader<NullWritable, ArrayWritable> recordReader = inputFormat.getRecordReader(split, jobConf, null);
    NullWritable key = recordReader.createKey();
    ArrayWritable writable = recordReader.createValue();

    while (recordReader.next(key, writable)) {
      // writable returns an array with [field1, field2, _hoodie_commit_time,
      // _hoodie_commit_seqno]
      // Take the commit time and compare with the one we are interested in
      if (commit.equals((writable.get()[2]).toString())) {
        actualCount++;
      }
      totalCount++;
    }
  }
  assertEquals(expectedNumberOfRecordsInCommit, actualCount, msg);
  assertEquals(totalExpected, totalCount, msg);
}

Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

6 votes

@Override
public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter,
    Class<RecordReader<K, V>> rrClass) throws IOException {
  isRealTime = Boolean.valueOf(job.get("hudi.hive.realtime", "false"));
  if (isRealTime) {
    List<RecordReader> recordReaders = new LinkedList<>();
    ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only "
        + HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName());
    for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
      if (split.getPaths().length == 0) {
        continue;
      }
      FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(split.getPath(0).toString(), true, job);
      recordReaders.add(inputFormat.getRecordReader(inputSplit, job, reporter));
    }
    return new HoodieCombineRealtimeRecordReader(job, split, recordReaders);
  }
  return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass);
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readEthereumBlockInputFormatBlock1() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
			ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1 contains at least one block");
assertEquals( 0, block.getEthereumTransactions().size(),"Block 1 must have 0 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1");
   	reader.close();
}

Source File: HiveDynamoDBInputFormat.java From emr-dynamodb-connector with Apache License 2.0

6 votes

@Override
public RecordReader<Text, DynamoDBItemWritable> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws
    IOException {
  reporter.progress();

  Map<String, String> columnMapping =
      HiveDynamoDBUtil.fromJsonString(conf.get(DynamoDBConstants.DYNAMODB_COLUMN_MAPPING));
  Map<String, String> hiveTypeMapping = HiveDynamoDBUtil.extractHiveTypeMapping(conf);
  DynamoDBQueryFilter queryFilter = getQueryFilter(conf, columnMapping, hiveTypeMapping);
  DynamoDBSplit bbSplit = (DynamoDBSplit) split;
  bbSplit.setDynamoDBFilterPushdown(queryFilter);

  Collection<String> attributes = (columnMapping == null ? null : columnMapping.values());
  DynamoDBRecordReaderContext context = buildHiveDynamoDBRecordReaderContext(bbSplit, conf,
      reporter, attributes);
  return new DefaultDynamoDBRecordReader(context);
}

Source File: LoadGeneratorMR.java From big-c with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) throws IOException {

  return new RecordReader<LongWritable, Text>() {

    boolean sentOneRecord = false;

    public boolean next(LongWritable key, Text value)
        throws IOException {
      key.set(1);
      value.set("dummy");
      if (sentOneRecord == false) { // first call
        sentOneRecord = true;
        return true;
      }
      return false; // we have sent one record - we are done
    }

    public LongWritable createKey() {
      return new LongWritable();
    }
    public Text createValue() {
      return new Text();
    }
    public long getPos() throws IOException {
      return 1;
    }
    public void close() throws IOException {
    }
    public float getProgress() throws IOException {
      return 1;
    }
  };
}

Source File: ExcelFileInputFormat.java From hadoopoffice with Apache License 2.0

5 votes

@Override
public  RecordReader<Text,ArrayWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
/** Create reader **/
try {
		 // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally
 		job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel");
		return new ExcelRecordReader( (FileSplit) split,job,reporter);
	} catch (FormatNotUnderstoodException e) {
		// log
		LOGIF.error(e);
	} catch (GeneralSecurityException gse) {
		LOGIF.error(gse);
	}
return null;
}

Source File: DelegatingInputFormat.java From big-c with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}

Source File: S3SelectRecordCursor.java From presto with Apache License 2.0

5 votes

public S3SelectRecordCursor(
        Configuration configuration,
        Path path,
        RecordReader<K, V> recordReader,
        long totalBytes,
        Properties splitSchema,
        List<HiveColumnHandle> columns,
        DateTimeZone hiveStorageTimeZone)
{
    super(configuration, path, recordReader, totalBytes, updateSplitSchema(splitSchema, columns), columns, hiveStorageTimeZone);
}

Source File: CompositeRecordReader.java From hadoop with Apache License 2.0

5 votes

/**
 * Report progress as the minimum of all child RR progress.
 */
public float getProgress() throws IOException {
  float ret = 1.0f;
  for (RecordReader<K,? extends Writable> rr : kids) {
    ret = Math.min(ret, rr.getProgress());
  }
  return ret;
}

Source File: NLineInputFormat.java From hadoop with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}

Source File: TestDatamerge.java From big-c with Apache License 2.0

5 votes

public RecordReader<K,V> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) {
  return new RecordReader<K,V>() {
    public boolean next(K key, V value) throws IOException { return false; }
    public K createKey() {
      return ReflectionUtils.newInstance(keyclass, null);
    }
    public V createValue() {
      return ReflectionUtils.newInstance(valclass, null);
    }
    public long getPos() throws IOException { return 0L; }
    public void close() throws IOException { }
    public float getProgress() throws IOException { return 0.0f; }
  };
}

Source File: TestDatamerge.java From hadoop with Apache License 2.0

5 votes

public RecordReader<K,V> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) {
  return new RecordReader<K,V>() {
    public boolean next(K key, V value) throws IOException { return false; }
    public K createKey() {
      return ReflectionUtils.newInstance(keyclass, null);
    }
    public V createValue() {
      return ReflectionUtils.newInstance(valclass, null);
    }
    public long getPos() throws IOException { return 0L; }
    public void close() throws IOException { }
    public float getProgress() throws IOException { return 0.0f; }
  };
}

Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "us");
	job.set("hadoopoffice.read.header.read", "true");
	job.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	job.set("hadoopoffice.read.header.column.names.regex", "column");
	job.set("hadoopoffice.read.header.column.names.replace", "spalte");
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
public void readEthereumBlockInputFormatBlock403419() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="block403419.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
	Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
    format.configure(job);
    InputSplit[] inputSplits = format.getSplits(job,1);
  
    assertEquals( 1, inputSplits.length,"Only one split generated for block 403419");
    	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull( reader,"Format returned  null RecordReader");
	BytesWritable key = new BytesWritable();	
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.next(key,block),"Input Split for block 403419 contains at least one block");
	assertEquals( 2, block.getEthereumTransactions().size(),"Block 403419 must have 2 transactions");
	EthereumBlockHeader ethereumBlockHeader = block.getEthereumBlockHeader();
	assertEquals(
			"f8b483dba2c3b7176a3da549ad41a48bb3121069",
			bytesToHex(ethereumBlockHeader.getCoinBase()).toLowerCase(),
			"Block 403419 was mined by f8b483dba2c3b7176a3da549ad41a48bb3121069"
	);
	assertEquals(
			"08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322",
			bytesToHex(ethereumBlockHeader.getParentHash()).toLowerCase(),
			"The parent of block 403419 has hash 08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322"
	);
    	assertFalse( reader.next(key,block),"No further lock 403419  in genesis Block");
    	
    	reader.close();
	
}

Source File: OutputHandler.java From hadoop with Apache License 2.0

5 votes

/**
 * Create a handler that will handle any records output from the application.
 * @param collector the "real" collector that takes the output
 * @param reporter the reporter for reporting progress
 */
public OutputHandler(OutputCollector<K, V> collector, Reporter reporter, 
                     RecordReader<FloatWritable,NullWritable> recordReader,
                     String expectedDigest) {
  this.reporter = reporter;
  this.collector = collector;
  this.recordReader = recordReader;
  this.expectedDigest = expectedDigest;
}

Source File: PipeMapRunner.java From big-c with Apache License 2.0

5 votes

public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output,
                Reporter reporter)
       throws IOException {
  PipeMapper pipeMapper = (PipeMapper)getMapper();
  pipeMapper.startOutputThreads(output, reporter);
  super.run(input, output, reporter);
}

Source File: NLineInputFormat.java From big-c with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}

Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0

5 votes

@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "us");
	job.set("hadoopoffice.read.header.read", "true");
	job.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	job.set("hadoopoffice.read.header.column.names.regex", "column");
	job.set("hadoopoffice.read.header.column.names.replace", "spalte");
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "sax");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}

Source File: HoodieParquetRealtimeInputFormat.java From hudi with Apache License 2.0

5 votes

@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf jobConf,
    final Reporter reporter) throws IOException {
  // Hive on Spark invokes multiple getRecordReaders from different threads in the same spark task (and hence the
  // same JVM) unlike Hive on MR. Due to this, accesses to JobConf, which is shared across all threads, is at the
  // risk of experiencing race conditions. Hence, we synchronize on the JobConf object here. There is negligible
  // latency incurred here due to the synchronization since get record reader is called once per spilt before the
  // actual heavy lifting of reading the parquet files happen.
  if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
    synchronized (jobConf) {
      LOG.info(
          "Before adding Hoodie columns, Projections :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
              + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
      if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
        // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table;
        // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases
        // hoodie additional projection columns are reset after calling setConf and only natural projections
        // (one found in select queries) are set. things would break because of this.
        // For e:g _hoodie_record_key would be missing and merge step would throw exceptions.
        // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction
        // time.
        cleanProjectionColumnIds(jobConf);
        addRequiredProjectionFields(jobConf);

        this.conf = jobConf;
        this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true");
      }
    }
  }

  LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
      + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
  // sanity check
  ValidationUtils.checkArgument(split instanceof HoodieRealtimeFileSplit,
      "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split);

  return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, jobConf,
      super.getRecordReader(split, jobConf, reporter));
}

Source File: DummyInputFormat.java From big-c with Apache License 2.0

5 votes

public RecordReader<Object, Object> getRecordReader(InputSplit split,
    JobConf job, Reporter reporter) throws IOException {
  return new RecordReader<Object, Object>() {

    boolean once = false;

    public boolean next(Object key, Object value) throws IOException {
      if (!once) {
        once = true;
        return true;
      }
      return false;
    }

    public Object createKey() {
      return new Object();
    }

    public Object createValue() {
      return new Object();
    }

    public long getPos() throws IOException {
      return 0L;
    }

    public void close() throws IOException {
    }

    public float getProgress() throws IOException {
      return 0.0f;
    }
  };
}

org.apache.hadoop.mapred.RecordReader Java Examples