org.apache.hadoop.mapred.RecordReader Java Examples

The following examples show how to use org.apache.hadoop.mapred.RecordReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java    From hadoopoffice with Apache License 2.0 6 votes vote down vote up
@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2013encrypt.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");
	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	// for decryption simply set the password

	job.set("hadoopoffice.read.security.crypt.password", "test2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNull(reader, "Null record reader implies invalid password");
}
 
Example #2
Source File: ExcelCellFileInputFormat.java    From hadoopoffice with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<Text, SpreadSheetCellDAO> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
		throws IOException {
	/** Create reader **/
	try {
			 // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally
	 		job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel");
			return new ExcelCellRecordReader( (FileSplit) split,job,reporter);
		} catch (FormatNotUnderstoodException e) {
			// log
			LOGIF.error(e);
		} catch (GeneralSecurityException gse) {
			LOGIF.error(gse);
		}
	return null;
}
 
Example #3
Source File: RealtimeUnmergedRecordReader.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Construct a Unmerged record reader that parallely consumes both parquet and log records and buffers for upstream
 * clients to consume.
 *
 * @param split File split
 * @param job Job Configuration
 * @param realReader Parquet Reader
 */
public RealtimeUnmergedRecordReader(HoodieRealtimeFileSplit split, JobConf job,
    RecordReader<NullWritable, ArrayWritable> realReader) {
  super(split, job);
  this.parquetReader = new SafeParquetRecordReaderWrapper(realReader);
  // Iterator for consuming records from parquet file
  this.parquetRecordsIterator = new RecordReaderValueIterator<>(this.parquetReader);
  this.executor = new BoundedInMemoryExecutor<>(getMaxCompactionMemoryInBytes(), getParallelProducers(),
      Option.empty(), x -> x, new DefaultSizeEstimator<>());
  // Consumer of this record reader
  this.iterator = this.executor.getQueue().iterator();
  this.logRecordScanner = new HoodieUnMergedLogRecordScanner(FSUtils.getFs(split.getPath().toString(), jobConf),
      split.getBasePath(), split.getDeltaLogPaths(), getReaderSchema(), split.getMaxCommitTime(),
      Boolean.parseBoolean(jobConf.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)),
      false, jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), record -> {
        // convert Hoodie log record to Hadoop AvroWritable and buffer
        GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get();
        ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(rec, getHiveSchema());
        this.executor.getQueue().insertRecord(aWritable);
  });
  // Start reading and buffering
  this.executor.startProducers();
}
 
Example #4
Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java    From hadoopoffice with Apache License 2.0 6 votes vote down vote up
@Test
public void readExcelInputFormatExcel2003SingleSheetEncryptedNegativeLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "excel2003encrypt.xls";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "de");

	// low footprint
	job.set("hadoopoffice.read.lowFootprint", "true");
	// for decryption simply set the password
	job.set("hadoopoffice.read.security.crypt.password", "test2");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNull(reader, "Null record reader implies invalid password");
}
 
Example #5
Source File: ReaderTextCellParallel.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public Void call() throws Exception {
	LongWritable key = new LongWritable();
	Text value = new Text();
	FastStringTokenizer st = new FastStringTokenizer(' ');
	
	RecordReader<LongWritable,Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL);
	try {
		//counting without locking as conflicts unlikely
		while( reader.next(key, value) ) {
			if( value.toString().charAt(0) == '%' )
				continue;
			st.reset( value.toString() );
			_rNnz[(int)st.nextLong()-1] ++;
			if( _isSymmetric )
				_rNnz[(int)st.nextLong()-1] ++;
		}
	}
	finally {
		IOUtilFunctions.closeSilently(reader);
	}
	return null;
}
 
Example #6
Source File: HoodieParquetInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job,
    final Reporter reporter) throws IOException {
  // TODO enable automatic predicate pushdown after fixing issues
  // FileSplit fileSplit = (FileSplit) split;
  // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent());
  // String tableName = metadata.getTableName();
  // String mode = HoodieHiveUtil.readMode(job, tableName);

  // if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) {
  // FilterPredicate predicate = constructHoodiePredicate(job, tableName, split);
  // LOG.info("Setting parquet predicate push down as " + predicate);
  // ParquetInputFormat.setFilterPredicate(job, predicate);
  // clearOutExistingPredicate(job);
  // }
  return super.getRecordReader(split, job, reporter);
}
 
Example #7
Source File: TestDBInputFormat.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/**
 * test DBInputFormat class. Class should split result for chunks
 * @throws Exception
 */
@Test(timeout = 10000)
public void testDBInputFormat() throws Exception {
  JobConf configuration = new JobConf();
  setupDriver(configuration);
  
  DBInputFormat<NullDBWritable> format = new DBInputFormat<NullDBWritable>();
  format.setConf(configuration);
  format.setConf(configuration);
  DBInputFormat.DBInputSplit splitter = new DBInputFormat.DBInputSplit(1, 10);
  Reporter reporter = mock(Reporter.class);
  RecordReader<LongWritable, NullDBWritable> reader = format.getRecordReader(
      splitter, configuration, reporter);

  configuration.setInt(MRJobConfig.NUM_MAPS, 3);
  InputSplit[] lSplits = format.getSplits(configuration, 3);
  assertEquals(5, lSplits[0].getLength());
  assertEquals(3, lSplits.length);

  // test reader .Some simple tests
  assertEquals(LongWritable.class, reader.createKey().getClass());
  assertEquals(0, reader.getPos());
  assertEquals(0, reader.getProgress(), 0.001);
  reader.close();
}
 
Example #8
Source File: FrameReaderTextCSVParallel.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public Long call() 
	throws Exception 
{
	RecordReader<LongWritable, Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL);
	LongWritable key = new LongWritable();
	Text value = new Text();
	long nrows = 0;
	
	// count rows from the first non-header row
	try {
		if ( _firstSplit && _hasHeader )
			reader.next(key, value);
		while ( reader.next(key, value) ) {
			String val = value.toString();
			nrows += ( val.startsWith(TfUtils.TXMTD_MVPREFIX)
				|| val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1; 
		}
	} 
	finally {
		IOUtilFunctions.closeSilently(reader);
	}

	return nrows;
}
 
Example #9
Source File: ImportRecordReaderFactory.java    From emr-dynamodb-connector with Apache License 2.0 6 votes vote down vote up
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}
 
Example #10
Source File: AutoInputFormat.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}
 
Example #11
Source File: TestHoodieParquetInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
    int totalExpected) throws IOException {
  int actualCount = 0;
  int totalCount = 0;
  InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
  for (InputSplit split : splits) {
    RecordReader<NullWritable, ArrayWritable> recordReader = inputFormat.getRecordReader(split, jobConf, null);
    NullWritable key = recordReader.createKey();
    ArrayWritable writable = recordReader.createValue();

    while (recordReader.next(key, writable)) {
      // writable returns an array with [field1, field2, _hoodie_commit_time,
      // _hoodie_commit_seqno]
      // Take the commit time and compare with the one we are interested in
      if (commit.equals((writable.get()[2]).toString())) {
        actualCount++;
      }
      totalCount++;
    }
  }
  assertEquals(expectedNumberOfRecordsInCommit, actualCount, msg);
  assertEquals(totalExpected, totalCount, msg);
}
 
Example #12
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter,
    Class<RecordReader<K, V>> rrClass) throws IOException {
  isRealTime = Boolean.valueOf(job.get("hudi.hive.realtime", "false"));
  if (isRealTime) {
    List<RecordReader> recordReaders = new LinkedList<>();
    ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only "
        + HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName());
    for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
      if (split.getPaths().length == 0) {
        continue;
      }
      FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(split.getPath(0).toString(), true, job);
      recordReaders.add(inputFormat.getRecordReader(inputSplit, job, reporter));
    }
    return new HoodieCombineRealtimeRecordReader(job, split, recordReaders);
  }
  return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass);
}
 
Example #13
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readEthereumBlockInputFormatBlock1() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
			ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth1.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 1 contains at least one block");
assertEquals( 0, block.getEthereumTransactions().size(),"Block 1 must have 0 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 1");
   	reader.close();
}
 
Example #14
Source File: HiveDynamoDBInputFormat.java    From emr-dynamodb-connector with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<Text, DynamoDBItemWritable> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws
    IOException {
  reporter.progress();

  Map<String, String> columnMapping =
      HiveDynamoDBUtil.fromJsonString(conf.get(DynamoDBConstants.DYNAMODB_COLUMN_MAPPING));
  Map<String, String> hiveTypeMapping = HiveDynamoDBUtil.extractHiveTypeMapping(conf);
  DynamoDBQueryFilter queryFilter = getQueryFilter(conf, columnMapping, hiveTypeMapping);
  DynamoDBSplit bbSplit = (DynamoDBSplit) split;
  bbSplit.setDynamoDBFilterPushdown(queryFilter);

  Collection<String> attributes = (columnMapping == null ? null : columnMapping.values());
  DynamoDBRecordReaderContext context = buildHiveDynamoDBRecordReaderContext(bbSplit, conf,
      reporter, attributes);
  return new DefaultDynamoDBRecordReader(context);
}
 
Example #15
Source File: LoadGeneratorMR.java    From big-c with Apache License 2.0 5 votes vote down vote up
public RecordReader<LongWritable, Text> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) throws IOException {

  return new RecordReader<LongWritable, Text>() {

    boolean sentOneRecord = false;

    public boolean next(LongWritable key, Text value)
        throws IOException {
      key.set(1);
      value.set("dummy");
      if (sentOneRecord == false) { // first call
        sentOneRecord = true;
        return true;
      }
      return false; // we have sent one record - we are done
    }

    public LongWritable createKey() {
      return new LongWritable();
    }
    public Text createValue() {
      return new Text();
    }
    public long getPos() throws IOException {
      return 1;
    }
    public void close() throws IOException {
    }
    public float getProgress() throws IOException {
      return 1;
    }
  };
}
 
Example #16
Source File: ExcelFileInputFormat.java    From hadoopoffice with Apache License 2.0 5 votes vote down vote up
@Override
public  RecordReader<Text,ArrayWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
/** Create reader **/
try {
		 // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally
 		job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel");
		return new ExcelRecordReader( (FileSplit) split,job,reporter);
	} catch (FormatNotUnderstoodException e) {
		// log
		LOGIF.error(e);
	} catch (GeneralSecurityException gse) {
		LOGIF.error(gse);
	}
return null;
}
 
Example #17
Source File: DelegatingInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}
 
Example #18
Source File: S3SelectRecordCursor.java    From presto with Apache License 2.0 5 votes vote down vote up
public S3SelectRecordCursor(
        Configuration configuration,
        Path path,
        RecordReader<K, V> recordReader,
        long totalBytes,
        Properties splitSchema,
        List<HiveColumnHandle> columns,
        DateTimeZone hiveStorageTimeZone)
{
    super(configuration, path, recordReader, totalBytes, updateSplitSchema(splitSchema, columns), columns, hiveStorageTimeZone);
}
 
Example #19
Source File: CompositeRecordReader.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Report progress as the minimum of all child RR progress.
 */
public float getProgress() throws IOException {
  float ret = 1.0f;
  for (RecordReader<K,? extends Writable> rr : kids) {
    ret = Math.min(ret, rr.getProgress());
  }
  return ret;
}
 
Example #20
Source File: NLineInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}
 
Example #21
Source File: TestDatamerge.java    From big-c with Apache License 2.0 5 votes vote down vote up
public RecordReader<K,V> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) {
  return new RecordReader<K,V>() {
    public boolean next(K key, V value) throws IOException { return false; }
    public K createKey() {
      return ReflectionUtils.newInstance(keyclass, null);
    }
    public V createValue() {
      return ReflectionUtils.newInstance(valclass, null);
    }
    public long getPos() throws IOException { return 0L; }
    public void close() throws IOException { }
    public float getProgress() throws IOException { return 0.0f; }
  };
}
 
Example #22
Source File: TestDatamerge.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public RecordReader<K,V> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) {
  return new RecordReader<K,V>() {
    public boolean next(K key, V value) throws IOException { return false; }
    public K createKey() {
      return ReflectionUtils.newInstance(keyclass, null);
    }
    public V createValue() {
      return ReflectionUtils.newInstance(valclass, null);
    }
    public long getPos() throws IOException { return 0L; }
    public void close() throws IOException { }
    public float getProgress() throws IOException { return 0.0f; }
  };
}
 
Example #23
Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java    From hadoopoffice with Apache License 2.0 5 votes vote down vote up
@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "us");
	job.set("hadoopoffice.read.header.read", "true");
	job.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	job.set("hadoopoffice.read.header.column.names.regex", "column");
	job.set("hadoopoffice.read.header.column.names.replace", "spalte");
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "stax");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}
 
Example #24
Source File: EthereumFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
@Test
public void readEthereumBlockInputFormatBlock403419() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName="block403419.bin";
	String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
	Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
    format.configure(job);
    InputSplit[] inputSplits = format.getSplits(job,1);
  
    assertEquals( 1, inputSplits.length,"Only one split generated for block 403419");
    	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull( reader,"Format returned  null RecordReader");
	BytesWritable key = new BytesWritable();	
	EthereumBlock block = new EthereumBlock();
	assertTrue( reader.next(key,block),"Input Split for block 403419 contains at least one block");
	assertEquals( 2, block.getEthereumTransactions().size(),"Block 403419 must have 2 transactions");
	EthereumBlockHeader ethereumBlockHeader = block.getEthereumBlockHeader();
	assertEquals(
			"f8b483dba2c3b7176a3da549ad41a48bb3121069",
			bytesToHex(ethereumBlockHeader.getCoinBase()).toLowerCase(),
			"Block 403419 was mined by f8b483dba2c3b7176a3da549ad41a48bb3121069"
	);
	assertEquals(
			"08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322",
			bytesToHex(ethereumBlockHeader.getParentHash()).toLowerCase(),
			"The parent of block 403419 has hash 08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322"
	);
    	assertFalse( reader.next(key,block),"No further lock 403419  in genesis Block");
    	
    	reader.close();
	
}
 
Example #25
Source File: OutputHandler.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Create a handler that will handle any records output from the application.
 * @param collector the "real" collector that takes the output
 * @param reporter the reporter for reporting progress
 */
public OutputHandler(OutputCollector<K, V> collector, Reporter reporter, 
                     RecordReader<FloatWritable,NullWritable> recordReader,
                     String expectedDigest) {
  this.reporter = reporter;
  this.collector = collector;
  this.recordReader = recordReader;
  this.expectedDigest = expectedDigest;
}
 
Example #26
Source File: PipeMapRunner.java    From big-c with Apache License 2.0 5 votes vote down vote up
public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output,
                Reporter reporter)
       throws IOException {
  PipeMapper pipeMapper = (PipeMapper)getMapper();
  pipeMapper.startOutputThreads(output, reporter);
  super.run(input, output, reporter);
}
 
Example #27
Source File: NLineInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}
 
Example #28
Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java    From hadoopoffice with Apache License 2.0 5 votes vote down vote up
@Test
public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException {
	JobConf job = new JobConf(defaultConf);
	ClassLoader classLoader = getClass().getClassLoader();
	String fileName = "multisheetheader.xlsx";
	String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
	Path file = new Path(fileNameSpreadSheet);
	FileInputFormat.setInputPaths(job, file);
	// set locale to the one of the test data
	job.set("hadoopoffice.read.locale.bcp47", "us");
	job.set("hadoopoffice.read.header.read", "true");
	job.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
	job.set("hadoopoffice.read.header.column.names.regex", "column");
	job.set("hadoopoffice.read.header.column.names.replace", "spalte");
	job.set("hadoopoffice.read.lowFootprint", "true");

	job.set("hadoopoffice.read.lowFootprint.parser", "sax");
	ExcelFileInputFormat format = new ExcelFileInputFormat();
	format.configure(job);
	InputSplit[] inputSplits = format.getSplits(job, 1);
	assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
	RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull(reader, "Format returned  null RecordReader");

	assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
			" header column 1 correctly read");
	assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
			" header column 2 correctly read");
	assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
			" header column 3 correctly read");
}
 
Example #29
Source File: HoodieParquetRealtimeInputFormat.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf jobConf,
    final Reporter reporter) throws IOException {
  // Hive on Spark invokes multiple getRecordReaders from different threads in the same spark task (and hence the
  // same JVM) unlike Hive on MR. Due to this, accesses to JobConf, which is shared across all threads, is at the
  // risk of experiencing race conditions. Hence, we synchronize on the JobConf object here. There is negligible
  // latency incurred here due to the synchronization since get record reader is called once per spilt before the
  // actual heavy lifting of reading the parquet files happen.
  if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
    synchronized (jobConf) {
      LOG.info(
          "Before adding Hoodie columns, Projections :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
              + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
      if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
        // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table;
        // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases
        // hoodie additional projection columns are reset after calling setConf and only natural projections
        // (one found in select queries) are set. things would break because of this.
        // For e:g _hoodie_record_key would be missing and merge step would throw exceptions.
        // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction
        // time.
        cleanProjectionColumnIds(jobConf);
        addRequiredProjectionFields(jobConf);

        this.conf = jobConf;
        this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true");
      }
    }
  }

  LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
      + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
  // sanity check
  ValidationUtils.checkArgument(split instanceof HoodieRealtimeFileSplit,
      "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split);

  return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, jobConf,
      super.getRecordReader(split, jobConf, reporter));
}
 
Example #30
Source File: DummyInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
public RecordReader<Object, Object> getRecordReader(InputSplit split,
    JobConf job, Reporter reporter) throws IOException {
  return new RecordReader<Object, Object>() {

    boolean once = false;

    public boolean next(Object key, Object value) throws IOException {
      if (!once) {
        once = true;
        return true;
      }
      return false;
    }

    public Object createKey() {
      return new Object();
    }

    public Object createValue() {
      return new Object();
    }

    public long getPos() throws IOException {
      return 0L;
    }

    public void close() throws IOException {
    }

    public float getProgress() throws IOException {
      return 0.0f;
    }
  };
}