org.apache.hadoop.mapred.RecordReader Java Examples
The following examples show how to use
org.apache.hadoop.mapred.RecordReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0 | 6 votes |
@Test public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "excel2013encrypt.xlsx"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // low footprint job.set("hadoopoffice.read.lowFootprint", "true"); job.set("hadoopoffice.read.lowFootprint.parser", "stax"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password", "test2"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNull(reader, "Null record reader implies invalid password"); }
Example #2
Source File: ExcelCellFileInputFormat.java From hadoopoffice with Apache License 2.0 | 6 votes |
@Override public RecordReader<Text, SpreadSheetCellDAO> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { /** Create reader **/ try { // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel"); return new ExcelCellRecordReader( (FileSplit) split,job,reporter); } catch (FormatNotUnderstoodException e) { // log LOGIF.error(e); } catch (GeneralSecurityException gse) { LOGIF.error(gse); } return null; }
Example #3
Source File: RealtimeUnmergedRecordReader.java From hudi with Apache License 2.0 | 6 votes |
/** * Construct a Unmerged record reader that parallely consumes both parquet and log records and buffers for upstream * clients to consume. * * @param split File split * @param job Job Configuration * @param realReader Parquet Reader */ public RealtimeUnmergedRecordReader(HoodieRealtimeFileSplit split, JobConf job, RecordReader<NullWritable, ArrayWritable> realReader) { super(split, job); this.parquetReader = new SafeParquetRecordReaderWrapper(realReader); // Iterator for consuming records from parquet file this.parquetRecordsIterator = new RecordReaderValueIterator<>(this.parquetReader); this.executor = new BoundedInMemoryExecutor<>(getMaxCompactionMemoryInBytes(), getParallelProducers(), Option.empty(), x -> x, new DefaultSizeEstimator<>()); // Consumer of this record reader this.iterator = this.executor.getQueue().iterator(); this.logRecordScanner = new HoodieUnMergedLogRecordScanner(FSUtils.getFs(split.getPath().toString(), jobConf), split.getBasePath(), split.getDeltaLogPaths(), getReaderSchema(), split.getMaxCommitTime(), Boolean.parseBoolean(jobConf.get(HoodieRealtimeConfig.COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP, HoodieRealtimeConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED)), false, jobConf.getInt(HoodieRealtimeConfig.MAX_DFS_STREAM_BUFFER_SIZE_PROP, HoodieRealtimeConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), record -> { // convert Hoodie log record to Hadoop AvroWritable and buffer GenericRecord rec = (GenericRecord) record.getData().getInsertValue(getReaderSchema()).get(); ArrayWritable aWritable = (ArrayWritable) HoodieRealtimeRecordReaderUtils.avroToArrayWritable(rec, getHiveSchema()); this.executor.getQueue().insertRecord(aWritable); }); // Start reading and buffering this.executor.startProducers(); }
Example #4
Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0 | 6 votes |
@Test public void readExcelInputFormatExcel2003SingleSheetEncryptedNegativeLowFootprint() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "excel2003encrypt.xls"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // low footprint job.set("hadoopoffice.read.lowFootprint", "true"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password", "test2"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNull(reader, "Null record reader implies invalid password"); }
Example #5
Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0 | 6 votes |
@Override public Void call() throws Exception { LongWritable key = new LongWritable(); Text value = new Text(); FastStringTokenizer st = new FastStringTokenizer(' '); RecordReader<LongWritable,Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL); try { //counting without locking as conflicts unlikely while( reader.next(key, value) ) { if( value.toString().charAt(0) == '%' ) continue; st.reset( value.toString() ); _rNnz[(int)st.nextLong()-1] ++; if( _isSymmetric ) _rNnz[(int)st.nextLong()-1] ++; } } finally { IOUtilFunctions.closeSilently(reader); } return null; }
Example #6
Source File: HoodieParquetInputFormat.java From hudi with Apache License 2.0 | 6 votes |
@Override public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException { // TODO enable automatic predicate pushdown after fixing issues // FileSplit fileSplit = (FileSplit) split; // HoodieTableMetadata metadata = getTableMetadata(fileSplit.getPath().getParent()); // String tableName = metadata.getTableName(); // String mode = HoodieHiveUtil.readMode(job, tableName); // if (HoodieHiveUtil.INCREMENTAL_SCAN_MODE.equals(mode)) { // FilterPredicate predicate = constructHoodiePredicate(job, tableName, split); // LOG.info("Setting parquet predicate push down as " + predicate); // ParquetInputFormat.setFilterPredicate(job, predicate); // clearOutExistingPredicate(job); // } return super.getRecordReader(split, job, reporter); }
Example #7
Source File: TestDBInputFormat.java From hadoop with Apache License 2.0 | 6 votes |
/** * test DBInputFormat class. Class should split result for chunks * @throws Exception */ @Test(timeout = 10000) public void testDBInputFormat() throws Exception { JobConf configuration = new JobConf(); setupDriver(configuration); DBInputFormat<NullDBWritable> format = new DBInputFormat<NullDBWritable>(); format.setConf(configuration); format.setConf(configuration); DBInputFormat.DBInputSplit splitter = new DBInputFormat.DBInputSplit(1, 10); Reporter reporter = mock(Reporter.class); RecordReader<LongWritable, NullDBWritable> reader = format.getRecordReader( splitter, configuration, reporter); configuration.setInt(MRJobConfig.NUM_MAPS, 3); InputSplit[] lSplits = format.getSplits(configuration, 3); assertEquals(5, lSplits[0].getLength()); assertEquals(3, lSplits.length); // test reader .Some simple tests assertEquals(LongWritable.class, reader.createKey().getClass()); assertEquals(0, reader.getPos()); assertEquals(0, reader.getProgress(), 0.001); reader.close(); }
Example #8
Source File: FrameReaderTextCSVParallel.java From systemds with Apache License 2.0 | 6 votes |
@Override public Long call() throws Exception { RecordReader<LongWritable, Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); long nrows = 0; // count rows from the first non-header row try { if ( _firstSplit && _hasHeader ) reader.next(key, value); while ( reader.next(key, value) ) { String val = value.toString(); nrows += ( val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1; } } finally { IOUtilFunctions.closeSilently(reader); } return nrows; }
Example #9
Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0 | 6 votes |
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader( InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException { // CombineFileSplit indicates the new export format which includes a manifest file if (inputSplit instanceof CombineFileSplit) { int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1); if (version != ExportManifestRecordWriter.FORMAT_VERSION) { throw new IOException("Unknown version: " + job.get(DynamoDBConstants .EXPORT_FORMAT_VERSION)); } return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter); } else if (inputSplit instanceof FileSplit) { // FileSplit indicates the old data pipeline format which doesn't include a manifest file Path path = ((FileSplit) inputSplit).getPath(); return new ImportRecordReader(job, path); } else { throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:" + " " + inputSplit.getClass()); } }
Example #10
Source File: AutoInputFormat.java From hadoop with Apache License 2.0 | 6 votes |
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit) split; FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job); FSDataInputStream is = fs.open(fileSplit.getPath()); byte[] header = new byte[3]; RecordReader reader = null; try { is.readFully(header); } catch (EOFException eof) { reader = textInputFormat.getRecordReader(split, job, reporter); } finally { is.close(); } if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') { reader = seqFileInputFormat.getRecordReader(split, job, reporter); } else { reader = textInputFormat.getRecordReader(split, job, reporter); } return reader; }
Example #11
Source File: TestHoodieParquetInputFormat.java From hudi with Apache License 2.0 | 6 votes |
private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException { int actualCount = 0; int totalCount = 0; InputSplit[] splits = inputFormat.getSplits(jobConf, 1); for (InputSplit split : splits) { RecordReader<NullWritable, ArrayWritable> recordReader = inputFormat.getRecordReader(split, jobConf, null); NullWritable key = recordReader.createKey(); ArrayWritable writable = recordReader.createValue(); while (recordReader.next(key, writable)) { // writable returns an array with [field1, field2, _hoodie_commit_time, // _hoodie_commit_seqno] // Take the commit time and compare with the one we are interested in if (commit.equals((writable.get()[2]).toString())) { actualCount++; } totalCount++; } } assertEquals(expectedNumberOfRecordsInCommit, actualCount, msg); assertEquals(totalExpected, totalCount, msg); }
Example #12
Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0 | 6 votes |
@Override public RecordReader getRecordReader(JobConf job, CombineFileSplit split, Reporter reporter, Class<RecordReader<K, V>> rrClass) throws IOException { isRealTime = Boolean.valueOf(job.get("hudi.hive.realtime", "false")); if (isRealTime) { List<RecordReader> recordReaders = new LinkedList<>(); ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only " + HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName()); for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) { if (split.getPaths().length == 0) { continue; } FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(split.getPath(0).toString(), true, job); recordReaders.add(inputFormat.getRecordReader(inputSplit, job, reporter)); } return new HoodieCombineRealtimeRecordReader(job, split, recordReaders); } return new HadoopShimsSecure.CombineFileRecordReader(job, split, reporter, rrClass); }
Example #13
Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0 | 6 votes |
@Test public void readEthereumBlockInputFormatBlock1() throws IOException, EthereumBlockReadException, ParseException, InterruptedException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="eth1.bin"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for genesis block"); RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); EthereumBlock block = new EthereumBlock(); assertTrue( reader.next(key,block),"Input Split for block 1 contains at least one block"); assertEquals( 0, block.getEthereumTransactions().size(),"Block 1 must have 0 transactions"); assertFalse( reader.next(key,block),"No further blocks in block 1"); reader.close(); }
Example #14
Source File: HiveDynamoDBInputFormat.java From emr-dynamodb-connector with Apache License 2.0 | 6 votes |
@Override public RecordReader<Text, DynamoDBItemWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { reporter.progress(); Map<String, String> columnMapping = HiveDynamoDBUtil.fromJsonString(conf.get(DynamoDBConstants.DYNAMODB_COLUMN_MAPPING)); Map<String, String> hiveTypeMapping = HiveDynamoDBUtil.extractHiveTypeMapping(conf); DynamoDBQueryFilter queryFilter = getQueryFilter(conf, columnMapping, hiveTypeMapping); DynamoDBSplit bbSplit = (DynamoDBSplit) split; bbSplit.setDynamoDBFilterPushdown(queryFilter); Collection<String> attributes = (columnMapping == null ? null : columnMapping.values()); DynamoDBRecordReaderContext context = buildHiveDynamoDBRecordReaderContext(bbSplit, conf, reporter, attributes); return new DefaultDynamoDBRecordReader(context); }
Example #15
Source File: LoadGeneratorMR.java From big-c with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit ignored, JobConf conf, Reporter reporter) throws IOException { return new RecordReader<LongWritable, Text>() { boolean sentOneRecord = false; public boolean next(LongWritable key, Text value) throws IOException { key.set(1); value.set("dummy"); if (sentOneRecord == false) { // first call sentOneRecord = true; return true; } return false; // we have sent one record - we are done } public LongWritable createKey() { return new LongWritable(); } public Text createValue() { return new Text(); } public long getPos() throws IOException { return 1; } public void close() throws IOException { } public float getProgress() throws IOException { return 1; } }; }
Example #16
Source File: ExcelFileInputFormat.java From hadoopoffice with Apache License 2.0 | 5 votes |
@Override public RecordReader<Text,ArrayWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { /** Create reader **/ try { // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel"); return new ExcelRecordReader( (FileSplit) split,job,reporter); } catch (FormatNotUnderstoodException e) { // log LOGIF.error(e); } catch (GeneralSecurityException gse) { LOGIF.error(gse); } return null; }
Example #17
Source File: DelegatingInputFormat.java From big-c with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
Example #18
Source File: S3SelectRecordCursor.java From presto with Apache License 2.0 | 5 votes |
public S3SelectRecordCursor( Configuration configuration, Path path, RecordReader<K, V> recordReader, long totalBytes, Properties splitSchema, List<HiveColumnHandle> columns, DateTimeZone hiveStorageTimeZone) { super(configuration, path, recordReader, totalBytes, updateSplitSchema(splitSchema, columns), columns, hiveStorageTimeZone); }
Example #19
Source File: CompositeRecordReader.java From hadoop with Apache License 2.0 | 5 votes |
/** * Report progress as the minimum of all child RR progress. */ public float getProgress() throws IOException { float ret = 1.0f; for (RecordReader<K,? extends Writable> rr : kids) { ret = Math.min(ret, rr.getProgress()); } return ret; }
Example #20
Source File: NLineInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new LineRecordReader(job, (FileSplit) genericSplit); }
Example #21
Source File: TestDatamerge.java From big-c with Apache License 2.0 | 5 votes |
public RecordReader<K,V> getRecordReader( InputSplit ignored, JobConf conf, Reporter reporter) { return new RecordReader<K,V>() { public boolean next(K key, V value) throws IOException { return false; } public K createKey() { return ReflectionUtils.newInstance(keyclass, null); } public V createValue() { return ReflectionUtils.newInstance(valclass, null); } public long getPos() throws IOException { return 0L; } public void close() throws IOException { } public float getProgress() throws IOException { return 0.0f; } }; }
Example #22
Source File: TestDatamerge.java From hadoop with Apache License 2.0 | 5 votes |
public RecordReader<K,V> getRecordReader( InputSplit ignored, JobConf conf, Reporter reporter) { return new RecordReader<K,V>() { public boolean next(K key, V value) throws IOException { return false; } public K createKey() { return ReflectionUtils.newInstance(keyclass, null); } public V createValue() { return ReflectionUtils.newInstance(valclass, null); } public long getPos() throws IOException { return 0L; } public void close() throws IOException { } public float getProgress() throws IOException { return 0.0f; } }; }
Example #23
Source File: OfficeFormatHadoopExcelLowFootPrintStaXTest.java From hadoopoffice with Apache License 2.0 | 5 votes |
@Test public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "multisheetheader.xlsx"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "us"); job.set("hadoopoffice.read.header.read", "true"); job.set("hadoopoffice.read.header.skipheaderinallsheets", "true"); job.set("hadoopoffice.read.header.column.names.regex", "column"); job.set("hadoopoffice.read.header.column.names.replace", "spalte"); job.set("hadoopoffice.read.lowFootprint", "true"); job.set("hadoopoffice.read.lowFootprint.parser", "stax"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0], " header column 1 correctly read"); assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1], " header column 2 correctly read"); assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2], " header column 3 correctly read"); }
Example #24
Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0 | 5 votes |
@Test public void readEthereumBlockInputFormatBlock403419() throws IOException, EthereumBlockReadException, ParseException, InterruptedException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="block403419.bin"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for block 403419"); RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); EthereumBlock block = new EthereumBlock(); assertTrue( reader.next(key,block),"Input Split for block 403419 contains at least one block"); assertEquals( 2, block.getEthereumTransactions().size(),"Block 403419 must have 2 transactions"); EthereumBlockHeader ethereumBlockHeader = block.getEthereumBlockHeader(); assertEquals( "f8b483dba2c3b7176a3da549ad41a48bb3121069", bytesToHex(ethereumBlockHeader.getCoinBase()).toLowerCase(), "Block 403419 was mined by f8b483dba2c3b7176a3da549ad41a48bb3121069" ); assertEquals( "08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322", bytesToHex(ethereumBlockHeader.getParentHash()).toLowerCase(), "The parent of block 403419 has hash 08741fa532c05804d9c1086a311e47cc024bbc43980f561041ad1fbb3c223322" ); assertFalse( reader.next(key,block),"No further lock 403419 in genesis Block"); reader.close(); }
Example #25
Source File: OutputHandler.java From hadoop with Apache License 2.0 | 5 votes |
/** * Create a handler that will handle any records output from the application. * @param collector the "real" collector that takes the output * @param reporter the reporter for reporting progress */ public OutputHandler(OutputCollector<K, V> collector, Reporter reporter, RecordReader<FloatWritable,NullWritable> recordReader, String expectedDigest) { this.reporter = reporter; this.collector = collector; this.recordReader = recordReader; this.expectedDigest = expectedDigest; }
Example #26
Source File: PipeMapRunner.java From big-c with Apache License 2.0 | 5 votes |
public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output, Reporter reporter) throws IOException { PipeMapper pipeMapper = (PipeMapper)getMapper(); pipeMapper.startOutputThreads(output, reporter); super.run(input, output, reporter); }
Example #27
Source File: NLineInputFormat.java From big-c with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new LineRecordReader(job, (FileSplit) genericSplit); }
Example #28
Source File: OfficeFormatHadoopExcelLowFootPrintSAXTest.java From hadoopoffice with Apache License 2.0 | 5 votes |
@Test public void readExcelInputFormatExcel2013MultiSheetHeaderRegExLowFootprint() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "multisheetheader.xlsx"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "us"); job.set("hadoopoffice.read.header.read", "true"); job.set("hadoopoffice.read.header.skipheaderinallsheets", "true"); job.set("hadoopoffice.read.header.column.names.regex", "column"); job.set("hadoopoffice.read.header.column.names.replace", "spalte"); job.set("hadoopoffice.read.lowFootprint", "true"); job.set("hadoopoffice.read.lowFootprint.parser", "sax"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); assertEquals("spalte1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0], " header column 1 correctly read"); assertEquals("spalte2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1], " header column 2 correctly read"); assertEquals("spalte3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2], " header column 3 correctly read"); }
Example #29
Source File: HoodieParquetRealtimeInputFormat.java From hudi with Apache License 2.0 | 5 votes |
@Override public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf jobConf, final Reporter reporter) throws IOException { // Hive on Spark invokes multiple getRecordReaders from different threads in the same spark task (and hence the // same JVM) unlike Hive on MR. Due to this, accesses to JobConf, which is shared across all threads, is at the // risk of experiencing race conditions. Hence, we synchronize on the JobConf object here. There is negligible // latency incurred here due to the synchronization since get record reader is called once per spilt before the // actual heavy lifting of reading the parquet files happen. if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) { synchronized (jobConf) { LOG.info( "Before adding Hoodie columns, Projections :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) { // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table; // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases // hoodie additional projection columns are reset after calling setConf and only natural projections // (one found in select queries) are set. things would break because of this. // For e:g _hoodie_record_key would be missing and merge step would throw exceptions. // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction // time. cleanProjectionColumnIds(jobConf); addRequiredProjectionFields(jobConf); this.conf = jobConf; this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true"); } } } LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); // sanity check ValidationUtils.checkArgument(split instanceof HoodieRealtimeFileSplit, "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split); return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, jobConf, super.getRecordReader(split, jobConf, reporter)); }
Example #30
Source File: DummyInputFormat.java From big-c with Apache License 2.0 | 5 votes |
public RecordReader<Object, Object> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { return new RecordReader<Object, Object>() { boolean once = false; public boolean next(Object key, Object value) throws IOException { if (!once) { once = true; return true; } return false; } public Object createKey() { return new Object(); } public Object createValue() { return new Object(); } public long getPos() throws IOException { return 0L; } public void close() throws IOException { } public float getProgress() throws IOException { return 0.0f; } }; }