org.apache.parquet.hadoop.ParquetReader Java Examples
The following examples show how to use
org.apache.parquet.hadoop.ParquetReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ReadBenchmarks.java From parquet-mr with Apache License 2.0 | 6 votes |
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException { ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build(); for (int i = 0; i < nRows; i++) { Group group = reader.read(); blackhole.consume(group.getBinary("binary_field", 0)); blackhole.consume(group.getInteger("int32_field", 0)); blackhole.consume(group.getLong("int64_field", 0)); blackhole.consume(group.getBoolean("boolean_field", 0)); blackhole.consume(group.getFloat("float_field", 0)); blackhole.consume(group.getDouble("double_field", 0)); blackhole.consume(group.getBinary("flba_field", 0)); blackhole.consume(group.getInt96("int96_field", 0)); } reader.close(); }
Example #2
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testWriteFileListOfMap() throws IOException, InterruptedException, TException { Map<String, String> map1 = new HashMap<String,String>(); map1.put("key11", "value11"); map1.put("key12", "value12"); Map<String, String> map2 = new HashMap<String,String>(); map2.put("key21", "value21"); final TestMapInList listMap = new TestMapInList("listmap", Arrays.asList(map1, map2)); final Path fileToCreate = createFile(listMap); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals(listMap.names.size(), g.getGroup("names", 0).getFieldRepetitionCount("names_tuple")); assertEquals(listMap.names.get(0).size(), g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map")); assertEquals(listMap.names.get(1).size(), g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map")); } }
Example #3
Source File: PutParquetTest.java From nifi with Apache License 2.0 | 6 votes |
private void verifyAvroParquetUsers(final Path avroParquetUsers, final int numExpectedUsers) throws IOException { final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader .<GenericRecord>builder(avroParquetUsers) .withConf(testConf); int currUser = 0; try (final ParquetReader<GenericRecord> reader = readerBuilder.build()) { GenericRecord nextRecord; while((nextRecord = reader.read()) != null) { Assert.assertNotNull(nextRecord); Assert.assertEquals("name" + currUser, nextRecord.get("name").toString()); Assert.assertEquals(currUser, nextRecord.get("favorite_number")); Assert.assertEquals("blue" + currUser, nextRecord.get("favorite_color").toString()); currUser++; } } Assert.assertEquals(numExpectedUsers, currUser); }
Example #4
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException { Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>(); map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2")); final TestListsInMap mapList = new TestListsInMap("maplists", map); final Path fileToCreate = createFile(mapList); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals("key1", g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8()); assertEquals("key2", g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8()); assertEquals("val1", g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8()); assertEquals("val2", g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8()); } }
Example #5
Source File: ParquetIO.java From beam with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(ProcessContext processContext) throws Exception { FileIO.ReadableFile file = processContext.element(); if (!file.getMetadata().isReadSeekEfficient()) { ResourceId filename = file.getMetadata().resourceId(); throw new RuntimeException(String.format("File has to be seekable: %s", filename)); } SeekableByteChannel seekableByteChannel = file.openSeekable(); AvroParquetReader.Builder builder = AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel)); if (modelClass != null) { // all GenericData implementations have a static get method builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null)); } try (ParquetReader<GenericRecord> reader = builder.build()) { GenericRecord read; while ((read = reader.read()) != null) { processContext.output(read); } } }
Example #6
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testWriteFile() throws IOException, InterruptedException, TException { final AddressBook a = new AddressBook( Arrays.asList( new Person( new Name("Bob", "Roberts"), 0, "[email protected]", Arrays.asList(new PhoneNumber("1234567890"))))); final Path fileToCreate = createFile(a); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; int i = 0; while((g = reader.read()) != null) { assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons")); assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0)); // just some sanity check, we're testing the various layers somewhere else ++i; } assertEquals("read 1 record", 1, i); }
Example #7
Source File: LargeInputFileIT.java From datacollector with Apache License 2.0 | 6 votes |
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException { ParquetReader reader = AvroParquetReader.builder(parquetFile) .build(); for(long i = 0; i < recourdCount; i++) { GenericData.Record actualRow = (GenericData.Record) reader.read(); Assert.assertNotNull("Can't read row " + i, actualRow); Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0); Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i))); Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i); Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100); Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100))); } Assert.assertNull("Parquet file contains more then expected rows", reader.read()); }
Example #8
Source File: AvroTestUtil.java From parquet-mr with Apache License 2.0 | 6 votes |
public static <D> List<D> read(Configuration conf, GenericData model, Schema schema, File file) throws IOException { List<D> data = new ArrayList<D>(); AvroReadSupport.setRequestedProjection(conf, schema); AvroReadSupport.setAvroReadSchema(conf, schema); try (ParquetReader<D> fileReader = AvroParquetReader .<D>builder(HadoopInputFile.fromPath(new Path(file.toString()), conf)) .withDataModel(model) // reflect disables compatibility .build()) { D datum; while ((datum = fileReader.read()) != null) { data.add(datum); } } return data; }
Example #9
Source File: BaseAvroParquetConvertIT.java From datacollector with Apache License 2.0 | 6 votes |
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException { ParquetReader reader = AvroParquetReader.builder(parquetFile) .build(); int position = 0; for(Map<String, Object> expectedRow : data) { GenericData.Record actualRow = (GenericData.Record) reader.read(); Assert.assertNotNull("Can't read row " + position, actualRow); for(Map.Entry<String, Object> entry : expectedRow.entrySet()) { Object value = actualRow.get(entry.getKey()); Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value); } } Assert.assertNull("Parquet file contains more then expected rows", reader.read()); }
Example #10
Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 6 votes |
private List<TestRecord> readParquetFilesAvro(File outputFile) throws IOException { ParquetReader<org.apache.gobblin.test.avro.TestRecord> reader = null; List<TestRecord> records = new ArrayList<>(); try { reader = new AvroParquetReader<>(new Path(outputFile.toString())); for (org.apache.gobblin.test.avro.TestRecord value = reader.read(); value != null; value = reader.read()) { records.add(new TestRecord(value.getPartition(), value.getSequence(), value.getPayload())); } } finally { if (reader != null) { try { reader.close(); } catch (Exception ex) { System.out.println(ex.getMessage()); } } } return records; }
Example #11
Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 6 votes |
protected List<TestRecord> readParquetFilesProto(File outputFile) throws IOException { ParquetReader<TestRecordProtos.TestRecordOrBuilder> reader = null; List<TestRecord> records = new ArrayList<>(); try { reader = new ProtoParquetReader<>(new Path(outputFile.toString())); for (TestRecordProtos.TestRecordOrBuilder value = reader.read(); value != null; value = reader.read()) { records.add(new TestRecord(value.getPartition(), value.getSequence(), value.getPayload())); } } finally { if (reader != null) { try { reader.close(); } catch (Exception ex) { System.out.println(ex.getMessage()); } } } return records; }
Example #12
Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 6 votes |
protected List<TestRecord> readParquetFilesGroup(File outputFile) throws IOException { ParquetReader<Group> reader = null; List<Group> records = new ArrayList<>(); try { reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport()); for (Group value = reader.read(); value != null; value = reader.read()) { records.add(value); } } finally { if (reader != null) { try { reader.close(); } catch (Exception ex) { System.out.println(ex.getMessage()); } } } return records.stream().map(value -> new TestRecord( value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0), value.getLong(TestConstants.SEQUENCE_FIELD_NAME, 0), value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0) )).collect(Collectors.toList()); }
Example #13
Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
private void validateColumns(String inputFile, List<String> prunePaths) throws IOException { ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(inputFile)).withConf(conf).build(); for (int i = 0; i < numRecord; i++) { Group group = reader.read(); if (!prunePaths.contains("DocId")) { assertEquals(1l, group.getLong("DocId", 0)); } if (!prunePaths.contains("Name")) { assertEquals("foo", group.getBinary("Name", 0).toStringUsingUTF8()); } if (!prunePaths.contains("Gender")) { assertEquals("male", group.getBinary("Gender", 0).toStringUsingUTF8()); } if (!prunePaths.contains("Links")) { Group subGroup = group.getGroup("Links", 0); if (!prunePaths.contains("Links.Backward")) { assertEquals(2l, subGroup.getLong("Backward", 0)); } if (!prunePaths.contains("Links.Forward")) { assertEquals(3l, subGroup.getLong("Forward", 0)); } } } reader.close(); }
Example #14
Source File: TestSpecificReadWrite.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testFilterOnSubAttribute() throws IOException { Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false); ParquetReader<Car> reader = new AvroParquetReader<Car>(testConf, path, column("engine.type", equalTo(EngineType.DIESEL))); assertEquals(reader.read().toString(), getVwPassat().toString()); assertNull(reader.read()); reader = new AvroParquetReader<Car>(testConf, path, column("engine.capacity", equalTo(1.4f))); assertEquals(getVwPolo().toString(), reader.read().toString()); assertNull(reader.read()); reader = new AvroParquetReader<Car>(testConf, path, column("engine.hasTurboCharger", equalTo(true))); assertEquals(getBmwMini().toString(), reader.read().toString()); assertNull(reader.read()); }
Example #15
Source File: TestSpecificReadWrite.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testAvroReadSchema() throws IOException { Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false); Configuration conf = new Configuration(testConf); AvroReadSupport.setAvroReadSchema(conf, NewCar.SCHEMA$); try(ParquetReader<NewCar> reader = new AvroParquetReader<>(conf, path)) { for (NewCar car = reader.read(); car != null; car = reader.read()) { assertNotNull(car.getEngine()); assertNotNull(car.getBrand()); assertEquals(2010, car.getYear()); assertNotNull(car.getVin()); assertNull(car.getDescription()); assertEquals(5, car.getOpt()); } } }
Example #16
Source File: TestBackwardCompatibility.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testCompatStringCompatibility() throws IOException { // some older versions of Parquet used avro.schema instead of // parquet.avro.schema and didn't annotate binary with UTF8 when the type // was converted from an Avro string. this validates that the old read // schema is recognized and used to read the file as expected. Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile()); Configuration conf = new Configuration(); ParquetReader<GenericRecord> reader = AvroParquetReader .builder(new AvroReadSupport<GenericRecord>(), testFile) .withConf(conf) .build(); GenericRecord r; while ((r = reader.read()) != null) { Assert.assertTrue("Should read value into a String", r.get("text") instanceof String); } }
Example #17
Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0 | 6 votes |
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{ ParquetReader<Group> reader = ParquetReader .builder(new GroupReadSupport(), path) .withFilter(FilterCompat.get(pred)) .build(); long count = 0; try { while (reader.read() != null) { count += 1; } } finally { reader.close(); } return count; }
Example #18
Source File: PageChecksumReadBenchmarks.java From parquet-mr with Apache License 2.0 | 6 votes |
private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole) throws IOException { try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file) .withConf(configuration) .usePageChecksumVerification(verifyChecksums) .build()) { for (int i = 0; i < nRows; i++) { Group group = reader.read(); blackhole.consume(group.getLong("long_field", 0)); blackhole.consume(group.getBinary("binary_field", 0)); Group subgroup = group.getGroup("group", 0); blackhole.consume(subgroup.getInteger("int_field", 0)); blackhole.consume(subgroup.getInteger("int_field", 1)); blackhole.consume(subgroup.getInteger("int_field", 2)); blackhole.consume(subgroup.getInteger("int_field", 3)); } } }
Example #19
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private static ParquetReader<Group> createReader(Path file, Filter filter) throws IOException { Configuration conf = new Configuration(); GroupWriteSupport.setSchema(schema, conf); return ParquetReader.builder(new GroupReadSupport(), file) .withConf(conf) .withFilter(filter) .build(); }
Example #20
Source File: TestSpecificReadWrite.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testFilterMatchesNoBlocks() throws IOException { Path path = writeCarsToParquetFile(10000, CompressionCodecName.UNCOMPRESSED, false, DEFAULT_BLOCK_SIZE/64, DEFAULT_PAGE_SIZE/64); try(ParquetReader<Car> reader = new AvroParquetReader<>(testConf, path, column("make", equalTo("Bogus")))) { assertNull(reader.read()); } }
Example #21
Source File: TestBackwardCompatibility.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testStringCompatibility() throws IOException { Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile()); Configuration conf = new Configuration(); conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false); ParquetReader<GenericRecord> reader = AvroParquetReader .builder(new AvroReadSupport<GenericRecord>(), testFile) .withConf(conf) .build(); GenericRecord r; while ((r = reader.read()) != null) { Assert.assertTrue("Should read value into a String", r.get("text") instanceof Utf8); } }
Example #22
Source File: TestSpecificReadWrite.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testFilterWithDictionary() throws IOException { Path path = writeCarsToParquetFile(1,CompressionCodecName.UNCOMPRESSED,true); try(ParquetReader<Car> reader = new AvroParquetReader<>(testConf, path, column("make", equalTo("Volkswagen")))) { assertEquals(getVwPolo().toString(), reader.read().toString()); assertEquals(getVwPassat().toString(), reader.read().toString()); assertNull(reader.read()); } }
Example #23
Source File: TestSpecificReadWrite.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testProjection() throws IOException { Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false); Configuration conf = new Configuration(testConf); Schema schema = Car.getClassSchema(); List<Schema.Field> fields = schema.getFields(); //Schema.Parser parser = new Schema.Parser(); List<Schema.Field> projectedFields = new ArrayList<Schema.Field>(); for (Schema.Field field : fields) { String name = field.name(); if ("optionalExtra".equals(name) || "serviceHistory".equals(name)) { continue; } //Schema schemaClone = parser.parse(field.schema().toString(false)); Schema.Field fieldClone = new Schema.Field(name, field.schema(), field.doc(), field.defaultVal()); projectedFields.add(fieldClone); } Schema projectedSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); projectedSchema.setFields(projectedFields); AvroReadSupport.setRequestedProjection(conf, projectedSchema); try(ParquetReader<Car> reader = new AvroParquetReader<Car>(conf, path)) { for (Car car = reader.read(); car != null; car = reader.read()) { assertTrue(car.getDoors() == 4 || car.getDoors() == 5); assertNotNull(car.getEngine()); assertNotNull(car.getMake()); assertNotNull(car.getModel()); assertEquals(2010, car.getYear()); assertNotNull(car.getVin()); assertNull(car.getOptionalExtra()); assertNull(car.getServiceHistory()); } } }
Example #24
Source File: TestSpecificReadWrite.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testFilterMatchesFinalBlockOnly() throws IOException { File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp"); tmp.deleteOnExit(); tmp.delete(); Path path = new Path(tmp.getPath()); Car vwPolo = getVwPolo(); Car vwPassat = getVwPassat(); Car bmwMini = getBmwMini(); try(ParquetWriter<Car> writer = new AvroParquetWriter<Car>(path, Car.SCHEMA$, CompressionCodecName.UNCOMPRESSED, DEFAULT_BLOCK_SIZE/128, DEFAULT_PAGE_SIZE/128, false)) { for (int i = 0; i < 10000; i++) { writer.write(vwPolo); writer.write(vwPassat); writer.write(vwPolo); } writer.write(bmwMini); // only write BMW in last block } try(ParquetReader<Car> reader = new AvroParquetReader<Car>(testConf, path, column("make", equalTo("BMW")))) { assertEquals(getBmwMini().toString(), reader.read().toString()); assertNull(reader.read()); } }
Example #25
Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException { Configuration configuration = new Configuration(true); GroupReadSupport readSupport = new GroupReadSupport(); ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath); MessageType schema = readFooter.getFileMetaData().getSchema(); readSupport.init(configuration, null, schema); return new ParquetReader<Group>(parquetFilePath, readSupport); }
Example #26
Source File: ThriftParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public ParquetReader<T> build() throws IOException { ReadSupport<T> readSupport; if (thriftClass != null) { readSupport = new ThriftReadSupport<T>(thriftClass); } else { readSupport = new ThriftReadSupport<T>(); } return ParquetReader.builder(readSupport, file).withConf(conf).withFilter(filter).build(); }
Example #27
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
public static List<Group> readFile(File f, Filter filter) throws IOException { ParquetReader<Group> reader = createReader(new Path(f.getAbsolutePath()), filter); Group current; List<Group> users = new ArrayList<Group>(); current = reader.read(); while (current != null) { users.add(current); current = reader.read(); } return users; }
Example #28
Source File: TestBinary.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testBinary() throws IOException { StringAndBinary expected = new StringAndBinary("test", ByteBuffer.wrap(new byte[] { -123, 20, 33 })); File temp = tempDir.newFile(UUID.randomUUID().toString()); temp.deleteOnExit(); temp.delete(); Path path = new Path(temp.getPath()); ThriftParquetWriter<StringAndBinary> writer = new ThriftParquetWriter<StringAndBinary>( path, StringAndBinary.class, CompressionCodecName.SNAPPY); writer.write(expected); writer.close(); ParquetReader<StringAndBinary> reader = ThriftParquetReader.<StringAndBinary> build(path) .withThriftClass(StringAndBinary.class) .build(); StringAndBinary record = reader.read(); reader.close(); assertSchema(ParquetFileReader.readFooter(new Configuration(), path)); assertEquals("Should match after serialization round trip", expected, record); }
Example #29
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
public static List<User> readUsers(ParquetReader.Builder<Group> builder) throws IOException { ParquetReader<Group> reader = builder.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()).build(); List<User> users = new ArrayList<>(); for (Group group = reader.read(); group != null; group = reader.read()) { users.add(userFromGroup(group)); } return users; }
Example #30
Source File: ScroogeBinaryTest.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test @SuppressWarnings("unchecked") public void testScroogeBinaryDecoding() throws Exception { StringAndBinary expected = new StringAndBinary.Immutable("test", ByteBuffer.wrap(new byte[] {-123, 20, 33})); File temp = tempDir.newFile(UUID.randomUUID().toString()); temp.deleteOnExit(); temp.delete(); Path path = new Path(temp.getPath()); ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>( path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class)); writer.write(expected); writer.close(); Configuration conf = new Configuration(); conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName()); ParquetReader<StringAndBinary> reader = ParquetReader.<StringAndBinary> builder(new ScroogeReadSupport(), path) .withConf(conf) .build(); StringAndBinary record = reader.read(); reader.close(); Assert.assertEquals("String should match after serialization round trip", "test", record.s()); Assert.assertEquals("ByteBuffer should match after serialization round trip", ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b()); }