Java Code Examples for org.apache.hadoop.hive.ql.io.orc.Reader

The following examples show how to use org.apache.hadoop.hive.ql.io.orc.Reader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: spork   Source File: TestOrcStorage.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testMultiStore() throws Exception {
    pigServer.setBatchOn();
    pigServer.registerQuery("A = load '" + INPUT1 + "' as (a0:int, a1:chararray);");
    pigServer.registerQuery("B = order A by a0;");
    pigServer.registerQuery("store B into '" + OUTPUT2 + "' using OrcStorage();");
    pigServer.registerQuery("store B into '" + OUTPUT3 +"' using OrcStorage('-c SNAPPY');");
    pigServer.executeBatch();

    Path outputFilePath = new Path(new Path(OUTPUT2), "part-r-00000");
    Reader reader = OrcFile.createReader(fs, outputFilePath);
    assertEquals(reader.getNumberOfRows(), 2);
    assertEquals(reader.getCompression(), CompressionKind.ZLIB);

    Path outputFilePath2 = new Path(new Path(OUTPUT3), "part-r-00000");
    reader = OrcFile.createReader(fs, outputFilePath2);
    assertEquals(reader.getNumberOfRows(), 2);
    assertEquals(reader.getCompression(), CompressionKind.SNAPPY);

    verifyData(outputFilePath, outputFilePath2, fs, 2);
}
 
Example 2
Source Project: presto   Source File: OrcTester.java    License: Apache License 2.0 5 votes vote down vote up
private static void assertFileContentsOrcHive(
        Type type,
        TempFile tempFile,
        Iterable<?> expectedValues)
        throws Exception
{
    JobConf configuration = new JobConf(new Configuration(false));
    configuration.set(READ_COLUMN_IDS_CONF_STR, "0");
    configuration.setBoolean(READ_ALL_COLUMNS, false);

    Reader reader = OrcFile.createReader(
            new Path(tempFile.getFile().getAbsolutePath()),
            new ReaderOptions(configuration));
    RecordReader recordReader = reader.rows();

    StructObjectInspector rowInspector = (StructObjectInspector) reader.getObjectInspector();
    StructField field = rowInspector.getStructFieldRef("test");

    Iterator<?> iterator = expectedValues.iterator();
    Object rowData = null;
    while (recordReader.hasNext()) {
        rowData = recordReader.next(rowData);
        Object expectedValue = iterator.next();

        Object actualValue = rowInspector.getStructFieldData(rowData, field);
        actualValue = decodeRecordReaderValue(type, actualValue);
        assertColumnValueEquals(type, actualValue, expectedValue);
    }
    assertFalse(iterator.hasNext());
}
 
Example 3
Source Project: presto   Source File: OrcFileRewriter.java    License: Apache License 2.0 5 votes vote down vote up
public static OrcFileInfo rewrite(File input, File output, BitSet rowsToDelete)
        throws IOException
{
    try (FileSystem fileSystem = new SyncingFileSystem(CONFIGURATION)) {
        Reader reader = createReader(fileSystem, path(input));

        if (reader.getNumberOfRows() < rowsToDelete.length()) {
            throw new IOException("File has fewer rows than deletion vector");
        }
        int deleteRowCount = rowsToDelete.cardinality();
        if (reader.getNumberOfRows() == deleteRowCount) {
            return new OrcFileInfo(0, 0);
        }
        if (reader.getNumberOfRows() >= Integer.MAX_VALUE) {
            throw new IOException("File has too many rows");
        }
        int inputRowCount = toIntExact(reader.getNumberOfRows());

        WriterOptions writerOptions = OrcFile.writerOptions(CONFIGURATION)
                .memory(new NullMemoryManager())
                .fileSystem(fileSystem)
                .compress(reader.getCompression())
                .inspector(reader.getObjectInspector());

        long start = System.nanoTime();
        try (Closer<RecordReader, IOException> recordReader = closer(reader.rows(), RecordReader::close);
                Closer<Writer, IOException> writer = closer(createWriter(path(output), writerOptions), Writer::close)) {
            if (reader.hasMetadataValue(OrcFileMetadata.KEY)) {
                ByteBuffer orcFileMetadata = reader.getMetadataValue(OrcFileMetadata.KEY);
                writer.get().addUserMetadata(OrcFileMetadata.KEY, orcFileMetadata);
            }
            OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, inputRowCount);
            log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
            return fileInfo;
        }
    }
}
 
Example 4
Source Project: pxf   Source File: HiveORCVectorizedAccessor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean openForRead() throws Exception {
    Reader.Options options = new Reader.Options();
    addColumns(options);
    addFragments(options);
    orcReader = getOrcReader();
    vrr = orcReader.rowsOptions(options);
    batch = orcReader.getSchema().createRowBatch();
    return vrr.hasNext();
}
 
Example 5
Source Project: pxf   Source File: HiveORCVectorizedAccessor.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * This method updated reader options to include projected columns only.
 * @param options reader options to modify
 * @throws Exception
 */
private void addColumns(Reader.Options options) throws Exception {
    boolean[] includeColumns = new boolean[context.getColumns() + 1];
    for (ColumnDescriptor col : context.getTupleDescription()) {
        if (col.isProjected()) {
            includeColumns[col.columnIndex() + 1] = true;
        }
    }
    options.include(includeColumns);
}
 
Example 6
Source Project: pxf   Source File: HiveUtilities.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Creates ORC file reader.
 *
 * @param requestContext input data with given data source
 * @return ORC file reader
 */
public static Reader getOrcReader(Configuration configuration, RequestContext requestContext) {
    try {
        Path path = new Path(requestContext.getDataSource());
        return OrcFile.createReader(path.getFileSystem(configuration), path);
    } catch (Exception e) {
        throw new RuntimeException("Exception while getting orc reader", e);
    }
}
 
Example 7
Source Project: dremio-oss   Source File: HiveRecordReaders.java    License: Apache License 2.0 5 votes vote down vote up
Reader(final HiveTableXattr tableAttr, final SplitAndPartitionInfo split,
    final List<SchemaPath> projectedColumns, final OperatorContext context, final JobConf jobConf,
    final SerDe tableSerDe, final StructObjectInspector tableOI, final SerDe partitionSerDe,
    final StructObjectInspector partitionOI, final ScanFilter filter, final Collection<List<String>> referencedTables,
    final UserGroupInformation readerUgi) {
  super(tableAttr, split, projectedColumns, context, jobConf, tableSerDe, tableOI, partitionSerDe, partitionOI, filter,
    referencedTables, readerUgi);
}
 
Example 8
Source Project: dremio-oss   Source File: HiveRecordReaders.java    License: Apache License 2.0 5 votes vote down vote up
Reader(final HiveTableXattr tableAttr, final SplitAndPartitionInfo split,
    final List<SchemaPath> projectedColumns, final OperatorContext context, final JobConf jobConf,
    final AbstractSerDe tableSerDe, final StructObjectInspector tableOI, final AbstractSerDe partitionSerDe,
    final StructObjectInspector partitionOI, final ScanFilter filter, final Collection<List<String>> referencedTables,
    final UserGroupInformation readerUgi) {
  super(tableAttr, split, projectedColumns, context, jobConf, tableSerDe, tableOI, partitionSerDe, partitionOI, filter,
    referencedTables, readerUgi);
}
 
Example 9
Source Project: incubator-gobblin   Source File: OrcTestTools.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Reading ORC file into in-memory representation.
 */
@Override
public TreeMap<String, OrcRowIterator> readAllRecordsInBinaryDirectory(FileSystem fs, Path path) throws IOException {
  TreeMap<String, OrcRowIterator> output = new TreeMap<>();
  if (!fs.exists(path)) {
    return output;
  }
  PathFilter pathFilter = new HiddenFilter();
  for (FileStatus status : FileListUtils.listFilesRecursively(fs, path, pathFilter)) {
    String key = PathUtils.relativizePath(status.getPath(), path).toString();
    Reader orcReader = OrcFile.createReader(fs, status.getPath());
    RecordReader recordReader = orcReader.rows();

    output.put(key, new OrcRowIterator(TypeInfoUtils.getTypeInfoFromObjectInspector(orcReader.getObjectInspector()),
        new AbstractIterator<Writable>() {
          @Override
          protected Writable computeNext() {
            try {
              if (recordReader.hasNext()) {
                return (Writable) recordReader.next(null);
              } else {
                recordReader.close();
                endOfData();
                return null;
              }
            } catch (IOException ioe) {
              log.warn("Failed to process orc record reader, will terminate reader immediately", ioe);
              endOfData();
              return null;
            }
          }
        }));
  }

  return output;
}
 
Example 10
Source Project: spork   Source File: OrcStorage.java    License: Apache License 2.0 5 votes vote down vote up
private TypeInfo getTypeInfoFromLocation(String location, Job job) throws IOException {
    FileSystem fs = FileSystem.get(job.getConfiguration());
    Path path = getFirstFile(location, fs);
    if (path == null) {
        log.info("Cannot find any ORC files from " + location +
                ". Probably multiple load store in script.");
        return null;
    }
    Reader reader = OrcFile.createReader(fs, path);
    ObjectInspector oip = (ObjectInspector)reader.getObjectInspector();
    return TypeInfoUtils.getTypeInfoFromObjectInspector(oip);
}
 
Example 11
Source Project: spork   Source File: TestOrcStorage.java    License: Apache License 2.0 5 votes vote down vote up
private void verifyData(Path orcFile, Iterator<Tuple> iter, FileSystem fs, int expectedTotalRows) throws Exception {

        int expectedRows = 0;
        int actualRows = 0;
        Reader orcReader = OrcFile.createReader(fs, orcFile);
        ObjectInspector oi = orcReader.getObjectInspector();
        StructObjectInspector soi = (StructObjectInspector) oi;

        RecordReader reader = orcReader.rows(null);
        Object row = null;

        while (reader.hasNext()) {
            row = reader.next(row);
            expectedRows++;
            List<?> orcRow = soi.getStructFieldsDataAsList(row);
            if (!iter.hasNext()) {
                break;
            }
            Tuple t = iter.next();
            assertEquals(orcRow.size(), t.size());
            actualRows++;

            for (int i = 0; i < orcRow.size(); i++) {
                Object expected = orcRow.get(i);
                Object actual = t.get(i);
                compareData(expected, actual);
            }
        }
        assertFalse(iter.hasNext());
        assertEquals(expectedRows, actualRows);
        assertEquals(expectedTotalRows, actualRows);

    }
 
Example 12
Source Project: spork   Source File: TestOrcStorage.java    License: Apache License 2.0 5 votes vote down vote up
private void verifyData(Path orcFile, Path pigOrcFile, FileSystem fs, int expectedTotalRows) throws Exception {

        int expectedRows = 0;
        int actualRows = 0;
        Reader orcReaderExpected = OrcFile.createReader(fs, orcFile);
        StructObjectInspector soiExpected = (StructObjectInspector) orcReaderExpected.getObjectInspector();
        Reader orcReaderActual = OrcFile.createReader(fs, orcFile);
        StructObjectInspector soiActual = (StructObjectInspector) orcReaderActual.getObjectInspector();

        RecordReader readerExpected = orcReaderExpected.rows(null);
        Object expectedRow = null;
        RecordReader readerActual = orcReaderActual.rows(null);
        Object actualRow = null;

        while (readerExpected.hasNext()) {
            expectedRow = readerExpected.next(expectedRow);
            expectedRows++;
            List<?> orcRowExpected = soiExpected.getStructFieldsDataAsList(expectedRow);
            if (!readerActual.hasNext()) {
                break;
            }
            actualRow = readerActual.next(actualRow);
            actualRows++;
            List<?> orcRowActual = soiActual.getStructFieldsDataAsList(actualRow);
            assertEquals(orcRowExpected.size(), orcRowActual.size());

            for (int i = 0; i < orcRowExpected.size(); i++) {
                assertEquals(orcRowExpected.get(i), orcRowActual.get(i));
            }
        }
        assertFalse(readerActual.hasNext());
        assertEquals(expectedRows, actualRows);
        assertEquals(expectedTotalRows, actualRows);

        readerExpected.close();
        readerActual.close();
    }
 
Example 13
Source Project: nifi   Source File: PutORCTest.java    License: Apache License 2.0 5 votes vote down vote up
private void verifyORCUsers(final Path orcUsers, final int numExpectedUsers, BiFunction<List<Object>, Integer, Void> assertFunction) throws IOException {
    Reader reader = OrcFile.createReader(orcUsers, OrcFile.readerOptions(testConf));
    RecordReader recordReader = reader.rows();

    TypeInfo typeInfo =
            TypeInfoUtils.getTypeInfoFromTypeString("struct<name:string,favorite_number:int,favorite_color:string,scale:double>");
    StructObjectInspector inspector = (StructObjectInspector)
            OrcStruct.createObjectInspector(typeInfo);

    int currUser = 0;
    Object nextRecord = null;
    while ((nextRecord = recordReader.next(nextRecord)) != null) {
        Assert.assertNotNull(nextRecord);
        Assert.assertTrue("Not an OrcStruct", nextRecord instanceof OrcStruct);
        List<Object> x = inspector.getStructFieldsDataAsList(nextRecord);

        if (assertFunction == null) {
            assertEquals("name" + currUser, x.get(0).toString());
            assertEquals(currUser, ((IntWritable) x.get(1)).get());
            assertEquals("blue" + currUser, x.get(2).toString());
            assertEquals(10.0 * currUser, ((DoubleWritable) x.get(3)).get(), Double.MIN_VALUE);
        } else {
            assertFunction.apply(x, currUser);
        }
        currUser++;
    }

    assertEquals(numExpectedUsers, currUser);
}
 
Example 14
Source Project: localization_nifi   Source File: TestConvertAvroToORC.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_primitive_record() throws Exception {
    GenericData.Record record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(10, 20L, true, 30.0f, 40, StandardCharsets.UTF_8.encode("Hello"), "World");

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    // Put another record in
    record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(1, 2L, false, 3.0f, 4L, StandardCharsets.UTF_8.encode("I am"), "another record");
    fileWriter.append(record);
    // And one more
    record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(100, 200L, true, 300.0f, 400L, StandardCharsets.UTF_8.encode("Me"), "too!");
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();
    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (int INT, long BIGINT, boolean BOOLEAN, float FLOAT, double DOUBLE, bytes BINARY, string STRING)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("3", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildPrimitiveOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("int"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());
    Object stringFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("string"));
    assertTrue(stringFieldObject instanceof Text);
    assertEquals("World", stringFieldObject.toString());

}
 
Example 15
Source Project: localization_nifi   Source File: TestConvertAvroToORC.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_complex_record() throws Exception {

    Map<String, Double> mapData1 = new TreeMap<String, Double>() {{
        put("key1", 1.0);
        put("key2", 2.0);
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, Double> mapData2 = new TreeMap<String, Double>() {{
        put("key1", 3.0);
        put("key2", 4.0);
    }};

    record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " +
            "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());

    // This is pretty awkward and messy. The map object is a Map (not a MapWritable) but the keys are writables (in this case Text)
    // and so are the values (DoubleWritables in this case).
    Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
    assertTrue(mapFieldObject instanceof Map);
    Map map = (Map) mapFieldObject;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);

    mapValue = map.get(new Text("key2"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
 
Example 16
Source Project: localization_nifi   Source File: TestConvertAvroToORC.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_array_of_records() throws Exception {
    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array_of_records.avsc"));
    List<GenericRecord> innerRecords = new LinkedList<>();

    final GenericRecord outerRecord = new GenericData.Record(schema);

    Schema arraySchema = schema.getField("records").schema();
    Schema innerRecordSchema = arraySchema.getElementType();
    final GenericRecord innerRecord1 = new GenericData.Record(innerRecordSchema);
    innerRecord1.put("name", "Joe");
    innerRecord1.put("age", 42);

    innerRecords.add(innerRecord1);

    final GenericRecord innerRecord2 = new GenericData.Record(innerRecordSchema);
    innerRecord2.put("name", "Mary");
    innerRecord2.put("age", 28);

    innerRecords.add(innerRecord2);

    GenericData.Array<GenericRecord> array = new GenericData.Array<>(arraySchema, innerRecords);
    outerRecord.put("records", array);

    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, out);
        dataFileWriter.append(outerRecord);
    }
    out.close();

    // Build a flow file from the Avro record
    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS org_apache_nifi_outer_record " +
            "(records ARRAY<STRUCT<name:STRING, age:INT>>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("1", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(schema));

    // Verify the record contains an array
    Object arrayFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("records"));
    assertTrue(arrayFieldObject instanceof ArrayList);
    ArrayList<?> arrayField = (ArrayList<?>) arrayFieldObject;
    assertEquals(2, arrayField.size());

    // Verify the first element. Should be a record with two fields "name" and "age"
    Object element = arrayField.get(0);
    assertTrue(element instanceof OrcStruct);
    StructObjectInspector elementInspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(innerRecordSchema));
    Object nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
    assertTrue(nameObject instanceof Text);
    assertEquals("Joe", nameObject.toString());
    Object ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
    assertTrue(ageObject instanceof IntWritable);
    assertEquals(42, ((IntWritable) ageObject).get());

    // Verify the first element. Should be a record with two fields "name" and "age"
    element = arrayField.get(1);
    assertTrue(element instanceof OrcStruct);
    nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
    assertTrue(nameObject instanceof Text);
    assertEquals("Mary", nameObject.toString());
    ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
    assertTrue(ageObject instanceof IntWritable);
    assertEquals(28, ((IntWritable) ageObject).get());
}
 
Example 17
Source Project: pxf   Source File: HiveAccessor.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * @return ORC file reader
 */
protected Reader getOrcReader() {
    return HiveUtilities.getOrcReader(configuration, context);
}
 
Example 18
Source Project: pxf   Source File: HivePlugin.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * @return ORC file reader
 */
protected Reader getOrcReader() {
    return HiveUtilities.getOrcReader(configuration, context);
}
 
Example 19
Source Project: nifi   Source File: TestConvertAvroToORC.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_primitive_record() throws Exception {
    GenericData.Record record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(10, 20L, true, 30.0f, 40, StandardCharsets.UTF_8.encode("Hello"), "World");

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    // Put another record in
    record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(1, 2L, false, 3.0f, 4L, StandardCharsets.UTF_8.encode("I am"), "another record");
    fileWriter.append(record);
    // And one more
    record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(100, 200L, true, 300.0f, 400L, StandardCharsets.UTF_8.encode("Me"), "too!");
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();
    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (int INT, long BIGINT, boolean BOOLEAN, float FLOAT, double DOUBLE, bytes BINARY, string STRING)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("3", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildPrimitiveOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("int"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());
    Object stringFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("string"));
    assertTrue(stringFieldObject instanceof Text);
    assertEquals("World", stringFieldObject.toString());

}
 
Example 20
Source Project: nifi   Source File: TestConvertAvroToORC.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_complex_record() throws Exception {

    Map<String, Double> mapData1 = new TreeMap<String, Double>() {{
        put("key1", 1.0);
        put("key2", 2.0);
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, Double> mapData2 = new TreeMap<String, Double>() {{
        put("key1", 3.0);
        put("key2", 4.0);
    }};

    record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " +
            "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());

    Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
    assertTrue(mapFieldObject instanceof Map);
    Map map = (Map) mapFieldObject;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);

    mapValue = map.get(new Text("key2"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
 
Example 21
Source Project: nifi   Source File: TestConvertAvroToORC.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_array_of_records() throws Exception {
    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array_of_records.avsc"));
    List<GenericRecord> innerRecords = new LinkedList<>();

    final GenericRecord outerRecord = new GenericData.Record(schema);

    Schema arraySchema = schema.getField("records").schema();
    Schema innerRecordSchema = arraySchema.getElementType();
    final GenericRecord innerRecord1 = new GenericData.Record(innerRecordSchema);
    innerRecord1.put("name", "Joe");
    innerRecord1.put("age", 42);

    innerRecords.add(innerRecord1);

    final GenericRecord innerRecord2 = new GenericData.Record(innerRecordSchema);
    innerRecord2.put("name", "Mary");
    innerRecord2.put("age", 28);

    innerRecords.add(innerRecord2);

    GenericData.Array<GenericRecord> array = new GenericData.Array<>(arraySchema, innerRecords);
    outerRecord.put("records", array);

    final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
        dataFileWriter.create(schema, out);
        dataFileWriter.append(outerRecord);
    }
    out.close();

    // Build a flow file from the Avro record
    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS org_apache_nifi_outer_record " +
            "(records ARRAY<STRUCT<name:STRING, age:INT>>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("1", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(schema));

    // Verify the record contains an array
    Object arrayFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("records"));
    assertTrue(arrayFieldObject instanceof ArrayList);
    ArrayList<?> arrayField = (ArrayList<?>) arrayFieldObject;
    assertEquals(2, arrayField.size());

    // Verify the first element. Should be a record with two fields "name" and "age"
    Object element = arrayField.get(0);
    assertTrue(element instanceof OrcStruct);
    StructObjectInspector elementInspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(innerRecordSchema));
    Object nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
    assertTrue(nameObject instanceof Text);
    assertEquals("Joe", nameObject.toString());
    Object ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
    assertTrue(ageObject instanceof IntWritable);
    assertEquals(42, ((IntWritable) ageObject).get());

    // Verify the first element. Should be a record with two fields "name" and "age"
    element = arrayField.get(1);
    assertTrue(element instanceof OrcStruct);
    nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
    assertTrue(nameObject instanceof Text);
    assertEquals("Mary", nameObject.toString());
    ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
    assertTrue(ageObject instanceof IntWritable);
    assertEquals(28, ((IntWritable) ageObject).get());
}
 
Example 22
Source Project: nifi   Source File: TestConvertAvroToORC.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_nested_complex_record() throws Exception {

    Map<String, List<Double>> mapData1 = new TreeMap<String, List<Double>>() {{
        put("key1", Arrays.asList(1.0, 2.0));
        put("key2", Arrays.asList(3.0, 4.0));
    }};

    Map<String, String> arrayMap11 = new TreeMap<String, String>() {{
        put("key1", "v1");
        put("key2", "v2");
    }};
    Map<String, String> arrayMap12 = new TreeMap<String, String>() {{
        put("key3", "v3");
        put("key4", "v4");
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData1, Arrays.asList(arrayMap11, arrayMap12));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, List<Double>> mapData2 = new TreeMap<String, List<Double>>() {{
        put("key1", Arrays.asList(-1.0, -2.0));
        put("key2", Arrays.asList(-3.0, -4.0));
    }};

    Map<String, String> arrayMap21 = new TreeMap<String, String>() {{
        put("key1", "v-1");
        put("key2", "v-2");
    }};
    Map<String, String> arrayMap22 = new TreeMap<String, String>() {{
        put("key3", "v-3");
        put("key4", "v-4");
    }};

    record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData2, Arrays.asList(arrayMap21, arrayMap22));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS nested_complex_record " +
            "(myMapOfArray MAP<STRING, ARRAY<DOUBLE>>, myArrayOfMap ARRAY<MAP<STRING, STRING>>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildNestedComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);


    // check values
    Object myMapOfArray = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMapOfArray"));
    assertTrue(myMapOfArray instanceof Map);
    Map map = (Map) myMapOfArray;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof List);
    assertEquals(Arrays.asList(new DoubleWritable(1.0), new DoubleWritable(2.0)), mapValue);

    Object myArrayOfMap = inspector.getStructFieldData(o, inspector.getStructFieldRef("myArrayOfMap"));
    assertTrue(myArrayOfMap instanceof List);
    List list = (List) myArrayOfMap;
    Object el0 = list.get(0);
    assertNotNull(el0);
    assertTrue(el0 instanceof Map);
    assertEquals(new Text("v1"), ((Map) el0).get(new Text("key1")));
}
 
Example 23
Source Project: pxf   Source File: HiveORCVectorizedAccessor.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * File might have multiple splits, so this method restricts
 * reader to one split.
 * @param options reader options to modify
 */
private void addFragments(Reader.Options options) {
    FileSplit fileSplit = HdfsUtilities.parseFileSplit(context);
    options.range(fileSplit.getStart(), fileSplit.getLength());
}