org.apache.hadoop.hive.ql.io.orc.OrcSerde Java Examples

The following examples show how to use org.apache.hadoop.hive.ql.io.orc.OrcSerde. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestOrcReaderMemoryUsage.java    From presto with Apache License 2.0 6 votes vote down vote up
/**
 * Write a file that contains a number of rows with 1 BIGINT column, and some rows have null values.
 */
private static TempFile createSingleColumnFileWithNullValues(int rows)
        throws IOException, SerDeException
{
    Serializer serde = new OrcSerde();
    TempFile tempFile = new TempFile();
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(tempFile.getFile(), ORC_12, CompressionKind.NONE, BIGINT);
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", BIGINT);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < rows; i++) {
        if (i % 10 == 0) {
            objectInspector.setStructFieldData(row, field, null);
        }
        else {
            objectInspector.setStructFieldData(row, field, (long) i);
        }

        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
    return tempFile;
}
 
Example #2
Source File: TestOrcReaderMemoryUsage.java    From presto with Apache License 2.0 6 votes vote down vote up
/**
 * Write a file that contains a number of rows with 1 VARCHAR column, and all values are not null.
 */
private static TempFile createSingleColumnVarcharFile(int count, int length)
        throws Exception
{
    Serializer serde = new OrcSerde();
    TempFile tempFile = new TempFile();
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(tempFile.getFile(), ORC_12, CompressionKind.NONE, VARCHAR);
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", VARCHAR);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < count; i++) {
        objectInspector.setStructFieldData(row, field, Strings.repeat("0", length));
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
    return tempFile;
}
 
Example #3
Source File: OrcTester.java    From presto with Apache License 2.0 6 votes vote down vote up
public static DataSize writeOrcFileColumnHive(File outputFile, RecordWriter recordWriter, Type type, Iterator<?> values)
        throws Exception
{
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", type);
    Object row = objectInspector.create();

    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    Serializer serializer = new OrcSerde();

    while (values.hasNext()) {
        Object value = values.next();
        value = preprocessWriteValueHive(type, value);
        objectInspector.setStructFieldData(row, fields.get(0), value);

        Writable record = serializer.serialize(row, objectInspector);
        recordWriter.write(record);
    }

    recordWriter.close(false);
    return succinctBytes(outputFile.length());
}
 
Example #4
Source File: TestOrcReaderPositions.java    From presto with Apache License 2.0 6 votes vote down vote up
private static void createMultiStripeFile(File file)
        throws IOException, ReflectiveOperationException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, CompressionKind.NONE, BIGINT);

    Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", BIGINT);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < 300; i += 3) {
        if ((i > 0) && (i % 60 == 0)) {
            flushWriter(writer);
        }

        objectInspector.setStructFieldData(row, field, (long) i);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example #5
Source File: TestOrcReaderPositions.java    From presto with Apache License 2.0 6 votes vote down vote up
private static void createSequentialFile(File file, int count)
        throws IOException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, CompressionKind.NONE, BIGINT);

    Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", BIGINT);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < count; i++) {
        objectInspector.setStructFieldData(row, field, (long) i);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example #6
Source File: HdfsHelper.java    From DataLink with Apache License 2.0 6 votes vote down vote up
OrcWriterProxy(Configuration config, String fileName) throws IOException{
	// initial columns
       columns = config.getListConfiguration(Key.COLUMN);

       // initial inspector
       List<String> columnNames = getColumnNames(columns);
       List<ObjectInspector> columnTypeInspectors = getColumnTypeInspectors(columns);
       inspector = (StructObjectInspector)ObjectInspectorFactory
               .getStandardStructObjectInspector(columnNames, columnTypeInspectors);

       // initial writer
       String compress = config.getString(Key.COMPRESS, null);
       FileOutputFormat outFormat = new OrcOutputFormat();
       if(!"NONE".equalsIgnoreCase(compress) && null != compress ) {
           Class<? extends CompressionCodec> codecClass = getCompressCodec(compress);
           if (null != codecClass) {
               outFormat.setOutputCompressorClass(conf, codecClass);
           }
       }
       writer = outFormat.getRecordWriter(fileSystem, conf, fileName, Reporter.NULL);

       //initial orcSerde
       orcSerde = new OrcSerde();
}
 
Example #7
Source File: HiveOrcSerDeManagerTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Test custom serde config
 */
@Test
public void testCustomSerdeConfig() throws IOException {
  State state = new State();
  state.setProp(HiveOrcSerDeManager.SERDE_TYPE_KEY, OrcSerde.class.getName());
  state.setProp(HiveOrcSerDeManager.INPUT_FORMAT_CLASS_KEY, "customInputFormat");
  state.setProp(HiveOrcSerDeManager.OUTPUT_FORMAT_CLASS_KEY, "customOutputFormat");

  HiveOrcSerDeManager manager = new HiveOrcSerDeManager(state);
  HiveRegistrationUnit registrationUnit = (new HiveTable.Builder()).withDbName(TEST_DB).withTableName(TEST_TABLE).build();

  manager.addSerDeProperties(this.testRegisterPath, registrationUnit);

  examineSchema(registrationUnit);
  Assert.assertEquals(registrationUnit.getSerDeType().get(), OrcSerde.class.getName());
  Assert.assertEquals(registrationUnit.getInputFormat().get(), "customInputFormat");
  Assert.assertEquals(registrationUnit.getOutputFormat().get(), "customOutputFormat");
}
 
Example #8
Source File: TestOrcReaderMemoryUsage.java    From presto with Apache License 2.0 5 votes vote down vote up
/**
 * Write a file that contains a given number of maps where each row has 10 entries in total
 * and some entries have null keys/values.
 */
private static TempFile createSingleColumnMapFileWithNullValues(Type mapType, int rows)
        throws IOException, SerDeException
{
    Serializer serde = new OrcSerde();
    TempFile tempFile = new TempFile();
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(tempFile.getFile(), ORC_12, CompressionKind.NONE, mapType);
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", mapType);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 1; i <= rows; i++) {
        HashMap<Long, Long> map = new HashMap<>();

        for (int j = 1; j <= 8; j++) {
            Long value = (long) j;
            map.put(value, value);
        }

        // Add null values so that the StreamReader nullVectors are not empty.
        map.put(null, 0L);
        map.put(0L, null);

        objectInspector.setStructFieldData(row, field, map);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }
    writer.close(false);
    return tempFile;
}
 
Example #9
Source File: TestOrcReaderPositions.java    From presto with Apache License 2.0 5 votes vote down vote up
private static void createGrowingSequentialFile(File file, int count, int step, int initialLength)
        throws IOException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, CompressionKind.NONE, VARCHAR);

    Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", VARCHAR);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < initialLength; i++) {
        builder.append("0");
    }
    String seedString = builder.toString();

    // gradually grow the length of a cell
    int previousLength = initialLength;
    for (int i = 0; i < count; i++) {
        if ((i / step + 1) * initialLength > previousLength) {
            previousLength = (i / step + 1) * initialLength;
            builder.append(seedString);
        }
        objectInspector.setStructFieldData(row, field, builder.toString());
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example #10
Source File: HdfsHelper.java    From DataLink with Apache License 2.0 5 votes vote down vote up
public OrcSerde getOrcSerde(Configuration config){
    String fieldDelimiter = config.getString(Key.FIELD_DELIMITER);
    String compress = config.getString(Key.COMPRESS);
    String encoding = config.getString(Key.ENCODING);

    OrcSerde orcSerde = new OrcSerde();
    Properties properties = new Properties();
    properties.setProperty("orc.bloom.filter.columns", fieldDelimiter);
    properties.setProperty("orc.compress", compress);
    properties.setProperty("orc.encoding.strategy", encoding);

    orcSerde.initialize(conf, properties);
    return orcSerde;
}
 
Example #11
Source File: HiveCatalogUtil.java    From tajo with Apache License 2.0 5 votes vote down vote up
public static String getDataFormat(StorageDescriptor descriptor) {
  Preconditions.checkNotNull(descriptor);

  String serde = descriptor.getSerdeInfo().getSerializationLib();
  String inputFormat = descriptor.getInputFormat();

  if (LazySimpleSerDe.class.getName().equals(serde)) {
    if (TextInputFormat.class.getName().equals(inputFormat)) {
      return BuiltinStorages.TEXT;
    } else if (SequenceFileInputFormat.class.getName().equals(inputFormat)) {
      return BuiltinStorages.SEQUENCE_FILE;
    } else {
      throw new TajoRuntimeException(new UnknownDataFormatException(inputFormat));
    }
  } else if (LazyBinarySerDe.class.getName().equals(serde)) {
    if (SequenceFileInputFormat.class.getName().equals(inputFormat)) {
      return BuiltinStorages.SEQUENCE_FILE;
    } else {
      throw new TajoRuntimeException(new UnknownDataFormatException(inputFormat));
    }
  } else if (LazyBinaryColumnarSerDe.class.getName().equals(serde) || ColumnarSerDe.class.getName().equals(serde)) {
    if (RCFileInputFormat.class.getName().equals(inputFormat)) {
      return BuiltinStorages.RCFILE;
    } else {
      throw new TajoRuntimeException(new UnknownDataFormatException(inputFormat));
    }
  } else if (ParquetHiveSerDe.class.getName().equals(serde)) {
    return BuiltinStorages.PARQUET;
  } else if (AvroSerDe.class.getName().equals(serde)) {
    return BuiltinStorages.AVRO;
  } else if (OrcSerde.class.getName().equals(serde)) {
    return BuiltinStorages.ORC;
  } else if (RegexSerDe.class.getName().equals(serde)) {
    return BuiltinStorages.REGEX;
  } else {
    throw new TajoRuntimeException(new UnknownDataFormatException(inputFormat));
  }
}
 
Example #12
Source File: HiveDialectITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateTable() throws Exception {
	String location = warehouse + "/external_location";
	tableEnv.executeSql(String.format(
			"create external table tbl1 (d decimal(10,0),ts timestamp) partitioned by (p string) location '%s' tblproperties('k1'='v1')", location));
	Table hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl1"));
	assertEquals(TableType.EXTERNAL_TABLE.toString(), hiveTable.getTableType());
	assertEquals(1, hiveTable.getPartitionKeysSize());
	assertEquals(location, locationPath(hiveTable.getSd().getLocation()));
	assertEquals("v1", hiveTable.getParameters().get("k1"));
	assertFalse(hiveTable.getParameters().containsKey(SqlCreateHiveTable.TABLE_LOCATION_URI));

	tableEnv.executeSql("create table tbl2 (s struct<ts:timestamp,bin:binary>) stored as orc");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl2"));
	assertEquals(TableType.MANAGED_TABLE.toString(), hiveTable.getTableType());
	assertEquals(OrcSerde.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib());
	assertEquals(OrcInputFormat.class.getName(), hiveTable.getSd().getInputFormat());
	assertEquals(OrcOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

	tableEnv.executeSql("create table tbl3 (m map<timestamp,binary>) partitioned by (p1 bigint,p2 tinyint) " +
			"row format serde 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl3"));
	assertEquals(2, hiveTable.getPartitionKeysSize());
	assertEquals(LazyBinarySerDe.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib());

	tableEnv.executeSql("create table tbl4 (x int,y smallint) row format delimited fields terminated by '|' lines terminated by '\n'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl4"));
	assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.FIELD_DELIM));
	assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.SERIALIZATION_FORMAT));
	assertEquals("\n", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.LINE_DELIM));

	tableEnv.executeSql("create table tbl5 (m map<bigint,string>) row format delimited collection items terminated by ';' " +
			"map keys terminated by ':'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl5"));
	assertEquals(";", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.COLLECTION_DELIM));
	assertEquals(":", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.MAPKEY_DELIM));
}
 
Example #13
Source File: OrcFileWriter.java    From presto with Apache License 2.0 4 votes vote down vote up
private static OrcSerde createSerializer(Properties properties)
{
    OrcSerde serde = new OrcSerde();
    serde.initialize(CONFIGURATION, properties);
    return serde;
}
 
Example #14
Source File: OrcPageSourceFactory.java    From presto with Apache License 2.0 4 votes vote down vote up
@Override
public Optional<ReaderPageSourceWithProjections> createPageSource(
        Configuration configuration,
        ConnectorSession session,
        Path path,
        long start,
        long length,
        long fileSize,
        Properties schema,
        List<HiveColumnHandle> columns,
        TupleDomain<HiveColumnHandle> effectivePredicate,
        DateTimeZone hiveStorageTimeZone,
        Optional<AcidInfo> acidInfo)
{
    if (!isDeserializerClass(schema, OrcSerde.class)) {
        return Optional.empty();
    }

    // per HIVE-13040 and ORC-162, empty files are allowed
    if (fileSize == 0) {
        ReaderPageSourceWithProjections context = noProjectionAdaptation(new FixedPageSource(ImmutableList.of()));
        return Optional.of(context);
    }

    Optional<ReaderProjections> projectedReaderColumns = projectBaseColumns(columns);

    ConnectorPageSource orcPageSource = createOrcPageSource(
            hdfsEnvironment,
            session.getUser(),
            configuration,
            path,
            start,
            length,
            fileSize,
            projectedReaderColumns
                    .map(ReaderProjections::getReaderColumns)
                    .orElse(columns),
            columns,
            isUseOrcColumnNames(session),
            isFullAcidTable(Maps.fromProperties(schema)),
            effectivePredicate,
            hiveStorageTimeZone,
            orcReaderOptions
                    .withMaxMergeDistance(getOrcMaxMergeDistance(session))
                    .withMaxBufferSize(getOrcMaxBufferSize(session))
                    .withStreamBufferSize(getOrcStreamBufferSize(session))
                    .withTinyStripeThreshold(getOrcTinyStripeThreshold(session))
                    .withMaxReadBlockSize(getOrcMaxReadBlockSize(session))
                    .withLazyReadSmallRanges(getOrcLazyReadSmallRanges(session))
                    .withNestedLazy(isOrcNestedLazy(session))
                    .withBloomFiltersEnabled(isOrcBloomFiltersEnabled(session)),
            acidInfo,
            stats);

    return Optional.of(new ReaderPageSourceWithProjections(orcPageSource, projectedReaderColumns));
}
 
Example #15
Source File: TestOrcPageSourceMemoryTracking.java    From presto with Apache License 2.0 4 votes vote down vote up
public TestPreparer(String tempFilePath, List<TestColumn> testColumns, int numRows, int stripeRows)
        throws Exception
{
    OrcSerde serde = new OrcSerde();
    schema = new Properties();
    schema.setProperty("columns",
            testColumns.stream()
                    .map(TestColumn::getName)
                    .collect(Collectors.joining(",")));
    schema.setProperty("columns.types",
            testColumns.stream()
                    .map(TestColumn::getType)
                    .collect(Collectors.joining(",")));
    schema.setProperty(FILE_INPUT_FORMAT, OrcInputFormat.class.getName());
    schema.setProperty(SERIALIZATION_LIB, serde.getClass().getName());

    partitionKeys = testColumns.stream()
            .filter(TestColumn::isPartitionKey)
            .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue()))
            .collect(toList());

    partitonName = String.join("/", partitionKeys.stream()
            .map(partitionKey -> format("%s=%s", partitionKey.getName(), partitionKey.getValue()))
            .collect(toImmutableList()));

    ImmutableList.Builder<HiveColumnHandle> columnsBuilder = ImmutableList.builder();
    ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
    int nextHiveColumnIndex = 0;
    for (int i = 0; i < testColumns.size(); i++) {
        TestColumn testColumn = testColumns.get(i);
        int columnIndex = testColumn.isPartitionKey() ? -1 : nextHiveColumnIndex++;

        ObjectInspector inspector = testColumn.getObjectInspector();
        HiveType hiveType = HiveType.valueOf(inspector.getTypeName());
        Type type = hiveType.getType(TYPE_MANAGER);

        columnsBuilder.add(createBaseColumn(testColumn.getName(), columnIndex, hiveType, type, testColumn.isPartitionKey() ? PARTITION_KEY : REGULAR, Optional.empty()));
        typesBuilder.add(type);
    }
    columns = columnsBuilder.build();
    types = typesBuilder.build();

    fileSplit = createTestFile(tempFilePath, serde, null, testColumns, numRows, stripeRows);
}
 
Example #16
Source File: HiveMetaStoreUtilsTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testGetTableOrc() {
  final String databaseName = "db";
  final String tableName = "tbl";
  HiveTable.Builder builder = new HiveTable.Builder();
  builder.withDbName(databaseName).withTableName(tableName);

  HiveTable hiveTable = builder.build();

  // SerDe props are
  State serdeProps = new State();
  serdeProps.setProp("columns", "timestamp,namespace,name,metadata");
  serdeProps.setProp("columns.types", "bigint,string,string,map<string,string>");

  hiveTable.getProps().addAll(serdeProps);

  hiveTable.setInputFormat(OrcInputFormat.class.getName());
  hiveTable.setOutputFormat(OrcOutputFormat.class.getName());
  hiveTable.setSerDeType(OrcSerde.class.getName());

  Table table = HiveMetaStoreUtils.getTable(hiveTable);
  Assert.assertEquals(table.getDbName(), databaseName);
  Assert.assertEquals(table.getTableName(), tableName);

  StorageDescriptor sd = table.getSd();
  Assert.assertEquals(sd.getInputFormat(), OrcInputFormat.class.getName());
  Assert.assertEquals(sd.getOutputFormat(), OrcOutputFormat.class.getName());
  Assert.assertNotNull(sd.getSerdeInfo());
  Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), OrcSerde.class.getName());

  // verify column name
  List<FieldSchema> fields = sd.getCols();
  Assert.assertTrue(fields != null && fields.size() == 4);
  FieldSchema fieldA = fields.get(0);
  Assert.assertEquals(fieldA.getName(), "timestamp");
  Assert.assertEquals(fieldA.getType(), "bigint");

  FieldSchema fieldB = fields.get(1);
  Assert.assertEquals(fieldB.getName(), "namespace");
  Assert.assertEquals(fieldB.getType(), "string");

  FieldSchema fieldC = fields.get(2);
  Assert.assertEquals(fieldC.getName(), "name");
  Assert.assertEquals(fieldC.getType(), "string");


  FieldSchema fieldD = fields.get(3);
  Assert.assertEquals(fieldD.getName(), "metadata");
  Assert.assertEquals(fieldD.getType(), "map<string,string>");
}