Java Code Examples for org.apache.hadoop.hive.serde2.Serializer

The following examples show how to use org.apache.hadoop.hive.serde2.Serializer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: presto   Source File: TestOrcReaderMemoryUsage.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Write a file that contains a number of rows with 1 BIGINT column, and some rows have null values.
 */
private static TempFile createSingleColumnFileWithNullValues(int rows)
        throws IOException, SerDeException
{
    Serializer serde = new OrcSerde();
    TempFile tempFile = new TempFile();
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(tempFile.getFile(), ORC_12, CompressionKind.NONE, BIGINT);
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", BIGINT);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < rows; i++) {
        if (i % 10 == 0) {
            objectInspector.setStructFieldData(row, field, null);
        }
        else {
            objectInspector.setStructFieldData(row, field, (long) i);
        }

        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
    return tempFile;
}
 
Example 2
Source Project: presto   Source File: TestOrcReaderMemoryUsage.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Write a file that contains a number of rows with 1 VARCHAR column, and all values are not null.
 */
private static TempFile createSingleColumnVarcharFile(int count, int length)
        throws Exception
{
    Serializer serde = new OrcSerde();
    TempFile tempFile = new TempFile();
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(tempFile.getFile(), ORC_12, CompressionKind.NONE, VARCHAR);
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", VARCHAR);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < count; i++) {
        objectInspector.setStructFieldData(row, field, Strings.repeat("0", length));
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
    return tempFile;
}
 
Example 3
Source Project: presto   Source File: OrcTester.java    License: Apache License 2.0 6 votes vote down vote up
public static DataSize writeOrcFileColumnHive(File outputFile, RecordWriter recordWriter, Type type, Iterator<?> values)
        throws Exception
{
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", type);
    Object row = objectInspector.create();

    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    Serializer serializer = new OrcSerde();

    while (values.hasNext()) {
        Object value = values.next();
        value = preprocessWriteValueHive(type, value);
        objectInspector.setStructFieldData(row, fields.get(0), value);

        Writable record = serializer.serialize(row, objectInspector);
        recordWriter.write(record);
    }

    recordWriter.close(false);
    return succinctBytes(outputFile.length());
}
 
Example 4
Source Project: presto   Source File: TestOrcReaderPositions.java    License: Apache License 2.0 6 votes vote down vote up
private static void createMultiStripeFile(File file)
        throws IOException, ReflectiveOperationException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, CompressionKind.NONE, BIGINT);

    Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", BIGINT);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < 300; i += 3) {
        if ((i > 0) && (i % 60 == 0)) {
            flushWriter(writer);
        }

        objectInspector.setStructFieldData(row, field, (long) i);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example 5
Source Project: presto   Source File: TestOrcReaderPositions.java    License: Apache License 2.0 6 votes vote down vote up
private static void createSequentialFile(File file, int count)
        throws IOException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, CompressionKind.NONE, BIGINT);

    Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", BIGINT);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < count; i++) {
        objectInspector.setStructFieldData(row, field, (long) i);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example 6
private static void createMultiStripeFile(File file)
        throws IOException, ReflectiveOperationException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, OrcTester.Compression.NONE, javaLongObjectInspector);

    @SuppressWarnings("deprecation") Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", javaLongObjectInspector);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < 300; i += 3) {
        if ((i > 0) && (i % 60 == 0)) {
            flushWriter(writer);
        }

        objectInspector.setStructFieldData(row, field, (long) i);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example 7
private static void createSequentialFile(File file, int count)
        throws IOException, ReflectiveOperationException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, OrcTester.Compression.NONE, javaLongObjectInspector);

    @SuppressWarnings("deprecation") Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", javaLongObjectInspector);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 0; i < count; i++) {
        objectInspector.setStructFieldData(row, field, (long) i);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example 8
Source Project: presto   Source File: TestOrcReaderMemoryUsage.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Write a file that contains a given number of maps where each row has 10 entries in total
 * and some entries have null keys/values.
 */
private static TempFile createSingleColumnMapFileWithNullValues(Type mapType, int rows)
        throws IOException, SerDeException
{
    Serializer serde = new OrcSerde();
    TempFile tempFile = new TempFile();
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(tempFile.getFile(), ORC_12, CompressionKind.NONE, mapType);
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", mapType);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    for (int i = 1; i <= rows; i++) {
        HashMap<Long, Long> map = new HashMap<>();

        for (int j = 1; j <= 8; j++) {
            Long value = (long) j;
            map.put(value, value);
        }

        // Add null values so that the StreamReader nullVectors are not empty.
        map.put(null, 0L);
        map.put(0L, null);

        objectInspector.setStructFieldData(row, field, map);
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }
    writer.close(false);
    return tempFile;
}
 
Example 9
Source Project: presto   Source File: TestOrcReaderPositions.java    License: Apache License 2.0 5 votes vote down vote up
private static void createGrowingSequentialFile(File file, int count, int step, int initialLength)
        throws IOException, SerDeException
{
    FileSinkOperator.RecordWriter writer = createOrcRecordWriter(file, ORC_12, CompressionKind.NONE, VARCHAR);

    Serializer serde = new OrcSerde();
    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", VARCHAR);
    Object row = objectInspector.create();
    StructField field = objectInspector.getAllStructFieldRefs().get(0);

    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < initialLength; i++) {
        builder.append("0");
    }
    String seedString = builder.toString();

    // gradually grow the length of a cell
    int previousLength = initialLength;
    for (int i = 0; i < count; i++) {
        if ((i / step + 1) * initialLength > previousLength) {
            previousLength = (i / step + 1) * initialLength;
            builder.append(seedString);
        }
        objectInspector.setStructFieldData(row, field, builder.toString());
        Writable record = serde.serialize(row, objectInspector);
        writer.write(record);
    }

    writer.close(false);
}
 
Example 10
Source Project: presto   Source File: RcFileTester.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Serializer createSerializer()
{
    try {
        ColumnarSerDe columnarSerDe = new ColumnarSerDe();
        Properties tableProperties = new Properties();
        tableProperties.setProperty("columns", "test");
        tableProperties.setProperty("columns.types", "string");
        columnarSerDe.initialize(new JobConf(false), tableProperties);
        return columnarSerDe;
    }
    catch (SerDeException e) {
        throw new RuntimeException(e);
    }
}
 
Example 11
Source Project: presto   Source File: RcFileTester.java    License: Apache License 2.0 5 votes vote down vote up
private static DataSize writeRcFileColumnOld(File outputFile, Format format, Compression compression, Type type, Iterator<?> values)
        throws Exception
{
    ObjectInspector columnObjectInspector = getJavaObjectInspector(type);
    RecordWriter recordWriter = createRcFileWriterOld(outputFile, compression, columnObjectInspector);

    SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", columnObjectInspector);
    Object row = objectInspector.create();

    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    Serializer serializer = format.createSerializer();

    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", "test");
    tableProperties.setProperty("columns.types", objectInspector.getTypeName());
    serializer.initialize(new JobConf(false), tableProperties);

    while (values.hasNext()) {
        Object value = values.next();
        value = preprocessWriteValueOld(type, value);
        objectInspector.setStructFieldData(row, fields.get(0), value);

        Writable record = serializer.serialize(row, objectInspector);
        recordWriter.write(record);
    }

    recordWriter.close(false);
    return DataSize.ofBytes(outputFile.length()).succinct();
}
 
Example 12
Source Project: flink   Source File: HiveWriterFactory.java    License: Apache License 2.0 5 votes vote down vote up
private void checkInitialize() throws Exception {
	if (initialized) {
		return;
	}

	JobConf jobConf = confWrapper.conf();
	Object serdeLib = Class.forName(serDeInfo.getSerializationLib()).newInstance();
	Preconditions.checkArgument(serdeLib instanceof Serializer && serdeLib instanceof Deserializer,
			"Expect a SerDe lib implementing both Serializer and Deserializer, but actually got "
					+ serdeLib.getClass().getName());
	this.recordSerDe = (Serializer) serdeLib;
	ReflectionUtils.setConf(recordSerDe, jobConf);

	// TODO: support partition properties, for now assume they're same as table properties
	SerDeUtils.initializeSerDe((Deserializer) recordSerDe, jobConf, tableProperties, null);

	this.formatFields = allColumns.length - partitionColumns.length;
	this.hiveConversions = new HiveObjectConversion[formatFields];
	this.converters = new DataFormatConverter[formatFields];
	List<ObjectInspector> objectInspectors = new ArrayList<>(hiveConversions.length);
	for (int i = 0; i < formatFields; i++) {
		DataType type = allTypes[i];
		ObjectInspector objectInspector = HiveInspectors.getObjectInspector(type);
		objectInspectors.add(objectInspector);
		hiveConversions[i] = HiveInspectors.getConversion(
				objectInspector, type.getLogicalType(), hiveShim);
		converters[i] = DataFormatConverters.getConverterForDataType(type);
	}

	this.formatInspector = ObjectInspectorFactory.getStandardStructObjectInspector(
			Arrays.asList(allColumns).subList(0, formatFields),
			objectInspectors);
	this.initialized = true;
}
 
Example 13
Source Project: presto   Source File: RcFileTester.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Serializer createSerializer()
{
    return new LazyBinaryColumnarSerDe();
}
 
Example 14
Source Project: presto   Source File: TestOrcPageSourceMemoryTracking.java    License: Apache License 2.0 4 votes vote down vote up
public static FileSplit createTestFile(
        String filePath,
        Serializer serializer,
        String compressionCodec,
        List<TestColumn> testColumns,
        int numRows,
        int stripeRows)
        throws Exception
{
    // filter out partition keys, which are not written to the file
    testColumns = testColumns.stream()
            .filter(column -> !column.isPartitionKey())
            .collect(toImmutableList());

    Properties tableProperties = new Properties();
    tableProperties.setProperty(
            "columns",
            testColumns.stream()
                    .map(TestColumn::getName)
                    .collect(Collectors.joining(",")));

    tableProperties.setProperty(
            "columns.types",
            testColumns.stream()
                    .map(TestColumn::getType)
                    .collect(Collectors.joining(",")));

    serializer.initialize(CONFIGURATION, tableProperties);

    JobConf jobConf = new JobConf();
    if (compressionCodec != null) {
        CompressionCodec codec = new CompressionCodecFactory(CONFIGURATION).getCodecByName(compressionCodec);
        jobConf.set(COMPRESS_CODEC, codec.getClass().getName());
        jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString());
    }

    RecordWriter recordWriter = createRecordWriter(new Path(filePath), CONFIGURATION);

    try {
        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(
                testColumns.stream()
                        .map(TestColumn::getName)
                        .collect(toImmutableList()),
                testColumns.stream()
                        .map(TestColumn::getObjectInspector)
                        .collect(toImmutableList()));

        Object row = objectInspector.create();

        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());

        for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
            for (int i = 0; i < testColumns.size(); i++) {
                Object writeValue = testColumns.get(i).getWriteValue();
                if (writeValue instanceof Slice) {
                    writeValue = ((Slice) writeValue).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), writeValue);
            }

            Writable record = serializer.serialize(row, objectInspector);
            recordWriter.write(record);
            if (rowNumber % stripeRows == stripeRows - 1) {
                flushStripe(recordWriter);
            }
        }
    }
    finally {
        recordWriter.close(false);
    }

    Path path = new Path(filePath);
    path.getFileSystem(CONFIGURATION).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}
 
Example 15
Source Project: presto   Source File: AbstractTestHiveFileFormats.java    License: Apache License 2.0 4 votes vote down vote up
public static FileSplit createTestFile(
        String filePath,
        HiveStorageFormat storageFormat,
        HiveCompressionCodec compressionCodec,
        List<TestColumn> testColumns,
        int numRows)
        throws Exception
{
    HiveOutputFormat<?, ?> outputFormat = newInstance(storageFormat.getOutputFormat(), HiveOutputFormat.class);
    Serializer serializer = newInstance(storageFormat.getSerDe(), Serializer.class);

    // filter out partition keys, which are not written to the file
    testColumns = testColumns.stream()
            .filter(column -> !column.isPartitionKey())
            .collect(toImmutableList());

    Properties tableProperties = new Properties();
    tableProperties.setProperty(
            "columns",
            testColumns.stream()
                    .map(TestColumn::getName)
                    .collect(Collectors.joining(",")));
    tableProperties.setProperty(
            "columns.types",
            testColumns.stream()
                    .map(TestColumn::getType)
                    .collect(Collectors.joining(",")));
    serializer.initialize(new Configuration(false), tableProperties);

    JobConf jobConf = new JobConf();
    configureCompression(jobConf, compressionCodec);

    RecordWriter recordWriter = outputFormat.getHiveRecordWriter(
            jobConf,
            new Path(filePath),
            Text.class,
            compressionCodec != HiveCompressionCodec.NONE,
            tableProperties,
            () -> {});

    try {
        serializer.initialize(new Configuration(false), tableProperties);

        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(
                testColumns.stream()
                        .map(TestColumn::getName)
                        .collect(toImmutableList()),
                testColumns.stream()
                        .map(TestColumn::getObjectInspector)
                        .collect(toImmutableList()));

        Object row = objectInspector.create();

        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());

        for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
            for (int i = 0; i < testColumns.size(); i++) {
                Object writeValue = testColumns.get(i).getWriteValue();
                if (writeValue instanceof Slice) {
                    writeValue = ((Slice) writeValue).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), writeValue);
            }

            Writable record = serializer.serialize(row, objectInspector);
            recordWriter.write(record);
        }
    }
    finally {
        recordWriter.close(false);
    }

    // todo to test with compression, the file must be renamed with the compression extension
    Path path = new Path(filePath);
    path.getFileSystem(new Configuration(false)).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}
 
Example 16
Source Project: flink   Source File: HiveTableOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public void open(int taskNumber, int numTasks) throws IOException {
	try {
		StorageDescriptor sd = hiveTablePartition.getStorageDescriptor();
		Object serdeLib = Class.forName(sd.getSerdeInfo().getSerializationLib()).newInstance();
		Preconditions.checkArgument(serdeLib instanceof Serializer && serdeLib instanceof Deserializer,
				"Expect a SerDe lib implementing both Serializer and Deserializer, but actually got " + serdeLib.getClass().getName());
		recordSerDe = (Serializer) serdeLib;
		ReflectionUtils.setConf(recordSerDe, jobConf);
		// TODO: support partition properties, for now assume they're same as table properties
		SerDeUtils.initializeSerDe((Deserializer) recordSerDe, jobConf, tableProperties, null);
		outputClass = recordSerDe.getSerializedClass();
	} catch (IllegalAccessException | SerDeException | InstantiationException | ClassNotFoundException e) {
		throw new FlinkRuntimeException("Error initializing Hive serializer", e);
	}

	TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_"
		+ String.format("%" + (6 - Integer.toString(taskNumber).length()) + "s", " ").replace(" ", "0")
		+ taskNumber + "_0");

	this.jobConf.set("mapred.task.id", taskAttemptID.toString());
	this.jobConf.setInt("mapred.task.partition", taskNumber);
	// for hadoop 2.2
	this.jobConf.set("mapreduce.task.attempt.id", taskAttemptID.toString());
	this.jobConf.setInt("mapreduce.task.partition", taskNumber);

	this.context = new TaskAttemptContextImpl(this.jobConf, taskAttemptID);

	if (!isDynamicPartition) {
		staticWriter = writerForLocation(hiveTablePartition.getStorageDescriptor().getLocation());
	} else {
		dynamicPartitionOffset = fieldNames.length - partitionColumns.size() + hiveTablePartition.getPartitionSpec().size();
	}

	numNonPartitionColumns = isPartitioned ? fieldNames.length - partitionColumns.size() : fieldNames.length;
	hiveConversions = new HiveObjectConversion[numNonPartitionColumns];
	List<ObjectInspector> objectInspectors = new ArrayList<>(hiveConversions.length);
	for (int i = 0; i < numNonPartitionColumns; i++) {
		ObjectInspector objectInspector = HiveInspectors.getObjectInspector(fieldTypes[i]);
		objectInspectors.add(objectInspector);
		hiveConversions[i] = HiveInspectors.getConversion(objectInspector, fieldTypes[i].getLogicalType());
	}

	if (!isPartitioned) {
		rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(
			Arrays.asList(fieldNames),
			objectInspectors);
	} else {
		rowObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(
			Arrays.asList(fieldNames).subList(0, fieldNames.length - partitionColumns.size()),
			objectInspectors);
		defaultPartitionName = jobConf.get(HiveConf.ConfVars.DEFAULTPARTITIONNAME.varname,
				HiveConf.ConfVars.DEFAULTPARTITIONNAME.defaultStrVal);
	}
}
 
Example 17
Source Project: presto   Source File: RcFileTester.java    License: Apache License 2.0 votes vote down vote up
public abstract Serializer createSerializer();