Java Code Examples for org.apache.hadoop.hive.ql.io.IOConstants

The following examples show how to use org.apache.hadoop.hive.ql.io.IOConstants. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: presto   Source File: TestCachingOrcDataSource.java    License: Apache License 2.0 6 votes vote down vote up
private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, ObjectInspector columnObjectInspector)
        throws IOException
{
    JobConf jobConf = new JobConf();
    OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11");
    OrcConf.COMPRESS.setString(jobConf, compression.name());

    Properties tableProperties = new Properties();
    tableProperties.setProperty(IOConstants.COLUMNS, "test");
    tableProperties.setProperty(IOConstants.COLUMNS_TYPES, columnObjectInspector.getTypeName());
    tableProperties.setProperty(OrcConf.STRIPE_SIZE.getAttribute(), "120000");

    return new OrcOutputFormat().getHiveRecordWriter(
            jobConf,
            new Path(outputFile.toURI()),
            Text.class,
            compression != NONE,
            tableProperties,
            () -> {});
}
 
Example 2
Source Project: presto   Source File: OrcFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
@VisibleForTesting
OrcFileWriter(List<Long> columnIds, List<Type> columnTypes, File target, boolean writeMetadata)
{
    this.columnTypes = ImmutableList.copyOf(requireNonNull(columnTypes, "columnTypes is null"));
    checkArgument(columnIds.size() == columnTypes.size(), "ids and types mismatch");
    checkArgument(isUnique(columnIds), "ids must be unique");

    List<StorageType> storageTypes = ImmutableList.copyOf(toStorageTypes(columnTypes));
    Iterable<String> hiveTypeNames = storageTypes.stream().map(StorageType::getHiveTypeName).collect(toList());
    List<String> columnNames = columnIds.stream()
            .map(Objects::toString)
            .collect(toImmutableList());

    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, Joiner.on(',').join(columnNames));
    properties.setProperty(IOConstants.COLUMNS_TYPES, Joiner.on(':').join(hiveTypeNames));

    serializer = createSerializer(properties);
    recordWriter = createRecordWriter(new Path(target.toURI()), columnIds, columnTypes, writeMetadata);

    tableInspector = getStandardStructObjectInspector(columnNames, getJavaObjectInspectors(storageTypes));
    structFields = ImmutableList.copyOf(tableInspector.getAllStructFieldRefs());
    orcRow = tableInspector.create();
}
 
Example 3
Source Project: dremio-oss   Source File: HiveUtilities.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Helper method which sets config to read transactional (ACID) tables. Prerequisite is <i>job</i>
 * contains the table properties.
 * @param job
 */
public static void addACIDPropertiesIfNeeded(final JobConf job) {
  if (!AcidUtils.isTablePropertyTransactional(job)) {
    return;
  }

  AcidUtils.setTransactionalTableScan(job, true);

  // Add ACID related properties
  if (Utilities.isSchemaEvolutionEnabled(job, true) &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS) != null &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES) != null) {
    // If the schema evolution columns and types are already set, then there is no additional conf to set.
    return;
  }

  // Get them from table properties and set them as schema evolution properties
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, job.get(serdeConstants.LIST_COLUMNS));
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, job.get(serdeConstants.LIST_COLUMN_TYPES));

}
 
Example 4
Source Project: dremio-oss   Source File: HiveUtilities.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Helper method which sets config to read transactional (ACID) tables. Prerequisite is <i>job</i>
 * contains the table properties.
 * @param job
 */
public static void addACIDPropertiesIfNeeded(final JobConf job) {
  if (!AcidUtils.isTablePropertyTransactional(job)) {
    return;
  }

  AcidUtils.setAcidOperationalProperties(job, true, null);

  // Add ACID related properties
  if (Utilities.isSchemaEvolutionEnabled(job, true) &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS) != null &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES) != null) {
    // If the schema evolution columns and types are already set, then there is no additional conf to set.
    return;
  }

  // Get them from table properties and set them as schema evolution properties
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, job.get(serdeConstants.LIST_COLUMNS));
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, job.get(serdeConstants.LIST_COLUMN_TYPES));

}
 
Example 5
Source Project: flink   Source File: HiveTableInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private void addSchemaToConf(JobConf jobConf) {
	// set columns/types -- including partition cols
	List<String> typeStrs = Arrays.stream(fieldTypes)
			.map(t -> HiveTypeUtil.toHiveTypeInfo(t, true).toString())
			.collect(Collectors.toList());
	jobConf.set(IOConstants.COLUMNS, String.join(",", fieldNames));
	jobConf.set(IOConstants.COLUMNS_TYPES, String.join(",", typeStrs));
	// set schema evolution -- excluding partition cols
	int numNonPartCol = fieldNames.length - partitionKeys.size();
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS, String.join(",", Arrays.copyOfRange(fieldNames, 0, numNonPartCol)));
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS_TYPES, String.join(",", typeStrs.subList(0, numNonPartCol)));

	// in older versions, parquet reader also expects the selected col indices in conf, excluding part cols
	String readColIDs = Arrays.stream(selectedFields)
			.filter(i -> i < numNonPartCol)
			.mapToObj(String::valueOf)
			.collect(Collectors.joining(","));
	jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIDs);
}
 
Example 6
Source Project: presto   Source File: IcebergFileWriterFactory.java    License: Apache License 2.0 5 votes vote down vote up
private IcebergFileWriter createParquetWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session)
{
    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, columns.stream()
            .map(IcebergColumnHandle::getName)
            .collect(joining(",")));
    properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream()
            .map(column -> toHiveType(column.getType()).getHiveTypeName().toString())
            .collect(joining(":")));

    setParquetSchema(jobConf, convert(icebergSchema, "table"));
    jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name());

    return new IcebergRecordFileWriter(
            outputPath,
            columns.stream()
                    .map(IcebergColumnHandle::getName)
                    .collect(toImmutableList()),
            fromHiveStorageFormat(HiveStorageFormat.PARQUET),
            properties,
            HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(),
            jobConf,
            typeManager,
            session);
}
 
Example 7
Source Project: presto   Source File: ParquetRecordWriterUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties)
        throws IOException
{
    if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) {
        List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS));
        List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES));
        MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes);
        setParquetSchema(conf, schema);
    }

    ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport());

    return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties);
}
 
Example 8
Source Project: indexr   Source File: IndexRSerde.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(Configuration conf, Properties tbl) throws SerDeException {
    String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
    String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);

    if (Strings.isEmpty(columnNameProperty)) {
        columnNames = new ArrayList<String>();
    } else {
        columnNames = Arrays.asList(columnNameProperty.split(","));
    }
    if (Strings.isEmpty(columnTypeProperty)) {
        columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(StringUtils.repeat("string", ":", columnNames.size()));
    } else {
        columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    }
    if (columnNames.size() != columnTypes.size()) {
        throw new IllegalArgumentException("IndexRHiveSerde initialization failed. Number of column " +
                "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
                columnTypes);
    }

    TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
    this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);

    stats = new SerDeStats();
    serdeSize = 0;
}
 
Example 9
Source Project: tajo   Source File: TestHiveCatalogStore.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTableUsingSequenceFileWithBinarySerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.SEQUENCE_FILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.SEQUENCEFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_BINARY_SERDE,
    table1.getMeta().getProperty(StorageConstants.SEQUENCEFILE_SERDE));
  store.dropTable(DB_NAME, REGION);
}
 
Example 10
Source Project: tajo   Source File: TestHiveCatalogStore.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTableUsingSequenceFileWithTextSerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE);
  options.set(StorageConstants.TEXT_DELIMITER, "\u0001");
  options.set(StorageConstants.TEXT_NULL, NullDatum.DEFAULT_TEXT);
  TableMeta meta = new TableMeta(BuiltinStorages.SEQUENCE_FILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.SEQUENCEFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getProperty(StorageConstants.SEQUENCEFILE_SERDE));
  assertEquals("\u0001", StringEscapeUtils.unescapeJava(table1.getMeta().getProperty(StorageConstants
    .TEXT_DELIMITER)));
  assertEquals(NullDatum.DEFAULT_TEXT, table1.getMeta().getProperty(StorageConstants.TEXT_NULL));
  store.dropTable(DB_NAME, REGION);
}
 
Example 11
Source Project: tajo   Source File: TestHiveCatalogStore.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTableUsingParquet() throws Exception {
  TableMeta meta = new TableMeta("PARQUET", new KeyValueSet());

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("c_custkey", TajoDataTypes.Type.INT4)
      .add("c_name", TajoDataTypes.Type.TEXT)
      .add("c_address", TajoDataTypes.Type.TEXT)
      .add("c_nationkey", TajoDataTypes.Type.INT4)
      .add("c_phone", TajoDataTypes.Type.TEXT)
      .add("c_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("c_mktsegment", TajoDataTypes.Type.TEXT)
      .add("c_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, CUSTOMER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.PARQUET);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, CUSTOMER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  store.dropTable(DB_NAME, CUSTOMER);
}
 
Example 12
Source Project: incubator-gobblin   Source File: HiveSerDeConverter.java    License: Apache License 2.0 5 votes vote down vote up
private void setColumnsIfPossible(WorkUnitState state)
    throws SerDeException {
  AvroObjectInspectorGenerator aoig = new AvroObjectInspectorGenerator(
      AvroSerdeUtils.determineSchemaOrReturnErrorSchema(state.getProperties()));
  List<String> columnNames = aoig.getColumnNames();
  List<TypeInfo> columnTypes = aoig.getColumnTypes();

  state.setProp(IOConstants.COLUMNS, StringUtils.join(columnNames, ","));
  state.setProp(IOConstants.COLUMNS_TYPES, StringUtils.join(columnTypes, ","));
}
 
Example 13
Source Project: parquet-mr   Source File: MapredParquetOutputFormat.java    License: Apache License 2.0 5 votes vote down vote up
/**
 *
 * Create the parquet schema from the hive schema, and return the RecordWriterWrapper which
 * contains the real output format
 */
@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(
    final JobConf jobConf,
    final Path finalOutPath,
    final Class<? extends Writable> valueClass,
    final boolean isCompressed,
    final Properties tableProperties,
    final Progressable progress) throws IOException {

  LOG.info("creating new record writer...{}", this);

  final String columnNameProperty = tableProperties.getProperty(IOConstants.COLUMNS);
  final String columnTypeProperty = tableProperties.getProperty(IOConstants.COLUMNS_TYPES);
  List<String> columnNames;
  List<TypeInfo> columnTypes;

  if (columnNameProperty.length() == 0) {
    columnNames = new ArrayList<String>();
  } else {
    columnNames = Arrays.asList(columnNameProperty.split(","));
  }

  if (columnTypeProperty.length() == 0) {
    columnTypes = new ArrayList<TypeInfo>();
  } else {
    columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
  }

  DataWritableWriteSupport.setSchema(HiveSchemaConverter.convert(columnNames, columnTypes), jobConf);
  return getParquerRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress);
}
 
Example 14
Source Project: parquet-mr   Source File: ParquetHiveSerDe.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final void initialize(final Configuration conf, final Properties tbl) throws SerDeException {

  final TypeInfo rowTypeInfo;
  final List<String> columnNames;
  final List<TypeInfo> columnTypes;
  // Get column names and sort order
  final String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
  final String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);

  if (columnNameProperty.length() == 0) {
    columnNames = new ArrayList<String>();
  } else {
    columnNames = Arrays.asList(columnNameProperty.split(","));
  }
  if (columnTypeProperty.length() == 0) {
    columnTypes = new ArrayList<TypeInfo>();
  } else {
    columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
  }
  if (columnNames.size() != columnTypes.size()) {
    throw new IllegalArgumentException("ParquetHiveSerde initialization failed. Number of column " +
      "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
      columnTypes);
  }
  // Create row related objects
  rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
  this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);

  // Stats part
  stats = new SerDeStats();
  serializedSize = 0;
  deserializedSize = 0;
  status = LAST_OPERATION.UNKNOWN;
}
 
Example 15
Source Project: parquet-mr   Source File: DataWritableReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}
 
Example 16
Source Project: presto   Source File: HiveUtil.java    License: Apache License 2.0 4 votes vote down vote up
public static List<String> getColumnNames(Properties schema)
{
    return COLUMN_NAMES_SPLITTER.splitToList(schema.getProperty(IOConstants.COLUMNS, ""));
}
 
Example 17
Source Project: presto   Source File: HiveUtil.java    License: Apache License 2.0 4 votes vote down vote up
public static List<HiveType> getColumnTypes(Properties schema)
{
    return toHiveTypes(schema.getProperty(IOConstants.COLUMNS_TYPES, ""));
}
 
Example 18
Source Project: localization_nifi   Source File: OrcFlowFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
public OrcFlowFileWriter(OutputStream flowFileOutputStream,
                         Path path,
                         Configuration conf,
                         ObjectInspector inspector,
                         long stripeSize,
                         CompressionKind compress,
                         int bufferSize,
                         int rowIndexStride,
                         MemoryManager memoryManager,
                         boolean addBlockPadding,
                         OrcFile.Version version,
                         OrcFile.WriterCallback callback,
                         EncodingStrategy encodingStrategy,
                         CompressionStrategy compressionStrategy,
                         float paddingTolerance,
                         long blockSizeValue,
                         String bloomFilterColumnNames,
                         double bloomFilterFpp) throws IOException {
    this.flowFileOutputStream = flowFileOutputStream;
    this.path = path;
    this.conf = conf;
    this.callback = callback;
    callbackContext = (callback != null) ? () -> OrcFlowFileWriter.this : null;
    this.adjustedStripeSize = stripeSize;
    this.defaultStripeSize = stripeSize;
    this.version = version;
    this.encodingStrategy = encodingStrategy;
    this.compressionStrategy = compressionStrategy;
    this.addBlockPadding = addBlockPadding;
    this.blockSize = blockSizeValue;
    this.paddingTolerance = paddingTolerance;
    this.compress = compress;
    this.rowIndexStride = rowIndexStride;
    this.memoryManager = memoryManager;
    buildIndex = rowIndexStride > 0;
    codec = createCodec(compress);
    String allColumns = conf.get(IOConstants.COLUMNS);
    if (allColumns == null) {
        allColumns = getColumnNamesFromInspector(inspector);
    }
    this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
    if (version == OrcFile.Version.V_0_11) {
        /* do not write bloom filters for ORC v11 */
        this.bloomFilterColumns =
                OrcUtils.includeColumns(null, allColumns, inspector);
    } else {
        this.bloomFilterColumns =
                OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
    }
    this.bloomFilterFpp = bloomFilterFpp;
    treeWriter = createTreeWriter(inspector, streamFactory, false);
    if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
        throw new IllegalArgumentException("Row stride must be at least " +
                MIN_ROW_INDEX_STRIDE);
    }

    // ensure that we are able to handle callbacks before we register ourselves
    memoryManager.addWriter(path, stripeSize, this);
}
 
Example 19
Source Project: localization_nifi   Source File: OrcFlowFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
@VisibleForTesting
int getEstimatedBufferSize(int bs) {
    return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
}
 
Example 20
Source Project: tajo   Source File: TestHiveCatalogStore.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testTableUsingTextFile() throws Exception {
  TableMeta meta = new TableMeta(BuiltinStorages.TEXT, new KeyValueSet());

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("c_custkey", TajoDataTypes.Type.INT4)
      .add("c_name", TajoDataTypes.Type.TEXT)
      .add("c_address", TajoDataTypes.Type.TEXT)
      .add("c_nationkey", TajoDataTypes.Type.INT4)
      .add("c_phone", TajoDataTypes.Type.TEXT)
      .add("c_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("c_mktsegment", TajoDataTypes.Type.TEXT)
      .add("c_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, CUSTOMER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.TEXTFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, CUSTOMER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  //IgnoreKeyTextOutputFormat was deprecated
  assertEquals(HiveIgnoreKeyTextOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StringEscapeUtils.escapeJava(StorageConstants.DEFAULT_FIELD_DELIMITER),
      table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER));

  Map<String, String> expected = getProperties(DB_NAME, CUSTOMER);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
      .put("key1", "value1")
      .put("key2", "value2")
      .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, CUSTOMER, toSet);
  Map<String, String> actual = getProperties(DB_NAME, CUSTOMER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, CUSTOMER, toUnset);
  actual = getProperties(DB_NAME, CUSTOMER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, CUSTOMER);
}
 
Example 21
Source Project: tajo   Source File: TestHiveCatalogStore.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testTableUsingRCFileWithBinarySerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.RCFILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.RCFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_BINARY_SERDE,
      table1.getMeta().getProperty(StorageConstants.RCFILE_SERDE));

  Map<String, String> expected = getProperties(DB_NAME, REGION);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
      .put("key1", "value1")
      .put("key2", "value2")
      .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, REGION, toSet);
  Map<String, String> actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, REGION, toUnset);
  actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, REGION);
}
 
Example 22
Source Project: tajo   Source File: TestHiveCatalogStore.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testTableUsingRCFileWithTextSerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.RCFILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.RCFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getProperty(StorageConstants.RCFILE_SERDE));

  Map<String, String> expected = getProperties(DB_NAME, REGION);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
          .put("key1", "value1")
          .put("key2", "value2")
          .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, REGION, toSet);
  Map<String, String> actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, REGION, toUnset);
  actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, REGION);
}
 
Example 23
Source Project: tajo   Source File: TestHiveCatalogStore.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testTableWithNullValue() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.TEXT_DELIMITER, StringEscapeUtils.escapeJava("\u0002"));
  options.set(StorageConstants.TEXT_NULL, StringEscapeUtils.escapeJava("\u0003"));
  TableMeta meta = new TableMeta(BuiltinStorages.TEXT, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("s_suppkey", TajoDataTypes.Type.INT4)
      .add("s_name", TajoDataTypes.Type.TEXT)
      .add("s_address", TajoDataTypes.Type.TEXT)
      .add("s_nationkey", TajoDataTypes.Type.INT4)
      .add("s_phone", TajoDataTypes.Type.TEXT)
      .add("s_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("s_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, SUPPLIER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, SUPPLIER)).toUri());

  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, SUPPLIER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.TEXTFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, SUPPLIER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  //IgnoreKeyTextOutputFormat was deprecated
  assertEquals(HiveIgnoreKeyTextOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, SUPPLIER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(table.getMeta().getProperty(StorageConstants.TEXT_DELIMITER),
      table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER));

  assertEquals(table.getMeta().getProperty(StorageConstants.TEXT_NULL),
      table1.getMeta().getProperty(StorageConstants.TEXT_NULL));

  assertEquals(table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER),
      StringEscapeUtils.escapeJava("\u0002"));

  assertEquals(table1.getMeta().getProperty(StorageConstants.TEXT_NULL),
      StringEscapeUtils.escapeJava("\u0003"));

  Map<String, String> expected = getProperties(DB_NAME, SUPPLIER);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
          .put("key1", "value1")
          .put("key2", "value2")
          .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, SUPPLIER, toSet);
  Map<String, String> actual = getProperties(DB_NAME, SUPPLIER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, SUPPLIER, toUnset);
  actual = getProperties(DB_NAME, SUPPLIER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, SUPPLIER);

}
 
Example 24
Source Project: nifi   Source File: OrcFlowFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
public OrcFlowFileWriter(OutputStream flowFileOutputStream,
                         Path path,
                         Configuration conf,
                         ObjectInspector inspector,
                         long stripeSize,
                         CompressionKind compress,
                         int bufferSize,
                         int rowIndexStride,
                         MemoryManager memoryManager,
                         boolean addBlockPadding,
                         OrcFile.Version version,
                         OrcFile.WriterCallback callback,
                         EncodingStrategy encodingStrategy,
                         CompressionStrategy compressionStrategy,
                         float paddingTolerance,
                         long blockSizeValue,
                         String bloomFilterColumnNames,
                         double bloomFilterFpp) throws IOException {
    this.flowFileOutputStream = flowFileOutputStream;
    this.path = path;
    this.conf = conf;
    this.callback = callback;
    callbackContext = (callback != null) ? () -> OrcFlowFileWriter.this : null;
    this.adjustedStripeSize = stripeSize;
    this.defaultStripeSize = stripeSize;
    this.version = version;
    this.encodingStrategy = encodingStrategy;
    this.compressionStrategy = compressionStrategy;
    this.addBlockPadding = addBlockPadding;
    this.blockSize = blockSizeValue;
    this.paddingTolerance = paddingTolerance;
    this.compress = compress;
    this.rowIndexStride = rowIndexStride;
    this.memoryManager = memoryManager;
    buildIndex = rowIndexStride > 0;
    codec = createCodec(compress);
    String allColumns = conf.get(IOConstants.COLUMNS);
    if (allColumns == null) {
        allColumns = getColumnNamesFromInspector(inspector);
    }
    this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
    if (version == OrcFile.Version.V_0_11) {
        /* do not write bloom filters for ORC v11 */
        this.bloomFilterColumns =
                OrcUtils.includeColumns(null, allColumns, inspector);
    } else {
        this.bloomFilterColumns =
                OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
    }
    this.bloomFilterFpp = bloomFilterFpp;
    treeWriter = createTreeWriter(inspector, streamFactory, false);
    if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
        throw new IllegalArgumentException("Row stride must be at least " +
                MIN_ROW_INDEX_STRIDE);
    }

    // ensure that we are able to handle callbacks before we register ourselves
    memoryManager.addWriter(path, stripeSize, this);
}
 
Example 25
Source Project: nifi   Source File: OrcFlowFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
@VisibleForTesting
int getEstimatedBufferSize(int bs) {
    return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
}