org.apache.hadoop.hive.ql.io.IOConstants Java Exaples

Source File: OrcFileWriter.java From presto with Apache License 2.0

6 votes

@VisibleForTesting
OrcFileWriter(List<Long> columnIds, List<Type> columnTypes, File target, boolean writeMetadata)
{
    this.columnTypes = ImmutableList.copyOf(requireNonNull(columnTypes, "columnTypes is null"));
    checkArgument(columnIds.size() == columnTypes.size(), "ids and types mismatch");
    checkArgument(isUnique(columnIds), "ids must be unique");

    List<StorageType> storageTypes = ImmutableList.copyOf(toStorageTypes(columnTypes));
    Iterable<String> hiveTypeNames = storageTypes.stream().map(StorageType::getHiveTypeName).collect(toList());
    List<String> columnNames = columnIds.stream()
            .map(Objects::toString)
            .collect(toImmutableList());

    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, Joiner.on(',').join(columnNames));
    properties.setProperty(IOConstants.COLUMNS_TYPES, Joiner.on(':').join(hiveTypeNames));

    serializer = createSerializer(properties);
    recordWriter = createRecordWriter(new Path(target.toURI()), columnIds, columnTypes, writeMetadata);

    tableInspector = getStandardStructObjectInspector(columnNames, getJavaObjectInspectors(storageTypes));
    structFields = ImmutableList.copyOf(tableInspector.getAllStructFieldRefs());
    orcRow = tableInspector.create();
}

Source File: HiveTableInputFormat.java From flink with Apache License 2.0

6 votes

private void addSchemaToConf(JobConf jobConf) {
	// set columns/types -- including partition cols
	List<String> typeStrs = Arrays.stream(fieldTypes)
			.map(t -> HiveTypeUtil.toHiveTypeInfo(t, true).toString())
			.collect(Collectors.toList());
	jobConf.set(IOConstants.COLUMNS, String.join(",", fieldNames));
	jobConf.set(IOConstants.COLUMNS_TYPES, String.join(",", typeStrs));
	// set schema evolution -- excluding partition cols
	int numNonPartCol = fieldNames.length - partitionKeys.size();
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS, String.join(",", Arrays.copyOfRange(fieldNames, 0, numNonPartCol)));
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS_TYPES, String.join(",", typeStrs.subList(0, numNonPartCol)));

	// in older versions, parquet reader also expects the selected col indices in conf, excluding part cols
	String readColIDs = Arrays.stream(selectedFields)
			.filter(i -> i < numNonPartCol)
			.mapToObj(String::valueOf)
			.collect(Collectors.joining(","));
	jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIDs);
}

Source File: HiveUtilities.java From dremio-oss with Apache License 2.0

6 votes

/**
 * Helper method which sets config to read transactional (ACID) tables. Prerequisite is <i>job</i>
 * contains the table properties.
 * @param job
 */
public static void addACIDPropertiesIfNeeded(final JobConf job) {
  if (!AcidUtils.isTablePropertyTransactional(job)) {
    return;
  }

  AcidUtils.setAcidOperationalProperties(job, true, null);

  // Add ACID related properties
  if (Utilities.isSchemaEvolutionEnabled(job, true) &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS) != null &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES) != null) {
    // If the schema evolution columns and types are already set, then there is no additional conf to set.
    return;
  }

  // Get them from table properties and set them as schema evolution properties
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, job.get(serdeConstants.LIST_COLUMNS));
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, job.get(serdeConstants.LIST_COLUMN_TYPES));

}

Source File: HiveUtilities.java From dremio-oss with Apache License 2.0

6 votes

/**
 * Helper method which sets config to read transactional (ACID) tables. Prerequisite is <i>job</i>
 * contains the table properties.
 * @param job
 */
public static void addACIDPropertiesIfNeeded(final JobConf job) {
  if (!AcidUtils.isTablePropertyTransactional(job)) {
    return;
  }

  AcidUtils.setTransactionalTableScan(job, true);

  // Add ACID related properties
  if (Utilities.isSchemaEvolutionEnabled(job, true) &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS) != null &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES) != null) {
    // If the schema evolution columns and types are already set, then there is no additional conf to set.
    return;
  }

  // Get them from table properties and set them as schema evolution properties
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, job.get(serdeConstants.LIST_COLUMNS));
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, job.get(serdeConstants.LIST_COLUMN_TYPES));

}

Source File: TestCachingOrcDataSource.java From presto with Apache License 2.0

6 votes

private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, ObjectInspector columnObjectInspector)
        throws IOException
{
    JobConf jobConf = new JobConf();
    OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11");
    OrcConf.COMPRESS.setString(jobConf, compression.name());

    Properties tableProperties = new Properties();
    tableProperties.setProperty(IOConstants.COLUMNS, "test");
    tableProperties.setProperty(IOConstants.COLUMNS_TYPES, columnObjectInspector.getTypeName());
    tableProperties.setProperty(OrcConf.STRIPE_SIZE.getAttribute(), "120000");

    return new OrcOutputFormat().getHiveRecordWriter(
            jobConf,
            new Path(outputFile.toURI()),
            Text.class,
            compression != NONE,
            tableProperties,
            () -> {});
}

Source File: ParquetRecordWriterUtil.java From presto with Apache License 2.0

5 votes

private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties)
        throws IOException
{
    if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) {
        List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS));
        List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES));
        MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes);
        setParquetSchema(conf, schema);
    }

    ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport());

    return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties);
}

Source File: DataWritableReadSupport.java From parquet-mr with Apache License 2.0

5 votes

/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}

Source File: ParquetHiveSerDe.java From parquet-mr with Apache License 2.0

5 votes

@Override
public final void initialize(final Configuration conf, final Properties tbl) throws SerDeException {

  final TypeInfo rowTypeInfo;
  final List<String> columnNames;
  final List<TypeInfo> columnTypes;
  // Get column names and sort order
  final String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
  final String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);

  if (columnNameProperty.length() == 0) {
    columnNames = new ArrayList<String>();
  } else {
    columnNames = Arrays.asList(columnNameProperty.split(","));
  }
  if (columnTypeProperty.length() == 0) {
    columnTypes = new ArrayList<TypeInfo>();
  } else {
    columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
  }
  if (columnNames.size() != columnTypes.size()) {
    throw new IllegalArgumentException("ParquetHiveSerde initialization failed. Number of column " +
      "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
      columnTypes);
  }
  // Create row related objects
  rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
  this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);

  // Stats part
  stats = new SerDeStats();
  serializedSize = 0;
  deserializedSize = 0;
  status = LAST_OPERATION.UNKNOWN;
}

Source File: IcebergFileWriterFactory.java From presto with Apache License 2.0

5 votes

private IcebergFileWriter createParquetWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session)
{
    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, columns.stream()
            .map(IcebergColumnHandle::getName)
            .collect(joining(",")));
    properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream()
            .map(column -> toHiveType(column.getType()).getHiveTypeName().toString())
            .collect(joining(":")));

    setParquetSchema(jobConf, convert(icebergSchema, "table"));
    jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name());

    return new IcebergRecordFileWriter(
            outputPath,
            columns.stream()
                    .map(IcebergColumnHandle::getName)
                    .collect(toImmutableList()),
            fromHiveStorageFormat(HiveStorageFormat.PARQUET),
            properties,
            HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(),
            jobConf,
            typeManager,
            session);
}

Source File: IndexRSerde.java From indexr with Apache License 2.0

5 votes

@Override
public void initialize(Configuration conf, Properties tbl) throws SerDeException {
    String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
    String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);

    if (Strings.isEmpty(columnNameProperty)) {
        columnNames = new ArrayList<String>();
    } else {
        columnNames = Arrays.asList(columnNameProperty.split(","));
    }
    if (Strings.isEmpty(columnTypeProperty)) {
        columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(StringUtils.repeat("string", ":", columnNames.size()));
    } else {
        columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    }
    if (columnNames.size() != columnTypes.size()) {
        throw new IllegalArgumentException("IndexRHiveSerde initialization failed. Number of column " +
                "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
                columnTypes);
    }

    TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
    this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);

    stats = new SerDeStats();
    serdeSize = 0;
}

Source File: MapredParquetOutputFormat.java From parquet-mr with Apache License 2.0

5 votes

/**
 *
 * Create the parquet schema from the hive schema, and return the RecordWriterWrapper which
 * contains the real output format
 */
@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(
    final JobConf jobConf,
    final Path finalOutPath,
    final Class<? extends Writable> valueClass,
    final boolean isCompressed,
    final Properties tableProperties,
    final Progressable progress) throws IOException {

  LOG.info("creating new record writer...{}", this);

  final String columnNameProperty = tableProperties.getProperty(IOConstants.COLUMNS);
  final String columnTypeProperty = tableProperties.getProperty(IOConstants.COLUMNS_TYPES);
  List<String> columnNames;
  List<TypeInfo> columnTypes;

  if (columnNameProperty.length() == 0) {
    columnNames = new ArrayList<String>();
  } else {
    columnNames = Arrays.asList(columnNameProperty.split(","));
  }

  if (columnTypeProperty.length() == 0) {
    columnTypes = new ArrayList<TypeInfo>();
  } else {
    columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
  }

  DataWritableWriteSupport.setSchema(HiveSchemaConverter.convert(columnNames, columnTypes), jobConf);
  return getParquerRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress);
}

Source File: HiveSerDeConverter.java From incubator-gobblin with Apache License 2.0

5 votes

private void setColumnsIfPossible(WorkUnitState state)
    throws SerDeException {
  AvroObjectInspectorGenerator aoig = new AvroObjectInspectorGenerator(
      AvroSerdeUtils.determineSchemaOrReturnErrorSchema(state.getProperties()));
  List<String> columnNames = aoig.getColumnNames();
  List<TypeInfo> columnTypes = aoig.getColumnTypes();

  state.setProp(IOConstants.COLUMNS, StringUtils.join(columnNames, ","));
  state.setProp(IOConstants.COLUMNS_TYPES, StringUtils.join(columnTypes, ","));
}

Source File: TestHiveCatalogStore.java From tajo with Apache License 2.0

5 votes

@Test
public void testTableUsingParquet() throws Exception {
  TableMeta meta = new TableMeta("PARQUET", new KeyValueSet());

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("c_custkey", TajoDataTypes.Type.INT4)
      .add("c_name", TajoDataTypes.Type.TEXT)
      .add("c_address", TajoDataTypes.Type.TEXT)
      .add("c_nationkey", TajoDataTypes.Type.INT4)
      .add("c_phone", TajoDataTypes.Type.TEXT)
      .add("c_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("c_mktsegment", TajoDataTypes.Type.TEXT)
      .add("c_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, CUSTOMER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.PARQUET);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, CUSTOMER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  store.dropTable(DB_NAME, CUSTOMER);
}

Source File: TestHiveCatalogStore.java From tajo with Apache License 2.0

5 votes

@Test
public void testTableUsingSequenceFileWithBinarySerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.SEQUENCE_FILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.SEQUENCEFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_BINARY_SERDE,
    table1.getMeta().getProperty(StorageConstants.SEQUENCEFILE_SERDE));
  store.dropTable(DB_NAME, REGION);
}

Source File: TestHiveCatalogStore.java From tajo with Apache License 2.0

5 votes

@Test
public void testTableUsingSequenceFileWithTextSerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE);
  options.set(StorageConstants.TEXT_DELIMITER, "\u0001");
  options.set(StorageConstants.TEXT_NULL, NullDatum.DEFAULT_TEXT);
  TableMeta meta = new TableMeta(BuiltinStorages.SEQUENCE_FILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.SEQUENCEFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getProperty(StorageConstants.SEQUENCEFILE_SERDE));
  assertEquals("\u0001", StringEscapeUtils.unescapeJava(table1.getMeta().getProperty(StorageConstants
    .TEXT_DELIMITER)));
  assertEquals(NullDatum.DEFAULT_TEXT, table1.getMeta().getProperty(StorageConstants.TEXT_NULL));
  store.dropTable(DB_NAME, REGION);
}

Source File: OrcFlowFileWriter.java From nifi with Apache License 2.0

4 votes

public OrcFlowFileWriter(OutputStream flowFileOutputStream,
                         Path path,
                         Configuration conf,
                         ObjectInspector inspector,
                         long stripeSize,
                         CompressionKind compress,
                         int bufferSize,
                         int rowIndexStride,
                         MemoryManager memoryManager,
                         boolean addBlockPadding,
                         OrcFile.Version version,
                         OrcFile.WriterCallback callback,
                         EncodingStrategy encodingStrategy,
                         CompressionStrategy compressionStrategy,
                         float paddingTolerance,
                         long blockSizeValue,
                         String bloomFilterColumnNames,
                         double bloomFilterFpp) throws IOException {
    this.flowFileOutputStream = flowFileOutputStream;
    this.path = path;
    this.conf = conf;
    this.callback = callback;
    callbackContext = (callback != null) ? () -> OrcFlowFileWriter.this : null;
    this.adjustedStripeSize = stripeSize;
    this.defaultStripeSize = stripeSize;
    this.version = version;
    this.encodingStrategy = encodingStrategy;
    this.compressionStrategy = compressionStrategy;
    this.addBlockPadding = addBlockPadding;
    this.blockSize = blockSizeValue;
    this.paddingTolerance = paddingTolerance;
    this.compress = compress;
    this.rowIndexStride = rowIndexStride;
    this.memoryManager = memoryManager;
    buildIndex = rowIndexStride > 0;
    codec = createCodec(compress);
    String allColumns = conf.get(IOConstants.COLUMNS);
    if (allColumns == null) {
        allColumns = getColumnNamesFromInspector(inspector);
    }
    this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
    if (version == OrcFile.Version.V_0_11) {
        /* do not write bloom filters for ORC v11 */
        this.bloomFilterColumns =
                OrcUtils.includeColumns(null, allColumns, inspector);
    } else {
        this.bloomFilterColumns =
                OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
    }
    this.bloomFilterFpp = bloomFilterFpp;
    treeWriter = createTreeWriter(inspector, streamFactory, false);
    if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
        throw new IllegalArgumentException("Row stride must be at least " +
                MIN_ROW_INDEX_STRIDE);
    }

    // ensure that we are able to handle callbacks before we register ourselves
    memoryManager.addWriter(path, stripeSize, this);
}

Source File: OrcFlowFileWriter.java From nifi with Apache License 2.0

4 votes

@VisibleForTesting
int getEstimatedBufferSize(int bs) {
    return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
}

Source File: TestHiveCatalogStore.java From tajo with Apache License 2.0

4 votes

@Test
public void testTableWithNullValue() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.TEXT_DELIMITER, StringEscapeUtils.escapeJava("\u0002"));
  options.set(StorageConstants.TEXT_NULL, StringEscapeUtils.escapeJava("\u0003"));
  TableMeta meta = new TableMeta(BuiltinStorages.TEXT, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("s_suppkey", TajoDataTypes.Type.INT4)
      .add("s_name", TajoDataTypes.Type.TEXT)
      .add("s_address", TajoDataTypes.Type.TEXT)
      .add("s_nationkey", TajoDataTypes.Type.INT4)
      .add("s_phone", TajoDataTypes.Type.TEXT)
      .add("s_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("s_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, SUPPLIER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, SUPPLIER)).toUri());

  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, SUPPLIER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.TEXTFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, SUPPLIER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  //IgnoreKeyTextOutputFormat was deprecated
  assertEquals(HiveIgnoreKeyTextOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, SUPPLIER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(table.getMeta().getProperty(StorageConstants.TEXT_DELIMITER),
      table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER));

  assertEquals(table.getMeta().getProperty(StorageConstants.TEXT_NULL),
      table1.getMeta().getProperty(StorageConstants.TEXT_NULL));

  assertEquals(table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER),
      StringEscapeUtils.escapeJava("\u0002"));

  assertEquals(table1.getMeta().getProperty(StorageConstants.TEXT_NULL),
      StringEscapeUtils.escapeJava("\u0003"));

  Map<String, String> expected = getProperties(DB_NAME, SUPPLIER);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
          .put("key1", "value1")
          .put("key2", "value2")
          .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, SUPPLIER, toSet);
  Map<String, String> actual = getProperties(DB_NAME, SUPPLIER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, SUPPLIER, toUnset);
  actual = getProperties(DB_NAME, SUPPLIER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, SUPPLIER);

}

Source File: TestHiveCatalogStore.java From tajo with Apache License 2.0

4 votes

@Test
public void testTableUsingRCFileWithTextSerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.RCFILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.RCFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getProperty(StorageConstants.RCFILE_SERDE));

  Map<String, String> expected = getProperties(DB_NAME, REGION);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
          .put("key1", "value1")
          .put("key2", "value2")
          .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, REGION, toSet);
  Map<String, String> actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, REGION, toUnset);
  actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, REGION);
}

Source File: TestHiveCatalogStore.java From tajo with Apache License 2.0

4 votes

@Test
public void testTableUsingRCFileWithBinarySerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.RCFILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.RCFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_BINARY_SERDE,
      table1.getMeta().getProperty(StorageConstants.RCFILE_SERDE));

  Map<String, String> expected = getProperties(DB_NAME, REGION);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
      .put("key1", "value1")
      .put("key2", "value2")
      .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, REGION, toSet);
  Map<String, String> actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, REGION, toUnset);
  actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, REGION);
}

Source File: TestHiveCatalogStore.java From tajo with Apache License 2.0

4 votes

@Test
public void testTableUsingTextFile() throws Exception {
  TableMeta meta = new TableMeta(BuiltinStorages.TEXT, new KeyValueSet());

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("c_custkey", TajoDataTypes.Type.INT4)
      .add("c_name", TajoDataTypes.Type.TEXT)
      .add("c_address", TajoDataTypes.Type.TEXT)
      .add("c_nationkey", TajoDataTypes.Type.INT4)
      .add("c_phone", TajoDataTypes.Type.TEXT)
      .add("c_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("c_mktsegment", TajoDataTypes.Type.TEXT)
      .add("c_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, CUSTOMER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.TEXTFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, CUSTOMER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  //IgnoreKeyTextOutputFormat was deprecated
  assertEquals(HiveIgnoreKeyTextOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StringEscapeUtils.escapeJava(StorageConstants.DEFAULT_FIELD_DELIMITER),
      table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER));

  Map<String, String> expected = getProperties(DB_NAME, CUSTOMER);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
      .put("key1", "value1")
      .put("key2", "value2")
      .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, CUSTOMER, toSet);
  Map<String, String> actual = getProperties(DB_NAME, CUSTOMER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, CUSTOMER, toUnset);
  actual = getProperties(DB_NAME, CUSTOMER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, CUSTOMER);
}

Source File: OrcFlowFileWriter.java From localization_nifi with Apache License 2.0

4 votes

@VisibleForTesting
int getEstimatedBufferSize(int bs) {
    return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
}

Source File: OrcFlowFileWriter.java From localization_nifi with Apache License 2.0

4 votes

public OrcFlowFileWriter(OutputStream flowFileOutputStream,
                         Path path,
                         Configuration conf,
                         ObjectInspector inspector,
                         long stripeSize,
                         CompressionKind compress,
                         int bufferSize,
                         int rowIndexStride,
                         MemoryManager memoryManager,
                         boolean addBlockPadding,
                         OrcFile.Version version,
                         OrcFile.WriterCallback callback,
                         EncodingStrategy encodingStrategy,
                         CompressionStrategy compressionStrategy,
                         float paddingTolerance,
                         long blockSizeValue,
                         String bloomFilterColumnNames,
                         double bloomFilterFpp) throws IOException {
    this.flowFileOutputStream = flowFileOutputStream;
    this.path = path;
    this.conf = conf;
    this.callback = callback;
    callbackContext = (callback != null) ? () -> OrcFlowFileWriter.this : null;
    this.adjustedStripeSize = stripeSize;
    this.defaultStripeSize = stripeSize;
    this.version = version;
    this.encodingStrategy = encodingStrategy;
    this.compressionStrategy = compressionStrategy;
    this.addBlockPadding = addBlockPadding;
    this.blockSize = blockSizeValue;
    this.paddingTolerance = paddingTolerance;
    this.compress = compress;
    this.rowIndexStride = rowIndexStride;
    this.memoryManager = memoryManager;
    buildIndex = rowIndexStride > 0;
    codec = createCodec(compress);
    String allColumns = conf.get(IOConstants.COLUMNS);
    if (allColumns == null) {
        allColumns = getColumnNamesFromInspector(inspector);
    }
    this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
    if (version == OrcFile.Version.V_0_11) {
        /* do not write bloom filters for ORC v11 */
        this.bloomFilterColumns =
                OrcUtils.includeColumns(null, allColumns, inspector);
    } else {
        this.bloomFilterColumns =
                OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
    }
    this.bloomFilterFpp = bloomFilterFpp;
    treeWriter = createTreeWriter(inspector, streamFactory, false);
    if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
        throw new IllegalArgumentException("Row stride must be at least " +
                MIN_ROW_INDEX_STRIDE);
    }

    // ensure that we are able to handle callbacks before we register ourselves
    memoryManager.addWriter(path, stripeSize, this);
}

Source File: HiveUtil.java From presto with Apache License 2.0

4 votes

public static List<HiveType> getColumnTypes(Properties schema)
{
    return toHiveTypes(schema.getProperty(IOConstants.COLUMNS_TYPES, ""));
}

Source File: HiveUtil.java From presto with Apache License 2.0

4 votes

public static List<String> getColumnNames(Properties schema)
{
    return COLUMN_NAMES_SPLITTER.splitToList(schema.getProperty(IOConstants.COLUMNS, ""));
}

org.apache.hadoop.hive.ql.io.IOConstants Java Examples