org.apache.hadoop.hive.ql.io.IOConstants Java Examples

The following examples show how to use org.apache.hadoop.hive.ql.io.IOConstants. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OrcFileWriter.java    From presto with Apache License 2.0 6 votes vote down vote up
@VisibleForTesting
OrcFileWriter(List<Long> columnIds, List<Type> columnTypes, File target, boolean writeMetadata)
{
    this.columnTypes = ImmutableList.copyOf(requireNonNull(columnTypes, "columnTypes is null"));
    checkArgument(columnIds.size() == columnTypes.size(), "ids and types mismatch");
    checkArgument(isUnique(columnIds), "ids must be unique");

    List<StorageType> storageTypes = ImmutableList.copyOf(toStorageTypes(columnTypes));
    Iterable<String> hiveTypeNames = storageTypes.stream().map(StorageType::getHiveTypeName).collect(toList());
    List<String> columnNames = columnIds.stream()
            .map(Objects::toString)
            .collect(toImmutableList());

    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, Joiner.on(',').join(columnNames));
    properties.setProperty(IOConstants.COLUMNS_TYPES, Joiner.on(':').join(hiveTypeNames));

    serializer = createSerializer(properties);
    recordWriter = createRecordWriter(new Path(target.toURI()), columnIds, columnTypes, writeMetadata);

    tableInspector = getStandardStructObjectInspector(columnNames, getJavaObjectInspectors(storageTypes));
    structFields = ImmutableList.copyOf(tableInspector.getAllStructFieldRefs());
    orcRow = tableInspector.create();
}
 
Example #2
Source File: HiveTableInputFormat.java    From flink with Apache License 2.0 6 votes vote down vote up
private void addSchemaToConf(JobConf jobConf) {
	// set columns/types -- including partition cols
	List<String> typeStrs = Arrays.stream(fieldTypes)
			.map(t -> HiveTypeUtil.toHiveTypeInfo(t, true).toString())
			.collect(Collectors.toList());
	jobConf.set(IOConstants.COLUMNS, String.join(",", fieldNames));
	jobConf.set(IOConstants.COLUMNS_TYPES, String.join(",", typeStrs));
	// set schema evolution -- excluding partition cols
	int numNonPartCol = fieldNames.length - partitionKeys.size();
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS, String.join(",", Arrays.copyOfRange(fieldNames, 0, numNonPartCol)));
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS_TYPES, String.join(",", typeStrs.subList(0, numNonPartCol)));

	// in older versions, parquet reader also expects the selected col indices in conf, excluding part cols
	String readColIDs = Arrays.stream(selectedFields)
			.filter(i -> i < numNonPartCol)
			.mapToObj(String::valueOf)
			.collect(Collectors.joining(","));
	jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIDs);
}
 
Example #3
Source File: HiveUtilities.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Helper method which sets config to read transactional (ACID) tables. Prerequisite is <i>job</i>
 * contains the table properties.
 * @param job
 */
public static void addACIDPropertiesIfNeeded(final JobConf job) {
  if (!AcidUtils.isTablePropertyTransactional(job)) {
    return;
  }

  AcidUtils.setAcidOperationalProperties(job, true, null);

  // Add ACID related properties
  if (Utilities.isSchemaEvolutionEnabled(job, true) &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS) != null &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES) != null) {
    // If the schema evolution columns and types are already set, then there is no additional conf to set.
    return;
  }

  // Get them from table properties and set them as schema evolution properties
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, job.get(serdeConstants.LIST_COLUMNS));
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, job.get(serdeConstants.LIST_COLUMN_TYPES));

}
 
Example #4
Source File: HiveUtilities.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Helper method which sets config to read transactional (ACID) tables. Prerequisite is <i>job</i>
 * contains the table properties.
 * @param job
 */
public static void addACIDPropertiesIfNeeded(final JobConf job) {
  if (!AcidUtils.isTablePropertyTransactional(job)) {
    return;
  }

  AcidUtils.setTransactionalTableScan(job, true);

  // Add ACID related properties
  if (Utilities.isSchemaEvolutionEnabled(job, true) &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS) != null &&
      job.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES) != null) {
    // If the schema evolution columns and types are already set, then there is no additional conf to set.
    return;
  }

  // Get them from table properties and set them as schema evolution properties
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, job.get(serdeConstants.LIST_COLUMNS));
  job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, job.get(serdeConstants.LIST_COLUMN_TYPES));

}
 
Example #5
Source File: TestCachingOrcDataSource.java    From presto with Apache License 2.0 6 votes vote down vote up
private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, ObjectInspector columnObjectInspector)
        throws IOException
{
    JobConf jobConf = new JobConf();
    OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11");
    OrcConf.COMPRESS.setString(jobConf, compression.name());

    Properties tableProperties = new Properties();
    tableProperties.setProperty(IOConstants.COLUMNS, "test");
    tableProperties.setProperty(IOConstants.COLUMNS_TYPES, columnObjectInspector.getTypeName());
    tableProperties.setProperty(OrcConf.STRIPE_SIZE.getAttribute(), "120000");

    return new OrcOutputFormat().getHiveRecordWriter(
            jobConf,
            new Path(outputFile.toURI()),
            Text.class,
            compression != NONE,
            tableProperties,
            () -> {});
}
 
Example #6
Source File: ParquetRecordWriterUtil.java    From presto with Apache License 2.0 5 votes vote down vote up
private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties)
        throws IOException
{
    if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) {
        List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS));
        List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES));
        MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes);
        setParquetSchema(conf, schema);
    }

    ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport());

    return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties);
}
 
Example #7
Source File: DataWritableReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}
 
Example #8
Source File: ParquetHiveSerDe.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public final void initialize(final Configuration conf, final Properties tbl) throws SerDeException {

  final TypeInfo rowTypeInfo;
  final List<String> columnNames;
  final List<TypeInfo> columnTypes;
  // Get column names and sort order
  final String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
  final String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);

  if (columnNameProperty.length() == 0) {
    columnNames = new ArrayList<String>();
  } else {
    columnNames = Arrays.asList(columnNameProperty.split(","));
  }
  if (columnTypeProperty.length() == 0) {
    columnTypes = new ArrayList<TypeInfo>();
  } else {
    columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
  }
  if (columnNames.size() != columnTypes.size()) {
    throw new IllegalArgumentException("ParquetHiveSerde initialization failed. Number of column " +
      "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
      columnTypes);
  }
  // Create row related objects
  rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
  this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);

  // Stats part
  stats = new SerDeStats();
  serializedSize = 0;
  deserializedSize = 0;
  status = LAST_OPERATION.UNKNOWN;
}
 
Example #9
Source File: IcebergFileWriterFactory.java    From presto with Apache License 2.0 5 votes vote down vote up
private IcebergFileWriter createParquetWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session)
{
    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, columns.stream()
            .map(IcebergColumnHandle::getName)
            .collect(joining(",")));
    properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream()
            .map(column -> toHiveType(column.getType()).getHiveTypeName().toString())
            .collect(joining(":")));

    setParquetSchema(jobConf, convert(icebergSchema, "table"));
    jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name());

    return new IcebergRecordFileWriter(
            outputPath,
            columns.stream()
                    .map(IcebergColumnHandle::getName)
                    .collect(toImmutableList()),
            fromHiveStorageFormat(HiveStorageFormat.PARQUET),
            properties,
            HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(),
            jobConf,
            typeManager,
            session);
}
 
Example #10
Source File: IndexRSerde.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(Configuration conf, Properties tbl) throws SerDeException {
    String columnNameProperty = tbl.getProperty(IOConstants.COLUMNS);
    String columnTypeProperty = tbl.getProperty(IOConstants.COLUMNS_TYPES);

    if (Strings.isEmpty(columnNameProperty)) {
        columnNames = new ArrayList<String>();
    } else {
        columnNames = Arrays.asList(columnNameProperty.split(","));
    }
    if (Strings.isEmpty(columnTypeProperty)) {
        columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(StringUtils.repeat("string", ":", columnNames.size()));
    } else {
        columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    }
    if (columnNames.size() != columnTypes.size()) {
        throw new IllegalArgumentException("IndexRHiveSerde initialization failed. Number of column " +
                "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
                columnTypes);
    }

    TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
    this.objInspector = new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);

    stats = new SerDeStats();
    serdeSize = 0;
}
 
Example #11
Source File: MapredParquetOutputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * Create the parquet schema from the hive schema, and return the RecordWriterWrapper which
 * contains the real output format
 */
@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(
    final JobConf jobConf,
    final Path finalOutPath,
    final Class<? extends Writable> valueClass,
    final boolean isCompressed,
    final Properties tableProperties,
    final Progressable progress) throws IOException {

  LOG.info("creating new record writer...{}", this);

  final String columnNameProperty = tableProperties.getProperty(IOConstants.COLUMNS);
  final String columnTypeProperty = tableProperties.getProperty(IOConstants.COLUMNS_TYPES);
  List<String> columnNames;
  List<TypeInfo> columnTypes;

  if (columnNameProperty.length() == 0) {
    columnNames = new ArrayList<String>();
  } else {
    columnNames = Arrays.asList(columnNameProperty.split(","));
  }

  if (columnTypeProperty.length() == 0) {
    columnTypes = new ArrayList<TypeInfo>();
  } else {
    columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
  }

  DataWritableWriteSupport.setSchema(HiveSchemaConverter.convert(columnNames, columnTypes), jobConf);
  return getParquerRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress);
}
 
Example #12
Source File: HiveSerDeConverter.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void setColumnsIfPossible(WorkUnitState state)
    throws SerDeException {
  AvroObjectInspectorGenerator aoig = new AvroObjectInspectorGenerator(
      AvroSerdeUtils.determineSchemaOrReturnErrorSchema(state.getProperties()));
  List<String> columnNames = aoig.getColumnNames();
  List<TypeInfo> columnTypes = aoig.getColumnTypes();

  state.setProp(IOConstants.COLUMNS, StringUtils.join(columnNames, ","));
  state.setProp(IOConstants.COLUMNS_TYPES, StringUtils.join(columnTypes, ","));
}
 
Example #13
Source File: TestHiveCatalogStore.java    From tajo with Apache License 2.0 5 votes vote down vote up
@Test
public void testTableUsingParquet() throws Exception {
  TableMeta meta = new TableMeta("PARQUET", new KeyValueSet());

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("c_custkey", TajoDataTypes.Type.INT4)
      .add("c_name", TajoDataTypes.Type.TEXT)
      .add("c_address", TajoDataTypes.Type.TEXT)
      .add("c_nationkey", TajoDataTypes.Type.INT4)
      .add("c_phone", TajoDataTypes.Type.TEXT)
      .add("c_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("c_mktsegment", TajoDataTypes.Type.TEXT)
      .add("c_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, CUSTOMER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.PARQUET);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, CUSTOMER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  store.dropTable(DB_NAME, CUSTOMER);
}
 
Example #14
Source File: TestHiveCatalogStore.java    From tajo with Apache License 2.0 5 votes vote down vote up
@Test
public void testTableUsingSequenceFileWithBinarySerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.SEQUENCE_FILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.SEQUENCEFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_BINARY_SERDE,
    table1.getMeta().getProperty(StorageConstants.SEQUENCEFILE_SERDE));
  store.dropTable(DB_NAME, REGION);
}
 
Example #15
Source File: TestHiveCatalogStore.java    From tajo with Apache License 2.0 5 votes vote down vote up
@Test
public void testTableUsingSequenceFileWithTextSerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE);
  options.set(StorageConstants.TEXT_DELIMITER, "\u0001");
  options.set(StorageConstants.TEXT_NULL, NullDatum.DEFAULT_TEXT);
  TableMeta meta = new TableMeta(BuiltinStorages.SEQUENCE_FILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.SEQUENCEFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getProperty(StorageConstants.SEQUENCEFILE_SERDE));
  assertEquals("\u0001", StringEscapeUtils.unescapeJava(table1.getMeta().getProperty(StorageConstants
    .TEXT_DELIMITER)));
  assertEquals(NullDatum.DEFAULT_TEXT, table1.getMeta().getProperty(StorageConstants.TEXT_NULL));
  store.dropTable(DB_NAME, REGION);
}
 
Example #16
Source File: OrcFlowFileWriter.java    From nifi with Apache License 2.0 4 votes vote down vote up
public OrcFlowFileWriter(OutputStream flowFileOutputStream,
                         Path path,
                         Configuration conf,
                         ObjectInspector inspector,
                         long stripeSize,
                         CompressionKind compress,
                         int bufferSize,
                         int rowIndexStride,
                         MemoryManager memoryManager,
                         boolean addBlockPadding,
                         OrcFile.Version version,
                         OrcFile.WriterCallback callback,
                         EncodingStrategy encodingStrategy,
                         CompressionStrategy compressionStrategy,
                         float paddingTolerance,
                         long blockSizeValue,
                         String bloomFilterColumnNames,
                         double bloomFilterFpp) throws IOException {
    this.flowFileOutputStream = flowFileOutputStream;
    this.path = path;
    this.conf = conf;
    this.callback = callback;
    callbackContext = (callback != null) ? () -> OrcFlowFileWriter.this : null;
    this.adjustedStripeSize = stripeSize;
    this.defaultStripeSize = stripeSize;
    this.version = version;
    this.encodingStrategy = encodingStrategy;
    this.compressionStrategy = compressionStrategy;
    this.addBlockPadding = addBlockPadding;
    this.blockSize = blockSizeValue;
    this.paddingTolerance = paddingTolerance;
    this.compress = compress;
    this.rowIndexStride = rowIndexStride;
    this.memoryManager = memoryManager;
    buildIndex = rowIndexStride > 0;
    codec = createCodec(compress);
    String allColumns = conf.get(IOConstants.COLUMNS);
    if (allColumns == null) {
        allColumns = getColumnNamesFromInspector(inspector);
    }
    this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
    if (version == OrcFile.Version.V_0_11) {
        /* do not write bloom filters for ORC v11 */
        this.bloomFilterColumns =
                OrcUtils.includeColumns(null, allColumns, inspector);
    } else {
        this.bloomFilterColumns =
                OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
    }
    this.bloomFilterFpp = bloomFilterFpp;
    treeWriter = createTreeWriter(inspector, streamFactory, false);
    if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
        throw new IllegalArgumentException("Row stride must be at least " +
                MIN_ROW_INDEX_STRIDE);
    }

    // ensure that we are able to handle callbacks before we register ourselves
    memoryManager.addWriter(path, stripeSize, this);
}
 
Example #17
Source File: OrcFlowFileWriter.java    From nifi with Apache License 2.0 4 votes vote down vote up
@VisibleForTesting
int getEstimatedBufferSize(int bs) {
    return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
}
 
Example #18
Source File: TestHiveCatalogStore.java    From tajo with Apache License 2.0 4 votes vote down vote up
@Test
public void testTableWithNullValue() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.TEXT_DELIMITER, StringEscapeUtils.escapeJava("\u0002"));
  options.set(StorageConstants.TEXT_NULL, StringEscapeUtils.escapeJava("\u0003"));
  TableMeta meta = new TableMeta(BuiltinStorages.TEXT, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("s_suppkey", TajoDataTypes.Type.INT4)
      .add("s_name", TajoDataTypes.Type.TEXT)
      .add("s_address", TajoDataTypes.Type.TEXT)
      .add("s_nationkey", TajoDataTypes.Type.INT4)
      .add("s_phone", TajoDataTypes.Type.TEXT)
      .add("s_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("s_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, SUPPLIER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, SUPPLIER)).toUri());

  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, SUPPLIER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.TEXTFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, SUPPLIER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  //IgnoreKeyTextOutputFormat was deprecated
  assertEquals(HiveIgnoreKeyTextOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, SUPPLIER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(table.getMeta().getProperty(StorageConstants.TEXT_DELIMITER),
      table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER));

  assertEquals(table.getMeta().getProperty(StorageConstants.TEXT_NULL),
      table1.getMeta().getProperty(StorageConstants.TEXT_NULL));

  assertEquals(table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER),
      StringEscapeUtils.escapeJava("\u0002"));

  assertEquals(table1.getMeta().getProperty(StorageConstants.TEXT_NULL),
      StringEscapeUtils.escapeJava("\u0003"));

  Map<String, String> expected = getProperties(DB_NAME, SUPPLIER);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
          .put("key1", "value1")
          .put("key2", "value2")
          .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, SUPPLIER, toSet);
  Map<String, String> actual = getProperties(DB_NAME, SUPPLIER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, SUPPLIER, toUnset);
  actual = getProperties(DB_NAME, SUPPLIER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, SUPPLIER);

}
 
Example #19
Source File: TestHiveCatalogStore.java    From tajo with Apache License 2.0 4 votes vote down vote up
@Test
public void testTableUsingRCFileWithTextSerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.RCFILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.RCFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getProperty(StorageConstants.RCFILE_SERDE));

  Map<String, String> expected = getProperties(DB_NAME, REGION);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
          .put("key1", "value1")
          .put("key2", "value2")
          .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, REGION, toSet);
  Map<String, String> actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, REGION, toUnset);
  actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, REGION);
}
 
Example #20
Source File: TestHiveCatalogStore.java    From tajo with Apache License 2.0 4 votes vote down vote up
@Test
public void testTableUsingRCFileWithBinarySerde() throws Exception {
  KeyValueSet options = new KeyValueSet();
  options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE);
  TableMeta meta = new TableMeta(BuiltinStorages.RCFILE, options);

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("r_regionkey", TajoDataTypes.Type.INT4)
      .add("r_name", TajoDataTypes.Type.TEXT)
      .add("r_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, REGION), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, REGION)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, REGION));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.RCFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, REGION);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  assertEquals(descriptor.getOutputFormat(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StorageConstants.DEFAULT_BINARY_SERDE,
      table1.getMeta().getProperty(StorageConstants.RCFILE_SERDE));

  Map<String, String> expected = getProperties(DB_NAME, REGION);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
      .put("key1", "value1")
      .put("key2", "value2")
      .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, REGION, toSet);
  Map<String, String> actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, REGION, toUnset);
  actual = getProperties(DB_NAME, REGION);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, REGION);
}
 
Example #21
Source File: TestHiveCatalogStore.java    From tajo with Apache License 2.0 4 votes vote down vote up
@Test
public void testTableUsingTextFile() throws Exception {
  TableMeta meta = new TableMeta(BuiltinStorages.TEXT, new KeyValueSet());

  org.apache.tajo.catalog.Schema schema = SchemaBuilder.builder()
      .add("c_custkey", TajoDataTypes.Type.INT4)
      .add("c_name", TajoDataTypes.Type.TEXT)
      .add("c_address", TajoDataTypes.Type.TEXT)
      .add("c_nationkey", TajoDataTypes.Type.INT4)
      .add("c_phone", TajoDataTypes.Type.TEXT)
      .add("c_acctbal", TajoDataTypes.Type.FLOAT8)
      .add("c_mktsegment", TajoDataTypes.Type.TEXT)
      .add("c_comment", TajoDataTypes.Type.TEXT)
      .build();

  TableDesc table = new TableDesc(IdentifierUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta,
      new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri());
  store.createTable(table.getProto());
  assertTrue(store.existTable(DB_NAME, CUSTOMER));

  StorageFormatDescriptor descriptor = formatFactory.get(IOConstants.TEXTFILE);
  org.apache.hadoop.hive.ql.metadata.Table hiveTable = store.getHiveTable(DB_NAME, CUSTOMER);
  assertEquals(descriptor.getInputFormat(), hiveTable.getSd().getInputFormat());
  //IgnoreKeyTextOutputFormat was deprecated
  assertEquals(HiveIgnoreKeyTextOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

  TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER));
  assertEquals(table.getName(), table1.getName());
  assertEquals(table.getUri(), table1.getUri());
  assertEquals(table.getSchema().size(), table1.getSchema().size());
  for (int i = 0; i < table.getSchema().size(); i++) {
    assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName());
  }

  assertEquals(StringEscapeUtils.escapeJava(StorageConstants.DEFAULT_FIELD_DELIMITER),
      table1.getMeta().getProperty(StorageConstants.TEXT_DELIMITER));

  Map<String, String> expected = getProperties(DB_NAME, CUSTOMER);
  Map<String, String> toSet = new ImmutableMap.Builder<String, String>()
      .put("key1", "value1")
      .put("key2", "value2")
      .build();
  expected.putAll(toSet);

  setProperty(DB_NAME, CUSTOMER, toSet);
  Map<String, String> actual = getProperties(DB_NAME, CUSTOMER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertEquals(actual.get("key2"), expected.get("key2"));

  Set<String> toUnset = Sets.newHashSet("key2", "key3");
  for (String key : toUnset) {
    expected.remove(key);
  }
  unSetProperty(DB_NAME, CUSTOMER, toUnset);
  actual = getProperties(DB_NAME, CUSTOMER);
  assertEquals(actual.get(StorageConstants.TEXT_DELIMITER), expected.get(StorageConstants.TEXT_DELIMITER));
  assertEquals(actual.get("key1"), expected.get("key1"));
  assertNull(actual.get("key2"));

  store.dropTable(DB_NAME, CUSTOMER);
}
 
Example #22
Source File: OrcFlowFileWriter.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@VisibleForTesting
int getEstimatedBufferSize(int bs) {
    return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs);
}
 
Example #23
Source File: OrcFlowFileWriter.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
public OrcFlowFileWriter(OutputStream flowFileOutputStream,
                         Path path,
                         Configuration conf,
                         ObjectInspector inspector,
                         long stripeSize,
                         CompressionKind compress,
                         int bufferSize,
                         int rowIndexStride,
                         MemoryManager memoryManager,
                         boolean addBlockPadding,
                         OrcFile.Version version,
                         OrcFile.WriterCallback callback,
                         EncodingStrategy encodingStrategy,
                         CompressionStrategy compressionStrategy,
                         float paddingTolerance,
                         long blockSizeValue,
                         String bloomFilterColumnNames,
                         double bloomFilterFpp) throws IOException {
    this.flowFileOutputStream = flowFileOutputStream;
    this.path = path;
    this.conf = conf;
    this.callback = callback;
    callbackContext = (callback != null) ? () -> OrcFlowFileWriter.this : null;
    this.adjustedStripeSize = stripeSize;
    this.defaultStripeSize = stripeSize;
    this.version = version;
    this.encodingStrategy = encodingStrategy;
    this.compressionStrategy = compressionStrategy;
    this.addBlockPadding = addBlockPadding;
    this.blockSize = blockSizeValue;
    this.paddingTolerance = paddingTolerance;
    this.compress = compress;
    this.rowIndexStride = rowIndexStride;
    this.memoryManager = memoryManager;
    buildIndex = rowIndexStride > 0;
    codec = createCodec(compress);
    String allColumns = conf.get(IOConstants.COLUMNS);
    if (allColumns == null) {
        allColumns = getColumnNamesFromInspector(inspector);
    }
    this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize);
    if (version == OrcFile.Version.V_0_11) {
        /* do not write bloom filters for ORC v11 */
        this.bloomFilterColumns =
                OrcUtils.includeColumns(null, allColumns, inspector);
    } else {
        this.bloomFilterColumns =
                OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector);
    }
    this.bloomFilterFpp = bloomFilterFpp;
    treeWriter = createTreeWriter(inspector, streamFactory, false);
    if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) {
        throw new IllegalArgumentException("Row stride must be at least " +
                MIN_ROW_INDEX_STRIDE);
    }

    // ensure that we are able to handle callbacks before we register ourselves
    memoryManager.addWriter(path, stripeSize, this);
}
 
Example #24
Source File: HiveUtil.java    From presto with Apache License 2.0 4 votes vote down vote up
public static List<HiveType> getColumnTypes(Properties schema)
{
    return toHiveTypes(schema.getProperty(IOConstants.COLUMNS_TYPES, ""));
}
 
Example #25
Source File: HiveUtil.java    From presto with Apache License 2.0 4 votes vote down vote up
public static List<String> getColumnNames(Properties schema)
{
    return COLUMN_NAMES_SPLITTER.splitToList(schema.getProperty(IOConstants.COLUMNS, ""));
}