org.apache.hadoop.hive.ql.io.orc.OrcInputFormat Java Exaples

Source File: ProfileFactory.java From pxf with Apache License 2.0

6 votes

/**
 * The method which returns optimal profile
 *
 * @param inputFormat input format of table/partition
 * @param hasComplexTypes whether record has complex types, see @EnumHiveToGpdbType
 * @param userProfileName profile name provided by user
 * @return name of optimal profile
 */
public static String get(InputFormat inputFormat, boolean hasComplexTypes, String userProfileName) {
    String profileName = null;
    if (HIVE_ORC_VECTORIZED_PROFILE.equals(userProfileName))
        return userProfileName;
    if (inputFormat instanceof TextInputFormat && !hasComplexTypes) {
        profileName = HIVE_TEXT_PROFILE;
    } else if (inputFormat instanceof RCFileInputFormat) {
        profileName = HIVE_RC_PROFILE;
    } else if (inputFormat instanceof OrcInputFormat) {
        profileName = HIVE_ORC_PROFILE;
    } else {
        //Default case
        profileName = HIVE_PROFILE;
    }
    return profileName;
}

Source File: ORCFilterPushDownRule.java From dremio-oss with Apache License 2.0

6 votes

@Override
public boolean matches(RelOptRuleCall call) {
  final HiveScanDrel scan = call.rel(1);
  if (scan.getFilter() != null) {
    return false;
  }
  try {
    final HiveTableXattr tableXattr =
        HiveTableXattr.parseFrom(scan.getTableMetadata().getReadDefinition().getExtendedProperty().asReadOnlyByteBuffer());
    final Optional<String> inputFormat = HiveReaderProtoUtil.getTableInputFormat(tableXattr);
    return inputFormat.isPresent() && inputFormat.get().equals(OrcInputFormat.class.getCanonicalName());
  } catch (InvalidProtocolBufferException e) {
    logger.warn("Failure while attempting to deserialize hive table attributes.", e);
  }
  return false;
}

Source File: HiveSchemaConverter.java From dremio-oss with Apache License 2.0

6 votes

private static boolean isTypeNotSupported(InputFormat<?,?> format, Category category, boolean includeParquetComplexTypes) {
  // No restrictions on primitive types
  if (category.equals(PRIMITIVE)) {
    return false;
  }

  // Don't support map anywhere.
  if (category.equals(MAP)) {
    return true;
  }

  // All complex types supported in Orc
  if (format instanceof OrcInputFormat) {
    return false;
  }

  // Support only list and struct in Parquet along with primitive types. // MapRedParquetInputFormat, VectorizedParquetInputformat
  if (includeParquetComplexTypes && MapredParquetInputFormat.class.isAssignableFrom(format.getClass()) && PARQUET_SUPPORTED_TYPES.contains(category)) {
    return false;
  }

  return true;
}

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

6 votes

/**
 * When impersonation is not possible and when last modified times are not available,
 * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated.
 *
 * @param hiveStorageCapabilities The capabilities of the storage mechanism.
 * @param format                  The file input format.
 * @return true if FSUpdateKeys should be generated. False if not.
 */
public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities,
                                                         final InputFormat<?, ?> format) {

  if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) {
    return false;
  }

  // Files in a filesystem have last modified times and filesystem permissions. Generate
  // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat
  // as well as OrcInputFormat represent files.
  if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) {
    return true;
  }

  return false;
}

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

6 votes

/**
 * When impersonation is not possible and when last modified times are not available,
 * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated.
 *
 * @param hiveStorageCapabilities The capabilities of the storage mechanism.
 * @param format                  The file input format.
 * @return true if FSUpdateKeys should be generated. False if not.
 */
public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities,
                                                         final InputFormat<?, ?> format) {

  if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) {
    return false;
  }

  // Files in a filesystem have last modified times and filesystem permissions. Generate
  // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat
  // as well as OrcInputFormat represent files.
  if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) {
    return true;
  }

  return false;
}

Source File: SpliceOrcUtils.java From spliceengine with GNU Affero General Public License v3.0

6 votes

public static List<InputSplit> getSplits(JobContext jobContext)
        throws IOException, InterruptedException {
    List<OrcSplit> splits =
            OrcInputFormat.generateSplitsInfo(ShimLoader.getHadoopShims()
                    .getConfiguration(jobContext));
    List<InputSplit> result = new ArrayList<InputSplit>(splits.size());
    // Filter Out Splits based on paths...
    for(OrcSplit split: splits) {

        result.add(new OrcNewSplit(split));
    }



    return result;
}

Source File: ProfileFactoryTest.java From pxf with Apache License 2.0

5 votes

@Test
public void get() throws Exception {

    // For TextInputFormat when table has no complex types, HiveText profile should be used
    String profileName = ProfileFactory.get(new TextInputFormat(), false);
    assertEquals("HiveText", profileName);

    // For TextInputFormat when table has complex types, Hive profile should be used, HiveText doesn't support complex types yet
    profileName = ProfileFactory.get(new TextInputFormat(), true);
    assertEquals("Hive", profileName);

    // For RCFileInputFormat when table has complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), true);
    assertEquals("HiveRC", profileName);

    // For RCFileInputFormat when table has no complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), false);
    assertEquals("HiveRC", profileName);

    // For OrcInputFormat when table has complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), true);
    assertEquals("HiveORC", profileName);

    // For OrcInputFormat when table has no complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), false);
    assertEquals("HiveORC", profileName);

    // For other formats Hive profile should be used
    profileName = ProfileFactory.get(new SequenceFileInputFilter(), false);
    assertEquals("Hive", profileName);
}

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}

Source File: HiveDialectITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testCreateTable() throws Exception {
	String location = warehouse + "/external_location";
	tableEnv.executeSql(String.format(
			"create external table tbl1 (d decimal(10,0),ts timestamp) partitioned by (p string) location '%s' tblproperties('k1'='v1')", location));
	Table hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl1"));
	assertEquals(TableType.EXTERNAL_TABLE.toString(), hiveTable.getTableType());
	assertEquals(1, hiveTable.getPartitionKeysSize());
	assertEquals(location, locationPath(hiveTable.getSd().getLocation()));
	assertEquals("v1", hiveTable.getParameters().get("k1"));
	assertFalse(hiveTable.getParameters().containsKey(SqlCreateHiveTable.TABLE_LOCATION_URI));

	tableEnv.executeSql("create table tbl2 (s struct<ts:timestamp,bin:binary>) stored as orc");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl2"));
	assertEquals(TableType.MANAGED_TABLE.toString(), hiveTable.getTableType());
	assertEquals(OrcSerde.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib());
	assertEquals(OrcInputFormat.class.getName(), hiveTable.getSd().getInputFormat());
	assertEquals(OrcOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

	tableEnv.executeSql("create table tbl3 (m map<timestamp,binary>) partitioned by (p1 bigint,p2 tinyint) " +
			"row format serde 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl3"));
	assertEquals(2, hiveTable.getPartitionKeysSize());
	assertEquals(LazyBinarySerDe.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib());

	tableEnv.executeSql("create table tbl4 (x int,y smallint) row format delimited fields terminated by '|' lines terminated by '\n'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl4"));
	assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.FIELD_DELIM));
	assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.SERIALIZATION_FORMAT));
	assertEquals("\n", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.LINE_DELIM));

	tableEnv.executeSql("create table tbl5 (m map<bigint,string>) row format delimited collection items terminated by ';' " +
			"map keys terminated by ':'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl5"));
	assertEquals(";", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.COLLECTION_DELIM));
	assertEquals(":", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.MAPKEY_DELIM));
}

Source File: TestOrcPageSourceMemoryTracking.java From presto with Apache License 2.0

4 votes

public TestPreparer(String tempFilePath, List<TestColumn> testColumns, int numRows, int stripeRows)
        throws Exception
{
    OrcSerde serde = new OrcSerde();
    schema = new Properties();
    schema.setProperty("columns",
            testColumns.stream()
                    .map(TestColumn::getName)
                    .collect(Collectors.joining(",")));
    schema.setProperty("columns.types",
            testColumns.stream()
                    .map(TestColumn::getType)
                    .collect(Collectors.joining(",")));
    schema.setProperty(FILE_INPUT_FORMAT, OrcInputFormat.class.getName());
    schema.setProperty(SERIALIZATION_LIB, serde.getClass().getName());

    partitionKeys = testColumns.stream()
            .filter(TestColumn::isPartitionKey)
            .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue()))
            .collect(toList());

    partitonName = String.join("/", partitionKeys.stream()
            .map(partitionKey -> format("%s=%s", partitionKey.getName(), partitionKey.getValue()))
            .collect(toImmutableList()));

    ImmutableList.Builder<HiveColumnHandle> columnsBuilder = ImmutableList.builder();
    ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
    int nextHiveColumnIndex = 0;
    for (int i = 0; i < testColumns.size(); i++) {
        TestColumn testColumn = testColumns.get(i);
        int columnIndex = testColumn.isPartitionKey() ? -1 : nextHiveColumnIndex++;

        ObjectInspector inspector = testColumn.getObjectInspector();
        HiveType hiveType = HiveType.valueOf(inspector.getTypeName());
        Type type = hiveType.getType(TYPE_MANAGER);

        columnsBuilder.add(createBaseColumn(testColumn.getName(), columnIndex, hiveType, type, testColumn.isPartitionKey() ? PARTITION_KEY : REGULAR, Optional.empty()));
        typesBuilder.add(type);
    }
    columns = columnsBuilder.build();
    types = typesBuilder.build();

    fileSplit = createTestFile(tempFilePath, serde, null, testColumns, numRows, stripeRows);
}

Source File: HiveORCAccessor.java From pxf with Apache License 2.0

4 votes

/**
 * Constructs a HiveORCFileAccessor.
 */
public HiveORCAccessor() {
    super(new OrcInputFormat());
}

Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Test
public void testGetTableOrc() {
  final String databaseName = "db";
  final String tableName = "tbl";
  HiveTable.Builder builder = new HiveTable.Builder();
  builder.withDbName(databaseName).withTableName(tableName);

  HiveTable hiveTable = builder.build();

  // SerDe props are
  State serdeProps = new State();
  serdeProps.setProp("columns", "timestamp,namespace,name,metadata");
  serdeProps.setProp("columns.types", "bigint,string,string,map<string,string>");

  hiveTable.getProps().addAll(serdeProps);

  hiveTable.setInputFormat(OrcInputFormat.class.getName());
  hiveTable.setOutputFormat(OrcOutputFormat.class.getName());
  hiveTable.setSerDeType(OrcSerde.class.getName());

  Table table = HiveMetaStoreUtils.getTable(hiveTable);
  Assert.assertEquals(table.getDbName(), databaseName);
  Assert.assertEquals(table.getTableName(), tableName);

  StorageDescriptor sd = table.getSd();
  Assert.assertEquals(sd.getInputFormat(), OrcInputFormat.class.getName());
  Assert.assertEquals(sd.getOutputFormat(), OrcOutputFormat.class.getName());
  Assert.assertNotNull(sd.getSerdeInfo());
  Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), OrcSerde.class.getName());

  // verify column name
  List<FieldSchema> fields = sd.getCols();
  Assert.assertTrue(fields != null && fields.size() == 4);
  FieldSchema fieldA = fields.get(0);
  Assert.assertEquals(fieldA.getName(), "timestamp");
  Assert.assertEquals(fieldA.getType(), "bigint");

  FieldSchema fieldB = fields.get(1);
  Assert.assertEquals(fieldB.getName(), "namespace");
  Assert.assertEquals(fieldB.getType(), "string");

  FieldSchema fieldC = fields.get(2);
  Assert.assertEquals(fieldC.getName(), "name");
  Assert.assertEquals(fieldC.getType(), "string");


  FieldSchema fieldD = fields.get(3);
  Assert.assertEquals(fieldD.getName(), "metadata");
  Assert.assertEquals(fieldD.getType(), "map<string,string>");
}

org.apache.hadoop.hive.ql.io.orc.OrcInputFormat Java Examples