org.apache.hadoop.hive.ql.io.orc.OrcInputFormat Java Examples

The following examples show how to use org.apache.hadoop.hive.ql.io.orc.OrcInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ProfileFactory.java    From pxf with Apache License 2.0 6 votes vote down vote up
/**
 * The method which returns optimal profile
 *
 * @param inputFormat input format of table/partition
 * @param hasComplexTypes whether record has complex types, see @EnumHiveToGpdbType
 * @param userProfileName profile name provided by user
 * @return name of optimal profile
 */
public static String get(InputFormat inputFormat, boolean hasComplexTypes, String userProfileName) {
    String profileName = null;
    if (HIVE_ORC_VECTORIZED_PROFILE.equals(userProfileName))
        return userProfileName;
    if (inputFormat instanceof TextInputFormat && !hasComplexTypes) {
        profileName = HIVE_TEXT_PROFILE;
    } else if (inputFormat instanceof RCFileInputFormat) {
        profileName = HIVE_RC_PROFILE;
    } else if (inputFormat instanceof OrcInputFormat) {
        profileName = HIVE_ORC_PROFILE;
    } else {
        //Default case
        profileName = HIVE_PROFILE;
    }
    return profileName;
}
 
Example #2
Source File: ORCFilterPushDownRule.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Override
public boolean matches(RelOptRuleCall call) {
  final HiveScanDrel scan = call.rel(1);
  if (scan.getFilter() != null) {
    return false;
  }
  try {
    final HiveTableXattr tableXattr =
        HiveTableXattr.parseFrom(scan.getTableMetadata().getReadDefinition().getExtendedProperty().asReadOnlyByteBuffer());
    final Optional<String> inputFormat = HiveReaderProtoUtil.getTableInputFormat(tableXattr);
    return inputFormat.isPresent() && inputFormat.get().equals(OrcInputFormat.class.getCanonicalName());
  } catch (InvalidProtocolBufferException e) {
    logger.warn("Failure while attempting to deserialize hive table attributes.", e);
  }
  return false;
}
 
Example #3
Source File: HiveSchemaConverter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private static boolean isTypeNotSupported(InputFormat<?,?> format, Category category, boolean includeParquetComplexTypes) {
  // No restrictions on primitive types
  if (category.equals(PRIMITIVE)) {
    return false;
  }

  // Don't support map anywhere.
  if (category.equals(MAP)) {
    return true;
  }

  // All complex types supported in Orc
  if (format instanceof OrcInputFormat) {
    return false;
  }

  // Support only list and struct in Parquet along with primitive types. // MapRedParquetInputFormat, VectorizedParquetInputformat
  if (includeParquetComplexTypes && MapredParquetInputFormat.class.isAssignableFrom(format.getClass()) && PARQUET_SUPPORTED_TYPES.contains(category)) {
    return false;
  }

  return true;
}
 
Example #4
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * When impersonation is not possible and when last modified times are not available,
 * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated.
 *
 * @param hiveStorageCapabilities The capabilities of the storage mechanism.
 * @param format                  The file input format.
 * @return true if FSUpdateKeys should be generated. False if not.
 */
public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities,
                                                         final InputFormat<?, ?> format) {

  if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) {
    return false;
  }

  // Files in a filesystem have last modified times and filesystem permissions. Generate
  // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat
  // as well as OrcInputFormat represent files.
  if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) {
    return true;
  }

  return false;
}
 
Example #5
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * When impersonation is not possible and when last modified times are not available,
 * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated.
 *
 * @param hiveStorageCapabilities The capabilities of the storage mechanism.
 * @param format                  The file input format.
 * @return true if FSUpdateKeys should be generated. False if not.
 */
public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities,
                                                         final InputFormat<?, ?> format) {

  if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) {
    return false;
  }

  // Files in a filesystem have last modified times and filesystem permissions. Generate
  // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat
  // as well as OrcInputFormat represent files.
  if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) {
    return true;
  }

  return false;
}
 
Example #6
Source File: SpliceOrcUtils.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
public static List<InputSplit> getSplits(JobContext jobContext)
        throws IOException, InterruptedException {
    List<OrcSplit> splits =
            OrcInputFormat.generateSplitsInfo(ShimLoader.getHadoopShims()
                    .getConfiguration(jobContext));
    List<InputSplit> result = new ArrayList<InputSplit>(splits.size());
    // Filter Out Splits based on paths...
    for(OrcSplit split: splits) {

        result.add(new OrcNewSplit(split));
    }



    return result;
}
 
Example #7
Source File: ProfileFactoryTest.java    From pxf with Apache License 2.0 5 votes vote down vote up
@Test
public void get() throws Exception {

    // For TextInputFormat when table has no complex types, HiveText profile should be used
    String profileName = ProfileFactory.get(new TextInputFormat(), false);
    assertEquals("HiveText", profileName);

    // For TextInputFormat when table has complex types, Hive profile should be used, HiveText doesn't support complex types yet
    profileName = ProfileFactory.get(new TextInputFormat(), true);
    assertEquals("Hive", profileName);

    // For RCFileInputFormat when table has complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), true);
    assertEquals("HiveRC", profileName);

    // For RCFileInputFormat when table has no complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), false);
    assertEquals("HiveRC", profileName);

    // For OrcInputFormat when table has complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), true);
    assertEquals("HiveORC", profileName);

    // For OrcInputFormat when table has no complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), false);
    assertEquals("HiveORC", profileName);

    // For other formats Hive profile should be used
    profileName = ProfileFactory.get(new SequenceFileInputFilter(), false);
    assertEquals("Hive", profileName);
}
 
Example #8
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}
 
Example #9
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}
 
Example #10
Source File: HiveDialectITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateTable() throws Exception {
	String location = warehouse + "/external_location";
	tableEnv.executeSql(String.format(
			"create external table tbl1 (d decimal(10,0),ts timestamp) partitioned by (p string) location '%s' tblproperties('k1'='v1')", location));
	Table hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl1"));
	assertEquals(TableType.EXTERNAL_TABLE.toString(), hiveTable.getTableType());
	assertEquals(1, hiveTable.getPartitionKeysSize());
	assertEquals(location, locationPath(hiveTable.getSd().getLocation()));
	assertEquals("v1", hiveTable.getParameters().get("k1"));
	assertFalse(hiveTable.getParameters().containsKey(SqlCreateHiveTable.TABLE_LOCATION_URI));

	tableEnv.executeSql("create table tbl2 (s struct<ts:timestamp,bin:binary>) stored as orc");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl2"));
	assertEquals(TableType.MANAGED_TABLE.toString(), hiveTable.getTableType());
	assertEquals(OrcSerde.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib());
	assertEquals(OrcInputFormat.class.getName(), hiveTable.getSd().getInputFormat());
	assertEquals(OrcOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat());

	tableEnv.executeSql("create table tbl3 (m map<timestamp,binary>) partitioned by (p1 bigint,p2 tinyint) " +
			"row format serde 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl3"));
	assertEquals(2, hiveTable.getPartitionKeysSize());
	assertEquals(LazyBinarySerDe.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib());

	tableEnv.executeSql("create table tbl4 (x int,y smallint) row format delimited fields terminated by '|' lines terminated by '\n'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl4"));
	assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.FIELD_DELIM));
	assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.SERIALIZATION_FORMAT));
	assertEquals("\n", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.LINE_DELIM));

	tableEnv.executeSql("create table tbl5 (m map<bigint,string>) row format delimited collection items terminated by ';' " +
			"map keys terminated by ':'");
	hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl5"));
	assertEquals(";", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.COLLECTION_DELIM));
	assertEquals(":", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.MAPKEY_DELIM));
}
 
Example #11
Source File: TestOrcPageSourceMemoryTracking.java    From presto with Apache License 2.0 4 votes vote down vote up
public TestPreparer(String tempFilePath, List<TestColumn> testColumns, int numRows, int stripeRows)
        throws Exception
{
    OrcSerde serde = new OrcSerde();
    schema = new Properties();
    schema.setProperty("columns",
            testColumns.stream()
                    .map(TestColumn::getName)
                    .collect(Collectors.joining(",")));
    schema.setProperty("columns.types",
            testColumns.stream()
                    .map(TestColumn::getType)
                    .collect(Collectors.joining(",")));
    schema.setProperty(FILE_INPUT_FORMAT, OrcInputFormat.class.getName());
    schema.setProperty(SERIALIZATION_LIB, serde.getClass().getName());

    partitionKeys = testColumns.stream()
            .filter(TestColumn::isPartitionKey)
            .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue()))
            .collect(toList());

    partitonName = String.join("/", partitionKeys.stream()
            .map(partitionKey -> format("%s=%s", partitionKey.getName(), partitionKey.getValue()))
            .collect(toImmutableList()));

    ImmutableList.Builder<HiveColumnHandle> columnsBuilder = ImmutableList.builder();
    ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
    int nextHiveColumnIndex = 0;
    for (int i = 0; i < testColumns.size(); i++) {
        TestColumn testColumn = testColumns.get(i);
        int columnIndex = testColumn.isPartitionKey() ? -1 : nextHiveColumnIndex++;

        ObjectInspector inspector = testColumn.getObjectInspector();
        HiveType hiveType = HiveType.valueOf(inspector.getTypeName());
        Type type = hiveType.getType(TYPE_MANAGER);

        columnsBuilder.add(createBaseColumn(testColumn.getName(), columnIndex, hiveType, type, testColumn.isPartitionKey() ? PARTITION_KEY : REGULAR, Optional.empty()));
        typesBuilder.add(type);
    }
    columns = columnsBuilder.build();
    types = typesBuilder.build();

    fileSplit = createTestFile(tempFilePath, serde, null, testColumns, numRows, stripeRows);
}
 
Example #12
Source File: HiveORCAccessor.java    From pxf with Apache License 2.0 4 votes vote down vote up
/**
 * Constructs a HiveORCFileAccessor.
 */
public HiveORCAccessor() {
    super(new OrcInputFormat());
}
 
Example #13
Source File: HiveMetaStoreUtilsTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testGetTableOrc() {
  final String databaseName = "db";
  final String tableName = "tbl";
  HiveTable.Builder builder = new HiveTable.Builder();
  builder.withDbName(databaseName).withTableName(tableName);

  HiveTable hiveTable = builder.build();

  // SerDe props are
  State serdeProps = new State();
  serdeProps.setProp("columns", "timestamp,namespace,name,metadata");
  serdeProps.setProp("columns.types", "bigint,string,string,map<string,string>");

  hiveTable.getProps().addAll(serdeProps);

  hiveTable.setInputFormat(OrcInputFormat.class.getName());
  hiveTable.setOutputFormat(OrcOutputFormat.class.getName());
  hiveTable.setSerDeType(OrcSerde.class.getName());

  Table table = HiveMetaStoreUtils.getTable(hiveTable);
  Assert.assertEquals(table.getDbName(), databaseName);
  Assert.assertEquals(table.getTableName(), tableName);

  StorageDescriptor sd = table.getSd();
  Assert.assertEquals(sd.getInputFormat(), OrcInputFormat.class.getName());
  Assert.assertEquals(sd.getOutputFormat(), OrcOutputFormat.class.getName());
  Assert.assertNotNull(sd.getSerdeInfo());
  Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), OrcSerde.class.getName());

  // verify column name
  List<FieldSchema> fields = sd.getCols();
  Assert.assertTrue(fields != null && fields.size() == 4);
  FieldSchema fieldA = fields.get(0);
  Assert.assertEquals(fieldA.getName(), "timestamp");
  Assert.assertEquals(fieldA.getType(), "bigint");

  FieldSchema fieldB = fields.get(1);
  Assert.assertEquals(fieldB.getName(), "namespace");
  Assert.assertEquals(fieldB.getType(), "string");

  FieldSchema fieldC = fields.get(2);
  Assert.assertEquals(fieldC.getName(), "name");
  Assert.assertEquals(fieldC.getType(), "string");


  FieldSchema fieldD = fields.get(3);
  Assert.assertEquals(fieldD.getName(), "metadata");
  Assert.assertEquals(fieldD.getType(), "map<string,string>");
}