org.apache.hadoop.hive.ql.io.orc.OrcInputFormat Java Examples
The following examples show how to use
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ProfileFactory.java From pxf with Apache License 2.0 | 6 votes |
/** * The method which returns optimal profile * * @param inputFormat input format of table/partition * @param hasComplexTypes whether record has complex types, see @EnumHiveToGpdbType * @param userProfileName profile name provided by user * @return name of optimal profile */ public static String get(InputFormat inputFormat, boolean hasComplexTypes, String userProfileName) { String profileName = null; if (HIVE_ORC_VECTORIZED_PROFILE.equals(userProfileName)) return userProfileName; if (inputFormat instanceof TextInputFormat && !hasComplexTypes) { profileName = HIVE_TEXT_PROFILE; } else if (inputFormat instanceof RCFileInputFormat) { profileName = HIVE_RC_PROFILE; } else if (inputFormat instanceof OrcInputFormat) { profileName = HIVE_ORC_PROFILE; } else { //Default case profileName = HIVE_PROFILE; } return profileName; }
Example #2
Source File: ORCFilterPushDownRule.java From dremio-oss with Apache License 2.0 | 6 votes |
@Override public boolean matches(RelOptRuleCall call) { final HiveScanDrel scan = call.rel(1); if (scan.getFilter() != null) { return false; } try { final HiveTableXattr tableXattr = HiveTableXattr.parseFrom(scan.getTableMetadata().getReadDefinition().getExtendedProperty().asReadOnlyByteBuffer()); final Optional<String> inputFormat = HiveReaderProtoUtil.getTableInputFormat(tableXattr); return inputFormat.isPresent() && inputFormat.get().equals(OrcInputFormat.class.getCanonicalName()); } catch (InvalidProtocolBufferException e) { logger.warn("Failure while attempting to deserialize hive table attributes.", e); } return false; }
Example #3
Source File: HiveSchemaConverter.java From dremio-oss with Apache License 2.0 | 6 votes |
private static boolean isTypeNotSupported(InputFormat<?,?> format, Category category, boolean includeParquetComplexTypes) { // No restrictions on primitive types if (category.equals(PRIMITIVE)) { return false; } // Don't support map anywhere. if (category.equals(MAP)) { return true; } // All complex types supported in Orc if (format instanceof OrcInputFormat) { return false; } // Support only list and struct in Parquet along with primitive types. // MapRedParquetInputFormat, VectorizedParquetInputformat if (includeParquetComplexTypes && MapredParquetInputFormat.class.isAssignableFrom(format.getClass()) && PARQUET_SUPPORTED_TYPES.contains(category)) { return false; } return true; }
Example #4
Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0 | 6 votes |
/** * When impersonation is not possible and when last modified times are not available, * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated. * * @param hiveStorageCapabilities The capabilities of the storage mechanism. * @param format The file input format. * @return true if FSUpdateKeys should be generated. False if not. */ public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities, final InputFormat<?, ?> format) { if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) { return false; } // Files in a filesystem have last modified times and filesystem permissions. Generate // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat // as well as OrcInputFormat represent files. if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) { return true; } return false; }
Example #5
Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0 | 6 votes |
/** * When impersonation is not possible and when last modified times are not available, * {@link HiveReaderProto.FileSystemPartitionUpdateKey} should not be generated. * * @param hiveStorageCapabilities The capabilities of the storage mechanism. * @param format The file input format. * @return true if FSUpdateKeys should be generated. False if not. */ public static boolean shouldGenerateFileSystemUpdateKeys(final HiveStorageCapabilities hiveStorageCapabilities, final InputFormat<?, ?> format) { if (!hiveStorageCapabilities.supportsImpersonation() && !hiveStorageCapabilities.supportsLastModifiedTime()) { return false; } // Files in a filesystem have last modified times and filesystem permissions. Generate // FileSystemPartitionUpdateKeys for formats representing files. Subclasses of FilInputFormat // as well as OrcInputFormat represent files. if ((format instanceof FileInputFormat) || (format instanceof OrcInputFormat)) { return true; } return false; }
Example #6
Source File: SpliceOrcUtils.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
public static List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { List<OrcSplit> splits = OrcInputFormat.generateSplitsInfo(ShimLoader.getHadoopShims() .getConfiguration(jobContext)); List<InputSplit> result = new ArrayList<InputSplit>(splits.size()); // Filter Out Splits based on paths... for(OrcSplit split: splits) { result.add(new OrcNewSplit(split)); } return result; }
Example #7
Source File: ProfileFactoryTest.java From pxf with Apache License 2.0 | 5 votes |
@Test public void get() throws Exception { // For TextInputFormat when table has no complex types, HiveText profile should be used String profileName = ProfileFactory.get(new TextInputFormat(), false); assertEquals("HiveText", profileName); // For TextInputFormat when table has complex types, Hive profile should be used, HiveText doesn't support complex types yet profileName = ProfileFactory.get(new TextInputFormat(), true); assertEquals("Hive", profileName); // For RCFileInputFormat when table has complex types, HiveRC profile should be used profileName = ProfileFactory.get(new RCFileInputFormat(), true); assertEquals("HiveRC", profileName); // For RCFileInputFormat when table has no complex types, HiveRC profile should be used profileName = ProfileFactory.get(new RCFileInputFormat(), false); assertEquals("HiveRC", profileName); // For OrcInputFormat when table has complex types, HiveORC profile should be used profileName = ProfileFactory.get(new OrcInputFormat(), true); assertEquals("HiveORC", profileName); // For OrcInputFormat when table has no complex types, HiveORC profile should be used profileName = ProfileFactory.get(new OrcInputFormat(), false); assertEquals("HiveORC", profileName); // For other formats Hive profile should be used profileName = ProfileFactory.get(new SequenceFileInputFilter(), false); assertEquals("Hive", profileName); }
Example #8
Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0 | 5 votes |
/** * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size * * @param statsParams parameters controling the stats calculations * @param statsFromMetastore * @param sizeRatio Ration of this split contributing to all stats in given <i>statsFromMetastore</i> * @param splitSizeInBytes * @param format * @param estimatedRecordSize * @return */ public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore, final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format, final int estimatedRecordSize) { final Class<? extends InputFormat> inputFormat = format == null ? null : ((Class<? extends InputFormat>) format.getClass()); double compressionFactor = 1.0; if (MapredParquetInputFormat.class.equals(inputFormat)) { compressionFactor = 30; } else if (OrcInputFormat.class.equals(inputFormat)) { compressionFactor = 30f; } else if (AvroContainerInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } else if (RCFileInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize); // Metastore stats are for complete partition. Multiply it by the size ratio of this split final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount()); logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'", compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount); if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) { return metastoreRowCount; } // return the maximum of estimate and metastore count return Math.max(estimatedRowCount, metastoreRowCount); }
Example #9
Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0 | 5 votes |
/** * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size * * @param statsParams parameters controling the stats calculations * @param statsFromMetastore * @param sizeRatio Ration of this split contributing to all stats in given <i>statsFromMetastore</i> * @param splitSizeInBytes * @param format * @param estimatedRecordSize * @return */ public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore, final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format, final int estimatedRecordSize) { final Class<? extends InputFormat> inputFormat = format == null ? null : ((Class<? extends InputFormat>) format.getClass()); double compressionFactor = 1.0; if (MapredParquetInputFormat.class.equals(inputFormat)) { compressionFactor = 30; } else if (OrcInputFormat.class.equals(inputFormat)) { compressionFactor = 30f; } else if (AvroContainerInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } else if (RCFileInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize); // Metastore stats are for complete partition. Multiply it by the size ratio of this split final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount()); logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'", compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount); if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) { return metastoreRowCount; } // return the maximum of estimate and metastore count return Math.max(estimatedRowCount, metastoreRowCount); }
Example #10
Source File: HiveDialectITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testCreateTable() throws Exception { String location = warehouse + "/external_location"; tableEnv.executeSql(String.format( "create external table tbl1 (d decimal(10,0),ts timestamp) partitioned by (p string) location '%s' tblproperties('k1'='v1')", location)); Table hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl1")); assertEquals(TableType.EXTERNAL_TABLE.toString(), hiveTable.getTableType()); assertEquals(1, hiveTable.getPartitionKeysSize()); assertEquals(location, locationPath(hiveTable.getSd().getLocation())); assertEquals("v1", hiveTable.getParameters().get("k1")); assertFalse(hiveTable.getParameters().containsKey(SqlCreateHiveTable.TABLE_LOCATION_URI)); tableEnv.executeSql("create table tbl2 (s struct<ts:timestamp,bin:binary>) stored as orc"); hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl2")); assertEquals(TableType.MANAGED_TABLE.toString(), hiveTable.getTableType()); assertEquals(OrcSerde.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib()); assertEquals(OrcInputFormat.class.getName(), hiveTable.getSd().getInputFormat()); assertEquals(OrcOutputFormat.class.getName(), hiveTable.getSd().getOutputFormat()); tableEnv.executeSql("create table tbl3 (m map<timestamp,binary>) partitioned by (p1 bigint,p2 tinyint) " + "row format serde 'org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe'"); hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl3")); assertEquals(2, hiveTable.getPartitionKeysSize()); assertEquals(LazyBinarySerDe.class.getName(), hiveTable.getSd().getSerdeInfo().getSerializationLib()); tableEnv.executeSql("create table tbl4 (x int,y smallint) row format delimited fields terminated by '|' lines terminated by '\n'"); hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl4")); assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.FIELD_DELIM)); assertEquals("|", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.SERIALIZATION_FORMAT)); assertEquals("\n", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.LINE_DELIM)); tableEnv.executeSql("create table tbl5 (m map<bigint,string>) row format delimited collection items terminated by ';' " + "map keys terminated by ':'"); hiveTable = hiveCatalog.getHiveTable(new ObjectPath("default", "tbl5")); assertEquals(";", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.COLLECTION_DELIM)); assertEquals(":", hiveTable.getSd().getSerdeInfo().getParameters().get(serdeConstants.MAPKEY_DELIM)); }
Example #11
Source File: TestOrcPageSourceMemoryTracking.java From presto with Apache License 2.0 | 4 votes |
public TestPreparer(String tempFilePath, List<TestColumn> testColumns, int numRows, int stripeRows) throws Exception { OrcSerde serde = new OrcSerde(); schema = new Properties(); schema.setProperty("columns", testColumns.stream() .map(TestColumn::getName) .collect(Collectors.joining(","))); schema.setProperty("columns.types", testColumns.stream() .map(TestColumn::getType) .collect(Collectors.joining(","))); schema.setProperty(FILE_INPUT_FORMAT, OrcInputFormat.class.getName()); schema.setProperty(SERIALIZATION_LIB, serde.getClass().getName()); partitionKeys = testColumns.stream() .filter(TestColumn::isPartitionKey) .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue())) .collect(toList()); partitonName = String.join("/", partitionKeys.stream() .map(partitionKey -> format("%s=%s", partitionKey.getName(), partitionKey.getValue())) .collect(toImmutableList())); ImmutableList.Builder<HiveColumnHandle> columnsBuilder = ImmutableList.builder(); ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder(); int nextHiveColumnIndex = 0; for (int i = 0; i < testColumns.size(); i++) { TestColumn testColumn = testColumns.get(i); int columnIndex = testColumn.isPartitionKey() ? -1 : nextHiveColumnIndex++; ObjectInspector inspector = testColumn.getObjectInspector(); HiveType hiveType = HiveType.valueOf(inspector.getTypeName()); Type type = hiveType.getType(TYPE_MANAGER); columnsBuilder.add(createBaseColumn(testColumn.getName(), columnIndex, hiveType, type, testColumn.isPartitionKey() ? PARTITION_KEY : REGULAR, Optional.empty())); typesBuilder.add(type); } columns = columnsBuilder.build(); types = typesBuilder.build(); fileSplit = createTestFile(tempFilePath, serde, null, testColumns, numRows, stripeRows); }
Example #12
Source File: HiveORCAccessor.java From pxf with Apache License 2.0 | 4 votes |
/** * Constructs a HiveORCFileAccessor. */ public HiveORCAccessor() { super(new OrcInputFormat()); }
Example #13
Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Test public void testGetTableOrc() { final String databaseName = "db"; final String tableName = "tbl"; HiveTable.Builder builder = new HiveTable.Builder(); builder.withDbName(databaseName).withTableName(tableName); HiveTable hiveTable = builder.build(); // SerDe props are State serdeProps = new State(); serdeProps.setProp("columns", "timestamp,namespace,name,metadata"); serdeProps.setProp("columns.types", "bigint,string,string,map<string,string>"); hiveTable.getProps().addAll(serdeProps); hiveTable.setInputFormat(OrcInputFormat.class.getName()); hiveTable.setOutputFormat(OrcOutputFormat.class.getName()); hiveTable.setSerDeType(OrcSerde.class.getName()); Table table = HiveMetaStoreUtils.getTable(hiveTable); Assert.assertEquals(table.getDbName(), databaseName); Assert.assertEquals(table.getTableName(), tableName); StorageDescriptor sd = table.getSd(); Assert.assertEquals(sd.getInputFormat(), OrcInputFormat.class.getName()); Assert.assertEquals(sd.getOutputFormat(), OrcOutputFormat.class.getName()); Assert.assertNotNull(sd.getSerdeInfo()); Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), OrcSerde.class.getName()); // verify column name List<FieldSchema> fields = sd.getCols(); Assert.assertTrue(fields != null && fields.size() == 4); FieldSchema fieldA = fields.get(0); Assert.assertEquals(fieldA.getName(), "timestamp"); Assert.assertEquals(fieldA.getType(), "bigint"); FieldSchema fieldB = fields.get(1); Assert.assertEquals(fieldB.getName(), "namespace"); Assert.assertEquals(fieldB.getType(), "string"); FieldSchema fieldC = fields.get(2); Assert.assertEquals(fieldC.getName(), "name"); Assert.assertEquals(fieldC.getType(), "string"); FieldSchema fieldD = fields.get(3); Assert.assertEquals(fieldD.getName(), "metadata"); Assert.assertEquals(fieldD.getType(), "map<string,string>"); }