org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat Java Examples

The following examples show how to use org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IntegrationTestHelper.java    From circus-train with Apache License 2.0 6 votes vote down vote up
Table createParquetPartitionedTable(
        URI tableUri,
        String database,
        String table,
        Schema schema,
        String fieldName,
        Object fieldData,
        int version) throws Exception {
  List<FieldSchema> columns = new ArrayList<>();
  AvroObjectInspectorGenerator schemaInspector = new AvroObjectInspectorGenerator(schema);
  for (int i = 0; i < schemaInspector.getColumnNames().size(); i++) {
    columns.add(new FieldSchema(
            schemaInspector.getColumnNames().get(i), schemaInspector.getColumnTypes().get(i).toString(), ""
    ));
  }
  List<FieldSchema> partitionKeys = Arrays.asList(new FieldSchema("hour", "string", ""));
  Table parquetTable = TestUtils
          .createPartitionedTable(metaStoreClient, database, table, tableUri, columns, partitionKeys,
                  "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", MapredParquetInputFormat.class.getName(),
                  MapredParquetOutputFormat.class.getName());
  URI partition = createData(tableUri, schema, Integer.toString(version), version, fieldName, fieldData);
  metaStoreClient.add_partitions(Arrays.asList(newTablePartition(parquetTable,
          Arrays.asList(Integer.toString(version)), partition)));
  return metaStoreClient.getTable(database, table);
}
 
Example #2
Source File: HiveSchemaConverter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private static boolean isTypeNotSupported(InputFormat<?,?> format, Category category, boolean includeParquetComplexTypes) {
  // No restrictions on primitive types
  if (category.equals(PRIMITIVE)) {
    return false;
  }

  // Don't support map anywhere.
  if (category.equals(MAP)) {
    return true;
  }

  // All complex types supported in Orc
  if (format instanceof OrcInputFormat) {
    return false;
  }

  // Support only list and struct in Parquet along with primitive types. // MapRedParquetInputFormat, VectorizedParquetInputformat
  if (includeParquetComplexTypes && MapredParquetInputFormat.class.isAssignableFrom(format.getClass()) && PARQUET_SUPPORTED_TYPES.contains(category)) {
    return false;
  }

  return true;
}
 
Example #3
Source File: HiveScanBatchCreator.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private boolean isParquetSplit(final HiveTableXattr tableXattr, SplitAndPartitionInfo split, boolean isPartitioned) {
  if (tableXattr.getReaderType() == NATIVE_PARQUET) {
    return true;
  }

  Optional<String> tableInputFormat;
  if (isPartitioned) {
    final HiveReaderProto.PartitionXattr partitionXattr = HiveReaderProtoUtil.getPartitionXattr(split);
    tableInputFormat = HiveReaderProtoUtil.getPartitionInputFormat(tableXattr, partitionXattr);
  } else {
    tableInputFormat = HiveReaderProtoUtil.getTableInputFormat(tableXattr);
  }

  if (!tableInputFormat.isPresent()) {
    return false;
  }

  try {
    return MapredParquetInputFormat.class.isAssignableFrom(
      (Class<? extends InputFormat>) Class.forName(tableInputFormat.get()));
  } catch (ClassNotFoundException e) {
    return false;
  }
}
 
Example #4
Source File: HiveScanBatchCreator.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private boolean isParquetSplit(final HiveTableXattr tableXattr, SplitAndPartitionInfo split, boolean isPartitioned) {
  if (tableXattr.getReaderType() == NATIVE_PARQUET) {
    return true;
  }

  Optional<String> tableInputFormat;
  if (isPartitioned) {
    final HiveReaderProto.PartitionXattr partitionXattr = HiveReaderProtoUtil.getPartitionXattr(split);
    tableInputFormat = HiveReaderProtoUtil.getPartitionInputFormat(tableXattr, partitionXattr);
  } else {
    tableInputFormat = HiveReaderProtoUtil.getTableInputFormat(tableXattr);
  }

  if (!tableInputFormat.isPresent()) {
    return false;
  }

  try {
    return MapredParquetInputFormat.class.isAssignableFrom(
      (Class<? extends InputFormat>) Class.forName(tableInputFormat.get()));
  } catch (ClassNotFoundException e) {
    return false;
  }
}
 
Example #5
Source File: HiveUtil.java    From presto with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"unchecked", "RedundantCast"})
private static Class<? extends InputFormat<?, ?>> getInputFormatClass(JobConf conf, String inputFormatName)
        throws ClassNotFoundException
{
    // CDH uses different names for Parquet
    if ("parquet.hive.DeprecatedParquetInputFormat".equals(inputFormatName) ||
            "parquet.hive.MapredParquetInputFormat".equals(inputFormatName)) {
        return MapredParquetInputFormat.class;
    }

    Class<?> clazz = conf.getClassByName(inputFormatName);
    return (Class<? extends InputFormat<?, ?>>) clazz.asSubclass(InputFormat.class);
}
 
Example #6
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}
 
Example #7
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}
 
Example #8
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public static boolean isVarcharTruncateSupported(InputFormat<?, ?> format) {
  return MapredParquetInputFormat.class.isAssignableFrom(format.getClass());
}
 
Example #9
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public static boolean allowParquetNative(boolean currentStatus, Class<? extends InputFormat> clazz) {
  return currentStatus && MapredParquetInputFormat.class.isAssignableFrom(clazz);
}
 
Example #10
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public static boolean isVarcharTruncateSupported(InputFormat<?, ?> format) {
  return MapredParquetInputFormat.class.isAssignableFrom(format.getClass());
}
 
Example #11
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public static boolean allowParquetNative(boolean currentStatus, Class<? extends InputFormat> clazz) {
  return currentStatus && MapredParquetInputFormat.class.isAssignableFrom(clazz);
}