org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat Java Examples

The following examples show how to use org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveUtil.java    From presto with Apache License 2.0 6 votes vote down vote up
public static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget)
{
    String inputFormatName = getInputFormatName(schema);
    try {
        JobConf jobConf = toJobConf(configuration);
        configureCompressionCodecs(jobConf);

        Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName);
        if (symlinkTarget && inputFormatClass == SymlinkTextInputFormat.class) {
            // Symlink targets are assumed to be TEXTFILE unless serde indicates otherwise.
            inputFormatClass = TextInputFormat.class;
            if (isDeserializerClass(schema, AvroSerDe.class)) {
                inputFormatClass = AvroContainerInputFormat.class;
            }
        }

        return ReflectionUtils.newInstance(inputFormatClass, jobConf);
    }
    catch (ClassNotFoundException | RuntimeException e) {
        throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unable to create input format " + inputFormatName, e);
    }
}
 
Example #2
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}
 
Example #3
Source File: HiveMetadataUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}
 
Example #4
Source File: HiveAvroCopyEntityHelper.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Tell whether a hive table is actually an Avro table
 * @param table a hive {@link Table}
 * @return true if it is a hive table
 */
public static boolean isHiveTableAvroType(Table table) {
  String serializationLib = table.getTTable().getSd().getSerdeInfo().getSerializationLib();
  String inputFormat = table.getTTable().getSd().getInputFormat();
  String outputFormat = table.getTTable().getSd().getOutputFormat();

  return inputFormat.endsWith(AvroContainerInputFormat.class.getSimpleName())
      || outputFormat.endsWith(AvroContainerOutputFormat.class.getSimpleName())
      || serializationLib.endsWith(AvroSerDe.class.getSimpleName());
}
 
Example #5
Source File: HiveMetaStoreUtils.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private static StorageDescriptor getStorageDescriptor(HiveRegistrationUnit unit) {
  State props = unit.getStorageProps();
  StorageDescriptor sd = new StorageDescriptor();
  sd.setParameters(getParameters(props));
  //Treat AVRO and other formats differently. Details can be found in GOBBLIN-877
  if (unit.isRegisterSchema() ||
      (unit.getInputFormat().isPresent() && !unit.getInputFormat().get().equals(AvroContainerInputFormat.class.getName()))) {
    sd.setCols(getFieldSchemas(unit));
  }
  if (unit.getLocation().isPresent()) {
    sd.setLocation(unit.getLocation().get());
  }
  if (unit.getInputFormat().isPresent()) {
    sd.setInputFormat(unit.getInputFormat().get());
  }
  if (unit.getOutputFormat().isPresent()) {
    sd.setOutputFormat(unit.getOutputFormat().get());
  }
  if (unit.getIsCompressed().isPresent()) {
    sd.setCompressed(unit.getIsCompressed().get());
  }
  if (unit.getNumBuckets().isPresent()) {
    sd.setNumBuckets(unit.getNumBuckets().get());
  }
  if (unit.getBucketColumns().isPresent()) {
    sd.setBucketCols(unit.getBucketColumns().get());
  }
  if (unit.getIsStoredAsSubDirs().isPresent()) {
    sd.setStoredAsSubDirectories(unit.getIsStoredAsSubDirs().get());
  }
  sd.setSerdeInfo(getSerDeInfo(unit));
  return sd;
}
 
Example #6
Source File: HiveMetaStoreUtilsTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetTableAvro() {
  final String databaseName = "testdb";
  final String tableName = "testtable";

  HiveTable.Builder builder = new HiveTable.Builder();

  builder.withDbName(databaseName).withTableName(tableName);

  State serdeProps = new State();
  serdeProps.setProp("avro.schema.literal", "{\"type\": \"record\", \"name\": \"TestEvent\","
      + " \"namespace\": \"test.namespace\", \"fields\": [{\"name\":\"a\"," + " \"type\": \"int\"}]}");
  builder.withSerdeProps(serdeProps);

  HiveTable hiveTable = builder.build();
  hiveTable.setInputFormat(AvroContainerInputFormat.class.getName());
  hiveTable.setOutputFormat(AvroContainerOutputFormat.class.getName());
  hiveTable.setSerDeType(AvroSerDe.class.getName());

  Table table = HiveMetaStoreUtils.getTable(hiveTable);
  Assert.assertEquals(table.getDbName(), databaseName);
  Assert.assertEquals(table.getTableName(), tableName);

  StorageDescriptor sd = table.getSd();
  Assert.assertEquals(sd.getInputFormat(), AvroContainerInputFormat.class.getName());
  Assert.assertEquals(sd.getOutputFormat(), AvroContainerOutputFormat.class.getName());
  Assert.assertNotNull(sd.getSerdeInfo());
  Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), AvroSerDe.class.getName());

  List<FieldSchema> fields = sd.getCols();
  Assert.assertTrue(fields != null && fields.size() == 1);
  FieldSchema fieldA = fields.get(0);
  Assert.assertEquals(fieldA.getName(), "a");
  Assert.assertEquals(fieldA.getType(), "int");
}
 
Example #7
Source File: HiveMetaStoreUtilsTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetTableAvroInvalidSchema() {
  final String databaseName = "testdb";
  final String tableName = "testtable";

  HiveTable.Builder builder = new HiveTable.Builder();

  builder.withDbName(databaseName).withTableName(tableName);

  State serdeProps = new State();
  serdeProps.setProp("avro.schema.literal", "invalid schema");
  builder.withSerdeProps(serdeProps);

  HiveTable hiveTable = builder.build();
  hiveTable.setInputFormat(AvroContainerInputFormat.class.getName());
  hiveTable.setOutputFormat(AvroContainerOutputFormat.class.getName());
  hiveTable.setSerDeType(AvroSerDe.class.getName());

  Table table = HiveMetaStoreUtils.getTable(hiveTable);
  Assert.assertEquals(table.getDbName(), databaseName);
  Assert.assertEquals(table.getTableName(), tableName);

  StorageDescriptor sd = table.getSd();
  Assert.assertEquals(sd.getInputFormat(), AvroContainerInputFormat.class.getName());
  Assert.assertEquals(sd.getOutputFormat(), AvroContainerOutputFormat.class.getName());
  Assert.assertNotNull(sd.getSerdeInfo());
  Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), AvroSerDe.class.getName());

  List<FieldSchema> fields = sd.getCols();
  Assert.assertTrue(fields != null && fields.size() == 0);
}
 
Example #8
Source File: HiveMetaStoreUtilsTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testGetHiveTable() throws Exception {
  final String databaseName = "testdb";
  final String tableName = "testtable";
  final String tableSdLoc = "/tmp/testtable";
  final String partitionName = "partitionName";

  State serdeProps = new State();
  serdeProps.setProp("avro.schema.literal", "{\"type\": \"record\", \"name\": \"TestEvent\","
          + " \"namespace\": \"test.namespace\", \"fields\": [{\"name\":\"testName\"," + " \"type\": \"int\"}]}");

  List<FieldSchema> fieldSchemas = new ArrayList<>();
  fieldSchemas.add(new FieldSchema("testName","int", "testContent"));
  SerDeInfo si = new SerDeInfo();
  si.setParameters(getParameters(serdeProps));
  si.setName(tableName);

  StorageDescriptor sd = new StorageDescriptor(fieldSchemas, tableSdLoc,
          AvroContainerInputFormat.class.getName(), AvroContainerOutputFormat.class.getName(),
          false, 0, si, null, Lists.<Order>newArrayList(), null);
  sd.setParameters(getParameters(serdeProps));
  Table table = new Table(tableName, databaseName, "testOwner", 0, 0, 0, sd,
          Lists.<FieldSchema>newArrayList(), Maps.<String,String>newHashMap(), "", "", "");
  table.addToPartitionKeys(new FieldSchema(partitionName, "string", "some comment"));

  HiveTable hiveTable = HiveMetaStoreUtils.getHiveTable(table);
  Assert.assertEquals(hiveTable.getDbName(), databaseName);
  Assert.assertEquals(hiveTable.getTableName(), tableName);

  Assert.assertTrue(hiveTable.getInputFormat().isPresent());
  Assert.assertTrue(hiveTable.getOutputFormat().isPresent());
  Assert.assertEquals(hiveTable.getInputFormat().get(), AvroContainerInputFormat.class.getName());
  Assert.assertEquals(hiveTable.getOutputFormat().get(), AvroContainerOutputFormat.class.getName());
  Assert.assertNotNull(hiveTable.getSerDeType());

  List<HiveRegistrationUnit.Column> fields = hiveTable.getColumns();
  Assert.assertTrue(fields != null && fields.size() == 1);
  HiveRegistrationUnit.Column fieldA = fields.get(0);
  Assert.assertEquals(fieldA.getName(), "testName");
  Assert.assertEquals(fieldA.getType(), "int");

}