org.apache.parquet.schema.PrimitiveType Java Examples

The following examples show how to use org.apache.parquet.schema.PrimitiveType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Metadata_V2.java    From Bats with Apache License 2.0 6 votes vote down vote up
@Override
public void serialize(ColumnMetadata_v2 value, JsonGenerator jgen, SerializerProvider provider)
    throws IOException, JsonProcessingException {
  jgen.writeStartObject();
  jgen.writeArrayFieldStart("name");
  for (String n : value.name) {
    jgen.writeString(n);
  }
  jgen.writeEndArray();
  if (value.mxValue != null) {
    Object val;
    if (value.primitiveType == PrimitiveType.PrimitiveTypeName.BINARY && value.mxValue != null) {
      val = new String(((Binary) value.mxValue).getBytes());
    } else {
      val = value.mxValue;
    }
    jgen.writeObjectField("mxValue", val);
  }
  if (value.nulls != null) {
    jgen.writeObjectField("nulls", value.nulls);
  }
  jgen.writeEndObject();
}
 
Example #2
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 6 votes vote down vote up
private MessageType getParquetSchemaForPrimitiveTypes(Type.Repetition repetition, boolean readCase) {
    List<Type> fields = new ArrayList<>();

    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s2", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "n1", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.DOUBLE, "d1", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, 16, "dc1", OriginalType.DECIMAL, new DecimalMetadata(38, 18), null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tm", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FLOAT, "f", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT64, "bg", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BOOLEAN, "b", null));

    // GPDB only has int16 and not int8 type, so for write tiny numbers int8 are still treated as shorts in16
    OriginalType tinyType = readCase ? OriginalType.INT_8 : OriginalType.INT_16;
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "tn", tinyType));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "sml", OriginalType.INT_16));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "vc1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "c1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "bin", null));

    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz2", null));

    return new MessageType("hive_schema", fields);
}
 
Example #3
Source File: AvroSchemaConverter190Int96Avro18.java    From datacollector with Apache License 2.0 6 votes vote down vote up
private Schema addLogicalTypeToSchema(
    Schema schema,
    OriginalType annotation,
    PrimitiveType asPrimitive,
    PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName
) {
  LogicalType logicalType = convertOriginalTypeToLogicalType(
      annotation, asPrimitive.getDecimalMetadata());
  if (logicalType != null && (annotation != DECIMAL ||
      parquetPrimitiveTypeName == BINARY ||
      parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) {
    schema = logicalType.addToSchema(schema);
  }

  return schema;
}
 
Example #4
Source File: MessageTypeConverter.java    From presto with Apache License 2.0 6 votes vote down vote up
private static org.apache.parquet.format.Type getType(PrimitiveType.PrimitiveTypeName type)
{
    switch (type) {
        case INT64:
            return Type.INT64;
        case INT32:
            return Type.INT32;
        case BOOLEAN:
            return Type.BOOLEAN;
        case BINARY:
            return Type.BYTE_ARRAY;
        case FLOAT:
            return Type.FLOAT;
        case DOUBLE:
            return Type.DOUBLE;
        case INT96:
            return Type.INT96;
        case FIXED_LEN_BYTE_ARRAY:
            return Type.FIXED_LEN_BYTE_ARRAY;
        default:
            throw new RuntimeException("Unknown primitive type " + type);
    }
}
 
Example #5
Source File: TestParquetPredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetTupleDomainMap()
{
    MapType mapType = new MapType(
            INTEGER,
            INTEGER,
            methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"),
            methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"),
            methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"),
            methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"));

    HiveColumnHandle columnHandle = createBaseColumn("my_map", 0, HiveType.valueOf("map<int,int>"), mapType, REGULAR, Optional.empty());

    TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(mapType)));

    MessageType fileSchema = new MessageType("hive_schema",
            new GroupType(OPTIONAL, "my_map",
                    new GroupType(REPEATED, "map",
                            new PrimitiveType(REQUIRED, INT32, "key"),
                            new PrimitiveType(OPTIONAL, INT32, "value"))));

    Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
    TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true);
    assertTrue(tupleDomain.isAll());
}
 
Example #6
Source File: Metadata.java    From Bats with Apache License 2.0 6 votes vote down vote up
private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) {
  if (type.isPrimitive()) {
    PrimitiveType primitiveType = (PrimitiveType) type;
    int precision = 0;
    int scale = 0;
    if (primitiveType.getDecimalMetadata() != null) {
      precision = primitiveType.getDecimalMetadata().getPrecision();
      scale = primitiveType.getDecimalMetadata().getScale();
    }

    int repetitionLevel = schema.getMaxRepetitionLevel(path);
    int definitionLevel = schema.getMaxDefinitionLevel(path);

    return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel);
  }
  Type t = ((GroupType) type).getType(path[depth]);
  return getColTypeInfo(schema, t, path, depth + 1);
}
 
Example #7
Source File: TestMetadataReader.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test(dataProvider = "allCreatedBy")
public void testReadStatsInt64(Optional<String> fileCreatedBy)
{
    Statistics statistics = new Statistics();
    statistics.setNull_count(13);
    statistics.setMin(fromHex("F6FFFFFFFFFFFFFF"));
    statistics.setMax(fromHex("3AA4000000000000"));
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.of(statistics), new PrimitiveType(OPTIONAL, INT64, "Test column")))
            .isInstanceOfSatisfying(LongStatistics.class, columnStatistics -> {
                assertEquals(columnStatistics.getNumNulls(), 13);
                assertEquals(columnStatistics.getMin(), -10);
                assertEquals(columnStatistics.getMax(), 42042);
                assertEquals(columnStatistics.genericGetMin(), (Long) (long) -10L);
                assertEquals(columnStatistics.genericGetMax(), (Long) 42042L);
            });
}
 
Example #8
Source File: TestParquetPredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetTupleDomainStruct()
{
    RowType rowType = rowType(
            RowType.field("a", INTEGER),
            RowType.field("b", INTEGER));

    HiveColumnHandle columnHandle = createBaseColumn("my_struct", 0, HiveType.valueOf("struct<a:int,b:int>"), rowType, REGULAR, Optional.empty());
    TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(rowType)));

    MessageType fileSchema = new MessageType("hive_schema",
            new GroupType(OPTIONAL, "my_struct",
                    new PrimitiveType(OPTIONAL, INT32, "a"),
                    new PrimitiveType(OPTIONAL, INT32, "b")));
    Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
    TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true);
    assertTrue(tupleDomain.isAll());
}
 
Example #9
Source File: TestParquetPredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetTupleDomainPrimitive()
{
    HiveColumnHandle columnHandle = createBaseColumn("my_primitive", 0, HiveType.valueOf("bigint"), BIGINT, REGULAR, Optional.empty());
    Domain singleValueDomain = Domain.singleValue(BIGINT, 123L);
    TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, singleValueDomain));

    MessageType fileSchema = new MessageType("hive_schema", new PrimitiveType(OPTIONAL, INT64, "my_primitive"));

    Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
    TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true);

    assertEquals(tupleDomain.getDomains().get().size(), 1);
    ColumnDescriptor descriptor = tupleDomain.getDomains().get().keySet().iterator().next();
    assertEquals(descriptor.getPath().length, 1);
    assertEquals(descriptor.getPath()[0], "my_primitive");

    Domain predicateDomain = Iterables.getOnlyElement(tupleDomain.getDomains().get().values());
    assertEquals(predicateDomain, singleValueDomain);
}
 
Example #10
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadUsingSchemaWithRequiredFieldThatWasOptional(){
  MessageType originalSchema = new MessageType("schema",
          new PrimitiveType(OPTIONAL, INT32, "e"));
  MemPageStore store = new MemPageStore(1);
  SimpleGroupFactory groupFactory = new SimpleGroupFactory(originalSchema);
  writeGroups(originalSchema, store, groupFactory.newGroup().append("e", 4));

  try {
    MessageType schemaWithRequiredFieldThatWasOptional = new MessageType("schema",
            new PrimitiveType(REQUIRED, INT32, "e")); // Incompatible schema: required when it was optional
    readGroups(store, originalSchema, schemaWithRequiredFieldThatWasOptional, 1);
    fail("should have thrown an incompatible schema exception");
  } catch (ParquetDecodingException e) {
    assertEquals("The requested schema is not compatible with the file schema. incompatible types: required int32 e != optional int32 e", e.getMessage());
  }
}
 
Example #11
Source File: HiveClientTest.java    From garmadon with Apache License 2.0 6 votes vote down vote up
@Test
public void createTableWithoutIssue() throws SQLException {
    PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id");

    MessageType schema = new MessageType("fs", appId);

    String table = "fs";
    String location = "file:" + hdfsTemp + "/garmadon_database/fs";
    HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon",
        hdfsTemp + "/garmadon_database");
    hiveClient.createTableIfNotExist(table, schema, location);

    HashMap<String, String> result = getResultHashTableDesc(hiveClient, table);
    assertEquals(location, result.get("Location"));
    assertEquals("EXTERNAL_TABLE", result.get("Table Type").trim());
    assertEquals("string", result.get("day"));
    assertEquals("string", result.get("app_id"));
}
 
Example #12
Source File: HiveClientTest.java    From garmadon with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldProvideHiveTypeFromParquetType() throws Exception {
    HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon",
        hdfsTemp + "/garmadon_database");

    PrimitiveType string = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "name");
    assertEquals("string", hiveClient.inferHiveType(string));

    PrimitiveType array_string = new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.BINARY, "name");
    assertEquals("array<string>", hiveClient.inferHiveType(array_string));

    PrimitiveType int32 = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT32, "name");
    assertEquals("int", hiveClient.inferHiveType(int32));

    PrimitiveType int64 = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT64, "name");
    assertEquals("bigint", hiveClient.inferHiveType(int64));

    PrimitiveType floatz = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.FLOAT, "name");
    assertEquals("float", hiveClient.inferHiveType(floatz));

    PrimitiveType doublez = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.DOUBLE, "name");
    assertEquals("double", hiveClient.inferHiveType(doublez));

    PrimitiveType booleanz = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BOOLEAN, "name");
    assertEquals("boolean", hiveClient.inferHiveType(booleanz));
}
 
Example #13
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, ParquetConversions.converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visitEvaluator(expr, this);
}
 
Example #14
Source File: ParquetRecordWriter.java    From Bats with Apache License 2.0 6 votes vote down vote up
protected PrimitiveType getPrimitiveType(MaterializedField field) {
  MinorType minorType = field.getType().getMinorType();
  String name = field.getName();
  int length = ParquetTypeHelper.getLengthForMinorType(minorType);
  PrimitiveTypeName primitiveTypeName = ParquetTypeHelper.getPrimitiveTypeNameForMinorType(minorType);
  if (Types.isDecimalType(minorType)) {
    primitiveTypeName = logicalTypeForDecimals;
    if (usePrimitiveTypesForDecimals) {
      if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT32)) {
        primitiveTypeName = PrimitiveTypeName.INT32;
      } else if (field.getPrecision() <= ParquetTypeHelper.getMaxPrecisionForPrimitiveType(PrimitiveTypeName.INT64)) {
        primitiveTypeName = PrimitiveTypeName.INT64;
      }
    }

    length = DecimalUtility.getMaxBytesSizeForPrecision(field.getPrecision());
  }

  Repetition repetition = ParquetTypeHelper.getRepetitionForDataMode(field.getDataMode());
  OriginalType originalType = ParquetTypeHelper.getOriginalTypeForMinorType(minorType);
  DecimalMetadata decimalMetadata = ParquetTypeHelper.getDecimalMetadataForField(field);
  return new PrimitiveType(repetition, primitiveTypeName, length, name, originalType, decimalMetadata, null);
}
 
Example #15
Source File: FixedLenBytesColumnReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public FixedLenBytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader,
		int precision) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY);
	this.precision = precision;
}
 
Example #16
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) {
  if (type instanceof GroupType) {
    showDetails(out, type.asGroupType(), depth, container, cpath);
    return;
  } else if (type instanceof PrimitiveType) {
    showDetails(out, type.asPrimitiveType(), depth, container, cpath);
    return;
  }
}
 
Example #17
Source File: ParquetColumnMetadata.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Returns data type length for a given {@see ColumnDescriptor} and it's corresponding
 * {@see SchemaElement}. Neither is enough information alone as the max
 * repetition level (indicating if it is an array type) is in the ColumnDescriptor and
 * the length of a fixed width field is stored at the schema level.
 *
 * @return the length if fixed width, else <tt>UNDEFINED_LENGTH</tt> (-1)
 */
public int getDataTypeLength() {
  if (! isFixedLength()) {
    return UNDEFINED_LENGTH;
  } else if (isRepeated()) {
    return UNDEFINED_LENGTH;
  } else if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    return se.getType_length() * 8;
  } else {
    return getTypeLengthInBits(column.getType());
  }
}
 
Example #18
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
final public void start() {
  currentTuple = TF.newTuple(schemaSize);
  if (elephantBirdCompatible) {
    try {
      int i = 0;
      for (Type field : parquetSchema.getFields()) {
        if (field.isPrimitive() && field.isRepetition(Repetition.OPTIONAL)) {
          PrimitiveType primitiveType = field.asPrimitiveType();
          switch (primitiveType.getPrimitiveTypeName()) {
          case INT32:
            currentTuple.set(i, I32_ZERO);
            break;
          case INT64:
            currentTuple.set(i, I64_ZERO);
            break;
          case FLOAT:
            currentTuple.set(i, FLOAT_ZERO);
            break;
          case DOUBLE:
            currentTuple.set(i, DOUBLE_ZERO);
            break;
          case BOOLEAN:
            currentTuple.set(i, I32_ZERO);
            break;
          }
        }
        ++ i;
      }
    } catch (ExecException e) {
      throw new RuntimeException(e);
    }
  }
}
 
Example #19
Source File: FixedBinaryTestUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static Binary getFixedBinary(PrimitiveType type, BigInteger bigInt) {
  switch (type.getPrimitiveTypeName()) {
  case FIXED_LEN_BYTE_ARRAY:
    return getFixedBinary(type.getTypeLength(), bigInt);
  case INT96:
    return getFixedBinary(12, bigInt);
  case BINARY:
    return Binary.fromConstantByteArray(bigInt.toByteArray());
  default:
    throw new IllegalArgumentException("Type " + type + " cannot be represented by a Binary");
  }
}
 
Example #20
Source File: TestTupleDomainParquetPredicate.java    From presto with Apache License 2.0 5 votes vote down vote up
@Test
public void testVarcharMatchesWithStatistics()
        throws ParquetCorruptionException
{
    String value = "Test";
    ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0);
    RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column"));
    TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value));
    TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
    Statistics<?> stats = getStatsBasedOnType(column.getPrimitiveType().getPrimitiveTypeName());
    stats.setNumNulls(1L);
    stats.setMinMaxFromBytes(value.getBytes(UTF_8), value.getBytes(UTF_8));
    assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, stats), ID, true));
}
 
Example #21
Source File: TestBinaryTruncator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testTruncator(PrimitiveType type, boolean strict) {
  BinaryTruncator truncator = BinaryTruncator.getTruncator(type);
  Comparator<Binary> comparator = type.comparator();

  checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa"), strict, strict);
  checkContract(truncator, comparator, Binary.fromString("árvíztűrő tükörfúrógép"), strict, strict);
  checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa" + UTF8_3BYTES_MAX_CHAR), strict, strict);
  checkContract(truncator, comparator, Binary.fromString("a" + UTF8_3BYTES_MAX_CHAR + UTF8_1BYTE_MAX_CHAR), strict,
      strict);

  checkContract(truncator, comparator,
      Binary.fromConstantByteArray(new byte[] { (byte) 0xFE, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, (byte) 0xFF }), strict,
      strict);

  // Edge case: zero length -> unable to truncate
  checkContract(truncator, comparator, Binary.fromString(""), false, false);
  // Edge case: containing only UTF-8 max characters -> unable to truncate for max
  checkContract(truncator, comparator, Binary.fromString(
      UTF8_1BYTE_MAX_CHAR +
          UTF8_4BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_4BYTES_MAX_CHAR +
          UTF8_2BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_1BYTE_MAX_CHAR +
          UTF8_2BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_4BYTES_MAX_CHAR),
      strict, false);
  // Edge case: non-UTF-8; max bytes -> unable to truncate for max
  checkContract(
      truncator, comparator,
      binary(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF),
      strict, false);
}
 
Example #22
Source File: TestTupleDomainParquetPredicate.java    From presto with Apache License 2.0 5 votes vote down vote up
@Test
public void testVarcharMatchesWithDictionaryDescriptor()
{
    ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0);
    RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column"));
    TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), EMPTY_SLICE);
    TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
    DictionaryPage page = new DictionaryPage(Slices.wrappedBuffer(new byte[] {0, 0, 0, 0}), 1, PLAIN_DICTIONARY);
    assertTrue(parquetPredicate.matches(new DictionaryDescriptor(column, Optional.of(page))));
}
 
Example #23
Source File: HiveClientTest.java    From garmadon with Apache License 2.0 5 votes vote down vote up
@Test(expected = Exception.class)
public void shouldThrowExceptionForUnknownParquetType() throws Exception {
    HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon",
        hdfsTemp + "/garmadon_database");

    PrimitiveType unsupported = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT96, "unsupported");
    hiveClient.inferHiveType(unsupported);
}
 
Example #24
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static boolean isIntType(PrimitiveType primitiveType) {
  if (primitiveType.getOriginalType() != null) {
    switch (primitiveType.getOriginalType()) {
      case INT_8:
      case INT_16:
      case INT_32:
      case DATE:
        return true;
      default:
        return false;
    }
  }
  return primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT32;
}
 
Example #25
Source File: TestMetadataReader.java    From presto with Apache License 2.0 5 votes vote down vote up
@Test(dataProvider = "allCreatedBy")
public void testReadNullStats(Optional<String> fileCreatedBy)
{
    // integer
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT32, "Test column")))
            .isInstanceOfSatisfying(
                    IntStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // bigint
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, INT64, "Test column")))
            .isInstanceOfSatisfying(
                    LongStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varchar
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8)))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));

    // varbinary
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.empty(), new PrimitiveType(OPTIONAL, BINARY, "Test column")))
            .isInstanceOfSatisfying(
                    BinaryStatistics.class,
                    columnStatistics -> assertTrue(columnStatistics.isEmpty()));
}
 
Example #26
Source File: FloatColumnIndexBuilder.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
ColumnIndexBase<Float> createColumnIndex(PrimitiveType type) {
  if (invalid) {
    return null;
  }
  FloatColumnIndex columnIndex = new FloatColumnIndex(type);
  columnIndex.minValues = minValues.toFloatArray();
  columnIndex.maxValues = maxValues.toFloatArray();
  return columnIndex;
}
 
Example #27
Source File: ThriftSchemaConvertVisitor.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ConvertedField visitPrimitiveType(PrimitiveTypeName type, LogicalTypeAnnotation orig, State state) {
  PrimitiveBuilder<PrimitiveType> b = primitive(type, state.repetition);

  if (orig != null) {
    b = b.as(orig);
  }

  if (fieldProjectionFilter.keep(state.path)) {
    return new Keep(state.path, b.named(state.name));
  } else {
    return new Drop(state.path);
  }
}
 
Example #28
Source File: DataWritableReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}
 
Example #29
Source File: Metadata_V3.java    From Bats with Apache License 2.0 5 votes vote down vote up
public ColumnMetadata_v3(String[] name, PrimitiveType.PrimitiveTypeName primitiveType, Object minValue, Object maxValue, Long nulls) {
  this.name = name;
  this.minValue = minValue;
  this.maxValue = maxValue;
  this.nulls = nulls;
  this.primitiveType = primitiveType;
}
 
Example #30
Source File: ColumnIOFactory.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void visit(PrimitiveType primitiveType) {
  if (!currentRequestedType.isPrimitive() || 
          (this.strictTypeChecking && currentRequestedType.asPrimitiveType().getPrimitiveTypeName() != primitiveType.getPrimitiveTypeName())) {
    incompatibleSchema(primitiveType, currentRequestedType);
  }
  PrimitiveColumnIO newIO = new PrimitiveColumnIO(primitiveType, current, currentRequestedIndex, leaves.size());
  current.add(newIO);
  leaves.add(newIO);
}