Java Code Examples for org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName#BINARY

The following examples show how to use org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName#BINARY . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MetadataReader.java    From presto with Apache License 2.0 6 votes vote down vote up
private static PrimitiveTypeName getTypeName(Type type)
{
    switch (type) {
        case BYTE_ARRAY:
            return PrimitiveTypeName.BINARY;
        case INT64:
            return PrimitiveTypeName.INT64;
        case INT32:
            return PrimitiveTypeName.INT32;
        case BOOLEAN:
            return PrimitiveTypeName.BOOLEAN;
        case FLOAT:
            return PrimitiveTypeName.FLOAT;
        case DOUBLE:
            return PrimitiveTypeName.DOUBLE;
        case INT96:
            return PrimitiveTypeName.INT96;
        case FIXED_LEN_BYTE_ARRAY:
            return PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
        default:
            throw new IllegalArgumentException("Unknown type " + type);
    }
}
 
Example 2
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public PrimitiveTypeName getPrimitive(Type type) {
  switch (type) {
    case BYTE_ARRAY: // TODO: rename BINARY and remove this switch
      return PrimitiveTypeName.BINARY;
    case INT64:
      return PrimitiveTypeName.INT64;
    case INT32:
      return PrimitiveTypeName.INT32;
    case BOOLEAN:
      return PrimitiveTypeName.BOOLEAN;
    case FLOAT:
      return PrimitiveTypeName.FLOAT;
    case DOUBLE:
      return PrimitiveTypeName.DOUBLE;
    case INT96:
      return PrimitiveTypeName.INT96;
    case FIXED_LEN_BYTE_ARRAY:
      return PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
    default:
      throw new RuntimeException("Unknown type " + type);
  }
}
 
Example 3
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the set of the lists with names of the columns with BINARY or
 * FIXED_LEN_BYTE_ARRAY data type from {@code ParquetTableMetadataBase columnTypeMetadataCollection}
 * if parquetTableMetadata has version v2 or v3 (including minor versions).
 *
 * @param parquetTableMetadata table metadata the source of the columns to check
 * @return set of the lists with column names
 */
private static Set<List<String>> getBinaryColumnsNames(ParquetTableMetadataBase parquetTableMetadata) {
  Set<List<String>> names = new HashSet<>();
  List<? extends MetadataBase.ColumnTypeMetadata> columnTypeMetadataList = parquetTableMetadata.getColumnTypeInfoList();
  if (columnTypeMetadataList != null) {
    for (MetadataBase.ColumnTypeMetadata columnTypeMetadata : columnTypeMetadataList) {
      if (columnTypeMetadata.getPrimitiveType() == PrimitiveTypeName.BINARY
              || columnTypeMetadata.getPrimitiveType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
        names.add(Arrays.asList(columnTypeMetadata.getName()));
      }
    }
  }
  return names;
}
 
Example 4
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData createColumnChunkMetaData() {
  Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.BINARY;
  ColumnPath p = ColumnPath.get("foo");
  CompressionCodecName c = CompressionCodecName.GZIP;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s,
          0, 0, 0, 0, 0);
  return md;
}
 
Example 5
Source File: HiveSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition) {
  if (typeInfo.getCategory().equals(Category.PRIMITIVE)) {
    if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) {
      return new PrimitiveType(repetition, PrimitiveTypeName.BINARY, name);
    } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo) ||
        typeInfo.equals(TypeInfoFactory.shortTypeInfo) ||
        typeInfo.equals(TypeInfoFactory.byteTypeInfo)) {
      return new PrimitiveType(repetition, PrimitiveTypeName.INT32, name);
    } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) {
      return new PrimitiveType(repetition, PrimitiveTypeName.INT64, name);
    } else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) {
      return new PrimitiveType(repetition, PrimitiveTypeName.DOUBLE, name);
    } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) {
      return new PrimitiveType(repetition, PrimitiveTypeName.FLOAT, name);
    } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) {
      return new PrimitiveType(repetition, PrimitiveTypeName.BOOLEAN, name);
    } else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) {
      // TODO : binaryTypeInfo is a byte array. Need to map it
      throw new UnsupportedOperationException("Binary type not implemented");
    } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) {
      throw new UnsupportedOperationException("Timestamp type not implemented");
    } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) {
      throw new UnsupportedOperationException("Void type not implemented");
    } else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) {
      throw new UnsupportedOperationException("Unknown type not implemented");
    } else {
      throw new IllegalArgumentException("Unknown type: " + typeInfo);
    }
  } else if (typeInfo.getCategory().equals(Category.LIST)) {
    return convertArrayType(name, (ListTypeInfo) typeInfo);
  } else if (typeInfo.getCategory().equals(Category.STRUCT)) {
    return convertStructType(name, (StructTypeInfo) typeInfo);
  } else if (typeInfo.getCategory().equals(Category.MAP)) {
    return convertMapType(name, (MapTypeInfo) typeInfo);
  } else if (typeInfo.getCategory().equals(Category.UNION)) {
    throw new UnsupportedOperationException("Union type not implemented");
  } else {
    throw new IllegalArgumentException("Unknown type: " + typeInfo);
  }
}
 
Example 6
Source File: ParquetRecordWriter.java    From Bats with Apache License 2.0 4 votes vote down vote up
@Override
public void init(Map<String, String> writerOptions) throws IOException {
  this.location = writerOptions.get("location");
  this.prefix = writerOptions.get("prefix");

  fs = FileSystem.get(conf);
  blockSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_BLOCK_SIZE));
  pageSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_PAGE_SIZE));
  dictionaryPageSize= Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_DICT_PAGE_SIZE));
  String codecName = writerOptions.get(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  String logicalTypeNameForDecimals = writerOptions.get(ExecConstants.PARQUET_WRITER_LOGICAL_TYPE_FOR_DECIMALS).toLowerCase();
  switch (logicalTypeNameForDecimals) {
    case "fixed_len_byte_array":
      logicalTypeForDecimals = PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
      break;
    case "binary":
      logicalTypeForDecimals = PrimitiveTypeName.BINARY;
      break;
    default:
      throw new UnsupportedOperationException(
          String.format(
              "Unsupported logical type for decimals: %s\n" +
              "Supported types: ['fixed_len_byte_array', 'binary']", codecName));
  }

  enableDictionary = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING));
  useSingleFSBlock = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_SINGLE_FS_BLOCK));
  usePrimitiveTypesForDecimals = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_PRIMITIVE_TYPES_FOR_DECIMALS));

  if (useSingleFSBlock) {
    // Round up blockSize to multiple of 64K.
    blockSize = (int)ceil((double)blockSize/BLOCKSIZE_MULTIPLE) * BLOCKSIZE_MULTIPLE;
  }
}
 
Example 7
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private static boolean isBinaryType(PrimitiveTypeName type) {
  return (PrimitiveTypeName.BINARY == type || PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == type);
}
 
Example 8
Source File: CorruptStatistics.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * Decides if the statistics from a file created by createdBy (the created_by field from parquet format)
 * should be ignored because they are potentially corrupt.
 *
 * @param createdBy the created-by string from a file footer
 * @param columnType the type of the column that this is checking
 * @return true if the statistics may be invalid and should be ignored, false otherwise
 */
public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName columnType) {

  if (columnType != PrimitiveTypeName.BINARY && columnType != PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    // the bug only applies to binary columns
    return false;
  }

  if (Strings.isNullOrEmpty(createdBy)) {
    // created_by is not populated, which could have been caused by
    // parquet-mr during the same time as PARQUET-251, see PARQUET-297
    warnOnce("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297");
    return true;
  }

  try {
    ParsedVersion version = VersionParser.parse(createdBy);

    if (!"parquet-mr".equals(version.application)) {
      // assume other applications don't have this bug
      return false;
    }

    if (Strings.isNullOrEmpty(version.version)) {
      warnOnce("Ignoring statistics because created_by did not contain a semver (see PARQUET-251): " + createdBy);
      return true;
    }

    SemanticVersion semver = SemanticVersion.parse(version.version);

    if (semver.compareTo(PARQUET_251_FIXED_VERSION) < 0 &&
        !(semver.compareTo(CDH_5_PARQUET_251_FIXED_START) >= 0 &&
            semver.compareTo(CDH_5_PARQUET_251_FIXED_END) < 0)) {
      warnOnce("Ignoring statistics because this file was created prior to "
          + PARQUET_251_FIXED_VERSION
          + ", see PARQUET-251");
      return true;
    }

    // this file was created after the fix
    return false;
  } catch (RuntimeException | SemanticVersionParseException | VersionParseException e) {
    // couldn't parse the created_by field, log what went wrong, don't trust the stats,
    // but don't make this fatal.
    warnParseErrorOnce(createdBy, e);
    return true;
  }
}
 
Example 9
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 3 votes vote down vote up
/**
 * Returns whether to use signed order min and max with a type. It is safe to
 * use signed min and max when the type is a string type and contains only
 * ASCII characters (where the sign bit was 0). This checks whether the type
 * is a string type and uses {@code useSignedStringMinMax} to determine if
 * only ASCII characters were written.
 *
 * @param type a primitive type with a logical type annotation
 * @return true if signed order min/max can be used with this type
 */
private boolean overrideSortOrderToSigned(PrimitiveType type) {
  // even if the override is set, only return stats for string-ish types
  // a null type annotation is considered string-ish because some writers
  // failed to use the UTF8 annotation.
  LogicalTypeAnnotation annotation = type.getLogicalTypeAnnotation();
  return useSignedStringMinMax &&
      PrimitiveTypeName.BINARY == type.getPrimitiveTypeName() &&
      (annotation == null || STRING_TYPES.contains(annotation.getClass()));
}