Java Code Examples for org.apache.parquet.schema.OriginalType#DATE

The following examples show how to use org.apache.parquet.schema.OriginalType#DATE . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroSchemaConverter190Int96Avro17.java    From datacollector with Apache License 2.0 7 votes vote down vote up
private OriginalType convertLogicalTypeStr(String logicalType) {
    if (logicalType == null) {
      return null;
    } else if (AvroTypeUtil.LOGICAL_TYPE_DECIMAL.equals(logicalType)) {
      return OriginalType.DECIMAL;
    } else if (AvroTypeUtil.LOGICAL_TYPE_DATE.equals(logicalType)) {
      return OriginalType.DATE;
    } else if (AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS.equals(logicalType)) {
      return OriginalType.TIME_MILLIS;
//    } else if (AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS.equals(logicalType)) {
//      return OriginalType.TIME_MICROS;
    } else if (AvroTypeUtil.LOGICAL_TYPE_TIMESTAMP_MILLIS.equals(logicalType)) {
      return OriginalType.TIMESTAMP_MILLIS;
//    } else if (AvroTypeUtil.LOGICAL_TYPE_TIMESTAMP_MICROS.equals(logicalType)) {
//      return OriginalType.TIMESTAMP_MICROS;
    }
    return null;
  }
 
Example 2
Source File: AvroSchemaConverter190Int96Avro18.java    From datacollector with Apache License 2.0 6 votes vote down vote up
private OriginalType convertLogicalType(LogicalType logicalType) {
  if (logicalType == null) {
    return null;
  } else if (logicalType instanceof LogicalTypes.Decimal) {
    return OriginalType.DECIMAL;
  } else if (logicalType instanceof LogicalTypes.Date) {
    return OriginalType.DATE;
  } else if (logicalType instanceof LogicalTypes.TimeMillis) {
    return OriginalType.TIME_MILLIS;
  } else if (logicalType instanceof LogicalTypes.TimeMicros) {
    return OriginalType.TIME_MICROS;
  } else if (logicalType instanceof LogicalTypes.TimestampMillis) {
    return OriginalType.TIMESTAMP_MILLIS;
  } else if (logicalType instanceof LogicalTypes.TimestampMicros) {
    return OriginalType.TIMESTAMP_MICROS;
  }
  return null;
}
 
Example 3
Source File: AvroSchemaConverterLogicalTypesPre19.java    From datacollector with Apache License 2.0 6 votes vote down vote up
private OriginalType convertLogicalType(String logicalType) {
    if (logicalType == null) {
      return null;
    } else if (LOGICAL_TYPE_DECIMAL.equals(logicalType)) {
      return OriginalType.DECIMAL;
    } else if (LOGICAL_TYPE_DATE.equals(logicalType)) {
      return OriginalType.DATE;
    } else if (LOGICAL_TYPE_TIME_MILLIS.equals(logicalType)) {
      return OriginalType.TIME_MILLIS;
//    } else if (LOGICAL_TYPE_TIME_MICROS.equals(logicalType)) {
//      return OriginalType.TIME_MICROS;
    } else if (LOGICAL_TYPE_TIMESTAMP_MILLIS.equals(logicalType)) {
      return OriginalType.TIMESTAMP_MILLIS;
//    } else if (LOGICAL_TYPE_TIMESTAMP_MICROS.equals(logicalType)) {
//      return OriginalType.TIMESTAMP_MICROS;
    }
    return null;
  }
 
Example 4
Source File: ParquetRecordFilterBuilder.java    From pxf with Apache License 2.0 5 votes vote down vote up
private static Integer getIntegerForINT32(OriginalType originalType, OperandNode valueOperand) {
    if (valueOperand == null) return null;
    if (originalType == OriginalType.DATE) {
        // Number of days since epoch
        LocalDate localDateValue = LocalDate.parse(valueOperand.toString());
        LocalDate epoch = LocalDate.ofEpochDay(0);
        return (int) ChronoUnit.DAYS.between(epoch, localDateValue);
    }
    return Integer.parseInt(valueOperand.toString());
}
 
Example 5
Source File: ParquetTableMetadataUtils.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Handles passed value considering its type and specified {@code primitiveType} with {@code originalType}.
 *
 * @param value         value to handle
 * @param primitiveType primitive type of the column whose value should be handled
 * @param originalType  original type of the column whose value should be handled
 * @return handled value
 */
public static Object getValue(Object value, PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType) {
  if (value != null) {
    switch (primitiveType) {
      case BOOLEAN:
        return Boolean.parseBoolean(value.toString());

      case INT32:
        if (originalType == OriginalType.DATE) {
          return convertToDrillDateValue(getInt(value));
        } else if (originalType == OriginalType.DECIMAL) {
          return BigInteger.valueOf(getInt(value));
        }
        return getInt(value);

      case INT64:
        if (originalType == OriginalType.DECIMAL) {
          return BigInteger.valueOf(getLong(value));
        } else {
          return getLong(value);
        }

      case FLOAT:
        return getFloat(value);

      case DOUBLE:
        return getDouble(value);

      case INT96:
        return new String(getBytes(value));

      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        if (originalType == OriginalType.DECIMAL) {
          return new BigInteger(getBytes(value));
        } else if (originalType == OriginalType.INTERVAL) {
          return getBytes(value);
        } else {
          return new String(getBytes(value));
        }
    }
  }
  return null;
}
 
Example 6
Source File: MetadataReader.java    From presto with Apache License 2.0 4 votes vote down vote up
private static OriginalType getOriginalType(ConvertedType type)
{
    switch (type) {
        case UTF8:
            return OriginalType.UTF8;
        case MAP:
            return OriginalType.MAP;
        case MAP_KEY_VALUE:
            return OriginalType.MAP_KEY_VALUE;
        case LIST:
            return OriginalType.LIST;
        case ENUM:
            return OriginalType.ENUM;
        case DECIMAL:
            return OriginalType.DECIMAL;
        case DATE:
            return OriginalType.DATE;
        case TIME_MILLIS:
            return OriginalType.TIME_MILLIS;
        case TIMESTAMP_MILLIS:
            return OriginalType.TIMESTAMP_MILLIS;
        case INTERVAL:
            return OriginalType.INTERVAL;
        case INT_8:
            return OriginalType.INT_8;
        case INT_16:
            return OriginalType.INT_16;
        case INT_32:
            return OriginalType.INT_32;
        case INT_64:
            return OriginalType.INT_64;
        case UINT_8:
            return OriginalType.UINT_8;
        case UINT_16:
            return OriginalType.UINT_16;
        case UINT_32:
            return OriginalType.UINT_32;
        case UINT_64:
            return OriginalType.UINT_64;
        case JSON:
            return OriginalType.JSON;
        case BSON:
            return OriginalType.BSON;
        case TIMESTAMP_MICROS:
            return OriginalType.TIMESTAMP_MICROS;
        case TIME_MICROS:
            return OriginalType.TIME_MICROS;
        default:
            throw new IllegalArgumentException("Unknown converted type " + type);
    }
}
 
Example 7
Source File: Metadata.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}
 
Example 8
Source File: ParquetTypeHelper.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Returns an arrow vector field for a parquet primitive field
 *
 * @param colPath       schema path of the column
 * @param primitiveType parquet primitive type
 * @param originalType  parquet original type
 * @param schemaHelper  schema helper used for type conversions
 * @return arrow vector field
 */
public static Field createField(SchemaPath colPath,
                                PrimitiveType primitiveType,
                                OriginalType originalType,
                                SchemaDerivationHelper schemaHelper) {
  final String colName = colPath.getAsNamePart().getName();
  switch (primitiveType.getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
      if (originalType == OriginalType.UTF8) {
        return CompleteType.VARCHAR.toField(colName);
      }
      if (originalType == OriginalType.DECIMAL) {

        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      if (schemaHelper.isVarChar(colPath)) {
        return CompleteType.VARCHAR.toField(colName);
      }
      return CompleteType.VARBINARY.toField(colName);
    case BOOLEAN:
      return CompleteType.BIT.toField(colName);
    case DOUBLE:
      return CompleteType.DOUBLE.toField(colName);
    case FLOAT:
      return CompleteType.FLOAT.toField(colName);
    case INT32:
      if (originalType == OriginalType.DATE) {
        return CompleteType.DATE.toField(colName);
      } else if (originalType == OriginalType.TIME_MILLIS) {
        return CompleteType.TIME.toField(colName);
      } else if (originalType == OriginalType.DECIMAL) {
        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      return CompleteType.INT.toField(colName);
    case INT64:
      if (originalType == OriginalType.TIMESTAMP_MILLIS) {
        return CompleteType.TIMESTAMP.toField(colName);
      } else if (originalType == OriginalType.DECIMAL) {
        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      return CompleteType.BIGINT.toField(colName);
    case INT96:
      if (schemaHelper.readInt96AsTimeStamp()) {
        return CompleteType.TIMESTAMP.toField(colName);
      }
      return CompleteType.VARBINARY.toField(colName);
    default:
      throw UserException.unsupportedError()
        .message("Parquet Primitive Type '%s', Original Type '%s' combination not supported. Column '%s'",
          primitiveType.toString(), originalType != null ? originalType : "Not Available", colName)
        .build();
  }
}
 
Example 9
Source File: PentahoParquetWriteSupport.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
private PrimitiveType convertToPrimitiveType( IParquetOutputField f ) {
  Type.Repetition rep = f.getAllowNull() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED;
  String formatFieldName = f.getFormatFieldName();
  switch ( f.getParquetType() ) {
    case BINARY:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName );
    case BOOLEAN:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BOOLEAN, formatFieldName );
    case DOUBLE:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.DOUBLE, formatFieldName );
    case FLOAT:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.FLOAT, formatFieldName );
    case INT_32:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName );
    case UTF8:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName, OriginalType.UTF8 );
    case INT_64:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName, OriginalType.INT_64 );
    case INT_96:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT96, formatFieldName );
    case DATE:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName, OriginalType.DATE );
    case DECIMAL:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case DECIMAL_INT_32:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case DECIMAL_INT_64:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case TIMESTAMP_MILLIS:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName,
        OriginalType.TIMESTAMP_MILLIS );
    default:
      throw new RuntimeException( "Unsupported output type: " + f.getParquetType() );
  }
}