org.apache.parquet.SemanticVersion Java Examples

The following examples show how to use org.apache.parquet.SemanticVersion. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroParquetConvertCreator.java    From datacollector with Apache License 2.0 6 votes vote down vote up
@Override
protected void addNecessaryJarsToJob(Configuration conf) {
  MapreduceUtils.addJarsToJob(conf,
      SemanticVersion.class,
      ParquetWriter.class,
      AvroParquetWriter.class,
      AvroParquetWriterBuilder190Int96.class,
      AvroSchemaConverter190Int96Avro18.class,
      FsInput.class,
      CompressionCodec.class,
      ParquetProperties.class,
      BytesInput.class,
      AvroToParquetConverterUtil.class,
      AvroLogicalTypeSupport.class
  );
}
 
Example #2
Source File: TestCorruptDeltaByteArrays.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testCorruptDeltaByteArrayVerisons() {
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.6.0 (build abcd)", Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((String) null, Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((ParsedVersion) null, Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((SemanticVersion) null, Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.DELTA_BYTE_ARRAY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.6.0 (build abcd)", Encoding.DELTA_BINARY_PACKED));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((String) null, Encoding.DELTA_LENGTH_BYTE_ARRAY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((ParsedVersion) null, Encoding.PLAIN));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((SemanticVersion) null, Encoding.RLE));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.RLE_DICTIONARY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.PLAIN_DICTIONARY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.BIT_PACKED));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0 (build abcd)", Encoding.DELTA_BYTE_ARRAY));
}
 
Example #3
Source File: AvroToParquetConverterUtil.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private static ParquetWriter.Builder getParquetWriterBuilder(Path tempFile, Schema avroSchema, Configuration conf) {
  // Parquet Avro pre-1.9 doesn't work with logical types, so in that case we use custom Builder that injects our own
  // avro schema -> parquet schema generator class (which is a copy of the one that was provided in PARQUET-358).
  // Additionally, Parquet Avro 1.9.x does not support converting from Avro timestamps (logical types TIMESTAMP_MILLIS
  // and TIMESTAMP_MICROS) and so we have to extend Parquet Avro classes to support timestamps conversion.
  ParquetWriter.Builder builder = null;
  try {
    SemanticVersion parquetVersion = SemanticVersion.parse(Version.VERSION_NUMBER);
    if(parquetVersion.major > 1 || (parquetVersion.major == 1 && parquetVersion.minor >= 9)) {
      if (parquetVersion.major == 1 && parquetVersion.minor >= 9) {
        LOG.debug("Creating AvroParquetWriterBuilder190Int96");
        if (propertyDefined(conf, AvroParquetConstants.TIMEZONE)) {
          String timeZoneId = conf.get(AvroParquetConstants.TIMEZONE);
          builder = new AvroParquetWriterBuilder190Int96(tempFile, timeZoneId).withSchema(avroSchema);
        } else {
          builder = new AvroParquetWriterBuilder190Int96(tempFile).withSchema(avroSchema);
        }
      } else {
        LOG.debug("Creating AvroParquetWriter.builder");
        builder = AvroParquetWriter.builder(tempFile).withSchema(avroSchema);
      }
    } else {
      LOG.debug("Creating AvroParquetWriterBuilder");
      builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
    }
  } catch (SemanticVersion.SemanticVersionParseException e) {
    LOG.warn("Can't parse parquet version string: " + Version.VERSION_NUMBER, e);
    builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
  }
  return builder;
}
 
Example #4
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Check for corrupted dates in a parquet file. See Drill-4203
 */
public static DateCorruptionStatus detectCorruptDates(ParquetMetadata footer,
                                         List<SchemaPath> columns,
                                         boolean autoCorrectCorruptDates) {
  // old drill files have "parquet-mr" as created by string, and no drill version, need to check min/max values to see
  // if they look corrupt
  //  - option to disable this auto-correction based on the date values, in case users are storing these
  //    dates intentionally

  // migrated parquet files have 1.8.1 parquet-mr version with drill-r0 in the part of the name usually containing "SNAPSHOT"

  // new parquet files are generated with "is.date.correct" property have no corruption dates

  String createdBy = footer.getFileMetaData().getCreatedBy();
  String drillVersion = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.DRILL_VERSION_PROPERTY);
  String writerVersionValue = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.WRITER_VERSION_PROPERTY);
  // This flag can be present in parquet files which were generated with 1.9.0-SNAPSHOT and 1.9.0 drill versions.
  // If this flag is present it means that the version of the drill parquet writer is 2
  final String isDateCorrectFlag = "is.date.correct";
  String isDateCorrect = footer.getFileMetaData().getKeyValueMetaData().get(isDateCorrectFlag);
  if (drillVersion != null) {
    int writerVersion = 1;
    if (writerVersionValue != null) {
      writerVersion = Integer.parseInt(writerVersionValue);
    }
    else if (Boolean.valueOf(isDateCorrect)) {
      writerVersion = DRILL_WRITER_VERSION_STD_DATE_FORMAT;
    }
    return writerVersion >= DRILL_WRITER_VERSION_STD_DATE_FORMAT ? DateCorruptionStatus.META_SHOWS_NO_CORRUPTION
        // loop through parquet column metadata to find date columns, check for corrupt values
        : checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates);
  } else {
    // Possibly an old, un-migrated Drill file, check the column statistics to see if min/max values look corrupt
    // only applies if there is a date column selected
    if (createdBy == null || createdBy.equals("parquet-mr")) {
      return checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates);
    } else {
      // check the created by to see if it is a migrated Drill file
      try {
        VersionParser.ParsedVersion parsedCreatedByVersion = VersionParser.parse(createdBy);
        // check if this is a migrated Drill file, lacking a Drill version number, but with
        // "drill" in the parquet created-by string
        if (parsedCreatedByVersion.hasSemanticVersion()) {
          SemanticVersion semVer = parsedCreatedByVersion.getSemanticVersion();
          String pre = semVer.pre + "";
          if (semVer.major == 1 && semVer.minor == 8 && semVer.patch == 1 && pre.contains("drill")) {
            return checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates);
          }
        }
        // written by a tool that wasn't Drill, the dates are not corrupted
        return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
      } catch (VersionParser.VersionParseException e) {
        // If we couldn't parse "created by" field, check column metadata of date columns
        return checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates);
      }
    }
  }
}
 
Example #5
Source File: ParquetReaderUtility.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Check for corrupted dates in a parquet file. See DRILL-4203
 */
public static DateCorruptionStatus detectCorruptDates(ParquetMetadata footer,
                                         List<SchemaPath> columns,
                                         boolean autoCorrectCorruptDates) {
  // old drill files have "parquet-mr" as created by string, and no drill version, need to check min/max values to see
  // if they look corrupt
  //  - option to disable this auto-correction based on the date values, in case users are storing these
  //    dates intentionally

  // migrated parquet files have 1.8.1 parquet-mr version with drill-r0 in the part of the name usually containing "SNAPSHOT"

  // new parquet files are generated with "is.date.correct" property have no corruption dates

  String createdBy = footer.getFileMetaData().getCreatedBy();
  String dremioVersion = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.DREMIO_VERSION_PROPERTY);
  String drillVersion = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.DRILL_VERSION_PROPERTY);
  String isDateCorrect = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.IS_DATE_CORRECT_PROPERTY);
  String writerVersionValue = footer.getFileMetaData().getKeyValueMetaData().get(ParquetRecordWriter.WRITER_VERSION_PROPERTY);
  logger.debug("Detecting corrupt dates for file created by {}, dremio version {}, writer version value {}, auto correct dates {}",
    createdBy, dremioVersion, writerVersionValue, autoCorrectCorruptDates);
  if (dremioVersion != null || drillVersion != null) {
    // File is generated by either Drill >= 1.3.0 or Dremio (all versions)

    if (writerVersionValue != null && Integer.parseInt(writerVersionValue) >= 2) {
      // If Drill parquet writer version is >=2 -> No date corruption.
      //   1. All parquet files written by Drill version >= 1.10.0 (DRILL-4980)
      return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
    }

    if (Boolean.valueOf(isDateCorrect)) {
      // If the footer contains "is.date.correct" -> No date corruption.
      //   1. File generated by Drill 1.9.0 (DRILL-4203) - This property got removed in 1.10.0 (DRILL-4980)
      //   2. All parquet files generated by Dremio
      return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
    }

    // File is generated using Drill >= 1.3.0 and Drill <= 1.9.0
    return DateCorruptionStatus.META_SHOWS_CORRUPTION;
  } else {
    // Possibly an old, un-migrated Drill file, check the column statistics to see if min/max values look corrupt
    // only applies if there is a date column selected
    if (createdBy == null || createdBy.equals("parquet-mr")) {
      // loop through parquet column metadata to find date columns, check for corrupt values
      return checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates);
    } else {
      // check the created by to see if it is a migrated Drill file
      try {
        VersionParser.ParsedVersion parsedCreatedByVersion = VersionParser.parse(createdBy);
        // check if this is a migrated Drill file, lacking a Drill version number, but with
        // "drill" in the parquet created-by string
        if (parsedCreatedByVersion.hasSemanticVersion()) {
          SemanticVersion semVer = parsedCreatedByVersion.getSemanticVersion();
          String pre = semVer.pre + "";
          if (semVer.major == 1 && semVer.minor == 8 && semVer.patch == 1 && pre.contains("drill")) {
            return DateCorruptionStatus.META_SHOWS_CORRUPTION;
          }
        }
        // written by a tool that wasn't Drill, the dates are not corrupted
        return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
      } catch (VersionParser.VersionParseException e) {
        // If we couldn't parse "created by" field, check column metadata of date columns
        return checkForCorruptDateValuesInStatistics(footer, columns, autoCorrectCorruptDates);
      }
    }
  }
}