org.apache.iceberg.FileScanTask Java Examples

The following examples show how to use org.apache.iceberg.FileScanTask. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RowDataReader.java    From iceberg with Apache License 2.0 7 votes vote down vote up
private CloseableIterable<InternalRow> newAvroIterable(
    InputFile location,
    FileScanTask task,
    Schema projection,
    Map<Integer, ?> idToConstant) {
  Avro.ReadBuilder builder = Avro.read(location)
      .reuseContainers()
      .project(projection)
      .split(task.start(), task.length())
      .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example #2
Source File: Util.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public static String[] blockLocations(CombinedScanTask task, Configuration conf) {
  Set<String> locationSets = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    Path path = new Path(f.file().path().toString());
    try {
      FileSystem fs = path.getFileSystem(conf);
      for (BlockLocation b : fs.getFileBlockLocations(path, f.start(), f.length())) {
        locationSets.addAll(Arrays.asList(b.getHosts()));
      }
    } catch (IOException ioe) {
      LOG.warn("Failed to get block locations for path {}", path, ioe);
    }
  }

  return locationSets.toArray(new String[0]);
}
 
Example #3
Source File: RewriteDataFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private Map<StructLikeWrapper, Collection<FileScanTask>> groupTasksByPartition(
    CloseableIterator<FileScanTask> tasksIter) {
  ListMultimap<StructLikeWrapper, FileScanTask> tasksGroupedByPartition = Multimaps.newListMultimap(
      Maps.newHashMap(), Lists::newArrayList);

  try {
    tasksIter.forEachRemaining(task -> {
      StructLikeWrapper structLike = StructLikeWrapper.wrap(task.file().partition());
      tasksGroupedByPartition.put(structLike, task);
    });

  } finally {
    try {
      tasksIter.close();
    } catch (IOException ioe) {
      LOG.warn("Failed to close task iterator", ioe);
    }
  }

  return tasksGroupedByPartition.asMap();
}
 
Example #4
Source File: RowDataReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<InternalRow> newParquetIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  Parquet.ReadBuilder builder = Parquet.read(location)
      .split(task.start(), task.length())
      .project(readSchema)
      .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive);

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example #5
Source File: IcebergTableWrapper.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private void buildPartitionsAndSplits() throws IOException {
  PartitionConverter partitionConverter = new PartitionConverter(schema);
  SplitConverter splitConverter = new SplitConverter(context, fs, schema, datasetColumnValueCounts);

  // map of distinct partition values.

  // iterate over all data files to get the partition values and them to the map.
  // TODO ravindra: this iteration requires reading all of the manifest files. This should go via
  // the dremio wrappers.
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    List<PartitionValue> partition = partitionConverter.from(task);
    DatasetSplit split = splitConverter.from(task);
    partitionChunkListing.put(partition, split);
    recordCount += task.file().recordCount();
  }
}
 
Example #6
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCanReadOldCompressedManifestFiles() throws Exception {
  assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());

  // do a file append
  table.newAppend()
      .appendFile(FILE_A)
      .commit();

  // since we don't generate old file extensions anymore, let's convert existing metadata to old .metadata.json.gz
  // to test backwards compatibility
  rewriteMetadataAsGzipWithOldExtension();

  List<File> metadataFiles = listMetadataJsonFiles();

  assertEquals("Should have two versions", 2, metadataFiles.size());
  assertTrue("Metadata should be compressed with old format.",
      metadataFiles.stream().allMatch(f -> f.getName().endsWith(".metadata.json.gz")));

  Table reloaded = TABLES.load(tableLocation);

  List<FileScanTask> tasks = Lists.newArrayList(reloaded.newScan().planFiles());
  Assert.assertEquals("Should scan 1 files", 1, tasks.size());
}
 
Example #7
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testMergeAppend() throws Exception {
  testFastAppend(); // create 2 compatible manifest files that will be merged

  // merge all manifests for this test
  table.updateProperties().set("commit.manifest.min-count-to-merge", "1").commit();

  // third append
  table.newAppend()
      .appendFile(FILE_C)
      .commit();

  List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles());
  Assert.assertEquals("Should scan 3 files", 3, tasks.size());

  Assert.assertEquals("Should contain 3 Avro manifest files",
      3, listManifestFiles().size());

  TableMetadata metadata = readMetadataVersion(5);
  Assert.assertEquals("Current snapshot should contain 1 merged manifest",
      1, metadata.currentSnapshot().allManifests().size());
}
 
Example #8
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testSchemaUpdate() throws Exception {
  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertEquals("Should write the current version to the hint file",
      2, readVersionHint());

  Assert.assertEquals("Table schema should match schema with reassigned ids",
      UPDATED_SCHEMA.asStruct(), table.schema().asStruct());

  List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles());
  Assert.assertEquals("Should not create any scan tasks", 0, tasks.size());

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size());
}
 
Example #9
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}
 
Example #10
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newAvroIterable(
    InputFile inputFile, FileScanTask task, Schema readSchema) {
  Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile)
      .project(readSchema)
      .split(task.start(), task.length());
  if (reuseContainers) {
    avroReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive");
    case GENERIC:
      avroReadBuilder.createReaderFunc(
          (expIcebergSchema, expAvroSchema) ->
              DataReader.create(expIcebergSchema, expAvroSchema,
                  constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema);
}
 
Example #11
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  if (reuseContainers) {
    parquetReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Parquet support not yet supported for Pig and Hive");
    case GENERIC:
      parquetReadBuilder.createReaderFunc(
          fileSchema -> GenericParquetReaders.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(parquetReadBuilder.build(), task.residual(), readSchema);
}
 
Example #12
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  // ORC does not support reuse containers yet
  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO: implement value readers for Pig and Hive
      throw new UnsupportedOperationException("ORC support not yet supported for Pig and Hive");
    case GENERIC:
      orcReadBuilder.createReaderFunc(
          fileSchema -> GenericOrcReader.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }

  return applyResidualFiltering(orcReadBuilder.build(), task.residual(), readSchema);
}
 
Example #13
Source File: RowDataReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
  DataFile file = task.file();

  // update the current file for Spark's filename() function
  InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());

  // schema or rows returned by readers
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();

  if (projectsIdentityPartitionColumns) {
    return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant))
        .iterator();
  }
  // return the base iterator
  return open(task, expectedSchema, ImmutableMap.of()).iterator();
}
 
Example #14
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example #15
Source File: SparkBatchScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public Statistics estimateStatistics() {
  long sizeInBytes = 0L;
  long numRows = 0L;

  for (CombinedScanTask task : tasks()) {
    for (FileScanTask file : task.files()) {
      sizeInBytes += file.length();
      numRows += file.file().recordCount();
    }
  }

  return new Stats(sizeInBytes, numRows);
}
 
Example #16
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}
 
Example #17
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testFullMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(2, file.lowerBounds().size());
    Assert.assertEquals(2, file.upperBounds().size());
  }
}
 
Example #18
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCountMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example #19
Source File: TestDataSourceOptions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoWriteFormatOption() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro");
  Table table = tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
    tasks.forEach(task -> {
      FileFormat fileFormat = FileFormat.fromFileName(task.file().path());
      Assert.assertEquals(FileFormat.AVRO, fileFormat);
    });
  }
}
 
Example #20
Source File: Util.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static String[] blockLocations(FileIO io, CombinedScanTask task) {
  Set<String> locations = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    InputFile in = io.newInputFile(f.file().path().toString());
    if (in instanceof HadoopInputFile) {
      Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length()));
    }
  }

  return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE);
}
 
Example #21
Source File: TableScanUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static CloseableIterable<FileScanTask> splitFiles(CloseableIterable<FileScanTask> tasks, long splitSize) {
  Iterable<FileScanTask> splitTasks = FluentIterable
      .from(tasks)
      .transformAndConcat(input -> input.split(splitSize));
  // Capture manifests which can be closed after scan planning
  return CloseableIterable.combine(splitTasks, tasks);
}
 
Example #22
Source File: TableScanUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static CloseableIterable<CombinedScanTask> planTasks(CloseableIterable<FileScanTask> splitFiles,
                                                            long splitSize, int lookback, long openFileCost) {
  Function<FileScanTask, Long> weightFunc = file -> Math.max(file.length(), openFileCost);

  return CloseableIterable.transform(
      CloseableIterable.combine(
          new BinPacking.PackingIterable<>(splitFiles, splitSize, lookback, weightFunc, true),
          splitFiles),
      BaseCombinedScanTask::new);
}
 
Example #23
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateTable() throws Exception {
  PartitionSpec expectedSpec = PartitionSpec.builderFor(TABLE_SCHEMA)
      .bucket("data", 16)
      .build();

  Assert.assertEquals("Table schema should match schema with reassigned ids",
      TABLE_SCHEMA.asStruct(), table.schema().asStruct());
  Assert.assertEquals("Table partition spec should match with reassigned ids",
      expectedSpec, table.spec());

  List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles());
  Assert.assertEquals("Should not create any scan tasks", 0, tasks.size());

  Assert.assertTrue("Table location should exist",
      tableDir.exists());
  Assert.assertTrue("Should create metadata folder",
      metadataDir.exists() && metadataDir.isDirectory());
  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());
  Assert.assertTrue("Should create version hint file",
      versionHintFile.exists());
  Assert.assertEquals("Should write the current version to the hint file",
      1, readVersionHint());

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size());
}
 
Example #24
Source File: TestHadoopCommits.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testFastAppend() throws Exception {
  // first append
  table.newFastAppend()
      .appendFile(FILE_A)
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertEquals("Should write the current version to the hint file",
      2, readVersionHint());

  List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles());
  Assert.assertEquals("Should scan 1 file", 1, tasks.size());

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain only one Avro manifest file", 1, manifests.size());

  // second append
  table.newFastAppend()
      .appendFile(FILE_B)
      .commit();

  Assert.assertTrue("Should create v3 for the update",
      version(3).exists() && version(3).isFile());
  Assert.assertEquals("Should write the current version to the hint file",
      3, readVersionHint());

  tasks = Lists.newArrayList(table.newScan().planFiles());
  Assert.assertEquals("Should scan 2 files", 2, tasks.size());

  Assert.assertEquals("Should contain 2 Avro manifest files",
      2, listManifestFiles().size());

  TableMetadata metadata = readMetadataVersion(3);
  Assert.assertEquals("Current snapshot should contain 2 manifests",
      2, metadata.currentSnapshot().allManifests().size());
}
 
Example #25
Source File: TestInsertIntoTable.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void checkSinglePartitionValue(File tableFolder, Class expectedClass, Object expectedValue) {
  Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
  for (FileScanTask fileScanTask : table.newScan().planFiles()) {
    StructLike structLike = fileScanTask.file().partition();
    Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue));
  }
}
 
Example #26
Source File: TestIcebergCTASWithPartition.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void verifyPartitionValue(String tableFolder, Class expectedClass, Object expectedValue) {
  Table table = new HadoopTables(new Configuration()).load(tableFolder);
  for (FileScanTask fileScanTask : table.newScan().planFiles()) {
    StructLike structLike = fileScanTask.file().partition();
    Assert.assertEquals(structLike.get(0, expectedClass), expectedValue);
  }
}
 
Example #27
Source File: TestIcebergPartitionData.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void verifyPartitionValue(PartitionSpec partitionSpec, IcebergPartitionData partitionData,
                                  String columnName, Class expectedClass, Object expectedValue) throws Exception {
  File tableFolder = new File(folder.getRoot(), "icebergPartitionTest");
  try {
    tableFolder.mkdir();
    File dataFile = new File(folder.getRoot(), "a.parquet");

    dataFile.createNewFile();

    DataFile d1 = DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(dataFile))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartition(partitionData)
      .build();

    IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(Path.of(tableFolder.toPath().toString()),
      (new SchemaConverter()).fromIceberg(schema), Lists.newArrayList(columnName), new Configuration());
    committer.consumeData(Lists.newArrayList(d1));
    committer.commit();


    Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
    for (FileScanTask fileScanTask : table.newScan().planFiles()) {
      StructLike structLike = fileScanTask.file().partition();
      if (expectedClass == ByteBuffer.class) {
        Assert.assertEquals(structLike.get(0, expectedClass).hashCode(), ByteBuffer.wrap((byte[])expectedValue).hashCode());
      } else {
        Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue));
      }
    }

  }
  finally {
    tableFolder.delete();
  }

}
 
Example #28
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Map<Integer, ?> constantsMap(FileScanTask task, BiFunction<Type, Object, Object> converter) {
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();
  if (projectsIdentityPartitionColumns) {
    return PartitionUtil.constantsMap(task, converter);
  } else {
    return Collections.emptyMap();
  }
}
 
Example #29
Source File: IcebergSplitSource.java    From presto with Apache License 2.0 5 votes vote down vote up
private ConnectorSplit toIcebergSplit(FileScanTask task)
{
    // TODO: We should leverage residual expression and convert that to TupleDomain.
    //       The predicate here is used by readers for predicate push down at reader level,
    //       so when we do not use residual expression, we are just wasting CPU cycles
    //       on reader side evaluating a condition that we know will always be true.

    return new IcebergSplit(
            task.file().path().toString(),
            task.start(),
            task.length(),
            task.file().format(),
            ImmutableList.of(),
            getPartitionKeys(task));
}
 
Example #30
Source File: IcebergSplitSource.java    From presto with Apache License 2.0 5 votes vote down vote up
private static Map<Integer, String> getPartitionKeys(FileScanTask scanTask)
{
    StructLike partition = scanTask.file().partition();
    PartitionSpec spec = scanTask.spec();
    Map<PartitionField, Integer> fieldToIndex = getIdentityPartitions(spec);
    Map<Integer, String> partitionKeys = new HashMap<>();

    fieldToIndex.forEach((field, index) -> {
        int id = field.sourceId();
        Type type = spec.schema().findType(id);
        Class<?> javaClass = type.typeId().javaClass();
        Object value = partition.get(index, javaClass);

        if (value == null) {
            partitionKeys.put(id, null);
        }
        else {
            String partitionValue;
            if (type.typeId() == FIXED || type.typeId() == BINARY) {
                // this is safe because Iceberg PartitionData directly wraps the byte array
                partitionValue = new String(((ByteBuffer) value).array(), UTF_8);
            }
            else {
                partitionValue = value.toString();
            }
            partitionKeys.put(id, partitionValue);
        }
    });

    return Collections.unmodifiableMap(partitionKeys);
}