Java Code Examples for org.apache.iceberg.FileScanTask

The following examples show how to use org.apache.iceberg.FileScanTask. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}
 
Example 2
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newAvroIterable(
    InputFile inputFile, FileScanTask task, Schema readSchema) {
  Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile)
      .project(readSchema)
      .split(task.start(), task.length());
  if (reuseContainers) {
    avroReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive");
    case GENERIC:
      avroReadBuilder.createReaderFunc(
          (expIcebergSchema, expAvroSchema) ->
              DataReader.create(expIcebergSchema, expAvroSchema,
                  constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema);
}
 
Example 3
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  if (reuseContainers) {
    parquetReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Parquet support not yet supported for Pig and Hive");
    case GENERIC:
      parquetReadBuilder.createReaderFunc(
          fileSchema -> GenericParquetReaders.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(parquetReadBuilder.build(), task.residual(), readSchema);
}
 
Example 4
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  // ORC does not support reuse containers yet
  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO: implement value readers for Pig and Hive
      throw new UnsupportedOperationException("ORC support not yet supported for Pig and Hive");
    case GENERIC:
      orcReadBuilder.createReaderFunc(
          fileSchema -> GenericOrcReader.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }

  return applyResidualFiltering(orcReadBuilder.build(), task.residual(), readSchema);
}
 
Example 5
Source Project: iceberg   Source File: RewriteDataFilesAction.java    License: Apache License 2.0 6 votes vote down vote up
private Map<StructLikeWrapper, Collection<FileScanTask>> groupTasksByPartition(
    CloseableIterator<FileScanTask> tasksIter) {
  ListMultimap<StructLikeWrapper, FileScanTask> tasksGroupedByPartition = Multimaps.newListMultimap(
      Maps.newHashMap(), Lists::newArrayList);

  try {
    tasksIter.forEachRemaining(task -> {
      StructLikeWrapper structLike = StructLikeWrapper.wrap(task.file().partition());
      tasksGroupedByPartition.put(structLike, task);
    });

  } finally {
    try {
      tasksIter.close();
    } catch (IOException ioe) {
      LOG.warn("Failed to close task iterator", ioe);
    }
  }

  return tasksGroupedByPartition.asMap();
}
 
Example 6
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
  DataFile file = task.file();

  // update the current file for Spark's filename() function
  InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());

  // schema or rows returned by readers
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();

  if (projectsIdentityPartitionColumns) {
    return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant))
        .iterator();
  }
  // return the base iterator
  return open(task, expectedSchema, ImmutableMap.of()).iterator();
}
 
Example 7
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<InternalRow> newAvroIterable(
    InputFile location,
    FileScanTask task,
    Schema projection,
    Map<Integer, ?> idToConstant) {
  Avro.ReadBuilder builder = Avro.read(location)
      .reuseContainers()
      .project(projection)
      .split(task.start(), task.length())
      .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example 8
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<InternalRow> newParquetIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  Parquet.ReadBuilder builder = Parquet.read(location)
      .split(task.start(), task.length())
      .project(readSchema)
      .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive);

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example 9
Source Project: iceberg   Source File: Util.java    License: Apache License 2.0 6 votes vote down vote up
public static String[] blockLocations(CombinedScanTask task, Configuration conf) {
  Set<String> locationSets = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    Path path = new Path(f.file().path().toString());
    try {
      FileSystem fs = path.getFileSystem(conf);
      for (BlockLocation b : fs.getFileBlockLocations(path, f.start(), f.length())) {
        locationSets.addAll(Arrays.asList(b.getHosts()));
      }
    } catch (IOException ioe) {
      LOG.warn("Failed to get block locations for path {}", path, ioe);
    }
  }

  return locationSets.toArray(new String[0]);
}
 
Example 10
Source Project: iceberg   Source File: TestHadoopCommits.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testSchemaUpdate() throws Exception {
  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertEquals("Should write the current version to the hint file",
      2, readVersionHint());

  Assert.assertEquals("Table schema should match schema with reassigned ids",
      UPDATED_SCHEMA.asStruct(), table.schema().asStruct());

  List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles());
  Assert.assertEquals("Should not create any scan tasks", 0, tasks.size());

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size());
}
 
Example 11
Source Project: iceberg   Source File: TestHadoopCommits.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testMergeAppend() throws Exception {
  testFastAppend(); // create 2 compatible manifest files that will be merged

  // merge all manifests for this test
  table.updateProperties().set("commit.manifest.min-count-to-merge", "1").commit();

  // third append
  table.newAppend()
      .appendFile(FILE_C)
      .commit();

  List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles());
  Assert.assertEquals("Should scan 3 files", 3, tasks.size());

  Assert.assertEquals("Should contain 3 Avro manifest files",
      3, listManifestFiles().size());

  TableMetadata metadata = readMetadataVersion(5);
  Assert.assertEquals("Current snapshot should contain 1 merged manifest",
      1, metadata.currentSnapshot().allManifests().size());
}
 
Example 12
Source Project: iceberg   Source File: TestHadoopCommits.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCanReadOldCompressedManifestFiles() throws Exception {
  assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());

  // do a file append
  table.newAppend()
      .appendFile(FILE_A)
      .commit();

  // since we don't generate old file extensions anymore, let's convert existing metadata to old .metadata.json.gz
  // to test backwards compatibility
  rewriteMetadataAsGzipWithOldExtension();

  List<File> metadataFiles = listMetadataJsonFiles();

  assertEquals("Should have two versions", 2, metadataFiles.size());
  assertTrue("Metadata should be compressed with old format.",
      metadataFiles.stream().allMatch(f -> f.getName().endsWith(".metadata.json.gz")));

  Table reloaded = TABLES.load(tableLocation);

  List<FileScanTask> tasks = Lists.newArrayList(reloaded.newScan().planFiles());
  Assert.assertEquals("Should scan 1 files", 1, tasks.size());
}
 
Example 13
Source Project: dremio-oss   Source File: IcebergTableWrapper.java    License: Apache License 2.0 6 votes vote down vote up
private void buildPartitionsAndSplits() throws IOException {
  PartitionConverter partitionConverter = new PartitionConverter(schema);
  SplitConverter splitConverter = new SplitConverter(context, fs, schema, datasetColumnValueCounts);

  // map of distinct partition values.

  // iterate over all data files to get the partition values and them to the map.
  // TODO ravindra: this iteration requires reading all of the manifest files. This should go via
  // the dremio wrappers.
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    List<PartitionValue> partition = partitionConverter.from(task);
    DatasetSplit split = splitConverter.from(task);
    partitionChunkListing.put(partition, split);
    recordCount += task.file().recordCount();
  }
}
 
Example 14
Source Project: presto   Source File: IcebergSplitSource.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize)
{
    // TODO: move this to a background thread
    List<ConnectorSplit> splits = new ArrayList<>();
    Iterator<FileScanTask> iterator = limit(fileScanIterator, maxSize);
    while (iterator.hasNext()) {
        FileScanTask task = iterator.next();
        splits.add(toIcebergSplit(task));
    }
    return completedFuture(new ConnectorSplitBatch(splits, isFinished()));
}
 
Example 15
Source Project: presto   Source File: IcebergSplitSource.java    License: Apache License 2.0 5 votes vote down vote up
private ConnectorSplit toIcebergSplit(FileScanTask task)
{
    // TODO: We should leverage residual expression and convert that to TupleDomain.
    //       The predicate here is used by readers for predicate push down at reader level,
    //       so when we do not use residual expression, we are just wasting CPU cycles
    //       on reader side evaluating a condition that we know will always be true.

    return new IcebergSplit(
            task.file().path().toString(),
            task.start(),
            task.length(),
            task.file().format(),
            ImmutableList.of(),
            getPartitionKeys(task));
}
 
Example 16
Source Project: presto   Source File: IcebergSplitSource.java    License: Apache License 2.0 5 votes vote down vote up
private static Map<Integer, String> getPartitionKeys(FileScanTask scanTask)
{
    StructLike partition = scanTask.file().partition();
    PartitionSpec spec = scanTask.spec();
    Map<PartitionField, Integer> fieldToIndex = getIdentityPartitions(spec);
    Map<Integer, String> partitionKeys = new HashMap<>();

    fieldToIndex.forEach((field, index) -> {
        int id = field.sourceId();
        Type type = spec.schema().findType(id);
        Class<?> javaClass = type.typeId().javaClass();
        Object value = partition.get(index, javaClass);

        if (value == null) {
            partitionKeys.put(id, null);
        }
        else {
            String partitionValue;
            if (type.typeId() == FIXED || type.typeId() == BINARY) {
                // this is safe because Iceberg PartitionData directly wraps the byte array
                partitionValue = new String(((ByteBuffer) value).array(), UTF_8);
            }
            else {
                partitionValue = value.toString();
            }
            partitionKeys.put(id, partitionValue);
        }
    });

    return Collections.unmodifiableMap(partitionKeys);
}
 
Example 17
Source Project: iceberg   Source File: Reader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Statistics estimateStatistics() {
  long sizeInBytes = 0L;
  long numRows = 0L;

  for (CombinedScanTask task : tasks()) {
    for (FileScanTask file : task.files()) {
      sizeInBytes += file.length();
      numRows += file.file().recordCount();
    }
  }

  return new Stats(sizeInBytes, numRows);
}
 
Example 18
Source Project: iceberg   Source File: TableScanIterable.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean hasNext() {
  while (true) {
    if (currentIterator.hasNext()) {
      return true;

    } else if (tasks.hasNext()) {
      if (currentCloseable != null) {
        try {
          currentCloseable.close();
        } catch (IOException e) {
          throw new RuntimeIOException(e, "Failed to close task");
        }
      }

      FileScanTask task = tasks.next();
      CloseableIterable<Record> reader = open(task);
      this.currentCloseable = reader;

      if (task.residual() != null && task.residual() != Expressions.alwaysTrue()) {
        Evaluator filter = new Evaluator(projection.asStruct(), task.residual(), caseSensitive);
        this.currentIterator = Iterables.filter(reader,
            record -> filter.eval(recordWrapper.wrap(record))).iterator();
      } else {
        this.currentIterator = reader.iterator();
      }

    } else {
      return false;
    }
  }
}
 
Example 19
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
private Map<Integer, ?> constantsMap(FileScanTask task, BiFunction<Type, Object, Object> converter) {
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();
  if (projectsIdentityPartitionColumns) {
    return PartitionUtil.constantsMap(task, converter);
  } else {
    return Collections.emptyMap();
  }
}
 
Example 20
Source Project: iceberg   Source File: BatchDataReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
CloseableIterator<ColumnarBatch> open(FileScanTask task) {
  CloseableIterable<ColumnarBatch> iter;
  InputFile location = getInputFile(task);
  Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
  if (task.file().format() == FileFormat.PARQUET) {
    Parquet.ReadBuilder builder = Parquet.read(location)
        .project(expectedSchema)
        .split(task.start(), task.length())
        .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema,
            fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED))
        .recordsPerBatch(batchSize)
        .filter(task.residual())
        .caseSensitive(caseSensitive)
        // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
        // without worrying about subsequent reads clobbering over each other. This improves
        // read performance as every batch read doesn't have to pay the cost of allocating memory.
        .reuseContainers();

    if (nameMapping != null) {
      builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }

    iter = builder.build();
  } else {
    throw new UnsupportedOperationException(
        "Format: " + task.file().format() + " not supported for batched reads");
  }
  return iter.iterator();
}
 
Example 21
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> open(FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
  CloseableIterable<InternalRow> iter;
  if (task.isDataTask()) {
    iter = newDataIterable(task.asDataTask(), readSchema);
  } else {
    InputFile location = getInputFile(task);
    Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");

    switch (task.file().format()) {
      case PARQUET:
        iter = newParquetIterable(location, task, readSchema, idToConstant);
        break;

      case AVRO:
        iter = newAvroIterable(location, task, readSchema, idToConstant);
        break;

      case ORC:
        iter = newOrcIterable(location, task, readSchema, idToConstant);
        break;

      default:
        throw new UnsupportedOperationException(
            "Cannot read unknown format: " + task.file().format());
    }
  }

  return iter;
}
 
Example 22
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> newOrcIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  return ORC.read(location)
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .build();
}
 
Example 23
Source Project: iceberg   Source File: TestDataSourceOptions.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteFormatOptionOverridesTableProperties() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro");
  Table table = tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
    tasks.forEach(task -> {
      FileFormat fileFormat = FileFormat.fromFileName(task.file().path());
      Assert.assertEquals(FileFormat.PARQUET, fileFormat);
    });
  }
}
 
Example 24
Source Project: iceberg   Source File: TestDataSourceOptions.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testNoWriteFormatOption() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro");
  Table table = tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
    tasks.forEach(task -> {
      FileFormat fileFormat = FileFormat.fromFileName(task.file().path());
      Assert.assertEquals(FileFormat.AVRO, fileFormat);
    });
  }
}
 
Example 25
Source Project: iceberg   Source File: TestWriteMetricsConfig.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testFullMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(2, file.lowerBounds().size());
    Assert.assertEquals(2, file.upperBounds().size());
  }
}
 
Example 26
Source Project: iceberg   Source File: TestWriteMetricsConfig.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testCountMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example 27
Source Project: iceberg   Source File: TestWriteMetricsConfig.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example 28
Source Project: iceberg   Source File: TestWriteMetricsConfig.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}
 
Example 29
Source Project: iceberg   Source File: SparkBatchScan.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Statistics estimateStatistics() {
  long sizeInBytes = 0L;
  long numRows = 0L;

  for (CombinedScanTask task : tasks()) {
    for (FileScanTask file : task.files()) {
      sizeInBytes += file.length();
      numRows += file.file().recordCount();
    }
  }

  return new Stats(sizeInBytes, numRows);
}
 
Example 30
Source Project: iceberg   Source File: Util.java    License: Apache License 2.0 5 votes vote down vote up
public static String[] blockLocations(FileIO io, CombinedScanTask task) {
  Set<String> locations = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    InputFile in = io.newInputFile(f.file().path().toString());
    if (in instanceof HadoopInputFile) {
      Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length()));
    }
  }

  return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE);
}