Java Code Examples for org.apache.iceberg.FileScanTask#file()

The following examples show how to use org.apache.iceberg.FileScanTask#file() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}
 
Example 2
Source File: RowDataReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
  DataFile file = task.file();

  // update the current file for Spark's filename() function
  InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());

  // schema or rows returned by readers
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();

  if (projectsIdentityPartitionColumns) {
    return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant))
        .iterator();
  }
  // return the base iterator
  return open(task, expectedSchema, ImmutableMap.of()).iterator();
}
 
Example 3
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testFullMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(2, file.lowerBounds().size());
    Assert.assertEquals(2, file.upperBounds().size());
  }
}
 
Example 4
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCountMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example 5
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example 6
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}
 
Example 7
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testCustomMetricCollectionForNestedParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA)
      .identity("strCol")
      .build();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  properties.put("write.metadata.metrics.column.longCol", "counts");
  properties.put("write.metadata.metrics.column.record.id", "full");
  properties.put("write.metadata.metrics.column.record.data", "truncate(2)");
  Table table = tables.create(COMPLEX_SCHEMA, spec, properties, tableLocation);

  Iterable<InternalRow> rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0);
  JavaRDD<InternalRow> rdd = sc.parallelize(Lists.newArrayList(rows));
  Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false);

  df.coalesce(1).write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField longCol = schema.findField("longCol");
  Types.NestedField recordId = schema.findField("record.id");
  Types.NestedField recordData = schema.findField("record.data");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();

    Map<Integer, Long> nullValueCounts = file.nullValueCounts();
    Assert.assertEquals(3, nullValueCounts.size());
    Assert.assertTrue(nullValueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordData.fieldId()));

    Map<Integer, Long> valueCounts = file.valueCounts();
    Assert.assertEquals(3, valueCounts.size());
    Assert.assertTrue(valueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordData.fieldId()));

    Map<Integer, ByteBuffer> lowerBounds = file.lowerBounds();
    Assert.assertEquals(2, lowerBounds.size());
    Assert.assertTrue(lowerBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataLowerBound = lowerBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataLowerBound).length);

    Map<Integer, ByteBuffer> upperBounds = file.upperBounds();
    Assert.assertEquals(2, upperBounds.size());
    Assert.assertTrue(upperBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataUpperBound = upperBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataUpperBound).length);
  }
}
 
Example 8
Source File: IcebergPigInputFormat.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private boolean advance() throws IOException {
  if (reader != null) {
    reader.close();
  }

  if (!tasks.hasNext()) {
    return false;
  }

  FileScanTask currentTask = tasks.next();

  Schema tableSchema = (Schema) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_SCHEMA)));
  LOG.debug("[{}]: Task table schema: {}", signature, tableSchema);

  List<String> projectedFields =
      (List<String>) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_PROJECTED_FIELDS)));
  LOG.debug("[{}]: Task projected fields: {}", signature, projectedFields);

  Schema projectedSchema = projectedFields != null ? SchemaUtil.project(tableSchema, projectedFields) : tableSchema;

  PartitionSpec spec = currentTask.asFileScanTask().spec();
  DataFile file = currentTask.file();
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());

  Set<Integer> idColumns = spec.identitySourceIds();

  // schema needed for the projection and filtering
  boolean hasJoinedPartitionColumns = !idColumns.isEmpty();

  switch (file.format()) {
    case PARQUET:
      Map<Integer, Object> partitionValueMap = Maps.newHashMap();

      if (hasJoinedPartitionColumns) {

        Schema readSchema = TypeUtil.selectNot(projectedSchema, idColumns);
        Schema projectedPartitionSchema = TypeUtil.select(projectedSchema, idColumns);

        Map<String, Integer> partitionSpecFieldIndexMap = Maps.newHashMap();
        for (int i = 0; i < spec.fields().size(); i++) {
          partitionSpecFieldIndexMap.put(spec.fields().get(i).name(), i);
        }

        for (Types.NestedField field : projectedPartitionSchema.columns()) {
          int partitionIndex = partitionSpecFieldIndexMap.get(field.name());

          Object partitionValue = file.partition().get(partitionIndex, Object.class);
          partitionValueMap.put(field.fieldId(), convertPartitionValue(field.type(), partitionValue));
        }

        reader = Parquet.read(inputFile)
            .project(readSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      } else {
        reader = Parquet.read(inputFile)
            .project(projectedSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      }

      recordIterator = reader.iterator();

      break;
    default:
      throw new UnsupportedOperationException("Unsupported file format: " + file.format());
  }

  return true;
}
 
Example 9
Source File: IcebergTableWrapper.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
DatasetSplit from(FileScanTask task) throws IOException {
  // TODO ravindra: iceberg does not track counts at a row-group level. We should fallback to
  // an alternate codepath for this.
  DataFile dataFile = task.file();
  if (dataFile.splitOffsets() != null && dataFile.splitOffsets().size() > 1) {
    throw new UnsupportedOperationException("iceberg does not support multiple row groups yet");
  }
  String pathString = dataFile.path().toString();
  FileAttributes fileAttributes = fs.getFileAttributes(Path.of(pathString));

  // Get the per-column value counts. The counts may not be present.
  List<ColumnValueCount> columnValueCounts = Lists.newArrayList();
  if (dataFile.valueCounts() != null) {
    for (Map.Entry<Integer, Long> entry : dataFile.valueCounts().entrySet()) {
      String columnName = schema.findColumnName(entry.getKey());
      Types.NestedField field = schema.findField(entry.getKey());
      if (field == null || columnName  == null) {
        // we are not updating counts for list elements.
        continue;
      }

      Long totalCount = entry.getValue();
      Long nullValueCount = dataFile.nullValueCounts().get(entry.getKey());
      if (totalCount == null || nullValueCount == null) {
        continue;
      }

      long nonNullValueCount = totalCount - nullValueCount;
      columnValueCounts.add(
          ColumnValueCount.newBuilder()
              .setColumn(columnName)
              .setCount(nonNullValueCount)
              .build());
      // aggregate into the dataset level column-value counts.
      datasetColumnValueCounts.merge(
          columnName, nonNullValueCount, (x, y) -> y + nonNullValueCount);
    }
  }

  // Create the split-level xattr with details about the column counts.
  // TODO: not populating column bounds here since they are not used by dremio planner.
  ParquetDatasetSplitXAttr splitExtended =
      ParquetDatasetSplitXAttr.newBuilder()
          .setPath(pathString)
          .setStart(task.start())
          .setRowGroupIndex(0)
          .setUpdateKey(
              FileSystemCachedEntity.newBuilder()
                  .setPath(pathString)
                  .setLastModificationTime(fileAttributes.lastModifiedTime().toMillis())
                  .setLength(fileAttributes.size()))
          .addAllColumnValueCounts(columnValueCounts)
          .setLength(task.length())
          .build();

  // build the host affinity details for the split.
  Map<HostAndPort, Float> affinities =
    Metadata.getHostAffinity(fs, fileAttributes, task.start(), task.length());
  List<DatasetSplitAffinity> splitAffinities = new ArrayList<>();
  for (ObjectLongCursor<HostAndPort> item :
    ParquetGroupScanUtils.buildEndpointByteMap(activeHostMap,
      activeHostPortMap, affinities, task.length())) {
    splitAffinities.add(DatasetSplitAffinity.of(item.key.toString(), item.value));
  }

  return DatasetSplit.of(
      splitAffinities, task.length(), task.file().recordCount(), splitExtended::writeTo);
}