org.apache.iceberg.DataFile Java Examples

The following examples show how to use org.apache.iceberg.DataFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}
 
Example #2
Source File: WriterCommitterOperator.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Override
public void consumeData(int records) throws Exception {
  project.consumeData(records);
  if (icebergTableCommitter) {
    List<DataFile> icebergDatafiles = new ArrayList<>();
    for (int i = 0; i < records; ++i) {
      DataFile dataFile = IcebergSerDe.deserializeDataFile(icebergMetadataVector.get(i));
      icebergDatafiles.add(dataFile);
    }
    if (icebergDatafiles.size() > 0) {
      try (AutoCloseable ac = OperatorStats.getWaitRecorder(context.getStats())) {
        icebergOpCommitter.consumeData(icebergDatafiles);
      }
    }
  }
  recordCount += records;
}
 
Example #3
Source File: HiveCreateReplaceTableTest.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnWithGlobalTableLocation() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap());
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();

  table.newAppend()
      .appendFile(dataFile)
      .commit();

  Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots()));
}
 
Example #4
Source File: HiveCreateReplaceTableTest.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}
 
Example #5
Source File: BaseWriter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}
 
Example #6
Source File: RowDataReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
  DataFile file = task.file();

  // update the current file for Spark's filename() function
  InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());

  // schema or rows returned by readers
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();

  if (projectsIdentityPartitionColumns) {
    return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant))
        .iterator();
  }
  // return the base iterator
  return open(task, expectedSchema, ImmutableMap.of()).iterator();
}
 
Example #7
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}
 
Example #8
Source File: RewriteDataFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private void replaceDataFiles(Iterable<DataFile> deletedDataFiles, Iterable<DataFile> addedDataFiles) {
  try {
    RewriteFiles rewriteFiles = table.newRewrite();
    rewriteFiles.rewriteFiles(Sets.newHashSet(deletedDataFiles), Sets.newHashSet(addedDataFiles));
    commit(rewriteFiles);

  } catch (Exception e) {
    Tasks.foreach(Iterables.transform(addedDataFiles, f -> f.path().toString()))
        .noRetry()
        .suppressFailureWhenFinished()
        .onFailure((location, exc) -> LOG.warn("Failed to delete: {}", location, exc))
        .run(fileIO::deleteFile);

    throw e;
  }
}
 
Example #9
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCustomCatalog() throws Exception {
  conf = new Configuration();
  conf.set("warehouse.location", temp.newFolder("hadoop_catalog").getAbsolutePath());

  Catalog catalog = new HadoopCatalogFunc().apply(conf);
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "t");
  Table table = catalog.createTable(tableIdentifier, SCHEMA, SPEC,
                                    ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()));
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .catalogFunc(HadoopCatalogFunc.class)
      .readFrom(tableIdentifier.toString());
  validate(job, expectedRecords);
}
 
Example #10
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}
 
Example #11
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTable() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, null, format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();
  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString());
  validate(job, expectedRecords);
}
 
Example #12
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedTable() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString());
  validate(job, expectedRecords);
}
 
Example #13
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterExp() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  expectedRecords.get(1).set(2, "2020-03-20");
  DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  DataFile dataFile2 = writeFile(table, Row.of("2020-03-21", 0), format,
                                 RandomGenericData.generate(table.schema(), 2, 0L));
  table.newAppend()
       .appendFile(dataFile1)
       .appendFile(dataFile2)
       .commit();
  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString())
               .filter(Expressions.equal("date", "2020-03-20"));
  validate(job, expectedRecords);
}
 
Example #14
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}
 
Example #15
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example #16
Source File: TestIcebergTableDrop.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example #17
Source File: TestIcebergManifests.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
List<DataFile> getDataFiles(PartitionSpec partitionSpec, int partitionValueSize, int dataFilesCount, String columnName) {
  List<DataFile> dataFiles = new ArrayList<>();
  for( int i=0; i<dataFilesCount; ++i) {
    String partitionValue = RandomStringUtils.randomAlphanumeric(partitionValueSize);
    String datafileName = RandomStringUtils.randomAlphanumeric(64);
    dataFiles.add(DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(datafileName+".parquet"))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartitionPath(columnName+"="+partitionValue)
      .build());
  }
  return dataFiles;
}
 
Example #18
Source File: RowDataRewriter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public List<DataFile> rewriteDataForTasks(JavaRDD<CombinedScanTask> taskRDD) {
  JavaRDD<TaskResult> taskCommitRDD = taskRDD.map(this::rewriteDataForTask);

  return taskCommitRDD.collect().stream()
      .flatMap(taskCommit -> Arrays.stream(taskCommit.files()))
      .collect(Collectors.toList());
}
 
Example #19
Source File: TestSparkDataFile.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void checkSparkDataFile(Table table) throws IOException {
  Iterable<InternalRow> rows = RandomData.generateSpark(table.schema(), 200, 0);
  JavaRDD<InternalRow> rdd = sparkContext.parallelize(Lists.newArrayList(rows));
  Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false);

  df.write().format("iceberg").mode("append").save(tableLocation);

  table.refresh();

  List<ManifestFile> manifests = table.currentSnapshot().allManifests();
  Assert.assertEquals("Should have 1 manifest", 1, manifests.size());

  List<DataFile> dataFiles = Lists.newArrayList();
  try (ManifestReader<DataFile> reader = ManifestFiles.read(manifests.get(0), table.io())) {
    reader.forEach(dataFile -> dataFiles.add(dataFile.copy()));
  }

  Dataset<Row> dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files");

  // reorder columns to test arbitrary projections
  List<Column> columns = Arrays.stream(dataFileDF.columns())
      .map(ColumnName::new)
      .collect(Collectors.toList());
  Collections.shuffle(columns);

  List<Row> sparkDataFiles = dataFileDF
      .select(Iterables.toArray(columns, Column.class))
      .collectAsList();

  Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size());

  Types.StructType dataFileType = DataFile.getType(table.spec().partitionType());
  StructType sparkDataFileType = sparkDataFiles.get(0).schema();
  SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkDataFileType);

  for (int i = 0; i < dataFiles.size(); i++) {
    checkDataFile(dataFiles.get(i), wrapper.wrap(sparkDataFiles.get(i)));
  }
}
 
Example #20
Source File: TestSparkDataFile.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void checkDataFile(DataFile expected, DataFile actual) {
  Assert.assertEquals("Path must match", expected.path(), actual.path());
  Assert.assertEquals("Format must match", expected.format(), actual.format());
  Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount());
  Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes());
  Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts());
  Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts());
  Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds());
  Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds());
  Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata());
  Assert.assertEquals("Split offsets must match", expected.splitOffsets(), actual.splitOffsets());

  checkStructLike(expected.partition(), actual.partition());
}
 
Example #21
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testFullMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(2, file.lowerBounds().size());
    Assert.assertEquals(2, file.upperBounds().size());
  }
}
 
Example #22
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testCountMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example #23
Source File: IcebergCatalog.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public void consumeData(List<DataFile> filesList) {
  Preconditions.checkState(transaction != null, "Transaction was not started");
  Preconditions.checkState(appendFiles != null, "Transaction was not started");


  filesList
    .stream()
    .forEach(x -> appendFiles.appendFile(x));

  // adds the current update to the transaction. It will be marked as
  // pending commit inside transaction. Final commit on transaction in end method
  // makes these files become part of the table

}
 
Example #24
Source File: SparkBatchWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void replacePartitions(WriterCommitMessage[] messages) {
  ReplacePartitions dynamicOverwrite = table.newReplacePartitions();

  int numFiles = 0;
  for (DataFile file : files(messages)) {
    numFiles += 1;
    dynamicOverwrite.addFile(file);
  }

  commitOperation(dynamicOverwrite, numFiles, "dynamic partition overwrite");
}
 
Example #25
Source File: TestRefresh.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example #26
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static ManifestFile writeManifest(
    List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io,
    String location, int format, PartitionSpec spec, StructType sparkType) throws IOException {

  String manifestName = "optimized-m-" + UUID.randomUUID();
  Path manifestPath = new Path(location, manifestName);
  OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString()));

  Types.StructType dataFileType = DataFile.getType(spec.partitionType());
  SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType);

  ManifestWriter writer = ManifestFiles.write(format, spec, outputFile, null);

  try {
    for (int index = startIndex; index < endIndex; index++) {
      Row row = rows.get(index);
      long snapshotId = row.getLong(0);
      long sequenceNumber = row.getLong(1);
      Row file = row.getStruct(2);
      writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber);
    }
  } finally {
    writer.close();
  }

  return writer.toManifestFile();
}
 
Example #27
Source File: SparkBatchWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void overwrite(WriterCommitMessage[] messages) {
  OverwriteFiles overwriteFiles = table.newOverwrite();
  overwriteFiles.overwriteByRowFilter(overwriteExpr);

  int numFiles = 0;
  for (DataFile file : files(messages)) {
    numFiles += 1;
    overwriteFiles.addFile(file);
  }

  commitOperation(overwriteFiles, numFiles, "overwrite by filter");
}
 
Example #28
Source File: SparkBatchWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
protected Iterable<DataFile> files(WriterCommitMessage[] messages) {
  if (messages.length > 0) {
    return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ?
        ImmutableList.copyOf(((TaskCommit) message).files()) :
        ImmutableList.of()));
  }
  return ImmutableList.of();
}
 
Example #29
Source File: TestIcebergSerDe.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Test
public void testDataFileSerDe() throws Exception{
  File dataFile = new File(folder.getRoot(), "a.parquet");
  dataFile.createNewFile();

  PartitionSpec partitionSpec = PartitionSpec
    .builderFor(schema)
    .identity("i")
    .identity("data")
    .build();

  IcebergPartitionData icebergPartitionData = new IcebergPartitionData(partitionSpec.partitionType());
  icebergPartitionData.set(0, Integer.valueOf(10));
  icebergPartitionData.set(1, "def");

  DataFile d1 = DataFiles.builder(partitionSpec)
    .withInputFile(Files.localInput(dataFile))
    .withRecordCount(50)
    .withFormat(FileFormat.PARQUET)
    .withPartition(icebergPartitionData)
    .build();

  long d1RecordCount = d1.recordCount();
  byte[] dataFileBytes = IcebergSerDe.serializeDataFile(d1);
  DataFile d2 = IcebergSerDe.deserializeDataFile(dataFileBytes);
  long d2RecordCount = d2.recordCount();
  Assert.assertEquals(d1RecordCount, d2RecordCount);
  Assert.assertEquals((Integer)(d2.partition().get(0, Integer.class)), Integer.valueOf(10));
  Assert.assertEquals((String)(d2.partition().get(1, String.class)), "def");
}
 
Example #30
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}