org.apache.iceberg.DataFiles Java Examples

The following examples show how to use org.apache.iceberg.DataFiles. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveCreateReplaceTableTest.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}
 
Example #2
Source File: HiveCreateReplaceTableTest.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnWithGlobalTableLocation() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap());
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();

  table.newAppend()
      .appendFile(dataFile)
      .commit();

  Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots()));
}
 
Example #3
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private byte[] getIcebergMetaData() throws IOException {
  if (!this.isIcebergWriter) {
    return null;
  }

  final long fileSize = parquetFileWriter.getPos();
  DataFiles.Builder dataFileBuilder =
    DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns))
      .withPath(path.toString())
      .withFileSizeInBytes(fileSize)
      .withRecordCount(recordCount)
      .withFormat(FileFormat.PARQUET);

  // add partition info
  if (partitionColumns != null) {
    dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData());
  }

  // add column level metrics
  Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema);
  dataFileBuilder = dataFileBuilder.withMetrics(metrics);
  return IcebergSerDe.serializeDataFile(dataFileBuilder.build());
}
 
Example #4
Source File: BaseWriter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}
 
Example #5
Source File: TestIcebergManifests.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
List<DataFile> getDataFiles(PartitionSpec partitionSpec, int partitionValueSize, int dataFilesCount, String columnName) {
  List<DataFile> dataFiles = new ArrayList<>();
  for( int i=0; i<dataFilesCount; ++i) {
    String partitionValue = RandomStringUtils.randomAlphanumeric(partitionValueSize);
    String datafileName = RandomStringUtils.randomAlphanumeric(64);
    dataFiles.add(DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(datafileName+".parquet"))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartitionPath(columnName+"="+partitionValue)
      .build());
  }
  return dataFiles;
}
 
Example #6
Source File: IcebergMetadata.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics)
{
    IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle;
    org.apache.iceberg.Table icebergTable = transaction.table();

    List<CommitTaskData> commitTasks = fragments.stream()
            .map(slice -> commitTaskCodec.fromJson(slice.getBytes()))
            .collect(toImmutableList());

    Type[] partitionColumnTypes = icebergTable.spec().fields().stream()
            .map(field -> field.transform().getResultType(
                    icebergTable.schema().findType(field.sourceId())))
            .toArray(Type[]::new);

    AppendFiles appendFiles = transaction.newFastAppend();
    for (CommitTaskData task : commitTasks) {
        HdfsContext context = new HdfsContext(session, table.getSchemaName(), table.getTableName());
        Configuration configuration = hdfsEnvironment.getConfiguration(context, new Path(task.getPath()));

        DataFiles.Builder builder = DataFiles.builder(icebergTable.spec())
                .withInputFile(HadoopInputFile.fromLocation(task.getPath(), configuration))
                .withFormat(table.getFileFormat())
                .withMetrics(task.getMetrics().metrics());

        if (!icebergTable.spec().fields().isEmpty()) {
            String partitionDataJson = task.getPartitionDataJson()
                    .orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
            builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes));
        }

        appendFiles.appendFile(builder.build());
    }

    appendFiles.commit();
    transaction.commitTransaction();

    return Optional.of(new HiveWrittenPartitions(commitTasks.stream()
            .map(CommitTaskData::getPath)
            .collect(toImmutableList())));
}
 
Example #7
Source File: TestIcebergPartitions.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName, int idValue, String nameValue,
  int recordCount) throws IOException {
  File dataFile = new File(dir, fileName);
  dataFile.createNewFile();

  return DataFiles.builder(spec)
    .withInputFile(Files.localInput(dataFile))
    .withPartitionPath(ID + "=" + idValue + "/" + NAME + "=" + nameValue)
    .withRecordCount(recordCount)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example #8
Source File: TestIcebergPartitionData.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void verifyPartitionValue(PartitionSpec partitionSpec, IcebergPartitionData partitionData,
                                  String columnName, Class expectedClass, Object expectedValue) throws Exception {
  File tableFolder = new File(folder.getRoot(), "icebergPartitionTest");
  try {
    tableFolder.mkdir();
    File dataFile = new File(folder.getRoot(), "a.parquet");

    dataFile.createNewFile();

    DataFile d1 = DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(dataFile))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartition(partitionData)
      .build();

    IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(Path.of(tableFolder.toPath().toString()),
      (new SchemaConverter()).fromIceberg(schema), Lists.newArrayList(columnName), new Configuration());
    committer.consumeData(Lists.newArrayList(d1));
    committer.commit();


    Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
    for (FileScanTask fileScanTask : table.newScan().planFiles()) {
      StructLike structLike = fileScanTask.file().partition();
      if (expectedClass == ByteBuffer.class) {
        Assert.assertEquals(structLike.get(0, expectedClass).hashCode(), ByteBuffer.wrap((byte[])expectedValue).hashCode());
      } else {
        Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue));
      }
    }

  }
  finally {
    tableFolder.delete();
  }

}
 
Example #9
Source File: TestRefresh.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example #10
Source File: TestIcebergSerDe.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Test
public void testDataFileSerDe() throws Exception{
  File dataFile = new File(folder.getRoot(), "a.parquet");
  dataFile.createNewFile();

  PartitionSpec partitionSpec = PartitionSpec
    .builderFor(schema)
    .identity("i")
    .identity("data")
    .build();

  IcebergPartitionData icebergPartitionData = new IcebergPartitionData(partitionSpec.partitionType());
  icebergPartitionData.set(0, Integer.valueOf(10));
  icebergPartitionData.set(1, "def");

  DataFile d1 = DataFiles.builder(partitionSpec)
    .withInputFile(Files.localInput(dataFile))
    .withRecordCount(50)
    .withFormat(FileFormat.PARQUET)
    .withPartition(icebergPartitionData)
    .build();

  long d1RecordCount = d1.recordCount();
  byte[] dataFileBytes = IcebergSerDe.serializeDataFile(d1);
  DataFile d2 = IcebergSerDe.deserializeDataFile(dataFileBytes);
  long d2RecordCount = d2.recordCount();
  Assert.assertEquals(d1RecordCount, d2RecordCount);
  Assert.assertEquals((Integer)(d2.partition().get(0, Integer.class)), Integer.valueOf(10));
  Assert.assertEquals((String)(d2.partition().get(1, String.class)), "def");
}
 
Example #11
Source File: TestIcebergTableDrop.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example #12
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example #13
Source File: HiveTableTest.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testDropWithoutPurgeLeavesTableData() throws IOException {
  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test"));
  List<GenericData.Record> records = Lists.newArrayList(
      recordBuilder.set("id", 1L).build(),
      recordBuilder.set("id", 2L).build(),
      recordBuilder.set("id", 3L).build()
  );

  String fileLocation = table.location().replace("file:", "") + "/data/file.avro";
  try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(fileLocation))
      .schema(schema)
      .named("test")
      .build()) {
    for (GenericData.Record rec : records) {
      writer.add(rec);
    }
  }

  DataFile file = DataFiles.builder(table.spec())
      .withRecordCount(3)
      .withPath(fileLocation)
      .withFileSizeInBytes(Files.localInput(fileLocation).getLength())
      .build();

  table.newAppend().appendFile(file).commit();

  String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", "");

  Assert.assertTrue("Drop should return true and drop the table",
      catalog.dropTable(TABLE_IDENTIFIER, false /* do not delete underlying files */));
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Assert.assertTrue("Table data files should exist",
      new File(fileLocation).exists());
  Assert.assertTrue("Table metadata files should exist",
      new File(manifestListLocation).exists());
}
 
Example #14
Source File: TestHiveTableConcurrency.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}
 
Example #15
Source File: TestHiveTableConcurrency.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentConnections() throws InterruptedException {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  icebergTable.updateProperties()
      .set(COMMIT_NUM_RETRIES, "20")
      .set(COMMIT_MIN_RETRY_WAIT_MS, "25")
      .set(COMMIT_MAX_RETRY_WAIT_MS, "25")
      .commit();

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(7));

  for (int i = 0; i < 7; i++) {
    executorService.submit(() -> icebergTable.newAppend().appendFile(file).commit());
  }

  executorService.shutdown();
  Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
  Assert.assertEquals(7, Iterables.size(icebergTable.snapshots()));
}
 
Example #16
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listOrcPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);

    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf));
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("orc")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example #17
Source File: TestAvroScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
protected void writeAndValidate(Schema schema) throws IOException {
  File parent = temp.newFolder("avro");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  File avroFile = new File(dataFolder,
      FileFormat.AVRO.addExtension(UUID.randomUUID().toString()));

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString());

  // Important: use the table's schema for the rest of the test
  // When tables are created, the column ids are reassigned.
  Schema tableSchema = table.schema();

  List<Record> expected = RandomData.generateList(tableSchema, 100, 1L);

  try (FileAppender<Record> writer = Avro.write(localOutput(avroFile))
      .schema(tableSchema)
      .build()) {
    writer.addAll(expected);
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(100)
      .withFileSizeInBytes(avroFile.length())
      .withPath(avroFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example #18
Source File: TestPartitionValues.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testPartitionValueTypes() throws Exception {
  String[] columnNames = new String[] {
      "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10"
  };

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());

  // create a table around the source data
  String sourceLocation = temp.newFolder("source_table").toString();
  Table source = tables.create(SUPPORTED_PRIMITIVES, sourceLocation);

  // write out an Avro data file with all of the data types for source data
  List<GenericData.Record> expected = RandomData.generateList(source.schema(), 2, 128735L);
  File avroData = temp.newFile("data.avro");
  Assert.assertTrue(avroData.delete());
  try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(avroData))
      .schema(source.schema())
      .build()) {
    appender.addAll(expected);
  }

  // add the Avro data file to the source table
  source.newAppend()
      .appendFile(DataFiles.fromInputFile(Files.localInput(avroData), 10))
      .commit();

  Dataset<Row> sourceDF = spark.read().format("iceberg").load(sourceLocation);

  for (String column : columnNames) {
    String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString();

    File parent = temp.newFolder(desc);
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

    PartitionSpec spec = PartitionSpec.builderFor(SUPPORTED_PRIMITIVES).identity(column).build();

    Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString());
    table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

    sourceDF.write()
        .format("iceberg")
        .mode("append")
        .save(location.toString());

    List<Row> actual = spark.read()
        .format("iceberg")
        .load(location.toString())
        .collectAsList();

    Assert.assertEquals("Number of rows should match", expected.size(), actual.size());

    for (int i = 0; i < expected.size(); i += 1) {
      TestHelpers.assertEqualsSafe(
          SUPPORTED_PRIMITIVES.asStruct(), expected.get(i), actual.get(i));
    }
  }
}
 
Example #19
Source File: TestPartitionValues.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testNestedPartitionValues() throws Exception {
  String[] columnNames = new String[] {
      "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10"
  };

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
  Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct()));

  // create a table around the source data
  String sourceLocation = temp.newFolder("source_table").toString();
  Table source = tables.create(nestedSchema, sourceLocation);

  // write out an Avro data file with all of the data types for source data
  List<GenericData.Record> expected = RandomData.generateList(source.schema(), 2, 128735L);
  File avroData = temp.newFile("data.avro");
  Assert.assertTrue(avroData.delete());
  try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(avroData))
      .schema(source.schema())
      .build()) {
    appender.addAll(expected);
  }

  // add the Avro data file to the source table
  source.newAppend()
      .appendFile(DataFiles.fromInputFile(Files.localInput(avroData), 10))
      .commit();

  Dataset<Row> sourceDF = spark.read().format("iceberg").load(sourceLocation);

  for (String column : columnNames) {
    String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString();

    File parent = temp.newFolder(desc);
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

    PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build();

    Table table = tables.create(nestedSchema, spec, location.toString());
    table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

    sourceDF.write()
        .format("iceberg")
        .mode("append")
        .save(location.toString());

    List<Row> actual = spark.read()
        .format("iceberg")
        .load(location.toString())
        .collectAsList();

    Assert.assertEquals("Number of rows should match", expected.size(), actual.size());

    for (int i = 0; i < expected.size(); i += 1) {
      TestHelpers.assertEqualsSafe(
          nestedSchema.asStruct(), expected.get(i), actual.get(i));
    }
  }
}
 
Example #20
Source File: TestParquetScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
protected void writeAndValidate(Schema schema) throws IOException {
  Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro",
      null == TypeUtil.find(
          schema,
          type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));

  File parent = temp.newFolder("parquet");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  File parquetFile = new File(dataFolder,
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString());

  // Important: use the table's schema for the rest of the test
  // When tables are created, the column ids are reassigned.
  Schema tableSchema = table.schema();

  List<GenericData.Record> expected = RandomData.generateList(tableSchema, 100, 1L);

  try (FileAppender<GenericData.Record> writer = Parquet.write(localOutput(parquetFile))
      .schema(tableSchema)
      .build()) {
    writer.addAll(expected);
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withFileSizeInBytes(parquetFile.length())
      .withPath(parquetFile.toString())
      .withRecordCount(100)
      .build();

  table.newAppend().appendFile(file).commit();
  table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example #21
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Before
public void writeUnpartitionedTable() throws IOException {
  this.parent = temp.newFolder("TestFilteredScan");
  this.unpartitioned = new File(parent, "unpartitioned");
  File dataFolder = new File(unpartitioned, "data");
  Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs());

  Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString());
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));

  File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));

  // create records using the table's schema
  org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(tableSchema, "test");
  this.records = testRecords(avroSchema);

  switch (fileFormat) {
    case AVRO:
      try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case PARQUET:
      try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(records.size())
      .withFileSizeInBytes(testFile.length())
      .withPath(testFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();
}
 
Example #22
Source File: TestIcebergInputFormat.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private DataFile writeFile(
    Table table, StructLike partitionData, FileFormat fileFormat, List<Record> records) throws IOException {
  File file = temp.newFile();
  Assert.assertTrue(file.delete());
  FileAppender<Record> appender;
  switch (fileFormat) {
    case AVRO:
      appender = Avro.write(Files.localOutput(file))
          .schema(table.schema())
          .createWriterFunc(DataWriter::create)
          .named(fileFormat.name())
          .build();
      break;
    case PARQUET:
      appender = Parquet.write(Files.localOutput(file))
          .schema(table.schema())
          .createWriterFunc(GenericParquetWriter::buildWriter)
          .named(fileFormat.name())
          .build();
      break;
    case ORC:
      appender = ORC.write(Files.localOutput(file))
          .schema(table.schema())
          .createWriterFunc(GenericOrcWriter::buildWriter)
          .build();
      break;
    default:
      throw new UnsupportedOperationException("Cannot write format: " + fileFormat);
  }

  try {
    appender.addAll(records);
  } finally {
    appender.close();
  }

  DataFiles.Builder builder = DataFiles.builder(table.spec())
      .withPath(file.toString())
      .withFormat(format)
      .withFileSizeInBytes(file.length())
      .withMetrics(appender.metrics());
  if (partitionData != null) {
    builder.withPartition(partitionData);
  }
  return builder.build();
}
 
Example #23
Source File: TestCreateTable.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Test
public void testDroppingOfMapTypeColumn() throws Exception{
  String table1 = "iceberg_map_test";
  try {
    File table1Folder = new File(getDfsTestTmpSchemaLocation(), table1);
    HadoopTables hadoopTables = new HadoopTables(new Configuration());

    Schema schema = new Schema(
      Types.NestedField.optional(1, "col1", Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get())),
      Types.NestedField.optional(2, "col2", Types.IntegerType.get())
    );
    PartitionSpec spec = PartitionSpec
      .builderFor(schema)
      .build();
    Table table = hadoopTables.create(schema, spec, table1Folder.getPath());
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    final String testWorkingPath = TestTools.getWorkingPath() + "/src/test/resources/iceberg/mapTest";
    final String parquetFile = "iceberg_map_test.parquet";
    File dataFile = new File(testWorkingPath, parquetFile);
    appendFiles.appendFile(
      DataFiles.builder(spec)
        .withInputFile(Files.localInput(dataFile))
        .withRecordCount(1)
        .withFormat(FileFormat.PARQUET)
        .build()
    );
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .build()
      .run();

    Thread.sleep(1001);
    String insertCommandSql = "insert into  dfs_test.iceberg_map_test select * from (values(2))";
    test(insertCommandSql);
    Thread.sleep(1001);

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .baselineValues(2)
      .build()
      .run();
  }
  finally {
    FileUtils.deleteQuietly(new File(getDfsTestTmpSchemaLocation(), table1));
  }
}
 
Example #24
Source File: TestLocalScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private DataFile writeFile(String location, String filename, Schema schema, List<Record> records) throws IOException {
  Path path = new Path(location, filename);
  FileFormat fileFormat = FileFormat.fromFileName(filename);
  Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename);
  switch (fileFormat) {
    case AVRO:
      FileAppender<Record> avroAppender = Avro.write(fromPath(path, CONF))
          .schema(schema)
          .createWriterFunc(DataWriter::create)
          .named(fileFormat.name())
          .build();
      try {
        avroAppender.addAll(records);
      } finally {
        avroAppender.close();
      }

      return DataFiles.builder(PartitionSpec.unpartitioned())
          .withInputFile(HadoopInputFile.fromPath(path, CONF))
          .withMetrics(avroAppender.metrics())
          .build();

    case PARQUET:
      FileAppender<Record> parquetAppender = Parquet.write(fromPath(path, CONF))
          .schema(schema)
          .createWriterFunc(GenericParquetWriter::buildWriter)
          .build();
      try {
        parquetAppender.addAll(records);
      } finally {
        parquetAppender.close();
      }

      return DataFiles.builder(PartitionSpec.unpartitioned())
          .withInputFile(HadoopInputFile.fromPath(path, CONF))
          .withMetrics(parquetAppender.metrics())
          .build();

    case ORC:
      FileAppender<Record> orcAppender = ORC.write(fromPath(path, CONF))
          .schema(schema)
          .createWriterFunc(GenericOrcWriter::buildWriter)
          .build();
      try {
        orcAppender.addAll(records);
      } finally {
        orcAppender.close();
      }

      return DataFiles.builder(PartitionSpec.unpartitioned())
              .withInputFile(HadoopInputFile.fromPath(path, CONF))
              .withMetrics(orcAppender.metrics())
              .build();

    default:
      throw new UnsupportedOperationException("Cannot write format: " + fileFormat);
  }
}
 
Example #25
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Before
public void writeUnpartitionedTable() throws IOException {
  this.parent = temp.newFolder("TestFilteredScan");
  this.unpartitioned = new File(parent, "unpartitioned");
  File dataFolder = new File(unpartitioned, "data");
  Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs());

  Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString());
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));

  File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));

  // create records using the table's schema
  this.records = testRecords(tableSchema);

  switch (fileFormat) {
    case AVRO:
      try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
          .createWriterFunc(DataWriter::create)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case PARQUET:
      try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
          .createWriterFunc(GenericParquetWriter::buildWriter)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case ORC:
      try (FileAppender<Record> writer = ORC.write(localOutput(testFile))
          .createWriterFunc(GenericOrcWriter::buildWriter)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(records.size())
      .withFileSizeInBytes(testFile.length())
      .withPath(testFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();
}
 
Example #26
Source File: TestNameMappingProjection.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testAvroReaderWithNameMapping() throws IOException {
  File avroFile = temp.newFile();
  org.apache.avro.Schema avroSchema = SchemaBuilder.record("TestRecord")
      .namespace("org.apache.iceberg.spark.data")
      .fields()
      .requiredInt("id")
      .requiredString("name")
      .endRecord();

  org.apache.avro.Schema avroSchemaWithoutIds = RemoveIds.removeIds(avroSchema);

  GenericRecord record1 = new GenericData.Record(avroSchemaWithoutIds);
  record1.put("id", 1);
  record1.put("name", "Bob");

  GenericRecord record2 = new GenericData.Record(avroSchemaWithoutIds);
  record2.put("id", 2);
  record2.put("name", "Alice");

  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(avroSchemaWithoutIds);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);

  dataFileWriter.create(avroSchemaWithoutIds, avroFile);
  dataFileWriter.append(record1);
  dataFileWriter.append(record2);
  dataFileWriter.close();

  DataFile avroDataFile = DataFiles.builder(PartitionSpec.unpartitioned())
      .withFormat("avro")
      .withFileSizeInBytes(avroFile.length())
      .withPath(avroFile.getAbsolutePath())
      .withRecordCount(2)
      .build();

  Schema filteredSchema = new Schema(
      required(1, "name", Types.StringType.get())
  );
  NameMapping nameMapping = MappingUtil.create(filteredSchema);

  Schema tableSchema = new Schema(
      required(1, "name", Types.StringType.get()),
      optional(2, "id", Types.IntegerType.get())
  );

  Table table = catalog.createTable(
      org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, "avro_table"),
      tableSchema,
      PartitionSpec.unpartitioned());

  table.updateProperties()
      .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping))
      .commit();

  table.newFastAppend().appendFile(avroDataFile).commit();

  List<Row> actual = spark.read().format("iceberg")
      .load(DB_NAME + ".avro_table")
      .filter("name='Alice'")
      .collectAsList();

  Assert.assertEquals("Should project 1 record", 1, actual.size());
  Assert.assertEquals("Should equal to 'Alice'", "Alice", actual.get(0).getString(0));
  Assert.assertNull("should be null", actual.get(0).get(1));
}
 
Example #27
Source File: TestForwardCompatibility.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Test
public void testSparkCanReadUnknownTransform() throws IOException {
  File parent = temp.newFolder("avro");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(SCHEMA, UNKNOWN_SPEC, location.toString());

  // enable snapshot inheritance to avoid rewriting the manifest with an unknown transform
  table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit();

  List<GenericData.Record> expected = RandomData.generateList(table.schema(), 100, 1L);

  File parquetFile = new File(dataFolder,
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
  FileAppender<GenericData.Record> writer = Parquet.write(localOutput(parquetFile))
      .schema(table.schema())
      .build();
  try {
    writer.addAll(expected);
  } finally {
    writer.close();
  }

  DataFile file = DataFiles.builder(FAKE_SPEC)
      .withInputFile(localInput(parquetFile))
      .withMetrics(writer.metrics())
      .withPartitionPath("id_zero=0")
      .build();

  OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString()));
  ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile);
  try {
    manifestWriter.add(file);
  } finally {
    manifestWriter.close();
  }

  table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(table.schema().asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example #28
Source File: TestSparkReadProjection.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema,
                              Record record) throws IOException {
  File parent = temp.newFolder(desc);
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

  File testFile = new File(dataFolder, format.addExtension(UUID.randomUUID().toString()));

  Table table = TestTables.create(location, desc, writeSchema, PartitionSpec.unpartitioned());
  try {
    // Important: use the table's schema for the rest of the test
    // When tables are created, the column ids are reassigned.
    Schema tableSchema = table.schema();

    switch (format) {
      case AVRO:
        try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
            .createWriterFunc(DataWriter::create)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;

      case PARQUET:
        try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
            .createWriterFunc(GenericParquetWriter::buildWriter)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;

      case ORC:
        try (FileAppender<org.apache.iceberg.data.Record> writer = ORC.write(localOutput(testFile))
            .createWriterFunc(GenericOrcWriter::buildWriter)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;
    }

    DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
        .withRecordCount(100)
        .withFileSizeInBytes(testFile.length())
        .withPath(testFile.toString())
        .build();

    table.newAppend().appendFile(file).commit();

    table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();

    // rewrite the read schema for the table's reassigned ids
    Map<Integer, Integer> idMapping = Maps.newHashMap();
    for (int id : allIds(writeSchema)) {
      // translate each id to the original schema's column name, then to the new schema's id
      String originalName = writeSchema.findColumnName(id);
      idMapping.put(id, tableSchema.findField(originalName).fieldId());
    }
    Schema expectedSchema = reassignIds(readSchema, idMapping);

    // Set the schema to the expected schema directly to simulate the table schema evolving
    TestTables.replaceMetadata(desc,
        TestTables.readMetadata(desc).updateSchema(expectedSchema, 100));

    Dataset<Row> df = spark.read()
        .format("org.apache.iceberg.spark.source.TestIcebergSource")
        .option("iceberg.table.name", desc)
        .load();

    return SparkValueConverter.convert(readSchema, df.collectAsList().get(0));

  } finally {
    TestTables.clearTables();
  }
}