Java Code Examples for org.apache.iceberg.DataFiles

The following examples show how to use org.apache.iceberg.DataFiles. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: HiveCreateReplaceTableTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}
 
Example 2
Source Project: iceberg   Source File: HiveCreateReplaceTableTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnWithGlobalTableLocation() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap());
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();

  table.newAppend()
      .appendFile(dataFile)
      .commit();

  Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots()));
}
 
Example 3
Source Project: iceberg   Source File: BaseWriter.java    License: Apache License 2.0 6 votes vote down vote up
protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}
 
Example 4
Source Project: dremio-oss   Source File: ParquetRecordWriter.java    License: Apache License 2.0 6 votes vote down vote up
private byte[] getIcebergMetaData() throws IOException {
  if (!this.isIcebergWriter) {
    return null;
  }

  final long fileSize = parquetFileWriter.getPos();
  DataFiles.Builder dataFileBuilder =
    DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns))
      .withPath(path.toString())
      .withFileSizeInBytes(fileSize)
      .withRecordCount(recordCount)
      .withFormat(FileFormat.PARQUET);

  // add partition info
  if (partitionColumns != null) {
    dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData());
  }

  // add column level metrics
  Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema);
  dataFileBuilder = dataFileBuilder.withMetrics(metrics);
  return IcebergSerDe.serializeDataFile(dataFileBuilder.build());
}
 
Example 5
Source Project: presto   Source File: IcebergMetadata.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics)
{
    IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle;
    org.apache.iceberg.Table icebergTable = transaction.table();

    List<CommitTaskData> commitTasks = fragments.stream()
            .map(slice -> commitTaskCodec.fromJson(slice.getBytes()))
            .collect(toImmutableList());

    Type[] partitionColumnTypes = icebergTable.spec().fields().stream()
            .map(field -> field.transform().getResultType(
                    icebergTable.schema().findType(field.sourceId())))
            .toArray(Type[]::new);

    AppendFiles appendFiles = transaction.newFastAppend();
    for (CommitTaskData task : commitTasks) {
        HdfsContext context = new HdfsContext(session, table.getSchemaName(), table.getTableName());
        Configuration configuration = hdfsEnvironment.getConfiguration(context, new Path(task.getPath()));

        DataFiles.Builder builder = DataFiles.builder(icebergTable.spec())
                .withInputFile(HadoopInputFile.fromLocation(task.getPath(), configuration))
                .withFormat(table.getFileFormat())
                .withMetrics(task.getMetrics().metrics());

        if (!icebergTable.spec().fields().isEmpty()) {
            String partitionDataJson = task.getPartitionDataJson()
                    .orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
            builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes));
        }

        appendFiles.appendFile(builder.build());
    }

    appendFiles.commit();
    transaction.commitTransaction();

    return Optional.of(new HiveWrittenPartitions(commitTasks.stream()
            .map(CommitTaskData::getPath)
            .collect(toImmutableList())));
}
 
Example 6
Source Project: iceberg   Source File: HiveTableTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testDropWithoutPurgeLeavesTableData() throws IOException {
  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test"));
  List<GenericData.Record> records = Lists.newArrayList(
      recordBuilder.set("id", 1L).build(),
      recordBuilder.set("id", 2L).build(),
      recordBuilder.set("id", 3L).build()
  );

  String fileLocation = table.location().replace("file:", "") + "/data/file.avro";
  try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(fileLocation))
      .schema(schema)
      .named("test")
      .build()) {
    for (GenericData.Record rec : records) {
      writer.add(rec);
    }
  }

  DataFile file = DataFiles.builder(table.spec())
      .withRecordCount(3)
      .withPath(fileLocation)
      .withFileSizeInBytes(Files.localInput(fileLocation).getLength())
      .build();

  table.newAppend().appendFile(file).commit();

  String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", "");

  Assert.assertTrue("Drop should return true and drop the table",
      catalog.dropTable(TABLE_IDENTIFIER, false /* do not delete underlying files */));
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Assert.assertTrue("Table data files should exist",
      new File(fileLocation).exists());
  Assert.assertTrue("Table metadata files should exist",
      new File(manifestListLocation).exists());
}
 
Example 7
Source Project: iceberg   Source File: TestHiveTableConcurrency.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}
 
Example 8
Source Project: iceberg   Source File: TestHiveTableConcurrency.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentConnections() throws InterruptedException {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  icebergTable.updateProperties()
      .set(COMMIT_NUM_RETRIES, "20")
      .set(COMMIT_MIN_RETRY_WAIT_MS, "25")
      .set(COMMIT_MAX_RETRY_WAIT_MS, "25")
      .commit();

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(7));

  for (int i = 0; i < 7; i++) {
    executorService.submit(() -> icebergTable.newAppend().appendFile(file).commit());
  }

  executorService.shutdown();
  Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
  Assert.assertEquals(7, Iterables.size(icebergTable.snapshots()));
}
 
Example 9
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example 10
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listOrcPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);

    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf));
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("orc")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example 11
Source Project: dremio-oss   Source File: TestIcebergTableDrop.java    License: Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example 12
Source Project: dremio-oss   Source File: TestIcebergSerDe.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testDataFileSerDe() throws Exception{
  File dataFile = new File(folder.getRoot(), "a.parquet");
  dataFile.createNewFile();

  PartitionSpec partitionSpec = PartitionSpec
    .builderFor(schema)
    .identity("i")
    .identity("data")
    .build();

  IcebergPartitionData icebergPartitionData = new IcebergPartitionData(partitionSpec.partitionType());
  icebergPartitionData.set(0, Integer.valueOf(10));
  icebergPartitionData.set(1, "def");

  DataFile d1 = DataFiles.builder(partitionSpec)
    .withInputFile(Files.localInput(dataFile))
    .withRecordCount(50)
    .withFormat(FileFormat.PARQUET)
    .withPartition(icebergPartitionData)
    .build();

  long d1RecordCount = d1.recordCount();
  byte[] dataFileBytes = IcebergSerDe.serializeDataFile(d1);
  DataFile d2 = IcebergSerDe.deserializeDataFile(dataFileBytes);
  long d2RecordCount = d2.recordCount();
  Assert.assertEquals(d1RecordCount, d2RecordCount);
  Assert.assertEquals((Integer)(d2.partition().get(0, Integer.class)), Integer.valueOf(10));
  Assert.assertEquals((String)(d2.partition().get(1, String.class)), "def");
}
 
Example 13
Source Project: dremio-oss   Source File: TestRefresh.java    License: Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example 14
Source Project: dremio-oss   Source File: TestIcebergPartitionData.java    License: Apache License 2.0 5 votes vote down vote up
private void verifyPartitionValue(PartitionSpec partitionSpec, IcebergPartitionData partitionData,
                                  String columnName, Class expectedClass, Object expectedValue) throws Exception {
  File tableFolder = new File(folder.getRoot(), "icebergPartitionTest");
  try {
    tableFolder.mkdir();
    File dataFile = new File(folder.getRoot(), "a.parquet");

    dataFile.createNewFile();

    DataFile d1 = DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(dataFile))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartition(partitionData)
      .build();

    IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(Path.of(tableFolder.toPath().toString()),
      (new SchemaConverter()).fromIceberg(schema), Lists.newArrayList(columnName), new Configuration());
    committer.consumeData(Lists.newArrayList(d1));
    committer.commit();


    Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
    for (FileScanTask fileScanTask : table.newScan().planFiles()) {
      StructLike structLike = fileScanTask.file().partition();
      if (expectedClass == ByteBuffer.class) {
        Assert.assertEquals(structLike.get(0, expectedClass).hashCode(), ByteBuffer.wrap((byte[])expectedValue).hashCode());
      } else {
        Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue));
      }
    }

  }
  finally {
    tableFolder.delete();
  }

}
 
Example 15
Source Project: dremio-oss   Source File: TestIcebergPartitions.java    License: Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName, int idValue, String nameValue,
  int recordCount) throws IOException {
  File dataFile = new File(dir, fileName);
  dataFile.createNewFile();

  return DataFiles.builder(spec)
    .withInputFile(Files.localInput(dataFile))
    .withPartitionPath(ID + "=" + idValue + "/" + NAME + "=" + nameValue)
    .withRecordCount(recordCount)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example 16
Source Project: dremio-oss   Source File: TestIcebergManifests.java    License: Apache License 2.0 5 votes vote down vote up
List<DataFile> getDataFiles(PartitionSpec partitionSpec, int partitionValueSize, int dataFilesCount, String columnName) {
  List<DataFile> dataFiles = new ArrayList<>();
  for( int i=0; i<dataFilesCount; ++i) {
    String partitionValue = RandomStringUtils.randomAlphanumeric(partitionValueSize);
    String datafileName = RandomStringUtils.randomAlphanumeric(64);
    dataFiles.add(DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(datafileName+".parquet"))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartitionPath(columnName+"="+partitionValue)
      .build());
  }
  return dataFiles;
}
 
Example 17
Source Project: iceberg   Source File: TestNameMappingProjection.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testAvroReaderWithNameMapping() throws IOException {
  File avroFile = temp.newFile();
  org.apache.avro.Schema avroSchema = SchemaBuilder.record("TestRecord")
      .namespace("org.apache.iceberg.spark.data")
      .fields()
      .requiredInt("id")
      .requiredString("name")
      .endRecord();

  org.apache.avro.Schema avroSchemaWithoutIds = RemoveIds.removeIds(avroSchema);

  GenericRecord record1 = new GenericData.Record(avroSchemaWithoutIds);
  record1.put("id", 1);
  record1.put("name", "Bob");

  GenericRecord record2 = new GenericData.Record(avroSchemaWithoutIds);
  record2.put("id", 2);
  record2.put("name", "Alice");

  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(avroSchemaWithoutIds);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);

  dataFileWriter.create(avroSchemaWithoutIds, avroFile);
  dataFileWriter.append(record1);
  dataFileWriter.append(record2);
  dataFileWriter.close();

  DataFile avroDataFile = DataFiles.builder(PartitionSpec.unpartitioned())
      .withFormat("avro")
      .withFileSizeInBytes(avroFile.length())
      .withPath(avroFile.getAbsolutePath())
      .withRecordCount(2)
      .build();

  Schema filteredSchema = new Schema(
      required(1, "name", Types.StringType.get())
  );
  NameMapping nameMapping = MappingUtil.create(filteredSchema);

  Schema tableSchema = new Schema(
      required(1, "name", Types.StringType.get()),
      optional(2, "id", Types.IntegerType.get())
  );

  Table table = catalog.createTable(
      org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, "avro_table"),
      tableSchema,
      PartitionSpec.unpartitioned());

  table.updateProperties()
      .set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping))
      .commit();

  table.newFastAppend().appendFile(avroDataFile).commit();

  List<Row> actual = spark.read().format("iceberg")
      .load(DB_NAME + ".avro_table")
      .filter("name='Alice'")
      .collectAsList();

  Assert.assertEquals("Should project 1 record", 1, actual.size());
  Assert.assertEquals("Should equal to 'Alice'", "Alice", actual.get(0).getString(0));
  Assert.assertNull("should be null", actual.get(0).get(1));
}
 
Example 18
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
@Before
public void writeUnpartitionedTable() throws IOException {
  this.parent = temp.newFolder("TestFilteredScan");
  this.unpartitioned = new File(parent, "unpartitioned");
  File dataFolder = new File(unpartitioned, "data");
  Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs());

  Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString());
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));

  File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));

  // create records using the table's schema
  this.records = testRecords(tableSchema);

  switch (fileFormat) {
    case AVRO:
      try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
          .createWriterFunc(DataWriter::create)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case PARQUET:
      try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
          .createWriterFunc(GenericParquetWriter::buildWriter)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case ORC:
      try (FileAppender<Record> writer = ORC.write(localOutput(testFile))
          .createWriterFunc(GenericOrcWriter::buildWriter)
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(records.size())
      .withFileSizeInBytes(testFile.length())
      .withPath(testFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();
}
 
Example 19
Source Project: iceberg   Source File: TestLocalScan.java    License: Apache License 2.0 4 votes vote down vote up
private DataFile writeFile(String location, String filename, Schema schema, List<Record> records) throws IOException {
  Path path = new Path(location, filename);
  FileFormat fileFormat = FileFormat.fromFileName(filename);
  Preconditions.checkNotNull(fileFormat, "Cannot determine format for file: %s", filename);
  switch (fileFormat) {
    case AVRO:
      FileAppender<Record> avroAppender = Avro.write(fromPath(path, CONF))
          .schema(schema)
          .createWriterFunc(DataWriter::create)
          .named(fileFormat.name())
          .build();
      try {
        avroAppender.addAll(records);
      } finally {
        avroAppender.close();
      }

      return DataFiles.builder(PartitionSpec.unpartitioned())
          .withInputFile(HadoopInputFile.fromPath(path, CONF))
          .withMetrics(avroAppender.metrics())
          .build();

    case PARQUET:
      FileAppender<Record> parquetAppender = Parquet.write(fromPath(path, CONF))
          .schema(schema)
          .createWriterFunc(GenericParquetWriter::buildWriter)
          .build();
      try {
        parquetAppender.addAll(records);
      } finally {
        parquetAppender.close();
      }

      return DataFiles.builder(PartitionSpec.unpartitioned())
          .withInputFile(HadoopInputFile.fromPath(path, CONF))
          .withMetrics(parquetAppender.metrics())
          .build();

    case ORC:
      FileAppender<Record> orcAppender = ORC.write(fromPath(path, CONF))
          .schema(schema)
          .createWriterFunc(GenericOrcWriter::buildWriter)
          .build();
      try {
        orcAppender.addAll(records);
      } finally {
        orcAppender.close();
      }

      return DataFiles.builder(PartitionSpec.unpartitioned())
              .withInputFile(HadoopInputFile.fromPath(path, CONF))
              .withMetrics(orcAppender.metrics())
              .build();

    default:
      throw new UnsupportedOperationException("Cannot write format: " + fileFormat);
  }
}
 
Example 20
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 4 votes vote down vote up
private DataFile writeFile(
    Table table, StructLike partitionData, FileFormat fileFormat, List<Record> records) throws IOException {
  File file = temp.newFile();
  Assert.assertTrue(file.delete());
  FileAppender<Record> appender;
  switch (fileFormat) {
    case AVRO:
      appender = Avro.write(Files.localOutput(file))
          .schema(table.schema())
          .createWriterFunc(DataWriter::create)
          .named(fileFormat.name())
          .build();
      break;
    case PARQUET:
      appender = Parquet.write(Files.localOutput(file))
          .schema(table.schema())
          .createWriterFunc(GenericParquetWriter::buildWriter)
          .named(fileFormat.name())
          .build();
      break;
    case ORC:
      appender = ORC.write(Files.localOutput(file))
          .schema(table.schema())
          .createWriterFunc(GenericOrcWriter::buildWriter)
          .build();
      break;
    default:
      throw new UnsupportedOperationException("Cannot write format: " + fileFormat);
  }

  try {
    appender.addAll(records);
  } finally {
    appender.close();
  }

  DataFiles.Builder builder = DataFiles.builder(table.spec())
      .withPath(file.toString())
      .withFormat(format)
      .withFileSizeInBytes(file.length())
      .withMetrics(appender.metrics());
  if (partitionData != null) {
    builder.withPartition(partitionData);
  }
  return builder.build();
}
 
Example 21
Source Project: iceberg   Source File: TestSparkReadProjection.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema,
                              Record record) throws IOException {
  File parent = temp.newFolder(desc);
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

  File testFile = new File(dataFolder, format.addExtension(UUID.randomUUID().toString()));

  Table table = TestTables.create(location, desc, writeSchema, PartitionSpec.unpartitioned());
  try {
    // Important: use the table's schema for the rest of the test
    // When tables are created, the column ids are reassigned.
    Schema tableSchema = table.schema();

    switch (format) {
      case AVRO:
        try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
            .createWriterFunc(DataWriter::create)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;

      case PARQUET:
        try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
            .createWriterFunc(GenericParquetWriter::buildWriter)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;

      case ORC:
        try (FileAppender<org.apache.iceberg.data.Record> writer = ORC.write(localOutput(testFile))
            .createWriterFunc(GenericOrcWriter::buildWriter)
            .schema(tableSchema)
            .build()) {
          writer.add(record);
        }
        break;
    }

    DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
        .withRecordCount(100)
        .withFileSizeInBytes(testFile.length())
        .withPath(testFile.toString())
        .build();

    table.newAppend().appendFile(file).commit();

    table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();

    // rewrite the read schema for the table's reassigned ids
    Map<Integer, Integer> idMapping = Maps.newHashMap();
    for (int id : allIds(writeSchema)) {
      // translate each id to the original schema's column name, then to the new schema's id
      String originalName = writeSchema.findColumnName(id);
      idMapping.put(id, tableSchema.findField(originalName).fieldId());
    }
    Schema expectedSchema = reassignIds(readSchema, idMapping);

    // Set the schema to the expected schema directly to simulate the table schema evolving
    TestTables.replaceMetadata(desc,
        TestTables.readMetadata(desc).updateSchema(expectedSchema, 100));

    Dataset<Row> df = spark.read()
        .format("org.apache.iceberg.spark.source.TestIcebergSource")
        .option("iceberg.table.name", desc)
        .load();

    return SparkValueConverter.convert(readSchema, df.collectAsList().get(0));

  } finally {
    TestTables.clearTables();
  }
}
 
Example 22
Source Project: iceberg   Source File: TestForwardCompatibility.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testSparkCanReadUnknownTransform() throws IOException {
  File parent = temp.newFolder("avro");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(SCHEMA, UNKNOWN_SPEC, location.toString());

  // enable snapshot inheritance to avoid rewriting the manifest with an unknown transform
  table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit();

  List<GenericData.Record> expected = RandomData.generateList(table.schema(), 100, 1L);

  File parquetFile = new File(dataFolder,
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
  FileAppender<GenericData.Record> writer = Parquet.write(localOutput(parquetFile))
      .schema(table.schema())
      .build();
  try {
    writer.addAll(expected);
  } finally {
    writer.close();
  }

  DataFile file = DataFiles.builder(FAKE_SPEC)
      .withInputFile(localInput(parquetFile))
      .withMetrics(writer.metrics())
      .withPartitionPath("id_zero=0")
      .build();

  OutputFile manifestFile = localOutput(FileFormat.AVRO.addExtension(temp.newFile().toString()));
  ManifestWriter manifestWriter = ManifestFiles.write(FAKE_SPEC, manifestFile);
  try {
    manifestWriter.add(file);
  } finally {
    manifestWriter.close();
  }

  table.newFastAppend().appendManifest(manifestWriter.toManifestFile()).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(table.schema().asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example 23
Source Project: iceberg   Source File: TestAvroScan.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void writeAndValidate(Schema schema) throws IOException {
  File parent = temp.newFolder("avro");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  File avroFile = new File(dataFolder,
      FileFormat.AVRO.addExtension(UUID.randomUUID().toString()));

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString());

  // Important: use the table's schema for the rest of the test
  // When tables are created, the column ids are reassigned.
  Schema tableSchema = table.schema();

  List<Record> expected = RandomData.generateList(tableSchema, 100, 1L);

  try (FileAppender<Record> writer = Avro.write(localOutput(avroFile))
      .schema(tableSchema)
      .build()) {
    writer.addAll(expected);
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(100)
      .withFileSizeInBytes(avroFile.length())
      .withPath(avroFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example 24
Source Project: iceberg   Source File: TestPartitionValues.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testPartitionValueTypes() throws Exception {
  String[] columnNames = new String[] {
      "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10"
  };

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());

  // create a table around the source data
  String sourceLocation = temp.newFolder("source_table").toString();
  Table source = tables.create(SUPPORTED_PRIMITIVES, sourceLocation);

  // write out an Avro data file with all of the data types for source data
  List<GenericData.Record> expected = RandomData.generateList(source.schema(), 2, 128735L);
  File avroData = temp.newFile("data.avro");
  Assert.assertTrue(avroData.delete());
  try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(avroData))
      .schema(source.schema())
      .build()) {
    appender.addAll(expected);
  }

  // add the Avro data file to the source table
  source.newAppend()
      .appendFile(DataFiles.fromInputFile(Files.localInput(avroData), 10))
      .commit();

  Dataset<Row> sourceDF = spark.read().format("iceberg").load(sourceLocation);

  for (String column : columnNames) {
    String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString();

    File parent = temp.newFolder(desc);
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

    PartitionSpec spec = PartitionSpec.builderFor(SUPPORTED_PRIMITIVES).identity(column).build();

    Table table = tables.create(SUPPORTED_PRIMITIVES, spec, location.toString());
    table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

    sourceDF.write()
        .format("iceberg")
        .mode("append")
        .save(location.toString());

    List<Row> actual = spark.read()
        .format("iceberg")
        .load(location.toString())
        .collectAsList();

    Assert.assertEquals("Number of rows should match", expected.size(), actual.size());

    for (int i = 0; i < expected.size(); i += 1) {
      TestHelpers.assertEqualsSafe(
          SUPPORTED_PRIMITIVES.asStruct(), expected.get(i), actual.get(i));
    }
  }
}
 
Example 25
Source Project: iceberg   Source File: TestPartitionValues.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testNestedPartitionValues() throws Exception {
  String[] columnNames = new String[] {
      "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10"
  };

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
  Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct()));

  // create a table around the source data
  String sourceLocation = temp.newFolder("source_table").toString();
  Table source = tables.create(nestedSchema, sourceLocation);

  // write out an Avro data file with all of the data types for source data
  List<GenericData.Record> expected = RandomData.generateList(source.schema(), 2, 128735L);
  File avroData = temp.newFile("data.avro");
  Assert.assertTrue(avroData.delete());
  try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(avroData))
      .schema(source.schema())
      .build()) {
    appender.addAll(expected);
  }

  // add the Avro data file to the source table
  source.newAppend()
      .appendFile(DataFiles.fromInputFile(Files.localInput(avroData), 10))
      .commit();

  Dataset<Row> sourceDF = spark.read().format("iceberg").load(sourceLocation);

  for (String column : columnNames) {
    String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString();

    File parent = temp.newFolder(desc);
    File location = new File(parent, "test");
    File dataFolder = new File(location, "data");
    Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

    PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build();

    Table table = tables.create(nestedSchema, spec, location.toString());
    table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

    sourceDF.write()
        .format("iceberg")
        .mode("append")
        .save(location.toString());

    List<Row> actual = spark.read()
        .format("iceberg")
        .load(location.toString())
        .collectAsList();

    Assert.assertEquals("Number of rows should match", expected.size(), actual.size());

    for (int i = 0; i < expected.size(); i += 1) {
      TestHelpers.assertEqualsSafe(
          nestedSchema.asStruct(), expected.get(i), actual.get(i));
    }
  }
}
 
Example 26
Source Project: iceberg   Source File: TestParquetScan.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void writeAndValidate(Schema schema) throws IOException {
  Assume.assumeTrue("Cannot handle non-string map keys in parquet-avro",
      null == TypeUtil.find(
          schema,
          type -> type.isMapType() && type.asMapType().keyType() != Types.StringType.get()));

  File parent = temp.newFolder("parquet");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();

  File parquetFile = new File(dataFolder,
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));

  HadoopTables tables = new HadoopTables(CONF);
  Table table = tables.create(schema, PartitionSpec.unpartitioned(), location.toString());

  // Important: use the table's schema for the rest of the test
  // When tables are created, the column ids are reassigned.
  Schema tableSchema = table.schema();

  List<GenericData.Record> expected = RandomData.generateList(tableSchema, 100, 1L);

  try (FileAppender<GenericData.Record> writer = Parquet.write(localOutput(parquetFile))
      .schema(tableSchema)
      .build()) {
    writer.addAll(expected);
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withFileSizeInBytes(parquetFile.length())
      .withPath(parquetFile.toString())
      .withRecordCount(100)
      .build();

  table.newAppend().appendFile(file).commit();
  table.updateProperties().set(TableProperties.PARQUET_VECTORIZATION_ENABLED, String.valueOf(vectorized)).commit();

  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> rows = df.collectAsList();
  Assert.assertEquals("Should contain 100 rows", 100, rows.size());

  for (int i = 0; i < expected.size(); i += 1) {
    TestHelpers.assertEqualsSafe(tableSchema.asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example 27
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 4 votes vote down vote up
@Before
public void writeUnpartitionedTable() throws IOException {
  this.parent = temp.newFolder("TestFilteredScan");
  this.unpartitioned = new File(parent, "unpartitioned");
  File dataFolder = new File(unpartitioned, "data");
  Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs());

  Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString());
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));

  File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));

  // create records using the table's schema
  org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(tableSchema, "test");
  this.records = testRecords(avroSchema);

  switch (fileFormat) {
    case AVRO:
      try (FileAppender<Record> writer = Avro.write(localOutput(testFile))
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;

    case PARQUET:
      try (FileAppender<Record> writer = Parquet.write(localOutput(testFile))
          .schema(tableSchema)
          .build()) {
        writer.addAll(records);
      }
      break;
  }

  DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
      .withRecordCount(records.size())
      .withFileSizeInBytes(testFile.length())
      .withPath(testFile.toString())
      .build();

  table.newAppend().appendFile(file).commit();
}
 
Example 28
Source Project: dremio-oss   Source File: TestCreateTable.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testDroppingOfMapTypeColumn() throws Exception{
  String table1 = "iceberg_map_test";
  try {
    File table1Folder = new File(getDfsTestTmpSchemaLocation(), table1);
    HadoopTables hadoopTables = new HadoopTables(new Configuration());

    Schema schema = new Schema(
      Types.NestedField.optional(1, "col1", Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get())),
      Types.NestedField.optional(2, "col2", Types.IntegerType.get())
    );
    PartitionSpec spec = PartitionSpec
      .builderFor(schema)
      .build();
    Table table = hadoopTables.create(schema, spec, table1Folder.getPath());
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    final String testWorkingPath = TestTools.getWorkingPath() + "/src/test/resources/iceberg/mapTest";
    final String parquetFile = "iceberg_map_test.parquet";
    File dataFile = new File(testWorkingPath, parquetFile);
    appendFiles.appendFile(
      DataFiles.builder(spec)
        .withInputFile(Files.localInput(dataFile))
        .withRecordCount(1)
        .withFormat(FileFormat.PARQUET)
        .build()
    );
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .build()
      .run();

    Thread.sleep(1001);
    String insertCommandSql = "insert into  dfs_test.iceberg_map_test select * from (values(2))";
    test(insertCommandSql);
    Thread.sleep(1001);

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .baselineValues(2)
      .build()
      .run();
  }
  finally {
    FileUtils.deleteQuietly(new File(getDfsTestTmpSchemaLocation(), table1));
  }
}