org.apache.iceberg.FileFormat Java Examples

The following examples show how to use org.apache.iceberg.FileFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}
 
Example #2
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFileParquet() {
  Assume.assumeTrue(format == FileFormat.PARQUET);
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example #3
Source File: ParquetWritingTestUtils.java    From iceberg with Apache License 2.0 6 votes vote down vote up
static File writeRecords(
    TemporaryFolder temp,
    Schema schema, Map<String, String> properties,
    Function<MessageType, ParquetValueWriter<?>> createWriterFunc,
    GenericData.Record... records) throws IOException {
  File tmpFolder = temp.newFolder("parquet");
  String filename = UUID.randomUUID().toString();
  File file = new File(tmpFolder, FileFormat.PARQUET.addExtension(filename));
  try (FileAppender<GenericData.Record> writer = Parquet.write(localOutput(file))
      .schema(schema)
      .setAll(properties)
      .createWriterFunc(createWriterFunc)
      .build()) {
    writer.addAll(Lists.newArrayList(records));
  }
  return file;
}
 
Example #4
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testAllNulls() {
  boolean shouldRead;

  // ORC-623: ORC does not skip a row group for a notNull predicate on a column with all nulls
  // boolean shouldRead = shouldRead(notNull("all_nulls"));
  if (format != FileFormat.ORC) {
    shouldRead = shouldRead(notNull("all_nulls"));
    Assert.assertFalse("Should skip: no non-null value in all null column", shouldRead);
  }

  shouldRead = shouldRead(notNull("some_nulls"));
  Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead);

  shouldRead = shouldRead(notNull("no_nulls"));
  Assert.assertTrue("Should read: non-null column contains a non-null value", shouldRead);

  shouldRead = shouldRead(notNull("map_not_null"));
  Assert.assertTrue("Should read: map type is not skipped", shouldRead);

  shouldRead = shouldRead(notNull("struct_not_null"));
  Assert.assertTrue("Should read: struct type is not skipped", shouldRead);
}
 
Example #5
Source File: TestIcebergSmoke.java    From presto with Apache License 2.0 6 votes vote down vote up
private void testSchemaEvolution(Session session, FileFormat fileFormat)
{
    assertUpdate(session, "CREATE TABLE test_schema_evolution_drop_end (col0 INTEGER, col1 INTEGER, col2 INTEGER) WITH (format = '" + fileFormat + "')");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_end VALUES (0, 1, 2)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1, 2)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_end DROP COLUMN col2");
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_end ADD COLUMN col2 INTEGER");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_end VALUES (3, 4, 5)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1, NULL), (3, 4, 5)");
    dropTable(session, "test_schema_evolution_drop_end");

    assertUpdate(session, "CREATE TABLE test_schema_evolution_drop_middle (col0 INTEGER, col1 INTEGER, col2 INTEGER) WITH (format = '" + fileFormat + "')");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_middle VALUES (0, 1, 2)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_middle", "VALUES(0, 1, 2)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_middle DROP COLUMN col1");
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_middle", "VALUES(0, 2)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_middle ADD COLUMN col1 INTEGER");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_middle VALUES (3, 4, 5)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_middle", "VALUES(0, 2, NULL), (3, 4, 5)");
    dropTable(session, "test_schema_evolution_drop_middle");
}
 
Example #6
Source File: SparkBatchScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public PartitionReaderFactory createReaderFactory() {
  boolean allParquetFileScanTasks =
      tasks().stream()
          .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files()
              .stream()
              .allMatch(fileScanTask -> fileScanTask.file().format().equals(
                  FileFormat.PARQUET)));

  boolean atLeastOneColumn = expectedSchema.columns().size() > 0;

  boolean hasNoIdentityProjections = tasks().stream()
      .allMatch(combinedScanTask -> combinedScanTask.files()
          .stream()
          .allMatch(fileScanTask -> fileScanTask.spec().identitySourceIds().isEmpty()));

  boolean onlyPrimitives = expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType());

  boolean readUsingBatch = batchReadsEnabled && allParquetFileScanTasks && atLeastOneColumn &&
      hasNoIdentityProjections && onlyPrimitives;

  return new ReaderFactory(readUsingBatch ? batchSize : 0);
}
 
Example #7
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private byte[] getIcebergMetaData() throws IOException {
  if (!this.isIcebergWriter) {
    return null;
  }

  final long fileSize = parquetFileWriter.getPos();
  DataFiles.Builder dataFileBuilder =
    DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns))
      .withPath(path.toString())
      .withFileSizeInBytes(fileSize)
      .withRecordCount(recordCount)
      .withFormat(FileFormat.PARQUET);

  // add partition info
  if (partitionColumns != null) {
    dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData());
  }

  // add column level metrics
  Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema);
  dataFileBuilder = dataFileBuilder.withMetrics(metrics);
  return IcebergSerDe.serializeDataFile(dataFileBuilder.build());
}
 
Example #8
Source File: TestIcebergSmoke.java    From presto with Apache License 2.0 6 votes vote down vote up
private void testCreatePartitionedTableWithNestedTypes(Session session, FileFormat fileFormat)
{
    @Language("SQL") String createTable = "" +
            "CREATE TABLE test_partitioned_table_nested_type (" +
            "  _string VARCHAR" +
            ", _struct ROW(_field1 INT, _field2 VARCHAR)" +
            ", _date DATE" +
            ") " +
            "WITH (" +
            "format = '" + fileFormat + "', " +
            "partitioning = ARRAY['_date']" +
            ")";

    assertUpdate(session, createTable);

    dropTable(session, "test_partitioned_table_nested_type");
}
 
Example #9
Source File: IcebergFileWriterFactory.java    From presto with Apache License 2.0 6 votes vote down vote up
public IcebergFileWriter createFileWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session,
        FileFormat fileFormat)
{
    switch (fileFormat) {
        case PARQUET:
            return createParquetWriter(outputPath, icebergSchema, columns, jobConf, session);
        case ORC:
            return createOrcWriter(outputPath, icebergSchema, jobConf, session);
    }
    throw new PrestoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat);
}
 
Example #10
Source File: IcebergWritableTableHandle.java    From presto with Apache License 2.0 6 votes vote down vote up
@JsonCreator
public IcebergWritableTableHandle(
        @JsonProperty("schemaName") String schemaName,
        @JsonProperty("tableName") String tableName,
        @JsonProperty("schemaAsJson") String schemaAsJson,
        @JsonProperty("partitionSpecAsJson") String partitionSpecAsJson,
        @JsonProperty("inputColumns") List<IcebergColumnHandle> inputColumns,
        @JsonProperty("outputPath") String outputPath,
        @JsonProperty("fileFormat") FileFormat fileFormat)
{
    this.schemaName = requireNonNull(schemaName, "schemaName is null");
    this.tableName = requireNonNull(tableName, "tableName is null");
    this.schemaAsJson = requireNonNull(schemaAsJson, "schemaAsJson is null");
    this.partitionSpecAsJson = requireNonNull(partitionSpecAsJson, "partitionSpecAsJson is null");
    this.inputColumns = ImmutableList.copyOf(requireNonNull(inputColumns, "inputColumns is null"));
    this.outputPath = requireNonNull(outputPath, "filePrefix is null");
    this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
}
 
Example #11
Source File: Reader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public boolean enableBatchRead() {
  if (readUsingBatch == null) {
    boolean allParquetFileScanTasks =
        tasks().stream()
            .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files()
                .stream()
                .allMatch(fileScanTask -> fileScanTask.file().format().equals(
                    FileFormat.PARQUET)));

    boolean atLeastOneColumn = lazySchema().columns().size() > 0;

    boolean hasNoIdentityProjections = tasks().stream()
        .allMatch(combinedScanTask -> combinedScanTask.files()
            .stream()
            .allMatch(fileScanTask -> fileScanTask.spec().identitySourceIds().isEmpty()));

    boolean onlyPrimitives = lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType());

    this.readUsingBatch = batchReadsEnabled && allParquetFileScanTasks && atLeastOneColumn &&
        hasNoIdentityProjections && onlyPrimitives;
  }
  return readUsingBatch;
}
 
Example #12
Source File: RowDataRewriter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}
 
Example #13
Source File: IcebergSplit.java    From presto with Apache License 2.0 6 votes vote down vote up
@JsonCreator
public IcebergSplit(
        @JsonProperty("path") String path,
        @JsonProperty("start") long start,
        @JsonProperty("length") long length,
        @JsonProperty("fileFormat") FileFormat fileFormat,
        @JsonProperty("addresses") List<HostAddress> addresses,
        @JsonProperty("partitionKeys") Map<Integer, String> partitionKeys)
{
    this.path = requireNonNull(path, "path is null");
    this.start = start;
    this.length = length;
    this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
    this.addresses = ImmutableList.copyOf(requireNonNull(addresses, "addresses is null"));
    this.partitionKeys = Collections.unmodifiableMap(requireNonNull(partitionKeys, "partitionKeys is null"));
}
 
Example #14
Source File: SparkBatchWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
protected WriterFactory(PartitionSpec spec, FileFormat format, LocationProvider locations,
                        Map<String, String> properties, Broadcast<FileIO> io,
                        Broadcast<EncryptionManager> encryptionManager, long targetFileSize,
                        Schema writeSchema, StructType dsSchema) {
  this.spec = spec;
  this.format = format;
  this.locations = locations;
  this.properties = properties;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.targetFileSize = targetFileSize;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;
}
 
Example #15
Source File: TestParquetMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private InputFile writeRecords(Schema schema, Map<String, String> properties, Record... records) throws IOException {
  File tmpFolder = temp.newFolder("parquet");
  String filename = UUID.randomUUID().toString();
  OutputFile file = Files.localOutput(new File(tmpFolder, FileFormat.PARQUET.addExtension(filename)));
  try (FileAppender<Record> writer = Parquet.write(file)
      .schema(schema)
      .setAll(properties)
      .createWriterFunc(GenericParquetWriter::buildWriter)
      .build()) {
    writer.addAll(Lists.newArrayList(records));
  }
  return file.toInputFile();
}
 
Example #16
Source File: TestOrcMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private InputFile writeRecords(Schema schema, Map<String, String> properties, Record... records) throws IOException {
  File tmpFolder = temp.newFolder("orc");
  String filename = UUID.randomUUID().toString();
  OutputFile file = Files.localOutput(new File(tmpFolder, FileFormat.ORC.addExtension(filename)));
  try (FileAppender<Record> writer = ORC.write(file)
      .schema(schema)
      .setAll(properties)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    writer.addAll(Lists.newArrayList(records));
  }
  return file.toInputFile();
}
 
Example #17
Source File: TestRefresh.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example #18
Source File: TestIcebergSmoke.java    From presto with Apache License 2.0 5 votes vote down vote up
private void testPredicating(Session session, FileFormat fileFormat)
{
    assertUpdate(session, "CREATE TABLE test_predicating_on_real (col REAL) WITH (format = '" + fileFormat + "')");
    assertUpdate(session, "INSERT INTO test_predicating_on_real VALUES 1.2", 1);
    assertQuery(session, "SELECT * FROM test_predicating_on_real WHERE col = 1.2", "VALUES 1.2");
    dropTable(session, "test_predicating_on_real");
}
 
Example #19
Source File: IcebergPageSink.java    From presto with Apache License 2.0 5 votes vote down vote up
public IcebergPageSink(
        Schema outputSchema,
        PartitionSpec partitionSpec,
        String outputPath,
        IcebergFileWriterFactory fileWriterFactory,
        PageIndexerFactory pageIndexerFactory,
        HdfsEnvironment hdfsEnvironment,
        HdfsContext hdfsContext,
        List<IcebergColumnHandle> inputColumns,
        JsonCodec<CommitTaskData> jsonCodec,
        ConnectorSession session,
        FileFormat fileFormat)
{
    requireNonNull(inputColumns, "inputColumns is null");
    this.outputSchema = requireNonNull(outputSchema, "outputSchema is null");
    this.partitionSpec = requireNonNull(partitionSpec, "partitionSpec is null");
    this.outputPath = requireNonNull(outputPath, "outputPath is null");
    this.fileWriterFactory = requireNonNull(fileWriterFactory, "fileWriterFactory is null");
    this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
    requireNonNull(hdfsContext, "hdfsContext is null");
    this.jobConf = toJobConf(hdfsEnvironment.getConfiguration(hdfsContext, new Path(outputPath)));
    this.jsonCodec = requireNonNull(jsonCodec, "jsonCodec is null");
    this.session = requireNonNull(session, "session is null");
    this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
    this.inputColumns = ImmutableList.copyOf(inputColumns);
    this.pagePartitioner = new PagePartitioner(pageIndexerFactory, toPartitionColumns(inputColumns, partitionSpec));
}
 
Example #20
Source File: TestHiveTableConcurrency.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}
 
Example #21
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testMissingStatsParquet() {
  Assume.assumeTrue(format == FileFormat.PARQUET);
  Expression[] exprs = new Expression[] {
      lessThan("no_stats_parquet", "a"), lessThanOrEqual("no_stats_parquet", "b"), equal("no_stats_parquet", "c"),
      greaterThan("no_stats_parquet", "d"), greaterThanOrEqual("no_stats_parquet", "e"),
      notEqual("no_stats_parquet", "f"), isNull("no_stats_parquet"), notNull("no_stats_parquet"),
      startsWith("no_stats_parquet", "a")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = shouldRead(expr);
    Assert.assertTrue("Should read when missing stats for expr: " + expr, shouldRead);
  }
}
 
Example #22
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testStringStartsWith() {
  Assume.assumeFalse("ORC row group filter does not support StringStartsWith", format == FileFormat.ORC);
  boolean shouldRead = shouldRead(startsWith("str", "1"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "0st"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "1str1"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "1str1_xgd"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "2str"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "9xstr"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);

  shouldRead = shouldRead(startsWith("str", "0S"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);

  shouldRead = shouldRead(startsWith("str", "x"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);

  shouldRead = shouldRead(startsWith("str", "9str9aaa"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);
}
 
Example #23
Source File: BaseWriter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public void writeInternal(InternalRow row)  throws IOException {
  //TODO: ORC file now not support target file size before closed
  if  (!format.equals(FileFormat.ORC) &&
      currentRows % ROWS_DIVISOR == 0 && currentAppender.length() >= targetFileSize) {
    closeCurrent();
    openCurrent();
  }

  currentAppender.add(row);
  currentRows++;
}
 
Example #24
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static ManifestFile writeManifest(
    List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io,
    String location, int format, PartitionSpec spec, StructType sparkType) throws IOException {

  String manifestName = "optimized-m-" + UUID.randomUUID();
  Path manifestPath = new Path(location, manifestName);
  OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString()));

  Types.StructType dataFileType = DataFile.getType(spec.partitionType());
  SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType);

  ManifestWriter writer = ManifestFiles.write(format, spec, outputFile, null);

  try {
    for (int index = startIndex; index < endIndex; index++) {
      Row row = rows.get(index);
      long snapshotId = row.getLong(0);
      long sequenceNumber = row.getLong(1);
      Row file = row.getStruct(2);
      writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber);
    }
  } finally {
    writer.close();
  }

  return writer.toManifestFile();
}
 
Example #25
Source File: BatchDataReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
CloseableIterator<ColumnarBatch> open(FileScanTask task) {
  CloseableIterable<ColumnarBatch> iter;
  InputFile location = getInputFile(task);
  Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
  if (task.file().format() == FileFormat.PARQUET) {
    Parquet.ReadBuilder builder = Parquet.read(location)
        .project(expectedSchema)
        .split(task.start(), task.length())
        .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema,
            fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED))
        .recordsPerBatch(batchSize)
        .filter(task.residual())
        .caseSensitive(caseSensitive)
        // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
        // without worrying about subsequent reads clobbering over each other. This improves
        // read performance as every batch read doesn't have to pay the cost of allocating memory.
        .reuseContainers();

    if (nameMapping != null) {
      builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }

    iter = builder.build();
  } else {
    throw new UnsupportedOperationException(
        "Format: " + task.file().format() + " not supported for batched reads");
  }
  return iter.iterator();
}
 
Example #26
Source File: OutputFileFactory.java    From iceberg with Apache License 2.0 5 votes vote down vote up
OutputFileFactory(PartitionSpec spec, FileFormat format, LocationProvider locations, FileIO io,
                  EncryptionManager encryptionManager, int partitionId, long taskId) {
  this.spec = spec;
  this.format = format;
  this.locations = locations;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.partitionId = partitionId;
  this.taskId = taskId;
}
 
Example #27
Source File: TestDataSourceOptions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoWriteFormatOption() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro");
  Table table = tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
    tasks.forEach(task -> {
      FileFormat fileFormat = FileFormat.fromFileName(task.file().path());
      Assert.assertEquals(FileFormat.AVRO, fileFormat);
    });
  }
}
 
Example #28
Source File: TestDataSourceOptions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteFormatOptionOverridesTableProperties() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro");
  Table table = tables.create(SCHEMA, spec, options, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) {
    tasks.forEach(task -> {
      FileFormat fileFormat = FileFormat.fromFileName(task.file().path());
      Assert.assertEquals(FileFormat.PARQUET, fileFormat);
    });
  }
}
 
Example #29
Source File: SparkAppenderFactory.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public FileAppender<InternalRow> newAppender(OutputFile file, FileFormat fileFormat) {
  MetricsConfig metricsConfig = MetricsConfig.fromProperties(properties);
  try {
    switch (fileFormat) {
      case PARQUET:
        return Parquet.write(file)
            .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(dsSchema, msgType))
            .setAll(properties)
            .metricsConfig(metricsConfig)
            .schema(writeSchema)
            .overwrite()
            .build();

      case AVRO:
        return Avro.write(file)
            .createWriterFunc(ignored -> new SparkAvroWriter(dsSchema))
            .setAll(properties)
            .schema(writeSchema)
            .overwrite()
            .build();

      case ORC:
        return ORC.write(file)
            .createWriterFunc(SparkOrcWriter::new)
            .setAll(properties)
            .schema(writeSchema)
            .overwrite()
            .build();

      default:
        throw new UnsupportedOperationException("Cannot write unknown format: " + fileFormat);
    }
  } catch (IOException e) {
    throw new RuntimeIOException(e);
  }
}
 
Example #30
Source File: BaseWriter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
BaseWriter(PartitionSpec spec, FileFormat format, SparkAppenderFactory appenderFactory,
           OutputFileFactory fileFactory, FileIO io, long targetFileSize) {
  this.spec = spec;
  this.format = format;
  this.appenderFactory = appenderFactory;
  this.fileFactory = fileFactory;
  this.io = io;
  this.targetFileSize = targetFileSize;
}