Java Code Examples for org.apache.iceberg.FileFormat

The following examples show how to use org.apache.iceberg.FileFormat. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: presto   Source File: IcebergSplit.java    License: Apache License 2.0 6 votes vote down vote up
@JsonCreator
public IcebergSplit(
        @JsonProperty("path") String path,
        @JsonProperty("start") long start,
        @JsonProperty("length") long length,
        @JsonProperty("fileFormat") FileFormat fileFormat,
        @JsonProperty("addresses") List<HostAddress> addresses,
        @JsonProperty("partitionKeys") Map<Integer, String> partitionKeys)
{
    this.path = requireNonNull(path, "path is null");
    this.start = start;
    this.length = length;
    this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
    this.addresses = ImmutableList.copyOf(requireNonNull(addresses, "addresses is null"));
    this.partitionKeys = Collections.unmodifiableMap(requireNonNull(partitionKeys, "partitionKeys is null"));
}
 
Example 2
Source Project: presto   Source File: IcebergWritableTableHandle.java    License: Apache License 2.0 6 votes vote down vote up
@JsonCreator
public IcebergWritableTableHandle(
        @JsonProperty("schemaName") String schemaName,
        @JsonProperty("tableName") String tableName,
        @JsonProperty("schemaAsJson") String schemaAsJson,
        @JsonProperty("partitionSpecAsJson") String partitionSpecAsJson,
        @JsonProperty("inputColumns") List<IcebergColumnHandle> inputColumns,
        @JsonProperty("outputPath") String outputPath,
        @JsonProperty("fileFormat") FileFormat fileFormat)
{
    this.schemaName = requireNonNull(schemaName, "schemaName is null");
    this.tableName = requireNonNull(tableName, "tableName is null");
    this.schemaAsJson = requireNonNull(schemaAsJson, "schemaAsJson is null");
    this.partitionSpecAsJson = requireNonNull(partitionSpecAsJson, "partitionSpecAsJson is null");
    this.inputColumns = ImmutableList.copyOf(requireNonNull(inputColumns, "inputColumns is null"));
    this.outputPath = requireNonNull(outputPath, "filePrefix is null");
    this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
}
 
Example 3
Source Project: presto   Source File: IcebergFileWriterFactory.java    License: Apache License 2.0 6 votes vote down vote up
public IcebergFileWriter createFileWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session,
        FileFormat fileFormat)
{
    switch (fileFormat) {
        case PARQUET:
            return createParquetWriter(outputPath, icebergSchema, columns, jobConf, session);
        case ORC:
            return createOrcWriter(outputPath, icebergSchema, jobConf, session);
    }
    throw new PrestoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat);
}
 
Example 4
Source Project: presto   Source File: TestIcebergSmoke.java    License: Apache License 2.0 6 votes vote down vote up
private void testCreatePartitionedTableWithNestedTypes(Session session, FileFormat fileFormat)
{
    @Language("SQL") String createTable = "" +
            "CREATE TABLE test_partitioned_table_nested_type (" +
            "  _string VARCHAR" +
            ", _struct ROW(_field1 INT, _field2 VARCHAR)" +
            ", _date DATE" +
            ") " +
            "WITH (" +
            "format = '" + fileFormat + "', " +
            "partitioning = ARRAY['_date']" +
            ")";

    assertUpdate(session, createTable);

    dropTable(session, "test_partitioned_table_nested_type");
}
 
Example 5
Source Project: presto   Source File: TestIcebergSmoke.java    License: Apache License 2.0 6 votes vote down vote up
private void testSchemaEvolution(Session session, FileFormat fileFormat)
{
    assertUpdate(session, "CREATE TABLE test_schema_evolution_drop_end (col0 INTEGER, col1 INTEGER, col2 INTEGER) WITH (format = '" + fileFormat + "')");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_end VALUES (0, 1, 2)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1, 2)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_end DROP COLUMN col2");
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_end ADD COLUMN col2 INTEGER");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_end VALUES (3, 4, 5)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1, NULL), (3, 4, 5)");
    dropTable(session, "test_schema_evolution_drop_end");

    assertUpdate(session, "CREATE TABLE test_schema_evolution_drop_middle (col0 INTEGER, col1 INTEGER, col2 INTEGER) WITH (format = '" + fileFormat + "')");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_middle VALUES (0, 1, 2)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_middle", "VALUES(0, 1, 2)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_middle DROP COLUMN col1");
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_middle", "VALUES(0, 2)");
    assertUpdate(session, "ALTER TABLE test_schema_evolution_drop_middle ADD COLUMN col1 INTEGER");
    assertUpdate(session, "INSERT INTO test_schema_evolution_drop_middle VALUES (3, 4, 5)", 1);
    assertQuery(session, "SELECT * FROM test_schema_evolution_drop_middle", "VALUES(0, 2, NULL), (3, 4, 5)");
    dropTable(session, "test_schema_evolution_drop_middle");
}
 
Example 6
Source Project: iceberg   Source File: Reader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public boolean enableBatchRead() {
  if (readUsingBatch == null) {
    boolean allParquetFileScanTasks =
        tasks().stream()
            .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files()
                .stream()
                .allMatch(fileScanTask -> fileScanTask.file().format().equals(
                    FileFormat.PARQUET)));

    boolean atLeastOneColumn = lazySchema().columns().size() > 0;

    boolean hasNoIdentityProjections = tasks().stream()
        .allMatch(combinedScanTask -> combinedScanTask.files()
            .stream()
            .allMatch(fileScanTask -> fileScanTask.spec().identitySourceIds().isEmpty()));

    boolean onlyPrimitives = lazySchema().columns().stream().allMatch(c -> c.type().isPrimitiveType());

    this.readUsingBatch = batchReadsEnabled && allParquetFileScanTasks && atLeastOneColumn &&
        hasNoIdentityProjections && onlyPrimitives;
  }
  return readUsingBatch;
}
 
Example 7
Source Project: iceberg   Source File: TestMetricsRowGroupFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testAllNulls() {
  boolean shouldRead;

  // ORC-623: ORC does not skip a row group for a notNull predicate on a column with all nulls
  // boolean shouldRead = shouldRead(notNull("all_nulls"));
  if (format != FileFormat.ORC) {
    shouldRead = shouldRead(notNull("all_nulls"));
    Assert.assertFalse("Should skip: no non-null value in all null column", shouldRead);
  }

  shouldRead = shouldRead(notNull("some_nulls"));
  Assert.assertTrue("Should read: column with some nulls contains a non-null value", shouldRead);

  shouldRead = shouldRead(notNull("no_nulls"));
  Assert.assertTrue("Should read: non-null column contains a non-null value", shouldRead);

  shouldRead = shouldRead(notNull("map_not_null"));
  Assert.assertTrue("Should read: map type is not skipped", shouldRead);

  shouldRead = shouldRead(notNull("struct_not_null"));
  Assert.assertTrue("Should read: struct type is not skipped", shouldRead);
}
 
Example 8
Source Project: iceberg   Source File: TestMetricsRowGroupFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFileParquet() {
  Assume.assumeTrue(format == FileFormat.PARQUET);
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example 9
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 6 votes vote down vote up
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}
 
Example 10
Source Project: iceberg   Source File: RowDataRewriter.java    License: Apache License 2.0 6 votes vote down vote up
public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}
 
Example 11
Source Project: iceberg   Source File: SparkBatchScan.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PartitionReaderFactory createReaderFactory() {
  boolean allParquetFileScanTasks =
      tasks().stream()
          .allMatch(combinedScanTask -> !combinedScanTask.isDataTask() && combinedScanTask.files()
              .stream()
              .allMatch(fileScanTask -> fileScanTask.file().format().equals(
                  FileFormat.PARQUET)));

  boolean atLeastOneColumn = expectedSchema.columns().size() > 0;

  boolean hasNoIdentityProjections = tasks().stream()
      .allMatch(combinedScanTask -> combinedScanTask.files()
          .stream()
          .allMatch(fileScanTask -> fileScanTask.spec().identitySourceIds().isEmpty()));

  boolean onlyPrimitives = expectedSchema.columns().stream().allMatch(c -> c.type().isPrimitiveType());

  boolean readUsingBatch = batchReadsEnabled && allParquetFileScanTasks && atLeastOneColumn &&
      hasNoIdentityProjections && onlyPrimitives;

  return new ReaderFactory(readUsingBatch ? batchSize : 0);
}
 
Example 12
Source Project: iceberg   Source File: ParquetWritingTestUtils.java    License: Apache License 2.0 6 votes vote down vote up
static File writeRecords(
    TemporaryFolder temp,
    Schema schema, Map<String, String> properties,
    Function<MessageType, ParquetValueWriter<?>> createWriterFunc,
    GenericData.Record... records) throws IOException {
  File tmpFolder = temp.newFolder("parquet");
  String filename = UUID.randomUUID().toString();
  File file = new File(tmpFolder, FileFormat.PARQUET.addExtension(filename));
  try (FileAppender<GenericData.Record> writer = Parquet.write(localOutput(file))
      .schema(schema)
      .setAll(properties)
      .createWriterFunc(createWriterFunc)
      .build()) {
    writer.addAll(Lists.newArrayList(records));
  }
  return file;
}
 
Example 13
Source Project: dremio-oss   Source File: ParquetRecordWriter.java    License: Apache License 2.0 6 votes vote down vote up
private byte[] getIcebergMetaData() throws IOException {
  if (!this.isIcebergWriter) {
    return null;
  }

  final long fileSize = parquetFileWriter.getPos();
  DataFiles.Builder dataFileBuilder =
    DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns))
      .withPath(path.toString())
      .withFileSizeInBytes(fileSize)
      .withRecordCount(recordCount)
      .withFormat(FileFormat.PARQUET);

  // add partition info
  if (partitionColumns != null) {
    dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData());
  }

  // add column level metrics
  Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema);
  dataFileBuilder = dataFileBuilder.withMetrics(metrics);
  return IcebergSerDe.serializeDataFile(dataFileBuilder.build());
}
 
Example 14
Source Project: presto   Source File: IcebergTableProperties.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public IcebergTableProperties(IcebergConfig icebergConfig)
{
    tableProperties = ImmutableList.<PropertyMetadata<?>>builder()
            .add(enumProperty(
                    FILE_FORMAT_PROPERTY,
                    "File format for the table",
                    FileFormat.class,
                    icebergConfig.getFileFormat(),
                    false))
            .add(new PropertyMetadata<>(
                    PARTITIONING_PROPERTY,
                    "Partition transforms",
                    new ArrayType(VARCHAR),
                    List.class,
                    ImmutableList.of(),
                    false,
                    value -> ((Collection<?>) value).stream()
                            .map(name -> ((String) name).toLowerCase(ENGLISH))
                            .collect(toImmutableList()),
                    value -> value))
            .add(stringProperty(
                    LOCATION_PROPERTY,
                    "File system location URI for the table",
                    null,
                    false))
            .build();
}
 
Example 15
Source Project: presto   Source File: IcebergPageSink.java    License: Apache License 2.0 5 votes vote down vote up
public IcebergPageSink(
        Schema outputSchema,
        PartitionSpec partitionSpec,
        String outputPath,
        IcebergFileWriterFactory fileWriterFactory,
        PageIndexerFactory pageIndexerFactory,
        HdfsEnvironment hdfsEnvironment,
        HdfsContext hdfsContext,
        List<IcebergColumnHandle> inputColumns,
        JsonCodec<CommitTaskData> jsonCodec,
        ConnectorSession session,
        FileFormat fileFormat)
{
    requireNonNull(inputColumns, "inputColumns is null");
    this.outputSchema = requireNonNull(outputSchema, "outputSchema is null");
    this.partitionSpec = requireNonNull(partitionSpec, "partitionSpec is null");
    this.outputPath = requireNonNull(outputPath, "outputPath is null");
    this.fileWriterFactory = requireNonNull(fileWriterFactory, "fileWriterFactory is null");
    this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
    requireNonNull(hdfsContext, "hdfsContext is null");
    this.jobConf = toJobConf(hdfsEnvironment.getConfiguration(hdfsContext, new Path(outputPath)));
    this.jsonCodec = requireNonNull(jsonCodec, "jsonCodec is null");
    this.session = requireNonNull(session, "session is null");
    this.fileFormat = requireNonNull(fileFormat, "fileFormat is null");
    this.inputColumns = ImmutableList.copyOf(inputColumns);
    this.pagePartitioner = new PagePartitioner(pageIndexerFactory, toPartitionColumns(inputColumns, partitionSpec));
}
 
Example 16
Source Project: presto   Source File: TestIcebergSmoke.java    License: Apache License 2.0 5 votes vote down vote up
private void testCreatePartitionedTableAs(Session session, FileFormat fileFormat)
    {
        @Language("SQL") String createTable = "" +
                "CREATE TABLE test_create_partitioned_table_as " +
                "WITH (" +
                "format = '" + fileFormat + "', " +
                "partitioning = ARRAY['ORDER_STATUS', 'Ship_Priority', 'Bucket(order_key,9)']" +
                ") " +
                "AS " +
                "SELECT orderkey AS order_key, shippriority AS ship_priority, orderstatus AS order_status " +
                "FROM tpch.tiny.orders";

        assertUpdate(session, createTable, "SELECT count(*) from orders");

        String createTableSql = format("" +
                        "CREATE TABLE %s.%s.%s (\n" +
                        "   order_key bigint,\n" +
                        "   ship_priority integer,\n" +
                        "   order_status varchar\n" +
                        ")\n" +
                        "WITH (\n" +
                        "   format = '" + fileFormat + "',\n" +
                        "   partitioning = ARRAY['order_status','ship_priority','bucket(order_key, 9)']\n" +
                        ")",
                getSession().getCatalog().get(),
                getSession().getSchema().get(),
                "test_create_partitioned_table_as");

        MaterializedResult actualResult = computeActual("SHOW CREATE TABLE test_create_partitioned_table_as");
        assertEquals(getOnlyElement(actualResult.getOnlyColumnAsSet()), createTableSql);

//        assertEquals(partitions.size(), 3);

        assertQuery(session, "SELECT * from test_create_partitioned_table_as", "SELECT orderkey, shippriority, orderstatus FROM orders");

        dropTable(session, "test_create_partitioned_table_as");
    }
 
Example 17
Source Project: presto   Source File: TestIcebergSmoke.java    License: Apache License 2.0 5 votes vote down vote up
private void testPredicating(Session session, FileFormat fileFormat)
{
    assertUpdate(session, "CREATE TABLE test_predicating_on_real (col REAL) WITH (format = '" + fileFormat + "')");
    assertUpdate(session, "INSERT INTO test_predicating_on_real VALUES 1.2", 1);
    assertQuery(session, "SELECT * FROM test_predicating_on_real WHERE col = 1.2", "VALUES 1.2");
    dropTable(session, "test_predicating_on_real");
}
 
Example 18
Source Project: iceberg   Source File: TestHiveTableConcurrency.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}
 
Example 19
Source Project: iceberg   Source File: TestHiveTableConcurrency.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentConnections() throws InterruptedException {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  icebergTable.updateProperties()
      .set(COMMIT_NUM_RETRIES, "20")
      .set(COMMIT_MIN_RETRY_WAIT_MS, "25")
      .set(COMMIT_MAX_RETRY_WAIT_MS, "25")
      .commit();

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(7));

  for (int i = 0; i < 7; i++) {
    executorService.submit(() -> icebergTable.newAppend().appendFile(file).commit());
  }

  executorService.shutdown();
  Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
  Assert.assertEquals(7, Iterables.size(icebergTable.snapshots()));
}
 
Example 20
Source Project: iceberg   Source File: Writer.java    License: Apache License 2.0 5 votes vote down vote up
WriterFactory(PartitionSpec spec, FileFormat format, LocationProvider locations,
              Map<String, String> properties, Broadcast<FileIO> io,
              Broadcast<EncryptionManager> encryptionManager, long targetFileSize,
              Schema writeSchema, StructType dsSchema) {
  this.spec = spec;
  this.format = format;
  this.locations = locations;
  this.properties = properties;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.targetFileSize = targetFileSize;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;
}
 
Example 21
Source Project: iceberg   Source File: TestOrcMetrics.java    License: Apache License 2.0 5 votes vote down vote up
private InputFile writeRecords(Schema schema, Map<String, String> properties, Record... records) throws IOException {
  File tmpFolder = temp.newFolder("orc");
  String filename = UUID.randomUUID().toString();
  OutputFile file = Files.localOutput(new File(tmpFolder, FileFormat.ORC.addExtension(filename)));
  try (FileAppender<Record> writer = ORC.write(file)
      .schema(schema)
      .setAll(properties)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    writer.addAll(Lists.newArrayList(records));
  }
  return file.toInputFile();
}
 
Example 22
Source Project: iceberg   Source File: TestMetricsRowGroupFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMissingStatsParquet() {
  Assume.assumeTrue(format == FileFormat.PARQUET);
  Expression[] exprs = new Expression[] {
      lessThan("no_stats_parquet", "a"), lessThanOrEqual("no_stats_parquet", "b"), equal("no_stats_parquet", "c"),
      greaterThan("no_stats_parquet", "d"), greaterThanOrEqual("no_stats_parquet", "e"),
      notEqual("no_stats_parquet", "f"), isNull("no_stats_parquet"), notNull("no_stats_parquet"),
      startsWith("no_stats_parquet", "a")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = shouldRead(expr);
    Assert.assertTrue("Should read when missing stats for expr: " + expr, shouldRead);
  }
}
 
Example 23
Source Project: iceberg   Source File: TestMetricsRowGroupFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testStringStartsWith() {
  Assume.assumeFalse("ORC row group filter does not support StringStartsWith", format == FileFormat.ORC);
  boolean shouldRead = shouldRead(startsWith("str", "1"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "0st"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "1str1"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "1str1_xgd"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "2str"));
  Assert.assertTrue("Should read: range matches", shouldRead);

  shouldRead = shouldRead(startsWith("str", "9xstr"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);

  shouldRead = shouldRead(startsWith("str", "0S"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);

  shouldRead = shouldRead(startsWith("str", "x"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);

  shouldRead = shouldRead(startsWith("str", "9str9aaa"));
  Assert.assertFalse("Should not read: range doesn't match", shouldRead);
}
 
Example 24
Source Project: iceberg   Source File: TestParquetMetrics.java    License: Apache License 2.0 5 votes vote down vote up
private InputFile writeRecords(Schema schema, Map<String, String> properties, Record... records) throws IOException {
  File tmpFolder = temp.newFolder("parquet");
  String filename = UUID.randomUUID().toString();
  OutputFile file = Files.localOutput(new File(tmpFolder, FileFormat.PARQUET.addExtension(filename)));
  try (FileAppender<Record> writer = Parquet.write(file)
      .schema(schema)
      .setAll(properties)
      .createWriterFunc(GenericParquetWriter::buildWriter)
      .build()) {
    writer.addAll(Lists.newArrayList(records));
  }
  return file.toInputFile();
}
 
Example 25
Source Project: iceberg   Source File: RewriteManifestsAction.java    License: Apache License 2.0 5 votes vote down vote up
private static ManifestFile writeManifest(
    List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io,
    String location, int format, PartitionSpec spec, StructType sparkType) throws IOException {

  String manifestName = "optimized-m-" + UUID.randomUUID();
  Path manifestPath = new Path(location, manifestName);
  OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString()));

  Types.StructType dataFileType = DataFile.getType(spec.partitionType());
  SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType);

  ManifestWriter writer = ManifestFiles.write(format, spec, outputFile, null);

  try {
    for (int index = startIndex; index < endIndex; index++) {
      Row row = rows.get(index);
      long snapshotId = row.getLong(0);
      long sequenceNumber = row.getLong(1);
      Row file = row.getStruct(2);
      writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber);
    }
  } finally {
    writer.close();
  }

  return writer.toManifestFile();
}
 
Example 26
Source Project: iceberg   Source File: BatchDataReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
CloseableIterator<ColumnarBatch> open(FileScanTask task) {
  CloseableIterable<ColumnarBatch> iter;
  InputFile location = getInputFile(task);
  Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
  if (task.file().format() == FileFormat.PARQUET) {
    Parquet.ReadBuilder builder = Parquet.read(location)
        .project(expectedSchema)
        .split(task.start(), task.length())
        .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema,
            fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED))
        .recordsPerBatch(batchSize)
        .filter(task.residual())
        .caseSensitive(caseSensitive)
        // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
        // without worrying about subsequent reads clobbering over each other. This improves
        // read performance as every batch read doesn't have to pay the cost of allocating memory.
        .reuseContainers();

    if (nameMapping != null) {
      builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }

    iter = builder.build();
  } else {
    throw new UnsupportedOperationException(
        "Format: " + task.file().format() + " not supported for batched reads");
  }
  return iter.iterator();
}
 
Example 27
Source Project: iceberg   Source File: OutputFileFactory.java    License: Apache License 2.0 5 votes vote down vote up
OutputFileFactory(PartitionSpec spec, FileFormat format, LocationProvider locations, FileIO io,
                  EncryptionManager encryptionManager, int partitionId, long taskId) {
  this.spec = spec;
  this.format = format;
  this.locations = locations;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.partitionId = partitionId;
  this.taskId = taskId;
}
 
Example 28
Source Project: iceberg   Source File: SparkAppenderFactory.java    License: Apache License 2.0 5 votes vote down vote up
public FileAppender<InternalRow> newAppender(OutputFile file, FileFormat fileFormat) {
  MetricsConfig metricsConfig = MetricsConfig.fromProperties(properties);
  try {
    switch (fileFormat) {
      case PARQUET:
        return Parquet.write(file)
            .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(dsSchema, msgType))
            .setAll(properties)
            .metricsConfig(metricsConfig)
            .schema(writeSchema)
            .overwrite()
            .build();

      case AVRO:
        return Avro.write(file)
            .createWriterFunc(ignored -> new SparkAvroWriter(dsSchema))
            .setAll(properties)
            .schema(writeSchema)
            .overwrite()
            .build();

      case ORC:
        return ORC.write(file)
            .createWriterFunc(SparkOrcWriter::new)
            .setAll(properties)
            .schema(writeSchema)
            .overwrite()
            .build();

      default:
        throw new UnsupportedOperationException("Cannot write unknown format: " + fileFormat);
    }
  } catch (IOException e) {
    throw new RuntimeIOException(e);
  }
}
 
Example 29
Source Project: iceberg   Source File: BaseWriter.java    License: Apache License 2.0 5 votes vote down vote up
BaseWriter(PartitionSpec spec, FileFormat format, SparkAppenderFactory appenderFactory,
           OutputFileFactory fileFactory, FileIO io, long targetFileSize) {
  this.spec = spec;
  this.format = format;
  this.appenderFactory = appenderFactory;
  this.fileFactory = fileFactory;
  this.io = io;
  this.targetFileSize = targetFileSize;
}
 
Example 30
Source Project: iceberg   Source File: BaseWriter.java    License: Apache License 2.0 5 votes vote down vote up
public void writeInternal(InternalRow row)  throws IOException {
  //TODO: ORC file now not support target file size before closed
  if  (!format.equals(FileFormat.ORC) &&
      currentRows % ROWS_DIVISOR == 0 && currentAppender.length() >= targetFileSize) {
    closeCurrent();
    openCurrent();
  }

  currentAppender.add(row);
  currentRows++;
}