Java Code Examples for org.apache.iceberg.DataFile

The following examples show how to use org.apache.iceberg.DataFile. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: HiveCreateReplaceTableTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}
 
Example 2
Source Project: iceberg   Source File: HiveCreateReplaceTableTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateTableTxnWithGlobalTableLocation() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap());
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();

  table.newAppend()
      .appendFile(dataFile)
      .commit();

  Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots()));
}
 
Example 3
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}
 
Example 4
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedTable() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, null, format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();
  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString());
  validate(job, expectedRecords);
}
 
Example 5
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedTable() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString());
  validate(job, expectedRecords);
}
 
Example 6
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterExp() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  expectedRecords.get(1).set(2, "2020-03-20");
  DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  DataFile dataFile2 = writeFile(table, Row.of("2020-03-21", 0), format,
                                 RandomGenericData.generate(table.schema(), 2, 0L));
  table.newAppend()
       .appendFile(dataFile1)
       .appendFile(dataFile2)
       .commit();
  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString())
               .filter(Expressions.equal("date", "2020-03-20"));
  validate(job, expectedRecords);
}
 
Example 7
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}
 
Example 8
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCustomCatalog() throws Exception {
  conf = new Configuration();
  conf.set("warehouse.location", temp.newFolder("hadoop_catalog").getAbsolutePath());

  Catalog catalog = new HadoopCatalogFunc().apply(conf);
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "t");
  Table table = catalog.createTable(tableIdentifier, SCHEMA, SPEC,
                                    ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()));
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .catalogFunc(HadoopCatalogFunc.class)
      .readFrom(tableIdentifier.toString());
  validate(job, expectedRecords);
}
 
Example 9
Source Project: iceberg   Source File: RewriteDataFilesAction.java    License: Apache License 2.0 6 votes vote down vote up
private void replaceDataFiles(Iterable<DataFile> deletedDataFiles, Iterable<DataFile> addedDataFiles) {
  try {
    RewriteFiles rewriteFiles = table.newRewrite();
    rewriteFiles.rewriteFiles(Sets.newHashSet(deletedDataFiles), Sets.newHashSet(addedDataFiles));
    commit(rewriteFiles);

  } catch (Exception e) {
    Tasks.foreach(Iterables.transform(addedDataFiles, f -> f.path().toString()))
        .noRetry()
        .suppressFailureWhenFinished()
        .onFailure((location, exc) -> LOG.warn("Failed to delete: {}", location, exc))
        .run(fileIO::deleteFile);

    throw e;
  }
}
 
Example 10
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 6 votes vote down vote up
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}
 
Example 11
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
  DataFile file = task.file();

  // update the current file for Spark's filename() function
  InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());

  // schema or rows returned by readers
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();

  if (projectsIdentityPartitionColumns) {
    return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant))
        .iterator();
  }
  // return the base iterator
  return open(task, expectedSchema, ImmutableMap.of()).iterator();
}
 
Example 12
Source Project: iceberg   Source File: BaseWriter.java    License: Apache License 2.0 6 votes vote down vote up
protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}
 
Example 13
Source Project: dremio-oss   Source File: WriterCommitterOperator.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void consumeData(int records) throws Exception {
  project.consumeData(records);
  if (icebergTableCommitter) {
    List<DataFile> icebergDatafiles = new ArrayList<>();
    for (int i = 0; i < records; ++i) {
      DataFile dataFile = IcebergSerDe.deserializeDataFile(icebergMetadataVector.get(i));
      icebergDatafiles.add(dataFile);
    }
    if (icebergDatafiles.size() > 0) {
      try (AutoCloseable ac = OperatorStats.getWaitRecorder(context.getStats())) {
        icebergOpCommitter.consumeData(icebergDatafiles);
      }
    }
  }
  recordCount += records;
}
 
Example 14
Source Project: iceberg   Source File: HiveTableTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testDropWithoutPurgeLeavesTableData() throws IOException {
  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test"));
  List<GenericData.Record> records = Lists.newArrayList(
      recordBuilder.set("id", 1L).build(),
      recordBuilder.set("id", 2L).build(),
      recordBuilder.set("id", 3L).build()
  );

  String fileLocation = table.location().replace("file:", "") + "/data/file.avro";
  try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(fileLocation))
      .schema(schema)
      .named("test")
      .build()) {
    for (GenericData.Record rec : records) {
      writer.add(rec);
    }
  }

  DataFile file = DataFiles.builder(table.spec())
      .withRecordCount(3)
      .withPath(fileLocation)
      .withFileSizeInBytes(Files.localInput(fileLocation).getLength())
      .build();

  table.newAppend().appendFile(file).commit();

  String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", "");

  Assert.assertTrue("Drop should return true and drop the table",
      catalog.dropTable(TABLE_IDENTIFIER, false /* do not delete underlying files */));
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Assert.assertTrue("Table data files should exist",
      new File(fileLocation).exists());
  Assert.assertTrue("Table metadata files should exist",
      new File(manifestListLocation).exists());
}
 
Example 15
Source Project: iceberg   Source File: TestHiveTableConcurrency.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}
 
Example 16
Source Project: iceberg   Source File: TestHiveTableConcurrency.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public synchronized void testConcurrentConnections() throws InterruptedException {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  icebergTable.updateProperties()
      .set(COMMIT_NUM_RETRIES, "20")
      .set(COMMIT_MIN_RETRY_WAIT_MS, "25")
      .set(COMMIT_MAX_RETRY_WAIT_MS, "25")
      .commit();

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(7));

  for (int i = 0; i < 7; i++) {
    executorService.submit(() -> icebergTable.newAppend().appendFile(file).commit());
  }

  executorService.shutdown();
  Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
  Assert.assertEquals(7, Iterables.size(icebergTable.snapshots()));
}
 
Example 17
Source Project: iceberg   Source File: Writer.java    License: Apache License 2.0 5 votes vote down vote up
private void replacePartitions(WriterCommitMessage[] messages) {
  ReplacePartitions dynamicOverwrite = table.newReplacePartitions();

  int numFiles = 0;
  for (DataFile file : files(messages)) {
    numFiles += 1;
    dynamicOverwrite.addFile(file);
  }

  commitOperation(dynamicOverwrite, numFiles, "dynamic partition overwrite");
}
 
Example 18
Source Project: iceberg   Source File: Writer.java    License: Apache License 2.0 5 votes vote down vote up
protected Iterable<DataFile> files(WriterCommitMessage[] messages) {
  if (messages.length > 0) {
    return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ?
        ImmutableList.copyOf(((TaskResult) message).files()) :
        ImmutableList.of()));
  }
  return ImmutableList.of();
}
 
Example 19
Source Project: iceberg   Source File: SnapshotFunctionalityTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void getInfoAboutFilesAddedFromSnapshot() {
  Snapshot snapshot = table.currentSnapshot();
  Iterable<DataFile> addedFiles = snapshot.addedFiles();

  for (DataFile dataFile : addedFiles) {
    log.info("File path: " + dataFile.path());
    log.info("File format: " + dataFile.format());
    log.info("File size in bytes: " + dataFile.fileSizeInBytes());
    log.info("Record count: " + dataFile.recordCount());
  }
}
 
Example 20
Source Project: dremio-oss   Source File: TestIcebergPartitions.java    License: Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName, int idValue, String nameValue,
  int recordCount) throws IOException {
  File dataFile = new File(dir, fileName);
  dataFile.createNewFile();

  return DataFiles.builder(spec)
    .withInputFile(Files.localInput(dataFile))
    .withPartitionPath(ID + "=" + idValue + "/" + NAME + "=" + nameValue)
    .withRecordCount(recordCount)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example 21
Source Project: iceberg   Source File: TestStrictMetricsEvaluator.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testZeroRecordFile() {
  DataFile empty = new TestDataFile("file.parquet", Row.of(), 0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, expr).eval(empty);
    Assert.assertTrue("Should always match 0-record file: " + expr, shouldRead);
  }
}
 
Example 22
Source Project: iceberg   Source File: TestInclusiveMetricsEvaluator.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMissingStats() {
  DataFile missingStats = new TestDataFile("file.parquet", Row.of(), 50);

  Expression[] exprs = new Expression[] {
      lessThan("no_stats", 5), lessThanOrEqual("no_stats", 30), equal("no_stats", 70),
      greaterThan("no_stats", 78), greaterThanOrEqual("no_stats", 90), notEqual("no_stats", 101),
      isNull("no_stats"), notNull("no_stats")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, expr).eval(missingStats);
    Assert.assertTrue("Should read when missing stats for expr: " + expr, shouldRead);
  }
}
 
Example 23
Source Project: iceberg   Source File: TestInclusiveMetricsEvaluator.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testZeroRecordFile() {
  DataFile empty = new TestDataFile("file.parquet", Row.of(), 0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, expr).eval(empty);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example 24
Source Project: iceberg   Source File: TestLocalScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterWithDateAndTimestamp() throws IOException {
  // TODO: Add multiple timestamp tests - there's an issue with ORC caching TZ in ThreadLocal, so it's not possible
  //   to change TZ and test with ORC as they will produce incompatible values.
  Schema schema = new Schema(
      required(1, "timestamp_with_zone", Types.TimestampType.withZone()),
      required(2, "timestamp_without_zone", Types.TimestampType.withoutZone()),
      required(3, "date", Types.DateType.get()),
      required(4, "time", Types.TimeType.get())
  );

  File tableLocation = temp.newFolder("complex_filter_table");
  Assert.assertTrue(tableLocation.delete());

  Table table = TABLES.create(
      schema, PartitionSpec.unpartitioned(),
      ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
      tableLocation.getAbsolutePath());

  List<Record> expected = RandomGenericData.generate(schema, 100, 435691832918L);
  DataFile file = writeFile(tableLocation.toString(), format.addExtension("record-file"), schema, expected);
  table.newFastAppend().appendFile(file).commit();

  for (Record r : expected) {
    Iterable<Record> filterResult = IcebergGenerics.read(table)
        .where(equal("timestamp_with_zone", r.getField("timestamp_with_zone").toString()))
        .where(equal("timestamp_without_zone", r.getField("timestamp_without_zone").toString()))
        .where(equal("date", r.getField("date").toString()))
        .where(equal("time", r.getField("time").toString()))
        .build();

    Assert.assertTrue(filterResult.iterator().hasNext());
    Record readRecord = filterResult.iterator().next();
    Assert.assertEquals(r.getField("timestamp_with_zone"), readRecord.getField("timestamp_with_zone"));
  }
}
 
Example 25
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testResiduals() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> writeRecords = RandomGenericData.generate(table.schema(), 2, 0L);
  writeRecords.get(0).set(1, 123L);
  writeRecords.get(0).set(2, "2020-03-20");
  writeRecords.get(1).set(1, 456L);
  writeRecords.get(1).set(2, "2020-03-20");

  List<Record> expectedRecords = new ArrayList<>();
  expectedRecords.add(writeRecords.get(0));

  DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, writeRecords);
  DataFile dataFile2 = writeFile(table, Row.of("2020-03-21", 0), format,
      RandomGenericData.generate(table.schema(), 2, 0L));
  table.newAppend()
       .appendFile(dataFile1)
       .appendFile(dataFile2)
       .commit();
  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 123)));
  validate(job, expectedRecords);

  // skip residual filtering
  job = Job.getInstance(conf);
  configBuilder = IcebergInputFormat.configure(job);
  configBuilder.skipResidualFiltering().readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 123)));
  validate(job, writeRecords);
}
 
Example 26
Source Project: iceberg   Source File: TestIcebergInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testFailedResidualFiltering() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
      ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
      location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  expectedRecords.get(1).set(2, "2020-03-20");

  DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
      .appendFile(dataFile1)
      .commit();

  Job jobShouldFail1 = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(jobShouldFail1);
  configBuilder.useHiveRows().readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 0)));
  AssertHelpers.assertThrows(
      "Residuals are not evaluated today for Iceberg Generics In memory model of HIVE",
      UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.",
      () -> validate(jobShouldFail1, expectedRecords));

  Job jobShouldFail2 = Job.getInstance(conf);
  configBuilder = IcebergInputFormat.configure(jobShouldFail2);
  configBuilder.usePigTuples().readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 0)));
  AssertHelpers.assertThrows(
      "Residuals are not evaluated today for Iceberg Generics In memory model of PIG",
      UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.",
      () -> validate(jobShouldFail2, expectedRecords));
}
 
Example 27
Source Project: iceberg   Source File: RewriteManifestsAction.java    License: Apache License 2.0 5 votes vote down vote up
private static ManifestFile writeManifest(
    List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io,
    String location, int format, PartitionSpec spec, StructType sparkType) throws IOException {

  String manifestName = "optimized-m-" + UUID.randomUUID();
  Path manifestPath = new Path(location, manifestName);
  OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString()));

  Types.StructType dataFileType = DataFile.getType(spec.partitionType());
  SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType);

  ManifestWriter writer = ManifestFiles.write(format, spec, outputFile, null);

  try {
    for (int index = startIndex; index < endIndex; index++) {
      Row row = rows.get(index);
      long snapshotId = row.getLong(0);
      long sequenceNumber = row.getLong(1);
      Row file = row.getStruct(2);
      writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber);
    }
  } finally {
    writer.close();
  }

  return writer.toManifestFile();
}
 
Example 28
Source Project: dremio-oss   Source File: TestRefresh.java    License: Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}
 
Example 29
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example 30
Source Project: dremio-oss   Source File: TestIcebergTableDrop.java    License: Apache License 2.0 5 votes vote down vote up
private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}