org.apache.iceberg.io.FileIO Java Examples

The following examples show how to use org.apache.iceberg.io.FileIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

6 votes

private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}

Example #2

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
  if (readSchema != null) {
    // convert() will fail if readSchema contains fields not in table.schema()
    SparkSchemaUtil.convert(table.schema(), readSchema);
    reader.pruneColumns(readSchema);
  }

  return reader;
}

Example #3

Source File: SparkBatchWrite.java From iceberg with Apache License 2.0

6 votes

SparkBatchWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                CaseInsensitiveStringMap options, boolean overwriteDynamic, boolean overwriteByFilter,
                Expression overwriteExpr, String applicationId, String wapId, Schema writeSchema,
                StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.overwriteDynamic = overwriteDynamic;
  this.overwriteByFilter = overwriteByFilter;
  this.overwriteExpr = overwriteExpr;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.genieId = options.get("genie-id");
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}

Example #4

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private List<ManifestFile> writeManifestsForPartitionedTable(
    Dataset<Row> manifestEntryDF, int numManifests,
    int targetNumManifestEntries) {

  Broadcast<FileIO> io = sparkContext.broadcast(fileIO);
  StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();

  // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough
  long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries);

  return withReusableDS(manifestEntryDF, df -> {
    Column partitionColumn = df.col("data_file.partition");
    return df.repartitionByRange(numManifests, partitionColumn)
        .sortWithinPartitions(partitionColumn)
        .mapPartitions(
            toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType),
            manifestEncoder
        )
        .collectAsList();
  });
}

Example #5

Source File: BaseSnapshot.java From iceberg with Apache License 2.0

6 votes

BaseSnapshot(FileIO io,
             long sequenceNumber,
             long snapshotId,
             Long parentId,
             long timestampMillis,
             String operation,
             Map<String, String> summary,
             String manifestList) {
  this.io = io;
  this.sequenceNumber = sequenceNumber;
  this.snapshotId = snapshotId;
  this.parentId = parentId;
  this.timestampMillis = timestampMillis;
  this.operation = operation;
  this.summary = summary;
  this.manifestListLocation = manifestList;
}

Example #6

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}

Example #7

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private static MapPartitionsFunction<Row, ManifestFile> toManifests(
    Broadcast<FileIO> io, long maxNumManifestEntries, String location,
    int format, PartitionSpec spec, StructType sparkType) {

  return (MapPartitionsFunction<Row, ManifestFile>) rows -> {
    List<Row> rowsAsList = Lists.newArrayList(rows);

    if (rowsAsList.isEmpty()) {
      return Collections.emptyIterator();
    }

    List<ManifestFile> manifests = Lists.newArrayList();
    if (rowsAsList.size() <= maxNumManifestEntries) {
      manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType));
    } else {
      int midIndex = rowsAsList.size() / 2;
      manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType));
      manifests.add(writeManifest(rowsAsList,  midIndex, rowsAsList.size(), io, location, format, spec, sparkType));
    }

    return manifests.iterator();
  };
}

Example #8

Source File: SparkBatchScan.java From iceberg with Apache License 2.0

6 votes

ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString, String nameMappingString,
         Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager, boolean caseSensitive,
         boolean localityPreferred) {
  this.task = task;
  this.tableSchemaString = tableSchemaString;
  this.expectedSchemaString = expectedSchemaString;
  this.nameMappingString = nameMappingString;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.caseSensitive = caseSensitive;
  if (localityPreferred) {
    this.preferredLocations = Util.blockLocations(io.value(), task);
  } else {
    this.preferredLocations = HadoopInputFile.NO_LOCATION_PREFERENCE;
  }
}

Example #9

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
                                       OutputMode mode, DataSourceOptions options) {
  Preconditions.checkArgument(
      mode == OutputMode.Append() || mode == OutputMode.Complete(),
      "Output mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  // Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
  // so we fetch it directly from sparkContext to make writes idempotent
  String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
  String appId = lazySparkSession().sparkContext().applicationId();

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}

Example #10

Source File: SparkWriteBuilder.java From iceberg with Apache License 2.0

6 votes

@Override
public BatchWrite buildForBatch() {
  // Validate
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema,
      checkNullability(spark, options), checkOrdering(spark, options));
  SparkUtil.validatePartitionTransforms(table.spec());

  // Get application id
  String appId = spark.sparkContext().applicationId();

  // Get write-audit-publish id
  String wapId = spark.conf().get("spark.wap.id", null);

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new SparkBatchWrite(
      table, io, encryptionManager, options, overwriteDynamic, overwriteByFilter, overwriteExpr, appId, wapId,
      writeSchema, dsSchema);
}

Example #11

Source File: SparkWriteBuilder.java From iceberg with Apache License 2.0

6 votes

@Override
public StreamingWrite buildForStreaming() {
  // Validate
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema,
      checkNullability(spark, options), checkOrdering(spark, options));
  SparkUtil.validatePartitionTransforms(table.spec());

  // Change to streaming write if it is just append
  Preconditions.checkState(!overwriteDynamic,
      "Unsupported streaming operation: dynamic partition overwrite");
  Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(),
      "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr);

  // Get application id
  String appId = spark.sparkContext().applicationId();

  // Get write-audit-publish id
  String wapId = spark.conf().get("spark.wap.id", null);

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new SparkStreamingWrite(
      table, io, encryptionManager, options, overwriteByFilter, writeQueryId, appId, wapId, writeSchema, dsSchema);
}

Example #12

Source File: Writer.java From iceberg with Apache License 2.0

6 votes

Writer(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
       DataSourceOptions options, boolean replacePartitions, String applicationId, String wapId,
       Schema writeSchema, StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.replacePartitions = replacePartitions;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}

Example #13

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private List<ManifestFile> writeManifestsForUnpartitionedTable(Dataset<Row> manifestEntryDF, int numManifests) {
  Broadcast<FileIO> io = sparkContext.broadcast(fileIO);
  StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();

  // we rely only on the target number of manifests for unpartitioned tables
  // as we should not worry about having too much metadata per partition
  long maxNumManifestEntries = Long.MAX_VALUE;

  return manifestEntryDF
      .repartition(numManifests)
      .mapPartitions(
          toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType),
          manifestEncoder
      )
      .collectAsList();
}

Example #14

Source File: RowDataRewriter.java From iceberg with Apache License 2.0

6 votes

public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}

Example #15

Source File: DataFilesTable.java From iceberg with Apache License 2.0

5 votes

ManifestReadTask(FileIO io, ManifestFile manifest, Schema schema, String schemaString,
                 String specString, ResidualEvaluator residuals) {
  super(DataFiles.fromManifest(manifest), schemaString, specString, residuals);
  this.io = io;
  this.manifest = manifest;
  this.schema = schema;
}

Example #16

Source File: SparkScanBuilder.java From iceberg with Apache License 2.0

5 votes

@Override
public Scan build() {
  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryption = lazySparkContext().broadcast(table.encryption());

  return new SparkBatchScan(table, io, encryption, caseSensitive, lazySchema(), filterExpressions, options);
}

Example #17

Source File: ManifestEntriesTable.java From iceberg with Apache License 2.0

5 votes

ManifestReadTask(FileIO io, ManifestFile manifest, Schema fileSchema, String schemaString,
                 String specString, ResidualEvaluator residuals) {
  super(DataFiles.fromManifest(manifest), schemaString, specString, residuals);
  this.fileSchema = fileSchema;
  this.io = io;
  this.manifest = manifest;
}

Example #18

Source File: SparkBatchScan.java From iceberg with Apache License 2.0

5 votes

SparkBatchScan(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryption, boolean caseSensitive,
               Schema expectedSchema, List<Expression> filters, CaseInsensitiveStringMap options) {
  this.table = table;
  this.io = io;
  this.encryptionManager = encryption;
  this.caseSensitive = caseSensitive;
  this.expectedSchema = expectedSchema;
  this.filterExpressions = filters;
  this.snapshotId = Spark3Util.propertyAsLong(options, "snapshot-id", null);
  this.asOfTimestamp = Spark3Util.propertyAsLong(options, "as-of-timestamp", null);

  if (snapshotId != null && asOfTimestamp != null) {
    throw new IllegalArgumentException(
        "Cannot scan using both snapshot-id and as-of-timestamp to select the table snapshot");
  }

  this.startSnapshotId = Spark3Util.propertyAsLong(options, "start-snapshot-id", null);
  this.endSnapshotId = Spark3Util.propertyAsLong(options, "end-snapshot-id", null);
  if (snapshotId != null || asOfTimestamp != null) {
    if (startSnapshotId != null || endSnapshotId != null) {
      throw new IllegalArgumentException(
          "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either snapshot-id or " +
              "as-of-timestamp is specified");
    }
  } else if (startSnapshotId == null && endSnapshotId != null) {
    throw new IllegalArgumentException("Cannot only specify option end-snapshot-id to do incremental scan");
  }

  // look for split behavior overrides in options
  this.splitSize = Spark3Util.propertyAsLong(options, "split-size", null);
  this.splitLookback = Spark3Util.propertyAsInt(options, "lookback", null);
  this.splitOpenFileCost = Spark3Util.propertyAsLong(options, "file-open-cost", null);
  this.localityPreferred = Spark3Util.isLocalityEnabled(io.value(), table.location(), options);
  this.batchReadsEnabled = Spark3Util.isVectorizationEnabled(table.properties(), options);
  this.batchSize = Spark3Util.batchSize(table.properties(), options);
}

Example #19

Source File: SparkBatchWrite.java From iceberg with Apache License 2.0

5 votes

protected WriterFactory(PartitionSpec spec, FileFormat format, LocationProvider locations,
                        Map<String, String> properties, Broadcast<FileIO> io,
                        Broadcast<EncryptionManager> encryptionManager, long targetFileSize,
                        Schema writeSchema, StructType dsSchema) {
  this.spec = spec;
  this.format = format;
  this.locations = locations;
  this.properties = properties;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.targetFileSize = targetFileSize;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;
}

Example #20

Source File: BaseSnapshot.java From iceberg with Apache License 2.0

5 votes

/**
 * For testing only.
 */
BaseSnapshot(FileIO io,
             long snapshotId,
             String... manifestFiles) {
  this(io, snapshotId, null, System.currentTimeMillis(), null, null,
      Lists.transform(Arrays.asList(manifestFiles),
          path -> new GenericManifestFile(io.newInputFile(path), 0)));
}

Example #21

Source File: Spark3Util.java From iceberg with Apache License 2.0

5 votes

public static boolean isLocalityEnabled(FileIO io, String location, CaseInsensitiveStringMap readOptions) {
  InputFile in = io.newInputFile(location);
  if (in instanceof HadoopInputFile) {
    String scheme = ((HadoopInputFile) in).getFileSystem().getScheme();
    return readOptions.getBoolean("locality", LOCALITY_WHITELIST_FS.contains(scheme));
  }
  return false;
}

Example #22

Source File: BaseDataReader.java From iceberg with Apache License 2.0

5 votes

BaseDataReader(CombinedScanTask task, FileIO fileIo, EncryptionManager encryptionManager) {
  this.fileIo = fileIo;
  this.tasks = task.files().iterator();
  Iterable<InputFile> decryptedFiles = encryptionManager.decrypt(Iterables.transform(
      task.files(),
      fileScanTask ->
          EncryptedFiles.encryptedInput(
              this.fileIo.newInputFile(fileScanTask.file().path().toString()),
              fileScanTask.file().keyMetadata())));
  ImmutableMap.Builder<String, InputFile> inputFileBuilder = ImmutableMap.builder();
  decryptedFiles.forEach(decrypted -> inputFileBuilder.put(decrypted.location(), decrypted));
  this.inputFiles = inputFileBuilder.build();
  this.currentIterator = CloseableIterator.empty();
}

Example #23

Source File: Util.java From iceberg with Apache License 2.0

5 votes

public static String[] blockLocations(FileIO io, CombinedScanTask task) {
  Set<String> locations = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    InputFile in = io.newInputFile(f.file().path().toString());
    if (in instanceof HadoopInputFile) {
      Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length()));
    }
  }

  return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE);
}

Example #24

Source File: HadoopTableOperations.java From iceberg with Apache License 2.0

5 votes

@Override
public FileIO io() {
  if (defaultFileIo == null) {
    defaultFileIo = new HadoopFileIO(conf);
  }
  return defaultFileIo;
}

Example #25

Source File: SparkStreamingWrite.java From iceberg with Apache License 2.0

5 votes

SparkStreamingWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                    CaseInsensitiveStringMap options, boolean truncateBatches, String queryId,
                    String applicationId, String wapId, Schema writeSchema, StructType dsSchema) {
  super(
      table, io, encryptionManager, options, false, truncateBatches, Expressions.alwaysTrue(), applicationId, wapId,
      writeSchema, dsSchema);
  this.truncateBatches = truncateBatches;
  this.queryId = queryId;
}

Example #26

Source File: BaseSnapshot.java From iceberg with Apache License 2.0

5 votes

BaseSnapshot(FileIO io,
             long snapshotId,
             Long parentId,
             long timestampMillis,
             String operation,
             Map<String, String> summary,
             List<ManifestFile> dataManifests) {
  this(io, INITIAL_SEQUENCE_NUMBER, snapshotId, parentId, timestampMillis, operation, summary, null);
  this.allManifests = dataManifests;
}

Example #27

Source File: SparkUtil.java From iceberg with Apache License 2.0

5 votes

public static FileIO serializableFileIO(Table table) {
  if (table.io() instanceof HadoopFileIO) {
    // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization
    SerializableConfiguration conf = new SerializableConfiguration(((HadoopFileIO) table.io()).conf());
    return new HadoopFileIO(conf::value);
  } else {
    return table.io();
  }
}

Example #28

Source File: BaseWriter.java From iceberg with Apache License 2.0

5 votes

BaseWriter(PartitionSpec spec, FileFormat format, SparkAppenderFactory appenderFactory,
           OutputFileFactory fileFactory, FileIO io, long targetFileSize) {
  this.spec = spec;
  this.format = format;
  this.appenderFactory = appenderFactory;
  this.fileFactory = fileFactory;
  this.io = io;
  this.targetFileSize = targetFileSize;
}

Example #29

Source File: RowDataReader.java From iceberg with Apache License 2.0

5 votes

RowDataReader(
    CombinedScanTask task, Schema tableSchema, Schema expectedSchema, String nameMapping, FileIO fileIo,
    EncryptionManager encryptionManager, boolean caseSensitive) {
  super(task, fileIo, encryptionManager);
  this.tableSchema = tableSchema;
  this.expectedSchema = expectedSchema;
  this.nameMapping = nameMapping;
  this.caseSensitive = caseSensitive;
}

Example #30

Source File: HiveTableOperations.java From presto with Apache License 2.0

5 votes

private HiveTableOperations(FileIO fileIo, HiveMetastore metastore, HiveIdentity identity, String database, String table, Optional<String> owner, Optional<String> location)
{
    this.fileIo = requireNonNull(fileIo, "fileIo is null");
    this.metastore = requireNonNull(metastore, "metastore is null");
    this.identity = requireNonNull(identity, "identity is null");
    this.database = requireNonNull(database, "database is null");
    this.tableName = requireNonNull(table, "table is null");
    this.owner = requireNonNull(owner, "owner is null");
    this.location = requireNonNull(location, "location is null");
}