org.apache.spark.util.SerializableConfiguration Java Examples

The following examples show how to use org.apache.spark.util.SerializableConfiguration. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}
 
Example #2
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static FlatMapFunction<Iterator<String>, String> listDirsRecursively(
    Broadcast<SerializableConfiguration> conf,
    long olderThanTimestamp) {

  return (FlatMapFunction<Iterator<String>, String>) dirs -> {
    List<String> subDirs = Lists.newArrayList();
    List<String> files = Lists.newArrayList();

    Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

    int maxDepth = 2000;
    int maxDirectSubDirs = Integer.MAX_VALUE;

    dirs.forEachRemaining(dir -> {
      listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files);
    });

    if (!subDirs.isEmpty()) {
      throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth);
    }

    return files.iterator();
  };
}
 
Example #3
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
RemoveOrphanFilesAction(SparkSession spark, Table table) {
  this.spark = spark;
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.hadoopConf = new SerializableConfiguration(spark.sessionState().newHadoopConf());
  this.partitionDiscoveryParallelism = spark.sessionState().conf().parallelPartitionDiscoveryParallelism();
  this.table = table;
  this.ops = ((HasTableOperations) table).operations();
  this.location = table.location();
}
 
Example #4
Source File: SparkUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static FileIO serializableFileIO(Table table) {
  if (table.io() instanceof HadoopFileIO) {
    // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization
    SerializableConfiguration conf = new SerializableConfiguration(((HadoopFileIO) table.io()).conf());
    return new HadoopFileIO(conf::value);
  } else {
    return table.io();
  }
}
 
Example #5
Source File: Writer.java    From iceberg with Apache License 2.0 5 votes vote down vote up
WriterFactory(PartitionSpec spec, FileFormat format, String dataLocation,
              Map<String, String> properties, Configuration conf) {
  this.spec = spec;
  this.format = format;
  this.dataLocation = dataLocation;
  this.properties = properties;
  this.conf = new SerializableConfiguration(conf);
}
 
Example #6
Source File: Reader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
                 SerializableConfiguration conf) {
  this.task = task;
  this.tableSchemaString = tableSchemaString;
  this.expectedSchemaString = expectedSchemaString;
  this.conf = conf;
}
 
Example #7
Source File: GryoSerializer.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
private SparkIoRegistry() {
    try {
        super.register(GryoIo.class, Tuple2.class, new Tuple2Serializer());
        super.register(GryoIo.class, Tuple2[].class, null);
        super.register(GryoIo.class, Tuple3.class, new Tuple3Serializer());
        super.register(GryoIo.class, Tuple3[].class, null);
        super.register(GryoIo.class, CompactBuffer.class, new CompactBufferSerializer());
        super.register(GryoIo.class, CompactBuffer[].class, null);
        super.register(GryoIo.class, CompressedMapStatus.class, null);
        super.register(GryoIo.class, BlockManagerId.class, null);
        super.register(GryoIo.class, HighlyCompressedMapStatus.class, new ExternalizableSerializer());  // externalizable implemented so its okay
        super.register(GryoIo.class, TorrentBroadcast.class, null);
        super.register(GryoIo.class, PythonBroadcast.class, null);
        super.register(GryoIo.class, BoxedUnit.class, null);
        super.register(GryoIo.class, Class.forName("scala.reflect.ClassTag$$anon$1"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("scala.reflect.ManifestFactory$$anon$1"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("org.apache.spark.internal.io.FileCommitProtocol$TaskCommitMessage"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("org.apache.spark.internal.io.FileCommitProtocol$EmptyTaskCommitMessage$"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("scala.collection.immutable.Map$EmptyMap$"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("scala.collection.immutable.Map"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("scala.None$"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("scala.Some$"), new JavaSerializer());
        super.register(GryoIo.class, Class.forName("scala.Some"), new JavaSerializer());
        super.register(GryoIo.class, WrappedArray.ofRef.class, new WrappedArraySerializer());
        super.register(GryoIo.class, MessagePayload.class, null);
        super.register(GryoIo.class, ViewIncomingPayload.class, null);
        super.register(GryoIo.class, ViewOutgoingPayload.class, null);
        super.register(GryoIo.class, ViewPayload.class, null);
        super.register(GryoIo.class, SerializableConfiguration.class, new JavaSerializer());
        super.register(GryoIo.class, VertexWritable.class, new VertexWritableSerializer());
        super.register(GryoIo.class, ObjectWritable.class, new ObjectWritableSerializer());
    } catch (final ClassNotFoundException e) {
        throw new IllegalStateException(e);
    }
}
 
Example #8
Source File: Reader.java    From iceberg with Apache License 2.0 4 votes vote down vote up
Reader(Table table, Configuration conf) {
  this.table = table;
  this.conf = new SerializableConfiguration(conf);
  this.schema = table.schema();
}