Java Code Examples for org.apache.spark.api.java.function.MapPartitionsFunction

The following examples show how to use org.apache.spark.api.java.function.MapPartitionsFunction. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: iceberg   Source File: RewriteManifestsAction.java    License: Apache License 2.0 6 votes vote down vote up
private static MapPartitionsFunction<Row, ManifestFile> toManifests(
    Broadcast<FileIO> io, long maxNumManifestEntries, String location,
    int format, PartitionSpec spec, StructType sparkType) {

  return (MapPartitionsFunction<Row, ManifestFile>) rows -> {
    List<Row> rowsAsList = Lists.newArrayList(rows);

    if (rowsAsList.isEmpty()) {
      return Collections.emptyIterator();
    }

    List<ManifestFile> manifests = Lists.newArrayList();
    if (rowsAsList.size() <= maxNumManifestEntries) {
      manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType));
    } else {
      int midIndex = rowsAsList.size() / 2;
      manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType));
      manifests.add(writeManifest(rowsAsList,  midIndex, rowsAsList.size(), io, location, format, spec, sparkType));
    }

    return manifests.iterator();
  };
}
 
Example 2
Source Project: sylph   Source File: StructuredNodeLoader.java    License: Apache License 2.0 6 votes vote down vote up
private static TransForm<Dataset<Row>> loadRealTimeTransForm(RealTimeTransForm realTimeTransForm)
{
    return stream -> {
        //spark2.x 要对dataSet 进行map操作必须要加上下面一句类型映射 即必须要指明返回的schema
        //implicit val matchError:org.apache.spark.sql.Encoder[Row] = org.apache.spark.sql.Encoders.kryo[Row]
        //      import collection.JavaConverters._
        //      val mapRowSchema = realTimeTransForm.getRowSchema.getFields.asScala.map(filed => {
        //        StructField(filed.getName, SparkRow.SparkRowParser.parserType(filed.getJavaType), true)
        //      })
        //      RowEncoder.apply(StructType(mapRowSchema))

        //implicit val mapenc = RowEncoder.apply(rddSchema)  //此处无法注册 原因是必须是sql基本类型   //Encoders.STRING
        Dataset<Row> transStream = stream.mapPartitions(
                (MapPartitionsFunction<Row, Row>) partition -> StreamNodeLoader.transFunction(partition, realTimeTransForm),
                Encoders.kryo(Row.class));
        //或者使用 transStream.as()
        return transStream;
    };
}
 
Example 3
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> mapPartitions(final MapPartitionsFunction<T, U> f, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(f, encoder);
  final Dataset<U> result = from(super.mapPartitions(f, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 4
Source Project: nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> mapPartitions(final MapPartitionsFunction<T, U> f, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(f, encoder);
  final Dataset<U> result = from(super.mapPartitions(f, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 5
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Import files from given partitions to an Iceberg table.
 *
 * @param spark a Spark session
 * @param partitions partitions to import
 * @param targetTable an Iceberg table where to import the data
 * @param spec a partition spec
 * @param stagingDir a staging directory to store temporary manifest files
 */
public static void importSparkPartitions(
    SparkSession spark, List<SparkPartition> partitions, Table targetTable, PartitionSpec spec, String stagingDir) {
  Configuration conf = spark.sessionState().newHadoopConf();
  SerializableConfiguration serializableConf = new SerializableConfiguration(conf);
  int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism());
  int numShufflePartitions = spark.sessionState().conf().numShufflePartitions();
  MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());

  JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
  JavaRDD<SparkPartition> partitionRDD = sparkContext.parallelize(partitions, parallelism);

  Dataset<SparkPartition> partitionDS = spark.createDataset(
      partitionRDD.rdd(),
      Encoders.javaSerialization(SparkPartition.class));

  List<ManifestFile> manifests = partitionDS
      .flatMap((FlatMapFunction<SparkPartition, DataFile>) sparkPartition ->
              listPartition(sparkPartition, spec, serializableConf, metricsConfig).iterator(),
          Encoders.javaSerialization(DataFile.class))
      .repartition(numShufflePartitions)
      .map((MapFunction<DataFile, Tuple2<String, DataFile>>) file ->
              Tuple2.apply(file.path().toString(), file),
          Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class)))
      .orderBy(col("_1"))
      .mapPartitions(
          (MapPartitionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple ->
              buildManifest(serializableConf, spec, stagingDir, fileTuple),
          Encoders.javaSerialization(ManifestFile.class))
      .collectAsList();

  try {
    boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean(
        targetTable.properties(),
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED,
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);

    AppendFiles append = targetTable.newAppend();
    manifests.forEach(append::appendManifest);
    append.commit();

    if (!snapshotIdInheritanceEnabled) {
      // delete original manifests as they were rewritten before the commit
      deleteManifests(targetTable.io(), manifests);
    }
  } catch (Throwable e) {
    deleteManifests(targetTable.io(), manifests);
    throw e;
  }
}