org.apache.spark.api.java.function.MapPartitionsFunction Java Examples

The following examples show how to use org.apache.spark.api.java.function.MapPartitionsFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private static MapPartitionsFunction<Row, ManifestFile> toManifests(
    Broadcast<FileIO> io, long maxNumManifestEntries, String location,
    int format, PartitionSpec spec, StructType sparkType) {

  return (MapPartitionsFunction<Row, ManifestFile>) rows -> {
    List<Row> rowsAsList = Lists.newArrayList(rows);

    if (rowsAsList.isEmpty()) {
      return Collections.emptyIterator();
    }

    List<ManifestFile> manifests = Lists.newArrayList();
    if (rowsAsList.size() <= maxNumManifestEntries) {
      manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType));
    } else {
      int midIndex = rowsAsList.size() / 2;
      manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType));
      manifests.add(writeManifest(rowsAsList,  midIndex, rowsAsList.size(), io, location, format, spec, sparkType));
    }

    return manifests.iterator();
  };
}

Example #2

Source File: StructuredNodeLoader.java From sylph with Apache License 2.0

6 votes

private static TransForm<Dataset<Row>> loadRealTimeTransForm(RealTimeTransForm realTimeTransForm)
{
    return stream -> {
        //spark2.x 要对dataSet 进行map操作必须要加上下面一句类型映射 即必须要指明返回的schema
        //implicit val matchError:org.apache.spark.sql.Encoder[Row] = org.apache.spark.sql.Encoders.kryo[Row]
        //      import collection.JavaConverters._
        //      val mapRowSchema = realTimeTransForm.getRowSchema.getFields.asScala.map(filed => {
        //        StructField(filed.getName, SparkRow.SparkRowParser.parserType(filed.getJavaType), true)
        //      })
        //      RowEncoder.apply(StructType(mapRowSchema))

        //implicit val mapenc = RowEncoder.apply(rddSchema)  //此处无法注册 原因是必须是sql基本类型   //Encoders.STRING
        Dataset<Row> transStream = stream.mapPartitions(
                (MapPartitionsFunction<Row, Row>) partition -> StreamNodeLoader.transFunction(partition, realTimeTransForm),
                Encoders.kryo(Row.class));
        //或者使用 transStream.as()
        return transStream;
    };
}

Example #3

Source File: Dataset.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <U> Dataset<U> mapPartitions(final MapPartitionsFunction<T, U> f, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(f, encoder);
  final Dataset<U> result = from(super.mapPartitions(f, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Example #4

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public <U> Dataset<U> mapPartitions(final MapPartitionsFunction<T, U> f, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(f, encoder);
  final Dataset<U> result = from(super.mapPartitions(f, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Example #5

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

4 votes

/**
 * Import files from given partitions to an Iceberg table.
 *
 * @param spark a Spark session
 * @param partitions partitions to import
 * @param targetTable an Iceberg table where to import the data
 * @param spec a partition spec
 * @param stagingDir a staging directory to store temporary manifest files
 */
public static void importSparkPartitions(
    SparkSession spark, List<SparkPartition> partitions, Table targetTable, PartitionSpec spec, String stagingDir) {
  Configuration conf = spark.sessionState().newHadoopConf();
  SerializableConfiguration serializableConf = new SerializableConfiguration(conf);
  int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism());
  int numShufflePartitions = spark.sessionState().conf().numShufflePartitions();
  MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());

  JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
  JavaRDD<SparkPartition> partitionRDD = sparkContext.parallelize(partitions, parallelism);

  Dataset<SparkPartition> partitionDS = spark.createDataset(
      partitionRDD.rdd(),
      Encoders.javaSerialization(SparkPartition.class));

  List<ManifestFile> manifests = partitionDS
      .flatMap((FlatMapFunction<SparkPartition, DataFile>) sparkPartition ->
              listPartition(sparkPartition, spec, serializableConf, metricsConfig).iterator(),
          Encoders.javaSerialization(DataFile.class))
      .repartition(numShufflePartitions)
      .map((MapFunction<DataFile, Tuple2<String, DataFile>>) file ->
              Tuple2.apply(file.path().toString(), file),
          Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class)))
      .orderBy(col("_1"))
      .mapPartitions(
          (MapPartitionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple ->
              buildManifest(serializableConf, spec, stagingDir, fileTuple),
          Encoders.javaSerialization(ManifestFile.class))
      .collectAsList();

  try {
    boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean(
        targetTable.properties(),
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED,
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);

    AppendFiles append = targetTable.newAppend();
    manifests.forEach(append::appendManifest);
    append.commit();

    if (!snapshotIdInheritanceEnabled) {
      // delete original manifests as they were rewritten before the commit
      deleteManifests(targetTable.io(), manifests);
    }
  } catch (Throwable e) {
    deleteManifests(targetTable.io(), manifests);
    throw e;
  }
}