org.apache.spark.api.java.function.PairFlatMapFunction Java Examples

The following examples show how to use org.apache.spark.api.java.function.PairFlatMapFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CleanActionExecutor.java    From hudi with Apache License 2.0 6 votes vote down vote up
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
    HoodieTable table) {
  return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
    Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();

    FileSystem fs = table.getMetaClient().getFs();
    Path basePath = new Path(table.getMetaClient().getBasePath());
    while (iter.hasNext()) {
      Tuple2<String, String> partitionDelFileTuple = iter.next();
      String partitionPath = partitionDelFileTuple._1();
      String delFileName = partitionDelFileTuple._2();
      Path deletePath = FSUtils.getPartitionPath(FSUtils.getPartitionPath(basePath, partitionPath), delFileName);
      String deletePathStr = deletePath.toString();
      Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
      if (!partitionCleanStatMap.containsKey(partitionPath)) {
        partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
      }
      PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
      partitionCleanStat.addDeleteFilePatterns(deletePath.getName());
      partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult);
    }
    return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue()))
        .collect(Collectors.toList()).iterator();
  };
}
 
Example #2
Source File: TransformTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
private static <K, V, OutputT>
    PairFlatMapFunction<Iterator<Tuple2<ByteArray, byte[]>>, TupleTag<?>, WindowedValue<?>>
        wrapDoFnFromSortedRDD(
            MultiDoFnFunction<KV<K, V>, OutputT> doFnFunction,
            Coder<K> keyCoder,
            Coder<WindowedValue<V>> wvCoder) {

  return (Iterator<Tuple2<ByteArray, byte[]>> in) -> {
    Iterator<Iterator<Tuple2<TupleTag<?>, WindowedValue<?>>>> mappedGroups;
    mappedGroups =
        Iterators.transform(
            splitBySameKey(in, keyCoder, wvCoder),
            group -> {
              try {
                return doFnFunction.call(group);
              } catch (Exception ex) {
                throw new RuntimeException(ex);
              }
            });
    return flatten(mappedGroups);
  };
}
 
Example #3
Source File: UserVisitSessionAnalyzeSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
	 * 获取sessionid2到访问行为数据的映射的RDD
	 * @param actionRDD
	 * @return
	 */
	public static JavaPairRDD<String, Row> getSessionid2ActionRDD(JavaRDD<Row> actionRDD) {
//		return actionRDD.mapToPair(new PairFunction<Row, String, Row>() {
//
//			private static final long serialVersionUID = 1L;
//
//			@Override
//			public Tuple2<String, Row> call(Row row) throws Exception {
//				return new Tuple2<String, Row>(row.getString(2), row);
//			}
//
//		});

		return actionRDD.mapPartitionsToPair(new PairFlatMapFunction<Iterator<Row>, String, Row>() {

			private static final long serialVersionUID = 1L;

			@Override
			public Iterator<Tuple2<String, Row>> call(Iterator<Row> iterator)
					throws Exception {
				List<Tuple2<String, Row>> list = new ArrayList<Tuple2<String, Row>>();

				while(iterator.hasNext()) {
					Row row = iterator.next();
					list.add(new Tuple2<String, Row>(row.getString(2), row));
				}

				return list.iterator();
			}

		});
	}
 
Example #4
Source File: TranslationUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * A utility method that adapts {@link PairFunction} to a {@link PairFlatMapFunction} with an
 * {@link Iterator} input. This is particularly useful because it allows to use functions written
 * for mapToPair functions in flatmapToPair functions.
 *
 * @param pairFunction the {@link PairFunction} to adapt.
 * @param <T> the input type.
 * @param <K> the output key type.
 * @param <V> the output value type.
 * @return a {@link PairFlatMapFunction} that accepts an {@link Iterator} as an input and applies
 *     the {@link PairFunction} on every element.
 */
public static <T, K, V> PairFlatMapFunction<Iterator<T>, K, V> pairFunctionToPairFlatMapFunction(
    final PairFunction<T, K, V> pairFunction) {
  return itr ->
      Iterators.transform(
          itr,
          t -> {
            try {
              return pairFunction.call(t);
            } catch (Exception e) {
              throw new RuntimeException(e);
            }
          });
}
 
Example #5
Source File: JoinReadsWithVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Joins each read of an RDD<GATKRead> with overlapping variants from a list of variants files.
 *
 * @param reads the RDD of reads, in coordinate-sorted order
 * @param variantsFileNames the names of the variants files added via {@code SparkContext#addFile()}
 * @return an RDD that contains each read along with the overlapping variants
 */
public static JavaPairRDD<GATKRead, Iterable<GATKVariant>> join(final JavaRDD<GATKRead> reads, final List<String> variantsFileNames) {
    return reads.mapPartitionsToPair((PairFlatMapFunction<Iterator<GATKRead>, GATKRead, Iterable<GATKVariant>>) gatkReadIterator -> {
        List<FeatureDataSource<VariantContext>> variantSources = variantsFileNames.stream().map(fileName -> openFeatureSource(SparkFiles.get(fileName))).collect(Collectors.toList());
        Iterator<Tuple2<GATKRead, Iterable<GATKVariant>>> iterator = Iterators.transform(gatkReadIterator, read -> getVariantsOverlappingRead(read, variantSources));
        return new CloseAtEndIterator<>(iterator, new AutoCloseableCollection(variantSources)); // close FeatureDataSource at end of iteration
    });
}
 
Example #6
Source File: SparkCubingByLayer.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}
 
Example #7
Source File: TranslationUtils.java    From beam with Apache License 2.0 4 votes vote down vote up
/** {@link KV} to pair flatmap function. */
public static <K, V> PairFlatMapFunction<Iterator<KV<K, V>>, K, V> toPairFlatMapFunction() {
  return itr -> Iterators.transform(itr, kv -> new Tuple2<>(kv.getKey(), kv.getValue()));
}
 
Example #8
Source File: SparkCubingByLayer.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc(), AbstractHadoopJob.loadKylinConfigFromHdfs(new SerializableConfiguration(sc.hadoopConfiguration()), metaUrl)); // set dfs.replication and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}
 
Example #9
Source File: TieredSpatialJoin.java    From geowave with Apache License 2.0 4 votes vote down vote up
private JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> reprojectToTier(
    final JavaRDD<Tuple2<GeoWaveInputKey, Geometry>> higherTiers,
    final byte targetTierId,
    final Broadcast<TieredSFCIndexStrategy> broadcastStrategy,
    final double bufferDistance,
    final HashPartitioner partitioner) {
  return higherTiers.flatMapToPair(
      (PairFlatMapFunction<Tuple2<GeoWaveInputKey, Geometry>, ByteArray, Tuple2<GeoWaveInputKey, Geometry>>) t -> {
        final TieredSFCIndexStrategy index = broadcastStrategy.value();
        final SubStrategy[] strategies = index.getSubStrategies();
        SingleTierSubStrategy useStrat = null;
        for (final SubStrategy strat : strategies) {
          final SingleTierSubStrategy tierStrat =
              (SingleTierSubStrategy) strat.getIndexStrategy();
          if (targetTierId == tierStrat.tier) {
            useStrat = tierStrat;
            break;
          }
        }
        final Geometry geom = t._2;
        final Envelope internalEnvelope = geom.getEnvelopeInternal();
        internalEnvelope.expandBy(bufferDistance);
        final MultiDimensionalNumericData boundsRange =
            GeometryUtils.getBoundsFromEnvelope(internalEnvelope);

        InsertionIds insertIds = useStrat.getInsertionIds(boundsRange, 80);

        if (bufferDistance == 0.0) {
          insertIds = RDDUtils.trimIndexIds(insertIds, geom, index);
        }

        final List<Tuple2<ByteArray, Tuple2<GeoWaveInputKey, Geometry>>> reprojected =
            Lists.newArrayListWithCapacity(insertIds.getSize());
        for (final byte[] id : insertIds.getCompositeInsertionIds()) {
          final Tuple2<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexPair =
              new Tuple2<>(new ByteArray(id), t);
          reprojected.add(indexPair);
        }
        return reprojected.iterator();
      }).partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK_SER());
}
 
Example #10
Source File: GeoWaveIndexedRDD.java    From geowave with Apache License 2.0 4 votes vote down vote up
public JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, SimpleFeature>> getIndexedFeatureRDD(
    final double bufferAmount) {
  verifyParameters();
  if (!geowaveRDD.isLoaded()) {
    LOGGER.error("Must provide a loaded RDD.");
    return null;
  }
  if (rawFeatureRDD == null) {
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, SimpleFeature>> indexedData =
        geowaveRDD.getRawRDD().flatMapToPair(
            new PairFlatMapFunction<Tuple2<GeoWaveInputKey, SimpleFeature>, ByteArray, Tuple2<GeoWaveInputKey, SimpleFeature>>() {
              /**
               *
               */
              private static final long serialVersionUID = 1L;

              @Override
              public Iterator<Tuple2<ByteArray, Tuple2<GeoWaveInputKey, SimpleFeature>>> call(
                  final Tuple2<GeoWaveInputKey, SimpleFeature> t) throws Exception {

                // Flattened output array.
                final List<Tuple2<ByteArray, Tuple2<GeoWaveInputKey, SimpleFeature>>> result =
                    new ArrayList<>();

                // Pull feature to index from tuple
                final SimpleFeature inputFeature = t._2;
                // If we are dealing with null or empty
                // geometry we can't properly compare this
                // feature.
                final Geometry geom = (Geometry) inputFeature.getDefaultGeometry();
                if (geom == null) {
                  return Collections.emptyIterator();
                }

                final Envelope internalEnvelope = geom.getEnvelopeInternal();
                if (internalEnvelope.isNull()) {
                  return Collections.emptyIterator();
                }
                // If we have to buffer geometry for
                // predicate expand bounds
                internalEnvelope.expandBy(bufferAmount);

                // Get data range from expanded envelope
                final MultiDimensionalNumericData boundsRange =
                    GeometryUtils.getBoundsFromEnvelope(internalEnvelope);

                final NumericIndexStrategy index = indexStrategy.value();
                InsertionIds insertIds = index.getInsertionIds(boundsRange, 80);

                // If we didnt expand the envelope for
                // buffering we can trim the indexIds by the
                // geometry
                if (bufferAmount == 0.0) {
                  insertIds = RDDUtils.trimIndexIds(insertIds, geom, index);
                }

                for (final Iterator<byte[]> iter =
                    insertIds.getCompositeInsertionIds().iterator(); iter.hasNext();) {
                  final byte[] id = iter.next();

                  final Tuple2<GeoWaveInputKey, SimpleFeature> valuePair =
                      new Tuple2<>(t._1, inputFeature);
                  final Tuple2<ByteArray, Tuple2<GeoWaveInputKey, SimpleFeature>> indexPair =
                      new Tuple2<>(new ByteArray(id), valuePair);
                  result.add(indexPair);
                }

                return result.iterator();
              }
            });
    rawFeatureRDD = indexedData;
  }

  return rawFeatureRDD;
}