org.apache.spark.HashPartitioner Java Examples

The following examples show how to use org.apache.spark.HashPartitioner. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NGlobalDictionaryV2Test.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void runWithLocalBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    HashPartitioner partitioner = new HashPartitioner(BUCKET_SIZE);
    Map<Integer, List<String>> vmap = new HashMap<>();
    for (int i = 0; i < BUCKET_SIZE; i++) {
        vmap.put(i, Lists.newArrayList());
    }
    for (String string : stringSet) {
        int bucketId = partitioner.getPartition(string);
        vmap.get(bucketId).add(string);
    }

    for (Map.Entry<Integer, List<String>> entry : vmap.entrySet()) {
        NBucketDictionary bucketDict = dict.loadBucketDictionary(entry.getKey());
        for (String s : entry.getValue()) {
            bucketDict.addRelativeValue(s);
        }
        bucketDict.saveBucketDict(entry.getKey());
    }

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}
 
Example #2
Source File: InstancePartitioner.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
/**
 * Partition instances by the specified partitioning (e.g. by instance type)
 *
 * @param instances RDD of instances to partition
 * @return partitioned RDD if requested, original RDD if no partitioning is specified
 */
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
    if (!config.isRepartitionByType()) {
        return instances;
    }
    log.info("Getting counts by type hash");
    Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
    int numPartitions = instances.getNumPartitions();
    long totalInstances = instances.count();
    long instancesPerPartition = totalInstances / numPartitions + 1;

    JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
        int typeHash = getTypeHash(instance);
        int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
        return new Tuple2<>(typeHash + splitIncrement, instance);
    });

    log.info("Partitioning instances by type");
    return instanceWithPartitions
            .partitionBy(new HashPartitioner(numPartitions))
            .values();
}
 
Example #3
Source File: CoverageModelEMWorkspace.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Instantiate compute block(s). If Spark is disabled, a single {@link CoverageModelEMComputeBlock} is
 * instantiated. Otherwise, a {@link JavaPairRDD} of compute nodes will be created.
 */
private void instantiateWorkers() {
    if (sparkContextIsAvailable) {
        /* initialize the RDD */
        logger.info("Initializing an RDD of compute blocks");
        computeRDD = ctx.parallelizePairs(targetBlockStream()
                .map(tb -> new Tuple2<>(tb, new CoverageModelEMComputeBlock(tb, numSamples, numLatents, ardEnabled)))
                .collect(Collectors.toList()), numTargetBlocks)
                .partitionBy(new HashPartitioner(numTargetBlocks))
                .cache();
    } else {
        logger.info("Initializing a local compute block");
        localComputeBlock = new CoverageModelEMComputeBlock(targetBlocks.get(0), numSamples, numLatents, ardEnabled);
    }
    prevCheckpointedComputeRDD = null;
    cacheCallCounter = 0;
}
 
Example #4
Source File: NGlobalDictionaryV2Test.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}
 
Example #5
Source File: CoverageModelEMWorkspace.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Creates a {@link JavaPairRDD} of (sample index, emission data list)
 * @return a {@link JavaPairRDD}
 */
private JavaPairRDD<Integer, List<CoverageModelCopyRatioEmissionData>> fetchCopyRatioEmissionDataSpark() {
    final int numSamples = this.numSamples;

    return computeRDD
            /* flat map workers a list of [sample index, [target block, emission data on target block]] */
            .flatMapToPair(tuple -> {
                final LinearlySpacedIndexBlock tb = tuple._1;
                final CoverageModelEMComputeBlock cb = tuple._2;
                final List<List<CoverageModelCopyRatioEmissionData>> el = cb.getSampleCopyRatioLatentPosteriorData();
                return IntStream.range(0, numSamples)
                        .mapToObj(si -> new Tuple2<>(si, new Tuple2<>(tb, el.get(si))))
                        .iterator();
            })
            /* combine elements with the same sample index */
            .combineByKey(
                    /* create a new list */
                    Collections::singletonList,
                    /* recipe to add an element to the list */
                    (list, element) -> Stream.concat(list.stream(), Stream.of(element))
                            .collect(Collectors.toList()),
                    /* recipe to concatenate two lists */
                    (list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()),
                    /* repartition with respect to sample indices */
                    new HashPartitioner(numSamples))
            /* sort the [target block, emission data on target block] chunks for each sample into a single list */
            .mapValues(emissionBlocksList -> emissionBlocksList.stream() /* for each partition ... */
                    /* sort the data blocks */
                    .sorted(Comparator.comparingInt(t -> t._1.getBegIndex()))
                    /* remove the LinearlySpacedIndexBlock keys from the sorted emissionBlocksList */
                    .map(p -> p._2)
                    /* flatten */
                    .flatMap(List::stream)
                    /* collect as a single list */
                    .collect(Collectors.toList()));
}
 
Example #6
Source File: SparkUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Randomly shuffle the examples in each DataSet object, and recombine them into new DataSet objects
 * with the specified BatchSize
 *
 * @param rdd DataSets to shuffle/recombine
 * @param newBatchSize New batch size for the DataSet objects, after shuffling/recombining
 * @param numPartitions Number of partitions to use when splitting/recombining
 * @return A new {@link JavaRDD<DataSet>}, with the examples shuffled/combined in each
 */
public static JavaRDD<DataSet> shuffleExamples(JavaRDD<DataSet> rdd, int newBatchSize, int numPartitions) {
    //Step 1: split into individual examples, mapping to a pair RDD (random key in range 0 to numPartitions)

    JavaPairRDD<Integer, DataSet> singleExampleDataSets =
                    rdd.flatMapToPair(new SplitDataSetExamplesPairFlatMapFunction(numPartitions));

    //Step 2: repartition according to the random keys
    singleExampleDataSets = singleExampleDataSets.partitionBy(new HashPartitioner(numPartitions));

    //Step 3: Recombine
    return singleExampleDataSets.values().mapPartitions(new BatchDataSetsFunction(newBatchSize));
}
 
Example #7
Source File: TieredSpatialJoin.java    From geowave with Apache License 2.0 5 votes vote down vote up
private JavaPairRDD<GeoWaveInputKey, ByteArray> joinAndCompareTiers(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> leftTier,
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> rightTier,
    final Broadcast<GeomFunction> geomPredicate,
    final int highestPartitionCount,
    final HashPartitioner partitioner) {
  // Cogroup groups on same tier ByteArrayId and pairs them into Iterable
  // sets.
  JavaPairRDD<ByteArray, Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>> joinedTiers =
      leftTier.cogroup(rightTier, partitioner);

  // Filter only the pairs that have data on both sides, bucket strategy
  // should have been accounted for by this point.
  // We need to go through the pairs and test each feature against each
  // other
  // End with a combined RDD for that tier.
  joinedTiers =
      joinedTiers.filter(t -> t._2._1.iterator().hasNext() && t._2._2.iterator().hasNext());

  final JavaPairRDD<GeoWaveInputKey, ByteArray> finalMatches =
      joinedTiers.flatMapValues(
          (Function<Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>, Iterable<GeoWaveInputKey>>) t -> {
            final GeomFunction predicate = geomPredicate.value();

            final HashSet<GeoWaveInputKey> results = Sets.newHashSet();
            for (final Tuple2<GeoWaveInputKey, Geometry> leftTuple : t._1) {
              for (final Tuple2<GeoWaveInputKey, Geometry> rightTuple : t._2) {
                if (predicate.call(leftTuple._2, rightTuple._2)) {
                  results.add(leftTuple._1);
                  results.add(rightTuple._1);
                }
              }
            }
            return results;
          }).mapToPair(Tuple2::swap).reduceByKey(partitioner, (id1, id2) -> id1).persist(
              StorageLevel.MEMORY_ONLY_SER());

  return finalMatches;
}
 
Example #8
Source File: SourceRDD.java    From beam with Apache License 2.0 5 votes vote down vote up
public Unbounded(
    SparkContext sc,
    SerializablePipelineOptions options,
    MicrobatchSource<T, CheckpointMarkT> microbatchSource,
    int initialNumPartitions) {
  super(sc, NIL, JavaSparkContext$.MODULE$.fakeClassTag());
  this.options = options;
  this.microbatchSource = microbatchSource;
  this.partitioner = new HashPartitioner(initialNumPartitions);
}
 
Example #9
Source File: SparkBatchPortablePipelineTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
@Nullable
private static Partitioner getPartitioner(SparkTranslationContext context) {
  Long bundleSize =
      context.serializablePipelineOptions.get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}
 
Example #10
Source File: TransformTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
@Nullable
private static Partitioner getPartitioner(EvaluationContext context) {
  Long bundleSize =
      context.getSerializableOptions().get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}
 
Example #11
Source File: TestPartitionerFactory.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testHash() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put("type", "hash");
  
  JavaPairRDD<Row, Row> base = getDummyRDD(10);
  Config config = ConfigFactory.parseMap(configMap);
  Partitioner p = PartitionerFactory.create(config, base);
  
  assertTrue(p instanceof HashPartitioner);
  assertEquals(p.numPartitions(), 10);
}
 
Example #12
Source File: GroupNonMergingWindowsFunctions.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, V> Partitioner getPartitioner(
    Partitioner partitioner, JavaRDD<WindowedValue<KV<K, V>>> rdd) {
  return partitioner == null ? new HashPartitioner(rdd.getNumPartitions()) : partitioner;
}
 
Example #13
Source File: CoverageModelEMWorkspace.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * A generic function for handling a blockified list of objects to their corresponding compute nodes
 *
 * If Spark is enabled:
 *
 *      Joins an instance of {@code List<Tuple2<LinearlySpacedIndexBlock, V>>} with {@link #computeRDD}, calls the provided
 *      map {@code mapper} on the RDD, and the reference to the old RDD will be replaced with the new RDD.
 *
 * If Spark is disabled:
 *
 *      Only a single target-space block is assumed, such that {@code data} is a singleton. The map function
 *      {@code mapper} will be called on the value contained in {@code data} and {@link #localComputeBlock}, and
 *      the old instance of {@link CoverageModelEMComputeBlock} is replaced with the new instance returned
 *      by {@code mapper.}
 *
 * @param data the list to joined and mapped together with the compute block(s)
 * @param mapper a mapper binary function that takes a compute block together with an object of type {@code V} and
 *               returns a new compute block
 * @param <V> the type of the object to the broadcasted
 */
@UpdatesRDD
private <V> void joinWithWorkersAndMap(@Nonnull final List<Tuple2<LinearlySpacedIndexBlock, V>> data,
                                       @Nonnull final Function<Tuple2<CoverageModelEMComputeBlock, V>, CoverageModelEMComputeBlock> mapper) {
    if (sparkContextIsAvailable) {
        final JavaPairRDD<LinearlySpacedIndexBlock, V> newRDD =
                ctx.parallelizePairs(data, numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks));
        computeRDD = computeRDD.join(newRDD).mapValues(mapper);
    } else {
        try {
            Utils.validateArg(data.size() == 1, "Only a single data block is expected in the local mode");
            localComputeBlock = mapper.call(new Tuple2<>(localComputeBlock, data.get(0)._2));
        } catch (Exception e) {
            throw new RuntimeException("Can not apply the map function to the local compute block: " + e.getMessage());
        }
    }
}
 
Example #14
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
static JavaRDD<GATKRead> repartitionReadsByName(final JavaRDD<GATKRead> reads, final int numPartitions) {
    //Shuffle reads into partitions by read name hash code
    return reads.mapToPair(read -> new Tuple2<>(read.getName(), read))
            .partitionBy(new HashPartitioner(numPartitions))
            .map(Tuple2::_2);
}
 
Example #15
Source File: TieredSpatialJoin.java    From geowave with Apache License 2.0 4 votes vote down vote up
private JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> reprojectToTier(
    final JavaRDD<Tuple2<GeoWaveInputKey, Geometry>> higherTiers,
    final byte targetTierId,
    final Broadcast<TieredSFCIndexStrategy> broadcastStrategy,
    final double bufferDistance,
    final HashPartitioner partitioner) {
  return higherTiers.flatMapToPair(
      (PairFlatMapFunction<Tuple2<GeoWaveInputKey, Geometry>, ByteArray, Tuple2<GeoWaveInputKey, Geometry>>) t -> {
        final TieredSFCIndexStrategy index = broadcastStrategy.value();
        final SubStrategy[] strategies = index.getSubStrategies();
        SingleTierSubStrategy useStrat = null;
        for (final SubStrategy strat : strategies) {
          final SingleTierSubStrategy tierStrat =
              (SingleTierSubStrategy) strat.getIndexStrategy();
          if (targetTierId == tierStrat.tier) {
            useStrat = tierStrat;
            break;
          }
        }
        final Geometry geom = t._2;
        final Envelope internalEnvelope = geom.getEnvelopeInternal();
        internalEnvelope.expandBy(bufferDistance);
        final MultiDimensionalNumericData boundsRange =
            GeometryUtils.getBoundsFromEnvelope(internalEnvelope);

        InsertionIds insertIds = useStrat.getInsertionIds(boundsRange, 80);

        if (bufferDistance == 0.0) {
          insertIds = RDDUtils.trimIndexIds(insertIds, geom, index);
        }

        final List<Tuple2<ByteArray, Tuple2<GeoWaveInputKey, Geometry>>> reprojected =
            Lists.newArrayListWithCapacity(insertIds.getSize());
        for (final byte[] id : insertIds.getCompositeInsertionIds()) {
          final Tuple2<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexPair =
              new Tuple2<>(new ByteArray(id), t);
          reprojected.add(indexPair);
        }
        return reprojected.iterator();
      }).partitionBy(partitioner).persist(StorageLevel.MEMORY_AND_DISK_SER());
}
 
Example #16
Source File: HashingBalancedPartitionerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void hashingBalancedPartitionerDoesBalance() {
        // partitionWeightsByClass = [[1.714, .429, .857], [0.9, 0.6, 1.5]]
        List<Double> reds = Arrays.asList(1.714D, 0.429D, .857D);
        List<Double> blues = Arrays.asList(0.9D, 0.6D, 1.5D);
        List<List<Double>> partitionWeights = Arrays.asList(reds, blues);

        HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights);
        List<Tuple2<Integer, String>> l = new ArrayList<>();

        for (int i = 0; i < 4; i++) {
            l.add(new Tuple2<Integer, String>(0, "red"));
        }
        for (int i = 0; i < 3; i++) {
            l.add(new Tuple2<Integer, String>(0, "blue"));
        }
        for (int i = 0; i < 1; i++) {
            l.add(new Tuple2<Integer, String>(1, "red"));
        }
        for (int i = 0; i < 2; i++) {
            l.add(new Tuple2<Integer, String>(1, "blue"));
        }
        for (int i = 0; i < 2; i++) {
            l.add(new Tuple2<Integer, String>(2, "red"));
        }
        for (int i = 0; i < 5; i++) {
            l.add(new Tuple2<Integer, String>(2, "blue"));
        }
        // This should give exactly the sought distribution
        JavaPairRDD<Integer, String> rdd =
                        JavaPairRDD.fromJavaRDD(sc.parallelize(l)).partitionBy(new HashPartitioner(3));

        // Let's reproduce UIDs
        JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId().mapToPair(
                        new PairFunction<Tuple2<Tuple2<Integer, String>, Long>, Tuple2<Long, Integer>, String>() {
                            @Override
                            public Tuple2<Tuple2<Long, Integer>, String> call(
                                            Tuple2<Tuple2<Integer, String>, Long> payLoadNuid) {
                                Long uid = payLoadNuid._2();
                                String value = payLoadNuid._1()._2();
                                Integer elemClass = value.equals("red") ? 0 : 1;
                                return new Tuple2<Tuple2<Long, Integer>, String>(
                                                new Tuple2<Long, Integer>(uid, elemClass), value);
                            }
                        });

        List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect();

        int[][] colorCountsByPartition = new int[3][2];
        for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) {
//            System.out.println(val);
            Integer partition = hbp.getPartition(val._1());
//            System.out.println(partition);

            if (val._2().equals("red"))
                colorCountsByPartition[partition][0] += 1;
            else
                colorCountsByPartition[partition][1] += 1;
        }

//        for (int i = 0; i < 3; i++) {
//            System.out.println(Arrays.toString(colorCountsByPartition[i]));
//        }
        for (int i = 0; i < 3; i++) {
            // avg red per partition : 2.33
            assertTrue(colorCountsByPartition[i][0] >= 1 && colorCountsByPartition[i][0] < 4);
            // avg blue per partition : 3.33
            assertTrue(colorCountsByPartition[i][1] >= 2 && colorCountsByPartition[i][1] < 5);
        }

    }
 
Example #17
Source File: SparkUtils.java    From systemds with Apache License 2.0 2 votes vote down vote up
/**
 * Indicates if the input RDD is hash partitioned, i.e., it has a partitioner
 * of type {@code org.apache.spark.HashPartitioner}.
 * 
 * @param in input JavaPairRDD
 * @return true if input is hash partitioned
 */
public static boolean isHashPartitioned(JavaPairRDD<?,?> in) {
	return !in.rdd().partitioner().isEmpty()
		&& in.rdd().partitioner().get() instanceof HashPartitioner;
}
 
Example #18
Source File: SparkUtils.java    From systemds with Apache License 2.0 2 votes vote down vote up
/**
 * Indicates if the input RDD is hash partitioned, i.e., it has a partitioner
 * of type {@code org.apache.spark.HashPartitioner}.
 * 
 * @param in input JavaPairRDD
 * @return true if input is hash partitioned
 */
public static boolean isHashPartitioned(JavaPairRDD<?,?> in) {
	return !in.rdd().partitioner().isEmpty()
		&& in.rdd().partitioner().get() instanceof HashPartitioner;
}