org.apache.spark.storage.StorageLevel Java Examples

The following examples show how to use org.apache.spark.storage.StorageLevel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: GarmadonSparkStorageStatusListenerIntegrationTest.java    From garmadon with Apache License 2.0 6 votes vote down vote up
@Test
public void SparkStorageStatusListener_should_track_rdd_storage_status() throws InterruptedException {
    assertTrue(rdds.isEmpty());
    assertTrue(executors.isEmpty());

    //Memory
    JavaRDD rddMemory = makeRDD("MemRDD", StorageLevel.MEMORY_ONLY());
    rddMemory.collect();

    checkRddStorage(rddMemory.name(), equalTo(0L), greaterThan(0L), equalTo(0L));

    //Disk
    JavaRDD rddDisk = makeRDD("DiskRDD", StorageLevel.DISK_ONLY());
    rddDisk.collect();

    checkRddStorage(rddDisk.name(), equalTo(0L), equalTo(0L), greaterThan(0L));

    //OffHeap
    JavaRDD rddOffHeap = makeRDD("OffHeapRDD", StorageLevel.OFF_HEAP());
    rddOffHeap.collect();

    checkRddStorage(rddOffHeap.name(), greaterThan(0L), equalTo(0L), equalTo(0L));
}
 
Example #2
Source File: UnboundedDataset.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public void cache(String storageLevel, Coder<?> coder) {
  // we "force" MEMORY storage level in streaming
  if (!StorageLevel.fromString(storageLevel).equals(StorageLevel.MEMORY_ONLY_SER())) {
    LOG.warn(
        "Provided StorageLevel: {} is ignored for streams, using the default level: {}",
        storageLevel,
        StorageLevel.MEMORY_ONLY_SER());
  }
  // Caching can cause Serialization, we need to code to bytes
  // more details in https://issues.apache.org/jira/browse/BEAM-2669
  Coder<WindowedValue<T>> wc = (Coder<WindowedValue<T>>) coder;
  this.dStream =
      dStream.map(CoderHelpers.toByteFunction(wc)).cache().map(CoderHelpers.fromByteFunction(wc));
}
 
Example #3
Source File: SparkStreamingPulsarReceiverTest.java    From pulsar with Apache License 2.0 6 votes vote down vote up
@Test(dataProvider = "ServiceUrls")
public void testDefaultSettingsOfReceiver(String serviceUrl) {
    ConsumerConfigurationData<byte[]> consConf = new ConsumerConfigurationData<>();

    Set<String> set = new HashSet<>();
    set.add(TOPIC);
    consConf.setTopicNames(set);
    consConf.setSubscriptionName(SUBS);

    SparkStreamingPulsarReceiver receiver = new SparkStreamingPulsarReceiver(
        serviceUrl,
        consConf,
        new AuthenticationDisabled());

    assertEquals(receiver.storageLevel(), StorageLevel.MEMORY_AND_DISK_2());
    assertNotNull(consConf.getMessageListener());
}
 
Example #4
Source File: PersistedOutputRDD.java    From tinkerpop with Apache License 2.0 6 votes vote down vote up
@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
    if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
        LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
    SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));  // this might be bad cause it unpersists the job RDD
    // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
    final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
    if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
        graphRDD.mapValues(vertex -> {
            vertex.get().dropEdges(Direction.BOTH);
            return vertex;
        }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    else
        graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
}
 
Example #5
Source File: PathSeqBwaSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Writes RDD of reads to path. Note writeReads() is not used because there are separate paired/unpaired outputs.
 * Header sequence dictionary is reduced to only those that were aligned to.
 */
private void writeBam(final JavaRDD<GATKRead> reads, final String inputBamPath, final boolean isPaired,
                      final JavaSparkContext ctx, SAMFileHeader header) {

    //Only retain header sequences that were aligned to.
    //This invokes an action and therefore the reads must be cached.
    reads.persist(StorageLevel.MEMORY_AND_DISK_SER());
    header = PSBwaUtils.removeUnmappedHeaderSequences(header, reads, logger);

    final String outputPath = isPaired ? outputPaired : outputUnpaired;
    try {
        ReadsSparkSink.writeReads(ctx, outputPath, null, reads, header,
                shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE,
                PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetPartitionSize()), shardedPartsDir, true, splittingIndexGranularity);
    } catch (final IOException e) {
        throw new UserException.CouldNotCreateOutputFile(outputPath, "Writing failed", e);
    }
}
 
Example #6
Source File: StorageLevelSerializer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
private static Map<StorageLevel, String> initMap() {
    Map<StorageLevel, String> map = new HashMap<>();
    map.put(StorageLevel.NONE(), "NONE");
    map.put(StorageLevel.DISK_ONLY(), "DISK_ONLY");
    map.put(StorageLevel.DISK_ONLY_2(), "DISK_ONLY_2");
    map.put(StorageLevel.MEMORY_ONLY(), "MEMORY_ONLY");
    map.put(StorageLevel.MEMORY_ONLY_2(), "MEMORY_ONLY_2");
    map.put(StorageLevel.MEMORY_ONLY_SER(), "MEMORY_ONLY_SER");
    map.put(StorageLevel.MEMORY_ONLY_SER_2(), "MEMORY_ONLY_SER_2");
    map.put(StorageLevel.MEMORY_AND_DISK(), "MEMORY_AND_DISK");
    map.put(StorageLevel.MEMORY_AND_DISK_2(), "MEMORY_AND_DISK_2");
    map.put(StorageLevel.MEMORY_AND_DISK_SER(), "MEMORY_AND_DISK_SER");
    map.put(StorageLevel.MEMORY_AND_DISK_SER_2(), "MEMORY_AND_DISK_SER_2");
    map.put(StorageLevel.OFF_HEAP(), "OFF_HEAP");
    return map;
}
 
Example #7
Source File: ALSUpdate.java    From oryx with Apache License 2.0 6 votes vote down vote up
private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
    JavaPairRDD<String,float[]> javaRDD,
    Broadcast<? extends Map<String,Integer>> bIdToIndex) {

  RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
      new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
  ).mapValues(f -> {
      double[] d = new double[f.length];
      for (int i = 0; i < d.length; i++) {
        d[i] = f[i];
      }
      return d;
    }
  ).rdd();

  // This mimics the persistence level establish by ALS training methods
  scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());

  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
  return objKeyRDD;
}
 
Example #8
Source File: TestJsonYaml.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
    public void testJsonYaml() {
        TrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(2).batchSizePerWorker(32)
                        .exportDirectory("hdfs://SomeDirectory/").saveUpdater(false).averagingFrequency(3)
                        .storageLevel(StorageLevel.MEMORY_ONLY_SER_2()).storageLevelStreams(StorageLevel.DISK_ONLY())
                        .build();

        String json = tm.toJson();
        String yaml = tm.toYaml();

//        System.out.println(json);

        TrainingMaster fromJson = ParameterAveragingTrainingMaster.fromJson(json);
        TrainingMaster fromYaml = ParameterAveragingTrainingMaster.fromYaml(yaml);


        assertEquals(tm, fromJson);
        assertEquals(tm, fromYaml);

    }
 
Example #9
Source File: PersistedInputOutputRDDIntegrateTest.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldPersistRDDBasedOnStorageLevel() throws Exception {
    Spark.create("local[4]");
    int counter = 0;
    for (final String storageLevel : Arrays.asList("MEMORY_ONLY", "DISK_ONLY", "MEMORY_ONLY_SER", "MEMORY_AND_DISK_SER")) {
        assertEquals(counter, Spark.getRDDs().size());
        assertEquals(counter, Spark.getContext().getPersistentRDDs().size());
        counter++;
        final String rddName = TestHelper.makeTestDataDirectory(PersistedInputOutputRDDIntegrateTest.class, UUID.randomUUID().toString());
        final Configuration configuration = super.getBaseConfiguration();
        configuration.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, SparkHadoopGraphProvider.PATHS.get("tinkerpop-modern-v3d0.kryo"));
        configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, GryoInputFormat.class.getCanonicalName());
        configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, PersistedOutputRDD.class.getCanonicalName());
        configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, storageLevel);
        configuration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, rddName);
        configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);
        Graph graph = GraphFactory.open(configuration);
        graph.compute(SparkGraphComputer.class)
                .result(GraphComputer.ResultGraph.NEW)
                .persist(GraphComputer.Persist.EDGES)
                .program(TraversalVertexProgram.build()
                        .traversal(graph.traversal().withComputer(SparkGraphComputer.class),
                                "gremlin-groovy",
                                "g.V().groupCount('m').by('name').out()").create(graph)).submit().get();
        ////////
        assertTrue(Spark.hasRDD(Constants.getGraphLocation(rddName)));
        assertEquals(StorageLevel.fromString(storageLevel), Spark.getRDD(Constants.getGraphLocation(rddName)).getStorageLevel());
        assertEquals(counter, Spark.getRDDs().size());
        assertEquals(counter, Spark.getContext().getPersistentRDDs().size());
    }
    Spark.close();
}
 
Example #10
Source File: Dataset.java    From nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> persist(final StorageLevel newLevel) {
  final boolean userTriggered = initializeFunction(newLevel);
  final Dataset<T> result = from(super.persist(newLevel));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #11
Source File: KafkaRangeReceiver.java    From kafka-spark-consumer with Apache License 2.0 5 votes vote down vote up
public KafkaRangeReceiver(
		KafkaConfig config,
        Set<Integer> partitionSet,
        StorageLevel storageLevel,
        KafkaMessageHandler<E> messageHandler) {
  super(storageLevel);
  this.kafkaConfig = config;
  _partitionSet = partitionSet;
  _messageHandler = messageHandler;
}
 
Example #12
Source File: Checkpoint.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * This is a utility method because Sparks StorageLevel.toString() is incompatible with its own
 * fromString() method.
 * 
 * @param level RDD storage level
 * @return storage level as a string
 */
public static String getStorageLevelString( StorageLevel level)
{
	if( StorageLevel.NONE().equals(level) )
		return "NONE";
	else if( StorageLevel.MEMORY_ONLY().equals(level) )
		return "MEMORY_ONLY";
	else if( StorageLevel.MEMORY_ONLY_2().equals(level) )
		return "MEMORY_ONLY_2";
	else if( StorageLevel.MEMORY_ONLY_SER().equals(level) )
		return "MEMORY_ONLY_SER";
	else if( StorageLevel.MEMORY_ONLY_SER_2().equals(level) )
		return "MEMORY_ONLY_SER_2";
	else if( StorageLevel.MEMORY_AND_DISK().equals(level) )
		return "MEMORY_AND_DISK";
	else if( StorageLevel.MEMORY_AND_DISK_2().equals(level) )
		return "MEMORY_AND_DISK_2";
	else if( StorageLevel.MEMORY_AND_DISK_SER().equals(level) )
		return "MEMORY_AND_DISK_SER";
	else if( StorageLevel.MEMORY_AND_DISK_SER_2().equals(level) )
		return "MEMORY_AND_DISK_SER_2";
	else if( StorageLevel.DISK_ONLY().equals(level) )
		return "DISK_ONLY";
	else if( StorageLevel.DISK_ONLY_2().equals(level) )
		return "DISK_ONLY_2";
	
	return "INVALID";
}
 
Example #13
Source File: CheckpointSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}
 
Example #14
Source File: RP_DBSCAN.java    From RP-DBSCAN with Apache License 2.0 5 votes vote down vote up
/**
 * Phase II : local clustering for RP-DBSCAN.
 * Phase II-1 (Core Marking) and Phase II-2 (Subgraph Building)
 */
public void phaseII()
{
	/**
	 * Phase II-1: Core Marking
	 */
	
	//Mark core cells and core points with the (eps,rho)-region query.
	JavaPairRDD<Long, ApproximatedCell> coreCells = dataset.mapPartitionsToPair(new Methods.FindCorePointsWithApproximation(Conf.dim, Conf.epsilon, Conf.minPts, conf, metaPaths)).persist(StorageLevel.MEMORY_AND_DISK_SER());
	
	//Count the number of core cells
	List<Tuple2<Integer, Long>> numOfCores = coreCells.mapToPair(new Methods.CountCorePts()).reduceByKey(new Methods.AggregateCount()).collect();
	numOfCorePoints = numOfCores.get(0)._2;
	
	//Broadcast core cell ids to every workers for updating the status of edges in cell subgraphs.
	try {
		corePaths = FileIO.broadCastData(sc, conf, Conf.coreInfoFolder);
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}

	/**
	 * Phase II-2: Subgraph Building
	 */
	// Build cell subgraph
	edgeSet = coreCells.mapPartitionsToPair(new Methods.FindDirectDensityReachableEdgesWithApproximation(Conf.dim, Conf.epsilon, Conf.minPts, conf, metaPaths, corePaths ,Conf.numOfPartitions)).repartition(Conf.numOfPartitions/2);
	
}
 
Example #15
Source File: PSPairedUnpairedSplitterSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Gets RDDs of the paired and unpaired reads. Option to cache the repartitioned RDD.
 */
public PSPairedUnpairedSplitterSpark(final JavaRDD<GATKRead> reads, final int readsPerPartitionGuess, final boolean cacheReads) {

    //Repartition reads then map each partition to a pair of lists, one containing the paired reads and the
    // other the unpaired reads
    repartitionedReads = PSFilter.repartitionReadsByName(reads)
            .mapPartitions(iter -> mapPartitionsToPairedAndUnpairedLists(iter, readsPerPartitionGuess));
    shouldBeCached = cacheReads;
    if (cacheReads) {
        repartitionedReads.persist(StorageLevel.MEMORY_AND_DISK_SER());
        isCached = true;
    }
}
 
Example #16
Source File: CheckpointSPInstruction.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}
 
Example #17
Source File: Dataset.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> persist(final StorageLevel newLevel) {
  final boolean userTriggered = initializeFunction(newLevel);
  final Dataset<T> result = from(super.persist(newLevel));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #18
Source File: SparkContextStorageCheck.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldSupportDirectoryFileDistinction() throws Exception {
    final Storage storage = SparkContextStorage.open(graph.configuration());
    for (int i = 0; i < 10; i++) {
        JavaSparkContext.fromSparkContext(Spark.getContext()).emptyRDD().setName("directory1/file1-" + i + ".txt.bz").persist(StorageLevel.DISK_ONLY());
    }
    for (int i = 0; i < 5; i++) {
        JavaSparkContext.fromSparkContext(Spark.getContext()).emptyRDD().setName("directory2/file2-" + i + ".txt.bz").persist(StorageLevel.DISK_ONLY());
    }
    super.checkFileDirectoryDistinction(storage, "directory1", "directory2");
}
 
Example #19
Source File: GlobalWatermarkHolder.java    From beam with Apache License 2.0 5 votes vote down vote up
private static Map<Integer, SparkWatermarks> initWatermarks(final BlockManager blockManager) {

    final Map<Integer, SparkWatermarks> watermarks = fetchSparkWatermarks(blockManager);

    if (watermarks == null) {
      final HashMap<Integer, SparkWatermarks> empty = Maps.newHashMap();
      blockManager.putSingle(
          WATERMARKS_BLOCK_ID, empty, StorageLevel.MEMORY_ONLY(), true, WATERMARKS_TAG);
      return empty;
    } else {
      return watermarks;
    }
  }
 
Example #20
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public ParameterAveragingTrainingMaster(boolean saveUpdater, Integer numWorkers, int rddDataSetNumExamples,
                int batchSizePerWorker, int averagingFrequency, int aggregationDepth, int prefetchNumBatches,
                Repartition repartition, RepartitionStrategy repartitionStrategy, StorageLevel storageLevel,
                boolean collectTrainingStats) {
    checkArgument(numWorkers > 0, "Invalid number of workers: " + numWorkers + " (must be >= 1)");
    checkArgument(rddDataSetNumExamples > 0,
                    "Invalid rdd data set size: " + rddDataSetNumExamples + " (must be >= 1)");
    checkArgument(averagingFrequency > 0, "Invalid input: averaging frequency must be >= 1");
    checkArgument(aggregationDepth > 0, "Invalid input: tree aggregation channels must be >= 1");

    this.saveUpdater = saveUpdater;
    this.numWorkers = numWorkers;
    this.rddDataSetNumExamples = rddDataSetNumExamples;
    this.batchSizePerWorker = batchSizePerWorker;
    this.averagingFrequency = averagingFrequency;
    this.aggregationDepth = aggregationDepth;
    this.prefetchNumBatches = prefetchNumBatches;
    this.collectTrainingStats = collectTrainingStats;
    this.repartition = repartition;
    this.repartitionStrategy = repartitionStrategy;
    this.storageLevel = storageLevel;
    if (collectTrainingStats)
        stats = new ParameterAveragingTrainingMasterStats.ParameterAveragingTrainingMasterStatsHelper();

    String jvmuid = UIDProvider.getJVMUID();
    this.trainingMasterUID =
                    System.currentTimeMillis() + "_" + (jvmuid.length() <= 8 ? jvmuid : jvmuid.substring(0, 8));
    this.rng = new Random();
}
 
Example #21
Source File: HoodieBloomIndex.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
                                            HoodieTable<T> hoodieTable) {

  // Step 0: cache the input record RDD
  if (config.getBloomIndexUseCaching()) {
    recordRDD.persist(SparkConfigUtils.getBloomIndexInputStorageLevel(config.getProps()));
  }

  // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
  JavaPairRDD<String, String> partitionRecordKeyPairRDD =
      recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

  // Lookup indexes for all the partition/recordkey pair
  JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
      lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);

  // Cache the result, for subsequent stages.
  if (config.getBloomIndexUseCaching()) {
    keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  }
  if (LOG.isDebugEnabled()) {
    long totalTaggedRecords = keyFilenamePairRDD.count();
    LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
  }

  // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
  // Cost: 4 sec.
  JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);

  if (config.getBloomIndexUseCaching()) {
    recordRDD.unpersist(); // unpersist the input Record RDD
    keyFilenamePairRDD.unpersist();
  }
  return taggedRecordRDD;
}
 
Example #22
Source File: BaseCommitActionExecutor.java    From hudi with Apache License 2.0 5 votes vote down vote up
public HoodieWriteMetadata execute(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
  HoodieWriteMetadata result = new HoodieWriteMetadata();
  // Cache the tagged records, so we don't end up computing both
  // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
  if (inputRecordsRDD.getStorageLevel() == StorageLevel.NONE()) {
    inputRecordsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  } else {
    LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
  }

  WorkloadProfile profile = null;
  if (isWorkloadProfileNeeded()) {
    profile = new WorkloadProfile(inputRecordsRDD);
    LOG.info("Workload profile :" + profile);
    saveWorkloadProfileMetadataToInflight(profile, instantTime);
  }

  // partition using the insert partitioner
  final Partitioner partitioner = getPartitioner(profile);
  JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDD, partitioner);
  JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
    if (WriteOperationType.isChangingRecords(operationType)) {
      return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
    } else {
      return handleInsertPartition(instantTime, partition, recordItr, partitioner);
    }
  }, true).flatMap(List::iterator);

  updateIndexAndCommitIfNeeded(writeStatusRDD, result);
  return result;
}
 
Example #23
Source File: StorageLevelSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void serialize(StorageLevel storageLevel, JsonGenerator jsonGenerator, SerializerProvider serializerProvider)
                throws IOException, JsonProcessingException {
    //This is a little ugly, but Spark doesn't provide many options here...
    String s = null;
    if (storageLevel != null) {
        s = map.get(storageLevel);
    }
    jsonGenerator.writeString(s);
}
 
Example #24
Source File: DataStep.java    From envelope with Apache License 2.0 5 votes vote down vote up
public boolean isCached() {
  if (data == null) {
    return false;
  }

  return data.storageLevel() != StorageLevel.NONE();
}
 
Example #25
Source File: SparkStreamingPulsarReceiver.java    From pulsar with Apache License 2.0 5 votes vote down vote up
public SparkStreamingPulsarReceiver(StorageLevel storageLevel,
    String serviceUrl,
    ConsumerConfigurationData<byte[]> conf,
    Authentication authentication) {
    super(storageLevel);

    checkNotNull(serviceUrl, "serviceUrl must not be null");
    checkNotNull(conf, "ConsumerConfigurationData must not be null");
    checkArgument(conf.getTopicNames().size() > 0, "TopicNames must be set a value.");
    checkNotNull(conf.getSubscriptionName(), "SubscriptionName must not be null");

    this.serviceUrl = serviceUrl;
    this.authentication = authentication;

    if (conf.getMessageListener() == null) {
        conf.setMessageListener((MessageListener<byte[]> & Serializable) (consumer, msg) -> {
            try {
                store(msg.getData());
                consumer.acknowledgeAsync(msg);
            } catch (Exception e) {
                LOG.error("Failed to store a message : {}", e.getMessage());
                consumer.negativeAcknowledge(msg);
            }
        });
    }
    this.conf = conf;
}
 
Example #26
Source File: TieredSpatialJoin.java    From geowave with Apache License 2.0 5 votes vote down vote up
private JavaPairRDD<GeoWaveInputKey, ByteArray> joinAndCompareTiers(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> leftTier,
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> rightTier,
    final Broadcast<GeomFunction> geomPredicate,
    final int highestPartitionCount,
    final HashPartitioner partitioner) {
  // Cogroup groups on same tier ByteArrayId and pairs them into Iterable
  // sets.
  JavaPairRDD<ByteArray, Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>> joinedTiers =
      leftTier.cogroup(rightTier, partitioner);

  // Filter only the pairs that have data on both sides, bucket strategy
  // should have been accounted for by this point.
  // We need to go through the pairs and test each feature against each
  // other
  // End with a combined RDD for that tier.
  joinedTiers =
      joinedTiers.filter(t -> t._2._1.iterator().hasNext() && t._2._2.iterator().hasNext());

  final JavaPairRDD<GeoWaveInputKey, ByteArray> finalMatches =
      joinedTiers.flatMapValues(
          (Function<Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>, Iterable<GeoWaveInputKey>>) t -> {
            final GeomFunction predicate = geomPredicate.value();

            final HashSet<GeoWaveInputKey> results = Sets.newHashSet();
            for (final Tuple2<GeoWaveInputKey, Geometry> leftTuple : t._1) {
              for (final Tuple2<GeoWaveInputKey, Geometry> rightTuple : t._2) {
                if (predicate.call(leftTuple._2, rightTuple._2)) {
                  results.add(leftTuple._1);
                  results.add(rightTuple._1);
                }
              }
            }
            return results;
          }).mapToPair(Tuple2::swap).reduceByKey(partitioner, (id1, id2) -> id1).persist(
              StorageLevel.MEMORY_ONLY_SER());

  return finalMatches;
}
 
Example #27
Source File: StringListReceiver.java    From cxf with Apache License 2.0 4 votes vote down vote up
public StringListReceiver(List<String> inputStrings) {
    super(StorageLevel.MEMORY_ONLY());
    this.inputStrings = inputStrings;
}
 
Example #28
Source File: Throughput.java    From flink-perf with Apache License 2.0 4 votes vote down vote up
public Source(StorageLevel storageLevel) {
	super(storageLevel);
	payload = new byte[12];
}
 
Example #29
Source File: SampleConsumer.java    From kafka-spark-consumer with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("deprecation")
private void run() {

  Properties props = new Properties();
  props.put("zookeeper.hosts", "zkhost");
  props.put("zookeeper.port", "2181");
  props.put("kafka.topic", "topicA,topicB,topicC");
  props.put("kafka.consumer.id", "kafka-consumer");
  // Optional Properties
  props.put("zookeeper.broker.path", "/brokers");
  props.put("zookeeper.consumer.path", "/consumers");
  props.put("consumer.forcefromstart", "false");
  props.put("max.poll.records", "10");
  props.put("consumer.fillfreqms", "500");
  props.put("consumer.backpressure.enabled", "true");
  //Kafka properties
  props.put("bootstrap.servers", "kafkahost-1:6667,"
          + "kafkahost-2:6667,"
          + "kafkahost-3:6667,"
          + "kafkahost-4:6667");
  props.put("security.protocol", "SSL");
  props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks");
  props.put("ssl.truststore.password", "test1234");

  SparkConf _sparkConf = new SparkConf();
  JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30));
  // Specify number of Receivers you need.
  int numberOfReceivers = 6;

  JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch(
      jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY());

  unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() {
    @Override
    public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception {
      //Start Application Logic
      rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() {
          @Override
          public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception {
              int countTopicA = 0;
              int countTopicB = 0;
              int countTopicC = 0;
              while(mmItr.hasNext()) {
                  MessageAndMetadata<byte[]> mm = mmItr.next();
                  if(mm.getTopic().equals("topicA")) {
                      countTopicA++;
                  }
                  else if (mm.getTopic().equals("topicB")) {
                      countTopicB++;
                  }
                  else if (mm.getTopic().equals("topicC")) {
                      countTopicC++;
                  }
              }
              System.out.println("topicA count " + countTopicA);
              System.out.println("topicB count " + countTopicB);
              System.out.println("topicC count " + countTopicC);
          }
      });
      System.out.println("RDD count " + rdd.count());
      //End Application Logic
      //commit offset
      System.out.println("Commiting Offset");
      ProcessedOffsetManager.persistsPartition(rdd, props);
    }
  });

  try {
    jsc.start();
    jsc.awaitTermination();
  }catch (Exception ex ) {
    jsc.ssc().sc().cancelAllJobs();
    jsc.stop(true, false);
    System.exit(-1);
  }
}
 
Example #30
Source File: ReceiverLauncher.java    From kafka-spark-consumer with Apache License 2.0 4 votes vote down vote up
public static DStream<MessageAndMetadata<byte[]>> launch(
    StreamingContext ssc, Properties pros, int numberOfReceivers, StorageLevel storageLevel) {
  JavaStreamingContext jsc = new JavaStreamingContext(ssc);
  return createStream(jsc, pros, numberOfReceivers, storageLevel, new IdentityMessageHandler()).dstream();
}