Java Code Examples for org.apache.spark.storage.StorageLevel

The following examples show how to use org.apache.spark.storage.StorageLevel. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Test
public void SparkStorageStatusListener_should_track_rdd_storage_status() throws InterruptedException {
    assertTrue(rdds.isEmpty());
    assertTrue(executors.isEmpty());

    //Memory
    JavaRDD rddMemory = makeRDD("MemRDD", StorageLevel.MEMORY_ONLY());
    rddMemory.collect();

    checkRddStorage(rddMemory.name(), equalTo(0L), greaterThan(0L), equalTo(0L));

    //Disk
    JavaRDD rddDisk = makeRDD("DiskRDD", StorageLevel.DISK_ONLY());
    rddDisk.collect();

    checkRddStorage(rddDisk.name(), equalTo(0L), equalTo(0L), greaterThan(0L));

    //OffHeap
    JavaRDD rddOffHeap = makeRDD("OffHeapRDD", StorageLevel.OFF_HEAP());
    rddOffHeap.collect();

    checkRddStorage(rddOffHeap.name(), greaterThan(0L), equalTo(0L), equalTo(0L));
}
 
Example 2
Source Project: deeplearning4j   Source File: StorageLevelSerializer.java    License: Apache License 2.0 6 votes vote down vote up
private static Map<StorageLevel, String> initMap() {
    Map<StorageLevel, String> map = new HashMap<>();
    map.put(StorageLevel.NONE(), "NONE");
    map.put(StorageLevel.DISK_ONLY(), "DISK_ONLY");
    map.put(StorageLevel.DISK_ONLY_2(), "DISK_ONLY_2");
    map.put(StorageLevel.MEMORY_ONLY(), "MEMORY_ONLY");
    map.put(StorageLevel.MEMORY_ONLY_2(), "MEMORY_ONLY_2");
    map.put(StorageLevel.MEMORY_ONLY_SER(), "MEMORY_ONLY_SER");
    map.put(StorageLevel.MEMORY_ONLY_SER_2(), "MEMORY_ONLY_SER_2");
    map.put(StorageLevel.MEMORY_AND_DISK(), "MEMORY_AND_DISK");
    map.put(StorageLevel.MEMORY_AND_DISK_2(), "MEMORY_AND_DISK_2");
    map.put(StorageLevel.MEMORY_AND_DISK_SER(), "MEMORY_AND_DISK_SER");
    map.put(StorageLevel.MEMORY_AND_DISK_SER_2(), "MEMORY_AND_DISK_SER_2");
    map.put(StorageLevel.OFF_HEAP(), "OFF_HEAP");
    return map;
}
 
Example 3
Source Project: beam   Source File: UnboundedDataset.java    License: Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public void cache(String storageLevel, Coder<?> coder) {
  // we "force" MEMORY storage level in streaming
  if (!StorageLevel.fromString(storageLevel).equals(StorageLevel.MEMORY_ONLY_SER())) {
    LOG.warn(
        "Provided StorageLevel: {} is ignored for streams, using the default level: {}",
        storageLevel,
        StorageLevel.MEMORY_ONLY_SER());
  }
  // Caching can cause Serialization, we need to code to bytes
  // more details in https://issues.apache.org/jira/browse/BEAM-2669
  Coder<WindowedValue<T>> wc = (Coder<WindowedValue<T>>) coder;
  this.dStream =
      dStream.map(CoderHelpers.toByteFunction(wc)).cache().map(CoderHelpers.fromByteFunction(wc));
}
 
Example 4
Source Project: pulsar   Source File: SparkStreamingPulsarReceiverTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test(dataProvider = "ServiceUrls")
public void testDefaultSettingsOfReceiver(String serviceUrl) {
    ConsumerConfigurationData<byte[]> consConf = new ConsumerConfigurationData<>();

    Set<String> set = new HashSet<>();
    set.add(TOPIC);
    consConf.setTopicNames(set);
    consConf.setSubscriptionName(SUBS);

    SparkStreamingPulsarReceiver receiver = new SparkStreamingPulsarReceiver(
        serviceUrl,
        consConf,
        new AuthenticationDisabled());

    assertEquals(receiver.storageLevel(), StorageLevel.MEMORY_AND_DISK_2());
    assertNotNull(consConf.getMessageListener());
}
 
Example 5
Source Project: tinkerpop   Source File: PersistedOutputRDD.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
    if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
        LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
    SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));  // this might be bad cause it unpersists the job RDD
    // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
    final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
    if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
        graphRDD.mapValues(vertex -> {
            vertex.get().dropEdges(Direction.BOTH);
            return vertex;
        }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    else
        graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
}
 
Example 6
Source Project: deeplearning4j   Source File: TestJsonYaml.java    License: Apache License 2.0 6 votes vote down vote up
@Test
    public void testJsonYaml() {
        TrainingMaster tm = new ParameterAveragingTrainingMaster.Builder(2).batchSizePerWorker(32)
                        .exportDirectory("hdfs://SomeDirectory/").saveUpdater(false).averagingFrequency(3)
                        .storageLevel(StorageLevel.MEMORY_ONLY_SER_2()).storageLevelStreams(StorageLevel.DISK_ONLY())
                        .build();

        String json = tm.toJson();
        String yaml = tm.toYaml();

//        System.out.println(json);

        TrainingMaster fromJson = ParameterAveragingTrainingMaster.fromJson(json);
        TrainingMaster fromYaml = ParameterAveragingTrainingMaster.fromYaml(yaml);


        assertEquals(tm, fromJson);
        assertEquals(tm, fromYaml);

    }
 
Example 7
Source Project: gatk   Source File: PathSeqBwaSpark.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Writes RDD of reads to path. Note writeReads() is not used because there are separate paired/unpaired outputs.
 * Header sequence dictionary is reduced to only those that were aligned to.
 */
private void writeBam(final JavaRDD<GATKRead> reads, final String inputBamPath, final boolean isPaired,
                      final JavaSparkContext ctx, SAMFileHeader header) {

    //Only retain header sequences that were aligned to.
    //This invokes an action and therefore the reads must be cached.
    reads.persist(StorageLevel.MEMORY_AND_DISK_SER());
    header = PSBwaUtils.removeUnmappedHeaderSequences(header, reads, logger);

    final String outputPath = isPaired ? outputPaired : outputUnpaired;
    try {
        ReadsSparkSink.writeReads(ctx, outputPath, null, reads, header,
                shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE,
                PSUtils.pathseqGetRecommendedNumReducers(inputBamPath, numReducers, getTargetPartitionSize()), shardedPartsDir, true, splittingIndexGranularity);
    } catch (final IOException e) {
        throw new UserException.CouldNotCreateOutputFile(outputPath, "Writing failed", e);
    }
}
 
Example 8
Source Project: oryx   Source File: ALSUpdate.java    License: Apache License 2.0 6 votes vote down vote up
private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
    JavaPairRDD<String,float[]> javaRDD,
    Broadcast<? extends Map<String,Integer>> bIdToIndex) {

  RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
      new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
  ).mapValues(f -> {
      double[] d = new double[f.length];
      for (int i = 0; i < d.length; i++) {
        d[i] = f[i];
      }
      return d;
    }
  ).rdd();

  // This mimics the persistence level establish by ALS training methods
  scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());

  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
  return objKeyRDD;
}
 
Example 9
Source Project: RP-DBSCAN   Source File: RP_DBSCAN.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Phase II : local clustering for RP-DBSCAN.
 * Phase II-1 (Core Marking) and Phase II-2 (Subgraph Building)
 */
public void phaseII()
{
	/**
	 * Phase II-1: Core Marking
	 */
	
	//Mark core cells and core points with the (eps,rho)-region query.
	JavaPairRDD<Long, ApproximatedCell> coreCells = dataset.mapPartitionsToPair(new Methods.FindCorePointsWithApproximation(Conf.dim, Conf.epsilon, Conf.minPts, conf, metaPaths)).persist(StorageLevel.MEMORY_AND_DISK_SER());
	
	//Count the number of core cells
	List<Tuple2<Integer, Long>> numOfCores = coreCells.mapToPair(new Methods.CountCorePts()).reduceByKey(new Methods.AggregateCount()).collect();
	numOfCorePoints = numOfCores.get(0)._2;
	
	//Broadcast core cell ids to every workers for updating the status of edges in cell subgraphs.
	try {
		corePaths = FileIO.broadCastData(sc, conf, Conf.coreInfoFolder);
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}

	/**
	 * Phase II-2: Subgraph Building
	 */
	// Build cell subgraph
	edgeSet = coreCells.mapPartitionsToPair(new Methods.FindDirectDensityReachableEdgesWithApproximation(Conf.dim, Conf.epsilon, Conf.minPts, conf, metaPaths, corePaths ,Conf.numOfPartitions)).repartition(Conf.numOfPartitions/2);
	
}
 
Example 10
Source Project: systemds   Source File: CheckpointSPInstruction.java    License: Apache License 2.0 5 votes vote down vote up
public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}
 
Example 11
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> persist(final StorageLevel newLevel) {
  final boolean userTriggered = initializeFunction(newLevel);
  final Dataset<T> result = from(super.persist(newLevel));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 12
Source Project: nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> persist(final StorageLevel newLevel) {
  final boolean userTriggered = initializeFunction(newLevel);
  final Dataset<T> result = from(super.persist(newLevel));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 13
public ParameterAveragingTrainingMaster(boolean saveUpdater, Integer numWorkers, int rddDataSetNumExamples,
                int batchSizePerWorker, int averagingFrequency, int aggregationDepth, int prefetchNumBatches,
                Repartition repartition, RepartitionStrategy repartitionStrategy, StorageLevel storageLevel,
                boolean collectTrainingStats) {
    checkArgument(numWorkers > 0, "Invalid number of workers: " + numWorkers + " (must be >= 1)");
    checkArgument(rddDataSetNumExamples > 0,
                    "Invalid rdd data set size: " + rddDataSetNumExamples + " (must be >= 1)");
    checkArgument(averagingFrequency > 0, "Invalid input: averaging frequency must be >= 1");
    checkArgument(aggregationDepth > 0, "Invalid input: tree aggregation channels must be >= 1");

    this.saveUpdater = saveUpdater;
    this.numWorkers = numWorkers;
    this.rddDataSetNumExamples = rddDataSetNumExamples;
    this.batchSizePerWorker = batchSizePerWorker;
    this.averagingFrequency = averagingFrequency;
    this.aggregationDepth = aggregationDepth;
    this.prefetchNumBatches = prefetchNumBatches;
    this.collectTrainingStats = collectTrainingStats;
    this.repartition = repartition;
    this.repartitionStrategy = repartitionStrategy;
    this.storageLevel = storageLevel;
    if (collectTrainingStats)
        stats = new ParameterAveragingTrainingMasterStats.ParameterAveragingTrainingMasterStatsHelper();

    String jvmuid = UIDProvider.getJVMUID();
    this.trainingMasterUID =
                    System.currentTimeMillis() + "_" + (jvmuid.length() <= 8 ? jvmuid : jvmuid.substring(0, 8));
    this.rng = new Random();
}
 
Example 14
Source Project: hudi   Source File: HoodieBloomIndex.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
                                            HoodieTable<T> hoodieTable) {

  // Step 0: cache the input record RDD
  if (config.getBloomIndexUseCaching()) {
    recordRDD.persist(SparkConfigUtils.getBloomIndexInputStorageLevel(config.getProps()));
  }

  // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
  JavaPairRDD<String, String> partitionRecordKeyPairRDD =
      recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

  // Lookup indexes for all the partition/recordkey pair
  JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
      lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);

  // Cache the result, for subsequent stages.
  if (config.getBloomIndexUseCaching()) {
    keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  }
  if (LOG.isDebugEnabled()) {
    long totalTaggedRecords = keyFilenamePairRDD.count();
    LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
  }

  // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
  // Cost: 4 sec.
  JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);

  if (config.getBloomIndexUseCaching()) {
    recordRDD.unpersist(); // unpersist the input Record RDD
    keyFilenamePairRDD.unpersist();
  }
  return taggedRecordRDD;
}
 
Example 15
Source Project: hudi   Source File: BaseCommitActionExecutor.java    License: Apache License 2.0 5 votes vote down vote up
public HoodieWriteMetadata execute(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
  HoodieWriteMetadata result = new HoodieWriteMetadata();
  // Cache the tagged records, so we don't end up computing both
  // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
  if (inputRecordsRDD.getStorageLevel() == StorageLevel.NONE()) {
    inputRecordsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  } else {
    LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
  }

  WorkloadProfile profile = null;
  if (isWorkloadProfileNeeded()) {
    profile = new WorkloadProfile(inputRecordsRDD);
    LOG.info("Workload profile :" + profile);
    saveWorkloadProfileMetadataToInflight(profile, instantTime);
  }

  // partition using the insert partitioner
  final Partitioner partitioner = getPartitioner(profile);
  JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDD, partitioner);
  JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
    if (WriteOperationType.isChangingRecords(operationType)) {
      return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
    } else {
      return handleInsertPartition(instantTime, partition, recordItr, partitioner);
    }
  }, true).flatMap(List::iterator);

  updateIndexAndCommitIfNeeded(writeStatusRDD, result);
  return result;
}
 
Example 16
Source Project: envelope   Source File: DataStep.java    License: Apache License 2.0 5 votes vote down vote up
public boolean isCached() {
  if (data == null) {
    return false;
  }

  return data.storageLevel() != StorageLevel.NONE();
}
 
Example 17
Source Project: geowave   Source File: TieredSpatialJoin.java    License: Apache License 2.0 5 votes vote down vote up
private JavaPairRDD<GeoWaveInputKey, ByteArray> joinAndCompareTiers(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> leftTier,
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> rightTier,
    final Broadcast<GeomFunction> geomPredicate,
    final int highestPartitionCount,
    final HashPartitioner partitioner) {
  // Cogroup groups on same tier ByteArrayId and pairs them into Iterable
  // sets.
  JavaPairRDD<ByteArray, Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>> joinedTiers =
      leftTier.cogroup(rightTier, partitioner);

  // Filter only the pairs that have data on both sides, bucket strategy
  // should have been accounted for by this point.
  // We need to go through the pairs and test each feature against each
  // other
  // End with a combined RDD for that tier.
  joinedTiers =
      joinedTiers.filter(t -> t._2._1.iterator().hasNext() && t._2._2.iterator().hasNext());

  final JavaPairRDD<GeoWaveInputKey, ByteArray> finalMatches =
      joinedTiers.flatMapValues(
          (Function<Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>, Iterable<GeoWaveInputKey>>) t -> {
            final GeomFunction predicate = geomPredicate.value();

            final HashSet<GeoWaveInputKey> results = Sets.newHashSet();
            for (final Tuple2<GeoWaveInputKey, Geometry> leftTuple : t._1) {
              for (final Tuple2<GeoWaveInputKey, Geometry> rightTuple : t._2) {
                if (predicate.call(leftTuple._2, rightTuple._2)) {
                  results.add(leftTuple._1);
                  results.add(rightTuple._1);
                }
              }
            }
            return results;
          }).mapToPair(Tuple2::swap).reduceByKey(partitioner, (id1, id2) -> id1).persist(
              StorageLevel.MEMORY_ONLY_SER());

  return finalMatches;
}
 
Example 18
Source Project: beam   Source File: GlobalWatermarkHolder.java    License: Apache License 2.0 5 votes vote down vote up
private static Map<Integer, SparkWatermarks> initWatermarks(final BlockManager blockManager) {

    final Map<Integer, SparkWatermarks> watermarks = fetchSparkWatermarks(blockManager);

    if (watermarks == null) {
      final HashMap<Integer, SparkWatermarks> empty = Maps.newHashMap();
      blockManager.putSingle(
          WATERMARKS_BLOCK_ID, empty, StorageLevel.MEMORY_ONLY(), true, WATERMARKS_TAG);
      return empty;
    } else {
      return watermarks;
    }
  }
 
Example 19
Source Project: pulsar   Source File: SparkStreamingPulsarReceiver.java    License: Apache License 2.0 5 votes vote down vote up
public SparkStreamingPulsarReceiver(StorageLevel storageLevel,
    String serviceUrl,
    ConsumerConfigurationData<byte[]> conf,
    Authentication authentication) {
    super(storageLevel);

    checkNotNull(serviceUrl, "serviceUrl must not be null");
    checkNotNull(conf, "ConsumerConfigurationData must not be null");
    checkArgument(conf.getTopicNames().size() > 0, "TopicNames must be set a value.");
    checkNotNull(conf.getSubscriptionName(), "SubscriptionName must not be null");

    this.serviceUrl = serviceUrl;
    this.authentication = authentication;

    if (conf.getMessageListener() == null) {
        conf.setMessageListener((MessageListener<byte[]> & Serializable) (consumer, msg) -> {
            try {
                store(msg.getData());
                consumer.acknowledgeAsync(msg);
            } catch (Exception e) {
                LOG.error("Failed to store a message : {}", e.getMessage());
                consumer.negativeAcknowledge(msg);
            }
        });
    }
    this.conf = conf;
}
 
Example 20
Source Project: deeplearning4j   Source File: StorageLevelSerializer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void serialize(StorageLevel storageLevel, JsonGenerator jsonGenerator, SerializerProvider serializerProvider)
                throws IOException, JsonProcessingException {
    //This is a little ugly, but Spark doesn't provide many options here...
    String s = null;
    if (storageLevel != null) {
        s = map.get(storageLevel);
    }
    jsonGenerator.writeString(s);
}
 
Example 21
Source Project: tinkerpop   Source File: SparkContextStorageCheck.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void shouldSupportDirectoryFileDistinction() throws Exception {
    final Storage storage = SparkContextStorage.open(graph.configuration());
    for (int i = 0; i < 10; i++) {
        JavaSparkContext.fromSparkContext(Spark.getContext()).emptyRDD().setName("directory1/file1-" + i + ".txt.bz").persist(StorageLevel.DISK_ONLY());
    }
    for (int i = 0; i < 5; i++) {
        JavaSparkContext.fromSparkContext(Spark.getContext()).emptyRDD().setName("directory2/file2-" + i + ".txt.bz").persist(StorageLevel.DISK_ONLY());
    }
    super.checkFileDirectoryDistinction(storage, "directory1", "directory2");
}
 
Example 22
@Test
public void shouldPersistRDDBasedOnStorageLevel() throws Exception {
    Spark.create("local[4]");
    int counter = 0;
    for (final String storageLevel : Arrays.asList("MEMORY_ONLY", "DISK_ONLY", "MEMORY_ONLY_SER", "MEMORY_AND_DISK_SER")) {
        assertEquals(counter, Spark.getRDDs().size());
        assertEquals(counter, Spark.getContext().getPersistentRDDs().size());
        counter++;
        final String rddName = TestHelper.makeTestDataDirectory(PersistedInputOutputRDDIntegrateTest.class, UUID.randomUUID().toString());
        final Configuration configuration = super.getBaseConfiguration();
        configuration.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, SparkHadoopGraphProvider.PATHS.get("tinkerpop-modern-v3d0.kryo"));
        configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, GryoInputFormat.class.getCanonicalName());
        configuration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, PersistedOutputRDD.class.getCanonicalName());
        configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, storageLevel);
        configuration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, rddName);
        configuration.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);
        Graph graph = GraphFactory.open(configuration);
        graph.compute(SparkGraphComputer.class)
                .result(GraphComputer.ResultGraph.NEW)
                .persist(GraphComputer.Persist.EDGES)
                .program(TraversalVertexProgram.build()
                        .traversal(graph.traversal().withComputer(SparkGraphComputer.class),
                                "gremlin-groovy",
                                "g.V().groupCount('m').by('name').out()").create(graph)).submit().get();
        ////////
        assertTrue(Spark.hasRDD(Constants.getGraphLocation(rddName)));
        assertEquals(StorageLevel.fromString(storageLevel), Spark.getRDD(Constants.getGraphLocation(rddName)).getStorageLevel());
        assertEquals(counter, Spark.getRDDs().size());
        assertEquals(counter, Spark.getContext().getPersistentRDDs().size());
    }
    Spark.close();
}
 
Example 23
Source Project: kafka-spark-consumer   Source File: KafkaRangeReceiver.java    License: Apache License 2.0 5 votes vote down vote up
public KafkaRangeReceiver(
		KafkaConfig config,
        Set<Integer> partitionSet,
        StorageLevel storageLevel,
        KafkaMessageHandler<E> messageHandler) {
  super(storageLevel);
  this.kafkaConfig = config;
  _partitionSet = partitionSet;
  _messageHandler = messageHandler;
}
 
Example 24
Source Project: systemds   Source File: Checkpoint.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * This is a utility method because Sparks StorageLevel.toString() is incompatible with its own
 * fromString() method.
 * 
 * @param level RDD storage level
 * @return storage level as a string
 */
public static String getStorageLevelString( StorageLevel level)
{
	if( StorageLevel.NONE().equals(level) )
		return "NONE";
	else if( StorageLevel.MEMORY_ONLY().equals(level) )
		return "MEMORY_ONLY";
	else if( StorageLevel.MEMORY_ONLY_2().equals(level) )
		return "MEMORY_ONLY_2";
	else if( StorageLevel.MEMORY_ONLY_SER().equals(level) )
		return "MEMORY_ONLY_SER";
	else if( StorageLevel.MEMORY_ONLY_SER_2().equals(level) )
		return "MEMORY_ONLY_SER_2";
	else if( StorageLevel.MEMORY_AND_DISK().equals(level) )
		return "MEMORY_AND_DISK";
	else if( StorageLevel.MEMORY_AND_DISK_2().equals(level) )
		return "MEMORY_AND_DISK_2";
	else if( StorageLevel.MEMORY_AND_DISK_SER().equals(level) )
		return "MEMORY_AND_DISK_SER";
	else if( StorageLevel.MEMORY_AND_DISK_SER_2().equals(level) )
		return "MEMORY_AND_DISK_SER_2";
	else if( StorageLevel.DISK_ONLY().equals(level) )
		return "DISK_ONLY";
	else if( StorageLevel.DISK_ONLY_2().equals(level) )
		return "DISK_ONLY_2";
	
	return "INVALID";
}
 
Example 25
Source Project: systemds   Source File: CheckpointSPInstruction.java    License: Apache License 2.0 5 votes vote down vote up
public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}
 
Example 26
/**
 * Gets RDDs of the paired and unpaired reads. Option to cache the repartitioned RDD.
 */
public PSPairedUnpairedSplitterSpark(final JavaRDD<GATKRead> reads, final int readsPerPartitionGuess, final boolean cacheReads) {

    //Repartition reads then map each partition to a pair of lists, one containing the paired reads and the
    // other the unpaired reads
    repartitionedReads = PSFilter.repartitionReadsByName(reads)
            .mapPartitions(iter -> mapPartitionsToPairedAndUnpairedLists(iter, readsPerPartitionGuess));
    shouldBeCached = cacheReads;
    if (cacheReads) {
        repartitionedReads.persist(StorageLevel.MEMORY_AND_DISK_SER());
        isCached = true;
    }
}
 
Example 27
Source Project: kylin-on-parquet-v2   Source File: SparkCubingByLayer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}
 
Example 28
Source Project: OpenDL   Source File: dATest.java    License: Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	try {
		int x_feature = 784;
		int y_feature = 10;
		int n_hidden = 160;
		List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature);
		
		List<SampleVector> trainList = new ArrayList<SampleVector>();
		List<SampleVector> testList = new ArrayList<SampleVector>();
		DataInput.splitList(samples, trainList, testList, 0.7);
		
		JavaSparkContext context = SparkContextBuild.getContext(args);
		JavaRDD<SampleVector> rdds = context.parallelize(trainList);
		rdds.count();
		logger.info("RDD ok.");
		
		AutoEncoder da = new AutoEncoder(x_feature, n_hidden);
           SGDTrainConfig config = new SGDTrainConfig();
           config.setUseCG(true);
           config.setDoCorruption(true);
           config.setCorruption_level(0.25);
           config.setCgEpochStep(50);
           config.setCgTolerance(0);
           config.setCgMaxIterations(10);
           config.setMaxEpochs(50);
           config.setNbrModelReplica(4);
           config.setMinLoss(0.01);
           config.setUseRegularization(true);
           config.setMrDataStorage(StorageLevel.MEMORY_ONLY());
           config.setPrintLoss(true);
           config.setLossCalStep(3);
           
           logger.info("Start to train dA.");
           DownpourSGDTrain.train(da, rdds, config);
           
           double[] reconstruct_x = new double[x_feature];
           double totalError = 0;
           for(SampleVector test : testList) {
           	da.reconstruct(test.getX(), reconstruct_x);
           	totalError += ClassVerify.squaredError(test.getX(), reconstruct_x);
           }
           logger.info("Mean square error is " + totalError / testList.size());
	} catch(Throwable e) {
		logger.error("", e);
	}
}
 
Example 29
Source Project: OpenDL   Source File: RBMTest.java    License: Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	try {
		int x_feature = 784;
		int y_feature = 10;
		int n_hidden = 160;
		List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature);
		
		List<SampleVector> trainList = new ArrayList<SampleVector>();
		List<SampleVector> testList = new ArrayList<SampleVector>();
		DataInput.splitList(samples, trainList, testList, 0.7);
		
		JavaSparkContext context = SparkContextBuild.getContext(args);
		JavaRDD<SampleVector> rdds = context.parallelize(trainList);
		rdds.count();
		logger.info("RDD ok.");
		
		RBM rbm = new RBM(x_feature, n_hidden);
           SGDTrainConfig config = new SGDTrainConfig();
           config.setUseCG(true);
           config.setCgEpochStep(50);
           config.setCgTolerance(0);
           config.setCgMaxIterations(10);
           config.setMaxEpochs(50);
           config.setNbrModelReplica(4);
           config.setMinLoss(0.01);
           config.setMrDataStorage(StorageLevel.MEMORY_ONLY());
           config.setPrintLoss(true);
           config.setLossCalStep(3);
           
           logger.info("Start to train RBM.");
           DownpourSGDTrain.train(rbm, rdds, config);
           
           double[] reconstruct_x = new double[x_feature];
           double totalError = 0;
           for(SampleVector test : testList) {
           	rbm.reconstruct(test.getX(), reconstruct_x);
           	totalError += ClassVerify.squaredError(test.getX(), reconstruct_x);
           }
           logger.info("Mean square error is " + totalError / testList.size());
	} catch(Throwable e) {
		logger.error("", e);
	}
}
 
Example 30
public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
            .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
            .setAppName(APPLICATION_NAME);

        JavaSparkContext javaSparkContext = new JavaSparkContext(conf);        
        
        List<Tuple2<Object, String>> listOfVertex = new ArrayList<>();
        listOfVertex.add(new Tuple2<>(1l, "James"));
        listOfVertex.add(new Tuple2<>(2l, "Andy"));
        listOfVertex.add(new Tuple2<>(3l, "Ed"));
        listOfVertex.add(new Tuple2<>(4l, "Roger"));
        listOfVertex.add(new Tuple2<>(5l, "Tony"));

        List<Edge<String>> listOfEdge = new ArrayList<>();
        listOfEdge.add(new Edge<>(2, 1, "Friend"));
        listOfEdge.add(new Edge<>(3, 1, "Friend"));
        listOfEdge.add(new Edge<>(3, 2, "Colleague"));    
        listOfEdge.add(new Edge<>(3, 5, "Partner"));
        listOfEdge.add(new Edge<>(4, 3, "Boss"));        
        listOfEdge.add(new Edge<>(5, 2, "Partner"));       
    
        JavaRDD<Tuple2<Object, String>> vertexRDD = javaSparkContext.parallelize(listOfVertex);
        JavaRDD<Edge<String>> edgeRDD = javaSparkContext.parallelize(listOfEdge);

        ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
		
        Graph<String, String> graph = Graph.apply(
            vertexRDD.rdd(), 
            edgeRDD.rdd(), 
            "", 
            StorageLevel.MEMORY_ONLY(), 
			StorageLevel.MEMORY_ONLY(), 
			stringTag, 
			stringTag
            );    

        //apply specific algorithms, such as PageRank

        graph.vertices()
            .saveAsTextFile(VERTICES_FOLDER_PATH);        
			 
        graph.edges()
	    .saveAsTextFile(EDGES_FOLDER_PATH);        

        javaSparkContext.close();
    }