Java Code Examples for org.apache.spark.api.java.JavaRDD#unpersist()

The following examples show how to use org.apache.spark.api.java.JavaRDD#unpersist() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HoodieSimpleIndex.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Tags records location for incoming records.
 *
 * @param inputRecordRDD {@link JavaRDD} of incoming records
 * @param jsc            instance of {@link JavaSparkContext} to use
 * @param hoodieTable    instance of {@link HoodieTable} to use
 * @return {@link JavaRDD} of records with record locations set
 */
protected JavaRDD<HoodieRecord<T>> tagLocationInternal(JavaRDD<HoodieRecord<T>> inputRecordRDD, JavaSparkContext jsc,
                                                       HoodieTable<T> hoodieTable) {
  if (config.getSimpleIndexUseCaching()) {
    inputRecordRDD.persist(SparkConfigUtils.getSimpleIndexInputStorageLevel(config.getProps()));
  }

  JavaPairRDD<HoodieKey, HoodieRecord<T>> keyedInputRecordRDD = inputRecordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record));
  JavaPairRDD<HoodieKey, HoodieRecordLocation> existingLocationsOnTable = fetchRecordLocationsForAffectedPartitions(keyedInputRecordRDD.keys(), jsc, hoodieTable,
      config.getSimpleIndexParallelism());

  JavaRDD<HoodieRecord<T>> taggedRecordRDD = keyedInputRecordRDD.leftOuterJoin(existingLocationsOnTable)
      .map(entry -> {
        final HoodieRecord<T> untaggedRecord = entry._2._1;
        final Option<HoodieRecordLocation> location = Option.ofNullable(entry._2._2.orNull());
        return HoodieIndexUtils.getTaggedRecord(untaggedRecord, location);
      });

  if (config.getSimpleIndexUseCaching()) {
    inputRecordRDD.unpersist();
  }
  return taggedRecordRDD;
}
 
Example 2
Source File: AssemblyContigAlignmentsConfigPicker.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Parses input alignments into custom {@link AlignmentInterval} format, and
 * performs a primitive filtering implemented in
 * {@link #notDiscardForBadMQ(AlignedContig)} that
 * gets rid of contigs with no good alignments.
 *
 * It's important to remember that this step doesn't select alignments,
 * but only parses alignments and either keeps the whole contig or drops it completely.
 */
private static JavaRDD<AlignedContig> convertRawAlignmentsToAlignedContigAndFilterByQuality(final JavaRDD<GATKRead> assemblyAlignments,
                                                                                            final SAMFileHeader header,
                                                                                            final Logger toolLogger) {
    assemblyAlignments.cache();
    toolLogger.info( "Processing " + assemblyAlignments.count() + " raw alignments from " +
                     assemblyAlignments.map(GATKRead::getName).distinct().count() + " contigs.");

    final JavaRDD<AlignedContig> parsedContigAlignments =
            new SvDiscoverFromLocalAssemblyContigAlignmentsSpark.SAMFormattedContigAlignmentParser(assemblyAlignments, header, false)
                    .getAlignedContigs()
                    .filter(AssemblyContigAlignmentsConfigPicker::notDiscardForBadMQ).cache();
    assemblyAlignments.unpersist();
    toolLogger.info( "Filtering on MQ left " + parsedContigAlignments.count() + " contigs.");
    return parsedContigAlignments;
}
 
Example 3
Source File: GarmadonSparkStorageStatusListenerIntegrationTest.java    From garmadon with Apache License 2.0 5 votes vote down vote up
@Test
public void SparkStorageStatusListener_should_track_executor_storage_status() throws InterruptedException {
    assertTrue(rdds.isEmpty());
    assertTrue(executors.isEmpty());

    //Memory
    JavaRDD rddMemory = makeRDD("MemRDD", StorageLevel.MEMORY_ONLY());
    rddMemory.collect();

    checkExecutorRDDStorage("driver", equalTo(0L), greaterThan(0L), equalTo(0L));

    //Disk
    JavaRDD rddDisk = makeRDD("DiskRDD", StorageLevel.DISK_ONLY());
    rddDisk.collect();

    checkExecutorRDDStorage("driver", equalTo(0L), greaterThan(0L), greaterThan(0L));

    //OffHeap
    JavaRDD rddOffHeap = makeRDD("OffHeapRDD", StorageLevel.OFF_HEAP());
    rddOffHeap.collect();

    checkExecutorRDDStorage("driver", greaterThan(0L), greaterThan(0L), greaterThan(0L));

    rddMemory.unpersist(true);
    //wait for the EventBus to fire the unpersistRDD event
    Thread.sleep(1000);
    checkExecutorRDDStorage("driver", greaterThan(0L), equalTo(0L), greaterThan(0L));

    rddDisk.unpersist(true);
    //wait for the EventBus to fire the unpersistRDD event
    Thread.sleep(1000);
    checkExecutorRDDStorage("driver", greaterThan(0L), equalTo(0L), equalTo(0L));

    rddOffHeap.unpersist(true);
    //wait for the EventBus to fire the unpersistRDD event
    Thread.sleep(1000);
    checkExecutorRDDStorage("driver", equalTo(0L), equalTo(0L), equalTo(0L));
}
 
Example 4
Source File: HoodieBloomIndex.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
                                            HoodieTable<T> hoodieTable) {

  // Step 0: cache the input record RDD
  if (config.getBloomIndexUseCaching()) {
    recordRDD.persist(SparkConfigUtils.getBloomIndexInputStorageLevel(config.getProps()));
  }

  // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
  JavaPairRDD<String, String> partitionRecordKeyPairRDD =
      recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

  // Lookup indexes for all the partition/recordkey pair
  JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
      lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);

  // Cache the result, for subsequent stages.
  if (config.getBloomIndexUseCaching()) {
    keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  }
  if (LOG.isDebugEnabled()) {
    long totalTaggedRecords = keyFilenamePairRDD.count();
    LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
  }

  // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
  // Cost: 4 sec.
  JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);

  if (config.getBloomIndexUseCaching()) {
    recordRDD.unpersist(); // unpersist the input Record RDD
    keyFilenamePairRDD.unpersist();
  }
  return taggedRecordRDD;
}
 
Example 5
Source File: SegmentedCpxVariantSimpleVariantExtractor.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static ExtractedSimpleVariants extract(final JavaRDD<VariantContext> complexVariants,
                                              final SvDiscoveryInputMetaData svDiscoveryInputMetaData,
                                              final JavaRDD<GATKRead> assemblyRawAlignments) {

    final Broadcast<ReferenceMultiSparkSource> referenceBroadcast = svDiscoveryInputMetaData.getReferenceData().getReferenceBroadcast();

    // still does an in-efficient 2-pass on the input RDD: 1 pass for zero- and one-segment calls, the other for multi-segment calls
    // that was due to restriction from how multi-segment calls are to be re-interpreted
    final ZeroAndOneSegmentCpxVariantExtractor zeroAndOneSegmentCpxVariantExtractor = new ZeroAndOneSegmentCpxVariantExtractor();
    final JavaRDD<VariantContext> zeroOrOneSegmentComplexVariants = complexVariants
            .filter(vc -> SVUtils.getAttributeAsStringList(vc, CPX_SV_REF_SEGMENTS).size() < 2)
            .cache();
    final List<VariantContext> reInterpretedZeroAndOneSegmentCalls =
            zeroOrOneSegmentComplexVariants
                    .flatMap(vc -> zeroAndOneSegmentCpxVariantExtractor.extract(vc, referenceBroadcast.getValue()).iterator())
                    .collect();
    zeroOrOneSegmentComplexVariants.unpersist(false);

    final JavaRDD<VariantContext> multiSegmentCalls =
            complexVariants.filter(vc -> SVUtils.getAttributeAsStringList(vc, CPX_SV_REF_SEGMENTS).size() > 1)
                    .cache();

    final MultiSegmentsCpxVariantExtractor multiSegmentsCpxVariantExtractor = new MultiSegmentsCpxVariantExtractor();
    final List<VariantContext> sourceWithLessAnnotations = multiSegmentCalls
            .flatMap(vc -> multiSegmentsCpxVariantExtractor.extract(vc, referenceBroadcast.getValue()).iterator()).collect();

    final List<VariantContext> sourceWithMoreAnnotations =
            reInterpretMultiSegmentComplexVarThroughAlignmentPairIteration(multiSegmentCalls,
                    svDiscoveryInputMetaData, assemblyRawAlignments);

    final List<VariantContext> reInterpretMultiSegmentsCalls = removeDuplicates(sourceWithLessAnnotations, sourceWithMoreAnnotations);
    multiSegmentCalls.unpersist(false);

    return new ExtractedSimpleVariants(reInterpretedZeroAndOneSegmentCalls, reInterpretMultiSegmentsCalls);
}
 
Example 6
Source File: MiniBatchTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testMiniBatches() throws Exception {
    log.info("Setting up Spark Context...");
    JavaRDD<String> lines = sc.textFile(new ClassPathResource("svmLight/iris_svmLight_0.txt")
                    .getTempFileFromArchive().toURI().toString()).cache();
    long count = lines.count();
    assertEquals(300, count);
    // gotta map this to a Matrix/INDArray
    RecordReader rr = new SVMLightRecordReader();
    Configuration c = new Configuration();
    c.set(SVMLightRecordReader.NUM_FEATURES, "5");
    rr.setConf(c);
    JavaRDD<DataSet> points = lines.map(new RecordReaderFunction(rr, 4, 3)).cache();
    count = points.count();
    assertEquals(300, count);

    List<DataSet> collect = points.collect();

    points = points.repartition(1);
    JavaRDD<DataSet> miniBatches = new RDDMiniBatches(10, points).miniBatchesJava();
    count = miniBatches.count();
    List<DataSet> list = miniBatches.collect();
    assertEquals(30, count);    //Expect exactly 30 from 1 partition... could be more for multiple input partitions

    lines.unpersist();
    points.unpersist();
    miniBatches.map(new DataSetAssertionFunction());
}
 
Example 7
Source File: PersistExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
/**
	 * @param args
	 */
	public static void main(String[] args) {
		//C:\Users\sumit.kumar\Downloads\bin\warehouse
		//System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads");
		String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system
		Logger rootLogger = LogManager.getRootLogger();
		rootLogger.setLevel(Level.WARN);
		 SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false");
			JavaSparkContext sparkContext = new JavaSparkContext(conf);
		    JavaRDD<Integer> rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache();	
		    JavaRDD<Integer> evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function<Integer, Boolean>() {
			@Override
			public Boolean call(Integer v1) throws Exception {
			  return ((v1%2)==0)?true:false;
				}
			});
		    
		    evenRDD.persist(StorageLevel.MEMORY_AND_DISK());
		    evenRDD.foreach(new VoidFunction<Integer>() {
			@Override
			public void call(Integer t) throws Exception {
			System.out.println("The value of RDD are :"+t);
			 }
			});
		   //unpersisting the RDD 
		   evenRDD.unpersist();
		   rdd.unpersist();
		   
		   /* JavaRDD<String> lines = spark.read().textFile(logFile).javaRDD().cache();
		    System.out.println("DEBUG: \n"+ lines.toDebugString());
		   long word= lines.count();
		   JavaRDD<String> distinctLines=lines.distinct();
		   System.out.println("DEBUG: \n"+ distinctLines.toDebugString());
		   JavaRDD<String> finalRdd=lines.subtract(distinctLines);
		    
		   
		   System.out.println("DEBUG: \n"+ finalRdd.toDebugString());
		   System.out.println("The count is "+word);
		   System.out.println("The count is "+distinctLines.count());
		   System.out.println("The count is "+finalRdd.count());
		   
		   finalRdd.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String t) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(t);
			}
		});
*/	    /*SparkConf conf = new SparkConf().setAppName("Simple Application");
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    StorageLevel newLevel;
		JavaRDD<String> logData = sc.textFile(logFile).cache();

	    long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("a"); }
	    }).count();

	    long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("b"); }
	    }).count();

	    System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
	    
	    sc.stop();*/

	}
 
Example 8
Source File: ALSUpdate.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int features = (Integer) hyperParameters.get(0);
  double lambda = (Double) hyperParameters.get(1);
  double alpha = (Double) hyperParameters.get(2);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = (Double) hyperParameters.get(3);
  }
  Preconditions.checkArgument(features > 0);
  Preconditions.checkArgument(lambda >= 0.0);
  Preconditions.checkArgument(alpha > 0.0);
  if (logStrength) {
    Preconditions.checkArgument(epsilon > 0.0);
  }

  JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
  parsedRDD.cache();

  Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
  Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDIndexMap.size(), itemIDIndexMap.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);

  JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
  trainRatingData = aggregateScores(trainRatingData, epsilon);
  ALS als = new ALS()
      .setRank(features)
      .setIterations(iterations)
      .setLambda(lambda)
      .setCheckpointInterval(5);
  if (implicit) {
    als = als.setImplicitPrefs(true).setAlpha(alpha);
  }

  RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
  trainingRatingDataRDD.cache();
  MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
  trainingRatingDataRDD.unpersist(false);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  parsedRDD.unpersist();

  Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
  Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));

  PMML pmml = mfModelToPMML(model,
                            features,
                            lambda,
                            alpha,
                            epsilon,
                            implicit,
                            logStrength,
                            candidatePath,
                            bUserIndexToID,
                            bItemIndexToID);
  unpersist(model);

  bUserIndexToID.unpersist();
  bItemIndexToID.unpersist();

  return pmml;
}
 
Example 9
Source File: ALSUpdate.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {

  JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
  parsedTestRDD.cache();

  Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
  Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDToIndex.size(), itemIDToIndex.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);

  JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
  }
  testRatingData = aggregateScores(testRatingData, epsilon);

  MatrixFactorizationModel mfModel =
      pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);

  parsedTestRDD.unpersist();

  double eval;
  if (implicit) {
    double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
    log.info("AUC: {}", auc);
    eval = auc;
  } else {
    double rmse = Evaluation.rmse(mfModel, testRatingData);
    log.info("RMSE: {}", rmse);
    eval = -rmse;
  }
  unpersist(mfModel);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  return eval;
}
 
Example 10
Source File: MLUpdate.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public void runUpdate(JavaSparkContext sparkContext,
                      long timestamp,
                      JavaPairRDD<Object,M> newKeyMessageData,
                      JavaPairRDD<Object,M> pastKeyMessageData,
                      String modelDirString,
                      TopicProducer<String,String> modelUpdateTopic)
    throws IOException, InterruptedException {

  Objects.requireNonNull(newKeyMessageData);

  JavaRDD<M> newData = newKeyMessageData.values();
  JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

  if (newData != null) {
    newData.cache();
    // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
    // when many workers try to materialize the RDDs at once. Hence the workaround.
    newData.foreachPartition(p -> {});
  }
  if (pastData != null) {
    pastData.cache();
    pastData.foreachPartition(p -> {});
  }

  List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(
      getHyperParameterValues(), hyperParamSearch, candidates);

  Path modelDir = new Path(modelDirString);
  Path tempModelPath = new Path(modelDir, ".temporary");
  Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

  FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
  fs.mkdirs(candidatesPath);

  Path bestCandidatePath = findBestCandidatePath(
      sparkContext, newData, pastData, hyperParameterCombos, candidatesPath);

  Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
  if (bestCandidatePath == null) {
    log.info("Unable to build any model");
  } else {
    // Move best model into place
    fs.rename(bestCandidatePath, finalPath);
  }
  // Then delete everything else
  fs.delete(candidatesPath, true);

  if (modelUpdateTopic == null) {
    log.info("No update topic configured, not publishing models to a topic");
  } else {
    // Push PMML model onto update topic, if it exists
    Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
    if (fs.exists(bestModelPath)) {
      FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
      PMML bestModel = null;
      boolean modelNeededForUpdates = canPublishAdditionalModelData();
      boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
      if (modelNeededForUpdates || modelNotTooLarge) {
        // Either the model is required for publishAdditionalModelData, or required because it's going to
        // be serialized to Kafka
        try (InputStream in = fs.open(bestModelPath)) {
          bestModel = PMMLUtils.read(in);
        }
      }

      if (modelNotTooLarge) {
        modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
      } else {
        modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
      }

      if (modelNeededForUpdates) {
        publishAdditionalModelData(
            sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic);
      }
    }
  }

  if (newData != null) {
    newData.unpersist();
  }
  if (pastData != null) {
    pastData.unpersist();
  }
}