Java Code Examples for org.apache.spark.api.java.JavaPairRDD#filter()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#filter() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BroadCastParam.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}
 
Example 2
Source File: WordCount.java    From spark-on-spring-boot with Apache License 2.0 6 votes vote down vote up
public void count() {

        JavaRDD<String> tokenized = javaSparkContext.textFile(inputFile).flatMap((s1) -> Arrays.asList(s1.split(" ")));

        // count the occurrence of each word
        JavaPairRDD<String, Integer> counts = tokenized
                .mapToPair(s -> new Tuple2<>(s, 1))
                .reduceByKey((i1, i2) -> i1 + i2);

        // filter out words with less than threshold occurrences
        JavaPairRDD<String, Integer> filtered = counts.filter(tup -> tup._2() >= threshold);

        // count characters
        JavaPairRDD<Character, Integer> charCounts = filtered.flatMap(
                s -> {
                    Collection<Character> chars = new ArrayList<>(s._1().length());
                    for (char c : s._1().toCharArray()) {
                        chars.add(c);
                    }
                    return chars;
                }
        ).mapToPair(c -> new Tuple2<>(c, 1))
                .reduceByKey((i1, i2) -> i1 + i2);

        System.out.println(charCounts.collect());
    }
 
Example 3
Source File: InteractionAnalysisSimple.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * @param args no input arguments
 * @throws IOException if MmtfReader fails
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    
    // use only representative structures
    int sequenceIdentity = 40;
    double resolution = 2.5;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // list the top 10 residue types that interact with Zn
       interactions.printSchema();
       interactions.show(20);
       
       System.out.println("# interactions: " + interactions.count());
       
       // show the top 10 interacting groups
       interactions
       .groupBy(col("residue2"))
       .count()
       .sort(col("count").desc())
       .show(10);
      
    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 4
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * @param parsedRDD parsed input as {@code String[]}
 * @return {@link Rating}s ordered by timestamp
 */
private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD,
                                          Broadcast<? extends Map<String,Integer>> bUserIDToIndex,
                                          Broadcast<? extends Map<String,Integer>> bItemIDToIndex) {
  JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
    try {
      return new Tuple2<>(
          Long.valueOf(tokens[3]),
          new Rating(bUserIDToIndex.value().get(tokens[0]),
                     bItemIDToIndex.value().get(tokens[1]),
                     // Empty value means 'delete'; propagate as NaN
                     tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2])));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(tokens));
      throw e;
    }
  });

  if (decayFactor < 1.0) {
    double factor = decayFactor;
    long now = System.currentTimeMillis();
    timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
        long timestamp = timestampRating._1();
        return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
      });
  }

  if (decayZeroThreshold > 0.0) {
    double theThreshold = decayZeroThreshold;
    timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
  }

  return timestampRatingRDD.sortByKey().values();
}
 
Example 5
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
      original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));

  JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
  if (implicit) {
    // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
    // they don't guarantee the delete elements are properly handled
    aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
  } else {
    // For non-implicit, last wins.
    aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
  }

  JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
      aggregated.filter(kv -> !Double.isNaN(kv._2()));

  if (logStrength) {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        Math.log1p(userProductScore._2() / epsilon)));
  } else {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        userProductScore._2()));
  }
}
 
Example 6
Source File: TieredSpatialJoin.java    From geowave with Apache License 2.0 5 votes vote down vote up
private JavaPairRDD<GeoWaveInputKey, ByteArray> joinAndCompareTiers(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> leftTier,
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> rightTier,
    final Broadcast<GeomFunction> geomPredicate,
    final int highestPartitionCount,
    final HashPartitioner partitioner) {
  // Cogroup groups on same tier ByteArrayId and pairs them into Iterable
  // sets.
  JavaPairRDD<ByteArray, Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>> joinedTiers =
      leftTier.cogroup(rightTier, partitioner);

  // Filter only the pairs that have data on both sides, bucket strategy
  // should have been accounted for by this point.
  // We need to go through the pairs and test each feature against each
  // other
  // End with a combined RDD for that tier.
  joinedTiers =
      joinedTiers.filter(t -> t._2._1.iterator().hasNext() && t._2._2.iterator().hasNext());

  final JavaPairRDD<GeoWaveInputKey, ByteArray> finalMatches =
      joinedTiers.flatMapValues(
          (Function<Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>, Iterable<GeoWaveInputKey>>) t -> {
            final GeomFunction predicate = geomPredicate.value();

            final HashSet<GeoWaveInputKey> results = Sets.newHashSet();
            for (final Tuple2<GeoWaveInputKey, Geometry> leftTuple : t._1) {
              for (final Tuple2<GeoWaveInputKey, Geometry> rightTuple : t._2) {
                if (predicate.call(leftTuple._2, rightTuple._2)) {
                  results.add(leftTuple._1);
                  results.add(rightTuple._1);
                }
              }
            }
            return results;
          }).mapToPair(Tuple2::swap).reduceByKey(partitioner, (id1, id2) -> id1).persist(
              StorageLevel.MEMORY_ONLY_SER());

  return finalMatches;
}
 
Example 7
Source File: AggregateUnarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void processMatrixAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());

	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in;

	//filter input blocks for trace
	if( getOpcode().equalsIgnoreCase("uaktrace") )
		out = out.filter(new FilterDiagMatrixBlocksFunction());

	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		if( auop.sparseSafe )
			out = out.filter(new FilterNonEmptyBlocksFunction());

		JavaRDD<MatrixBlock> out2 = out.map(
				new RDDUAggFunction2(auop, mc.getBlocksize()));
		MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop);

		//drop correction after aggregation
		out3.dropLastRowsOrColumns(aggop.correction);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of matrix characteristics
		sec.setMatrixOutput(output.getName(), out3);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize()));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.existsCorrection() )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}
 
Example 8
Source File: InteractionAnalysisAdvanced.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(20);

       // TODO print the top 10 interacting elements
       System.out.println("Top interacting elements");
       Dataset<Row> topElements = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("element2")
       		.count();
       
       topElements.withColumn("frequency", col("count").divide(n))
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       interactions
       .groupBy("element2")
       .avg("distance")
       .sort("avg(distance)")
       .show(10);

       // Aggregate multiple statistics
       // Note: import static org.apache.spark.sql.functions.* required!
       // e.g. org.apache.spark.sql.functions.avg
       // for a list of all available functions
       interactions
       .groupBy("element2")
       .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance"))
       .show(10);
       
       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 9
Source File: AtpInteractionAnalysis.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // filter by sequence identity subset
    int sequenceIdentity = 20;
    double resolution = 2.0;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    // find ATP interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // TODO add a line to only analyze interactions 
    // with the oxygens in the terminal phosphate group of ATP
    // (O1G, O2G, O3G)
    // Tip: Google SQL LIKE
    interactions = interactions.filter("atom1 LIKE('O%G')");
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 10
Source File: FilterBySequenceRegex.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args
 * @throws FileNotFoundException 
 */
public static void main(String[] args) throws FileNotFoundException {

	String path = MmtfReader.getMmtfReducedPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterBySequenceRegex.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path,  sc);

    // find structures that containing a Zinc finger motif
    pdb = pdb.filter(new ContainsSequenceRegex("C.{2,4}C.{12}H.{3,5}H"));
    
    System.out.println("Number of PDB entries containing a Zinc finger motif: " + pdb.count());
  
    long end = System.nanoTime();
    
    System.out.println("Time: " + (end-start)/1E9 + " sec.");
    
    sc.close();
}
 
Example 11
Source File: FilterByResolution.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	    
	    long start = System.nanoTime();
	    
	    // instantiate Spark. Each Spark application needs these two lines of code.
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByResolution.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);

	    // read entire PDB in MMTF format
	    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path,  sc);

	    // filter PDB entries resolution. Entries without resolution values, 
	    // e.g., NMR structures, will be filtered out as well.
	    pdb = pdb.filter(new Resolution(0.0, 2.0));
	    
	    System.out.println("# structures: " + pdb.count());
	   
	    // close Spark
	    sc.close();
	    
	    long end = System.nanoTime();
	    System.out.println((end-start)/1E9 + " sec.");    
	}
 
Example 12
Source File: AggregateUnarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void processMatrixAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());

	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in;

	//filter input blocks for trace
	if( getOpcode().equalsIgnoreCase("uaktrace") )
		out = out.filter(new FilterDiagMatrixBlocksFunction());

	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		if( auop.sparseSafe )
			out = out.filter(new FilterNonEmptyBlocksFunction());

		JavaRDD<MatrixBlock> out2 = out.map(
				new RDDUAggFunction2(auop, mc.getBlocksize()));
		MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop);

		//drop correction after aggregation
		out3.dropLastRowsOrColumns(aggop.correction);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of matrix characteristics
		sec.setMatrixOutput(output.getName(), out3);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize()));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.existsCorrection() )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}
 
Example 13
Source File: TieredSpatialJoin.java    From geowave with Apache License 2.0 4 votes vote down vote up
private JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> filterTier(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexedRDD,
    final byte tierId) {
  return indexedRDD.filter(v1 -> v1._1().getBytes()[0] == tierId);
}