Java Code Examples for org.apache.spark.api.java.JavaPairRDD#coalesce()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#coalesce() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CreateRepresentativeSet.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CreateRepresentativeSet.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);

    // filter by representative protein chains at 40% sequence identify 
    // and  2.5 A resolution using the Pisces filter. Any pair of protein
    // chains in the representative set will have <= 40% sequence identity.
    int sequenceIdentity = 40;
    double resolution = 2.5;
    
    // read PDB, split entries into polymer chains, and filter by Pisces filter
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
    		.readReducedSequenceFile(sc)
    		.flatMapToPair(new StructureToPolymerChains())
    		.filter(new Pisces(sequenceIdentity, resolution));
   
    System.out.println("# representative chains: " + pdb.count());
    
    // coalesce partitions to avoid saving many small files
    pdb = pdb.coalesce(12);
    
    // save representative set
    String path = MmtfReader.getMmtfReducedPath();
    MmtfWriter.writeSequenceFile(path +"_representatives_i40_r2.5", sc, pdb);
	    
    sc.close();
}
 
Example 2
Source File: WriteMmtfCustom.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * @param args
 * @throws FileNotFoundException 
 */
public static void main(String[] args) throws FileNotFoundException {

	String path = MmtfReader.getMmtfFullPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read a 20% random sample of the PDB
    double fraction = 0.2;
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc);

    // retain high resolution X-ray structures
    pdb = pdb
    		.filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION))
    		.filter(new Resolution(0, 2.0))
    		.filter(new Rfree(0, 0.2));
   
    // coalesce this into 8 partitions to avoid creating many small files
    pdb = pdb.coalesce(8);
    
    // save this subset in a Hadoop Sequence file
    MmtfWriter.writeSequenceFile(path +"_xray", sc, pdb);
    
    System.out.println("# structures in custom set: " + pdb.count());
  
    long end = System.nanoTime();
    
    System.out.println("Time: " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 3
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
			getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
				((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
	   .count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}
 
Example 4
Source File: MergeFastq.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("MergeFastq");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2]));

        coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }
 
Example 5
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		getRDDHandleForMatrixObject(mo, FileFormat.BINARY);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
			((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
		.count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}