Java Code Examples for org.apache.spark.sql.Dataset#coalesce()

The following examples show how to use org.apache.spark.sql.Dataset#coalesce() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BatchStep.java    From envelope with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> repartition(Dataset<Row> data) {
  int numPartitions = 0;
  List<String> colPartitions = null;

  if (config.hasPath(REPARTITION_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY);
  }

  if (config.hasPath(REPARTITION_COLUMNS_PROPERTY)) {
    colPartitions = config.getStringList(REPARTITION_COLUMNS_PROPERTY);
  }

  if (numPartitions > 0 && null != colPartitions) {
    data = data.repartition(numPartitions, RowUtils.toColumnArray(colPartitions));
  }
  else if (numPartitions > 0) {
    data = data.repartition(numPartitions);
  }
  else if (null != colPartitions) {
    data = data.repartition(RowUtils.toColumnArray(colPartitions));
  }

  if (config.hasPath(COALESCE_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(COALESCE_NUM_PARTITIONS_PROPERTY);
    data = data.coalesce(numPartitions);
  }
  
  return data;
}
 
Example 2
Source File: SecondaryStructureWord2VecEncoder.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// add Word2Vec encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int n = 2;
	int windowSize = (segmentLength-1)/2;
	int vectorSize = 50;
	data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);	
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}
 
Example 3
Source File: SecondaryStructureOneHotEncoder.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length < 2) {
		System.err.println("Usage: " + SecondaryStructureOneHotEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + [<modelFileName>]");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureOneHotEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add one-hot encoded sequence feature vector to dataset
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.oneHotEncode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}
 
Example 4
Source File: SecondaryStructureShiftedWord2VecEncoder.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
	// of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
			.filter(new Pisces(sequenceIdentity, resolution));

	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// create a Word2Vector representation of the protein sequences
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int windowSize = (segmentLength-1)/2;
	int vectorSize = 50; // dimension of feature vector	(50)
	data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();

	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}
 
Example 5
Source File: SecondaryStructureElementsWord2VecEncoder.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

		String path = MmtfReader.getMmtfReducedPath();
	    
		if (args.length != 0 && args.length != 2) {
			System.err.println("Usage: " + SecondaryStructureElementsWord2VecEncoder.class.getSimpleName() + " [<outputFilePath> + <fileFormat>]");
			System.exit(1);
		}


		long start = System.nanoTime();

		SparkConf conf = new SparkConf()
				.setMaster("local[*]")
				.setAppName(SecondaryStructureWord2VecEncoder.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);
		
		// read MMTF Hadoop sequence file and create a non-redundant Pisces 
		// subset set (<=20% seq. identity) of L-protein chains
		int sequenceIdentity = 20;
		double resolution = 3.0;
		
		JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
				.readSequenceFile(path, sc)
				.flatMapToPair(new StructureToPolymerChains())
                .filter(new Pisces(sequenceIdentity, resolution));
			
		int segmentLength = 11;
		
		// extract helical sequence segments
		Dataset<Row> data = SecondaryStructureElementExtractor.getDataset(pdb, "H", segmentLength);
		System.out.println(data.count());
		data.show(10,false);
		
		// add Word2Vec encoded feature vector
		ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
		int n = 2;
		int windowSize = (segmentLength-1)/2;
		int vectorSize = 50;
		data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize);	
		data.show(50,false);
		
		// optionally, save results
		if (args.length > 0) {
			if (args[1].equals("json")) {
				// coalesce data into a single file
				data = data.coalesce(1);
			}
			data.write().mode("overwrite").format(args[1]).save(args[0]);
		}
		
		long end = System.nanoTime();

		System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
	}
 
Example 6
Source File: SecondaryStructureWord2VecModelEncoder.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet), args[3] word2VecModelFile
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
    
	if (args.length != 3) {
		System.err.println("Usage: " + SecondaryStructureWord2VecModelEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat> + <word2VecModelFile>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureWord2VecModelEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// add Word2Vec encoded feature vector using
	// a pre-trained Word2Vec model read from file
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int n = 2;
	String modelFileName = args[2];
	data = encoder.overlappingNgramWord2VecEncode(modelFileName, n).cache();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}
 
Example 7
Source File: SecondaryStructureBlosum62Encoder.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureBlosum62Encoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureBlosum62Encoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add a property encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.blosum62Encode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}
 
Example 8
Source File: SecondaryStructurePropertyEncoder.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add a property encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.propertyEncode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}