Java Code Examples for org.apache.spark.api.java.JavaRDD#first()

The following examples show how to use org.apache.spark.api.java.JavaRDD#first() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Chapter4.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
/**
 * flatMap分割字符串
 */
public void flatMap(JavaSparkContext sparkContext){
    JavaRDD<String> lines = sparkContext.parallelize(Arrays.asList("hello world", "hi"));

    JavaRDD<String> flatMapResult  = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String s) throws Exception {
            return Arrays.asList(PATTERN.split(s)).iterator();
        }
    });

    flatMapResult.first();

    //结果:hello
}
 
Example 2
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}
 
Example 3
Source File: ActionRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 取第一个元素.
 *
 * @since hui_project 1.0.0
 */
public void testFirst() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    String first = stringJavaRDD.first();
    System.out.println(first);
}
 
Example 4
Source File: ActionRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 取第一个元素.
 *
 * @since hui_project 1.0.0
 */
@Test
public void testFirst() {
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    String first = stringJavaRDD.first();
    System.out.println(first);
}
 
Example 5
Source File: JavaRDDToDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Converts a JavaRDD<Row> to a Dataset<Row>. This method only
 * supports simple data types and all data need to be not null.
 * 
 * @param data JavaRDD of Row objects
 * @param colNames names of the columns in a row
 * @return
 */
public static Dataset<Row> getDataset(JavaRDD<Row> data, String...colNames) {
	// create the schema for the dataset
	Row row = data.first();
	int length = row.length();
	
	if (length != colNames.length) {
		throw new IllegalArgumentException("colNames length does not match row length");
	}
	
	StructField[] sf = new StructField[length];
	
	for (int i = 0; i < row.size(); i++) {
		Object o = row.get(i);

		// TODO add more types
		if (o instanceof String) {
			sf[i] = DataTypes.createStructField(colNames[i], DataTypes.StringType, false);
		} else if (o instanceof Integer) {
			sf[i] = DataTypes.createStructField(colNames[i], DataTypes.IntegerType, false);
		} else if (o instanceof Long) {
			sf[i] = DataTypes.createStructField(colNames[i], DataTypes.LongType, false);
		} else if (o instanceof Float) {
			sf[i] = DataTypes.createStructField(colNames[i], DataTypes.FloatType, false);
		} else if (o instanceof Double) {
			sf[i] = DataTypes.createStructField(colNames[i], DataTypes.DoubleType, false);
		} else if (o instanceof Boolean) {
               sf[i] = DataTypes.createStructField(colNames[i], DataTypes.BooleanType, false);
		} else {
			System.out.println("Data type not implemented yet");
		}
	}
	StructType schema = new StructType(sf);

	// convert JavaRDD to Dataset
	SparkSession spark = SparkSession.builder().getOrCreate();
	return spark.createDataFrame(data, schema);
}
 
Example 6
Source File: Tokenizer.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tokenizes a line. 
 * @param line a line of text
 * @return a result text string
 */
public String tokenizeOneLine(String line) {
	List<String> list = new ArrayList<String>();
	list.add(line);
	JavaRDD<String> input = jsc.parallelize(list);
	JavaRDD<String> output = tokenize(input);
	return output.first();
}
 
Example 7
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}
 
Example 8
Source File: DeepSparkContext.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a JavaSchemaRDD from a DeepJobConfig and a JavaSQLContext.
 * @param config Specific Deep ExtractorConfig.
 * @return A JavaSchemaRDD built from Cells.
 * @throws UnsupportedDataTypeException
 */
public DataFrame createJavaSchemaRDD(ExtractorConfig<Cells> config) throws UnsupportedDataTypeException, UnsupportedOperationException {
    JavaRDD<Cells> cellsRDD = createJavaRDD(config);
    JavaRDD<Row> rowsRDD = DeepSparkContext.createJavaRowRDD(cellsRDD);
    try {
        Cells firstCells = cellsRDD.first();
        StructType schema = CellsUtils.getStructTypeFromCells(firstCells);
        return sqlContext.applySchema(rowsRDD, schema);
    } catch(UnsupportedOperationException e) {
        throw new UnsupportedOperationException("Cannot infer schema from empty data RDD", e);
    }
}
 
Example 9
Source File: PSScorerTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "mapPairs", groups = "spark")
public void testMapGroupedReadsToTax(final int readLength, final List<Integer> NM1, final List<Integer> NM2,
                                     final List<Integer> clip1, final List<Integer> clip2,
                                     final List<Integer> insert1, final List<Integer> insert2,
                                     final List<Integer> delete1, final List<Integer> delete2,
                                     final List<String> contig1, final List<String> contig2,
                                     final List<Integer> truthTax) {

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase);

    //Test with alternate alignments assigned to the XA tag
    final List<Iterable<GATKRead>> readListXA = new ArrayList<>();
    readListXA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "XA"));
    final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoXA = resultXA.first()._2;

    Assert.assertNotNull(infoXA);
    Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoXA.numMates, 2);

    //Test SA tag
    final List<Iterable<GATKRead>> readListSA = new ArrayList<>();
    readListSA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "SA"));
    final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoSA = resultSA.first()._2;

    Assert.assertNotNull(infoSA);
    Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoSA.numMates, 2);
}
 
Example 10
Source File: PSScorerTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "mapUnpaired", groups = "spark")
public void testMapGroupedReadsToTaxUnpaired(final int readLength, final List<Integer> NM, final List<Integer> clip,
                                             final List<Integer> insert, final List<Integer> delete,
                                             final List<String> contig, final List<Integer> truthTax) {

    if (!(NM.size() == clip.size() && NM.size() == insert.size() && NM.size() == delete.size() && NM.size() == contig.size())) {
        throw new TestException("Input lists for read must be of uniform length");
    }

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase);

    //Test with alternate alignments assigned to the XA tag
    final List<Iterable<GATKRead>> readListXA = new ArrayList<>();
    readListXA.add(generateUnpairedRead(readLength, NM, clip, insert, delete, contig, "XA"));
    final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoXA = resultXA.first()._2;

    Assert.assertNotNull(infoXA);
    Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoXA.numMates, 1);

    //Test SA tag
    final List<Iterable<GATKRead>> readListSA = new ArrayList<>();
    readListSA.add(generateUnpairedRead(readLength, NM, clip, insert, delete, contig, "SA"));
    final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoSA = resultSA.first()._2;

    Assert.assertNotNull(infoSA);
    Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoSA.numMates, 1);
}
 
Example 11
Source File: CassandraCellExtractorFT.java    From deep-spark with Apache License 2.0 4 votes vote down vote up
@Override
protected void initDataSetDivineComedy(DeepSparkContext context) {
    JavaRDD<String> stringJavaRDD;

    //Divine Comedy
    List<String> lineas = readFile("/simpleDivineComedy.json");

    stringJavaRDD = context.parallelize(lineas);

    JavaRDD javaRDD = transformRDD(stringJavaRDD, Cells.class);

    originBook = javaRDD.first();

    DeepSparkContext.saveRDD(javaRDD.rdd(), getWriteExtractorConfig(BOOK_INPUT,
            Cells.class));
}
 
Example 12
Source File: CassandraEntityExtractorFT.java    From deep-spark with Apache License 2.0 4 votes vote down vote up
@Override
protected void initDataSetDivineComedy(DeepSparkContext context) {
    JavaRDD<String> stringJavaRDD;

    //Divine Comedy
    List<String> lineas = readFile("/simpleDivineComedy.json");

    stringJavaRDD = context.parallelize(lineas);

    JavaRDD javaRDD = transformRDD(stringJavaRDD, SimpleBookEntity.class);

    originBook = javaRDD.first();

    DeepSparkContext.saveRDD(javaRDD.rdd(), getWriteExtractorConfig(BOOK_INPUT,
            SimpleBookEntity.class));
}
 
Example 13
Source File: ExtractorTest.java    From deep-spark with Apache License 2.0 3 votes vote down vote up
protected void initDataSetDivineComedy(DeepSparkContext context) {
    JavaRDD<String> stringJavaRDD;

    //Divine Comedy
    List<String> lineas = readFile(DATA_TEST_DIVINE_COMEDY);

    stringJavaRDD = context.parallelize(lineas);

    JavaRDD<T> javaRDD = transformRDD(stringJavaRDD, configEntity);

    originBook = javaRDD.first();

    DeepSparkContext.saveRDD(javaRDD.rdd(), (ExtractorConfig<T>) getWriteExtractorConfig(BOOK_INPUT,
            configEntity));
}