Java Code Examples for org.apache.spark.api.java.JavaPairRDD#map()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#map() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ChronixRDD.java    From chronix.spark with Apache License 2.0 6 votes vote down vote up
/**
 * Transformation: Joins the time series according their identity.
 *
 * @return joined time series
 */
public ChronixRDD joinChunks() {
    JavaPairRDD<MetricTimeSeriesKey, Iterable<MetricTimeSeries>> groupRdd
            = this.groupBy(MetricTimeSeriesKey::new);

    JavaPairRDD<MetricTimeSeriesKey, MetricTimeSeries> joinedRdd
            = groupRdd.mapValues((Function<Iterable<MetricTimeSeries>, MetricTimeSeries>) mtsIt -> {
        MetricTimeSeriesOrdering ordering = new MetricTimeSeriesOrdering();
        List<MetricTimeSeries> orderedChunks = ordering.immutableSortedCopy(mtsIt);
        MetricTimeSeries result = null;
        for (MetricTimeSeries mts : orderedChunks) {
            if (result == null) {
                result = new MetricTimeSeries
                        .Builder(mts.getMetric())
                        .attributes(mts.attributes()).build();
            }
            result.addAll(mts.getTimestampsAsArray(), mts.getValuesAsArray());
        }
        return result;
    });

    JavaRDD<MetricTimeSeries> resultJavaRdd =
            joinedRdd.map((Tuple2<MetricTimeSeriesKey, MetricTimeSeries> mtTuple) -> mtTuple._2);

    return new ChronixRDD(resultJavaRdd);
}
 
Example 2
Source File: TsmmSPInstruction.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute tsmm instruction (always produce exactly one output block)
	//(this formulation with values() requires --conf spark.driver.maxResultSize=0)
	JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type));
	MatrixBlock out = RDDAggregateUtils.sumStable(tmp);

	//put output block into symbol table (no lineage because single block)
	//this also includes implicit maintenance of matrix characteristics
	sec.setMatrixOutput(output.getName(), out);
}
 
Example 3
Source File: BatchTrafficDataProcessor.java    From lambda-arch with Apache License 2.0 6 votes vote down vote up
/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 *
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues       variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(
        JavaRDD<IoTData> nonFilteredIotDataStream,
        Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues
) {
    // Filter by routeId,vehicleType and in POI range
    JavaRDD<IoTData> iotDataStreamFiltered = filterVehicleInPOIRange(nonFilteredIotDataStream, broadcastPOIValues);

    // pair with poi
    JavaPairRDD<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair(
            iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1())
    );

    // Transform to dstream of POITrafficData
    JavaRDD<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);
    persistPOI(trafficDStream);
}
 
Example 4
Source File: TsmmSPInstruction.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute tsmm instruction (always produce exactly one output block)
	//(this formulation with values() requires --conf spark.driver.maxResultSize=0)
	JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type));
	MatrixBlock out = RDDAggregateUtils.sumStable(tmp);

	//put output block into symbol table (no lineage because single block)
	//this also includes implicit maintenance of matrix characteristics
	sec.setMatrixOutput(output.getName(), out);
}
 
Example 5
Source File: VariantsSparkSink.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static JavaRDD<VariantContext> sortVariants(final JavaRDD<VariantContext> variants, final VCFHeader header, final int numReducers) {
    // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount
    // of data going through the shuffle.
    final JavaPairRDD<VariantContext, Void> rddVariantPairs = variants.mapToPair(variant -> new Tuple2<>(variant, (Void) null));

    // do a total sort so that all the records in partition i are less than those in partition i+1
    final Comparator<VariantContext> comparator = header.getVCFRecordComparator();
    final JavaPairRDD<VariantContext, Void> variantVoidPairs;
    if (comparator == null){
        variantVoidPairs = rddVariantPairs; //no sort
    } else if (numReducers > 0) {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator, true, numReducers);
    } else {
        variantVoidPairs = rddVariantPairs.sortByKey(comparator);
    }

    return variantVoidPairs.map(Tuple2::_1);
}
 
Example 6
Source File: CopybookSparkExample.java    From CopybookInputFormat with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	if (args.length == 0) {

	}
	if (args.length == 0) {
		System.out
				.println("CopybookSparkExample {master} {copybookInputPath} {dataFileInputPath} {outputFolder}");
		return;
	}

	String master = args[0];
	String copybookInputPath = args[1];
	String dataFileInputPath = args[2];
	String outputPath = args[3];

	JavaSparkContext jsc = new JavaSparkContext(master,
			"UniqueSeqGenerator", null, "SparkCopybookExample.jar");

	Configuration config = new Configuration();
	config.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/yarn-site.xml"));
	config.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
	CopybookInputFormat.setCopybookHdfsPath(config, copybookInputPath);
	
	JavaPairRDD<LongWritable, Text> rdd = jsc.newAPIHadoopFile(dataFileInputPath, CopybookInputFormat.class, LongWritable.class, Text.class, config);
	JavaRDD<String> pipeDelimiter = rdd.map(new MapFunction());

	pipeDelimiter.saveAsTextFile(outputPath);
}
 
Example 7
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
      original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));

  JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
  if (implicit) {
    // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
    // they don't guarantee the delete elements are properly handled
    aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
  } else {
    // For non-implicit, last wins.
    aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
  }

  JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
      aggregated.filter(kv -> !Double.isNaN(kv._2()));

  if (logStrength) {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        Math.log1p(userProductScore._2() / epsilon)));
  } else {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        userProductScore._2()));
  }
}
 
Example 8
Source File: FileSystemInput.java    From envelope with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> getEncodedRowsFromInputFormat(String path, Class<? extends InputFormat> inputFormatClass) {
  JavaSparkContext context = new JavaSparkContext(Contexts.getSparkSession().sparkContext());
  JavaPairRDD rawRDD = context.newAPIHadoopFile(
      path, inputFormatClass, convertToClass(getKeyDataType()), convertToClass(getValueDataType()),
      new Configuration());

  boolean useKey = getKeyDataType() != null;
  JavaRDD<Row> encodedRDD = rawRDD.map(new EncodeRecordAsKeyValueFunction(useKey));

  return Contexts.getSparkSession().createDataFrame(encodedRDD, getProvidingSchema());
}
 
Example 9
Source File: BatchHeatMapProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
private JavaRDD<HeatMapData> getCountInArea(JavaPairRDD<Coordinate, Integer> tuples, Date day) throws IOException {
    JavaRDD<HeatMapData> map = tuples.map(tuple -> {
                Coordinate coordinate = tuple._1();
                return new HeatMapData(coordinate.getLatitude(), coordinate.getLongitude(), tuple._2(), day);
            }
    );
    return map;
}
 
Example 10
Source File: AvroDataSupplier.java    From tablasco with Apache License 2.0 5 votes vote down vote up
@Override
public DistributedTable get()
{
    JavaPairRDD<AvroWrapper, NullWritable> avroRdd = this.sparkContext.hadoopFile(this.dataPath.toString(), AvroInputFormat.class, AvroWrapper.class, NullWritable.class);
    LOGGER.info("data location: {}", this.dataPath);
    List<String> headers = avroRdd.keys().map(new AvroHeadersFunction()).first();
    LOGGER.info("data headers: {}", headers);
    JavaRDD<List<Object>> rows = avroRdd.map(new AvroRowsFunction(headers));
    return new DistributedTable(headers, rows);
}
 
Example 11
Source File: SQLQueryBAM.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}
 
Example 12
Source File: AggregateUnarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void processTensorAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//get input
	// TODO support DataTensor
	JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<TensorIndexes, TensorBlock> out = in;

	// TODO: filter input blocks for trace
	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		// TODO filter non empty blocks if sparse safe
		JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop));
		TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of data characteristics
		// TODO generalize to drop depending on location of correction
		// TODO support DataTensor
		TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1});
		out4.set(0, 0, out3.get(0, 0));
		sec.setTensorOutput(output.getName(), out4);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUTensorAggValueFunction(auop));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			// TODO MULTI_BLOCK
			throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet.");
			/*
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.correctionExists )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
			 */
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}
 
Example 13
Source File: SecondaryStructureExtractor.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static JavaRDD<Row> getJavaRDD(JavaPairRDD<String, StructureDataInterface> structure) {
	return structure.map(t -> getSecStructFractions(t));
}
 
Example 14
Source File: TestSequenceRecordReaderBytesFunction.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testRecordReaderBytesFunction() throws Exception {

    //Local file path
    File f = testDir.newFolder();
    new ClassPathResource("datavec-spark/video/").copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    //Load binary data from local file system, convert to a sequence file:
    //Load and convert
    JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
    JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction());
    //Write the sequence file:
    Path p = Files.createTempDirectory("dl4j_rrbytesTest");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);

    //Load data from sequence file, parse via SequenceRecordReader:
    JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class);
    SequenceRecordReader seqRR = new CodecRecordReader();
    Configuration conf = new Configuration();
    conf.set(CodecRecordReader.RAVEL, "true");
    conf.set(CodecRecordReader.START_FRAME, "0");
    conf.set(CodecRecordReader.TOTAL_FRAMES, "25");
    conf.set(CodecRecordReader.ROWS, "64");
    conf.set(CodecRecordReader.COLUMNS, "64");
    Configuration confCopy = new Configuration(conf);
    seqRR.setConf(conf);
    JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR));



    //Next: do the same thing locally, and compare the results
    InputSplit is = new FileSplit(f, new String[] {"mp4"}, true);
    SequenceRecordReader srr = new CodecRecordReader();
    srr.initialize(is);
    srr.setConf(confCopy);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }
    assertEquals(4, list.size());

    List<List<List<Writable>>> fromSequenceFile = dataVecData.collect();

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        List<List<Writable>> collection = fromSequenceFile.get(i);
        for (int j = 0; j < 4; j++) {
            if (collection.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}
 
Example 15
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
 * 生成点击商品基础信息临时表
 * @param sqlContext
 * @param cityid2clickActionRDD
 * @param cityid2cityInfoRDD
 */
private static void generateTempClickProductBasicTable(
		SQLContext sqlContext,
		JavaPairRDD<Long, Row> cityid2clickActionRDD,
		JavaPairRDD<Long, Row> cityid2cityInfoRDD) {
	// 执行join操作,进行点击行为数据和城市数据的关联
	JavaPairRDD<Long, Tuple2<Row, Row>> joinedRDD =
			cityid2clickActionRDD.join(cityid2cityInfoRDD);
	
	// 将上面的JavaPairRDD,转换成一个JavaRDD<Row>(才能将RDD转换为Dataset<Row>)
	JavaRDD<Row> mappedRDD = joinedRDD.map(
			
			new Function<Tuple2<Long,Tuple2<Row,Row>>, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Row call(Tuple2<Long, Tuple2<Row, Row>> tuple)
						throws Exception {
					Long cityid = tuple._1;
					Row clickAction = tuple._2._1;
					Row cityInfo = tuple._2._2;
					
					Long productid = clickAction.getLong(1);
					String cityName = cityInfo.getString(1);
					String area = cityInfo.getString(2);
					
					return RowFactory.create(cityid, cityName, area, productid);  
				}
				
			});
	
	// 基于JavaRDD<Row>的格式,就可以将其转换为Dataset<Row>
	List<StructField> structFields = new ArrayList<StructField>();
	structFields.add(DataTypes.createStructField("city_id", DataTypes.LongType, true));
	structFields.add(DataTypes.createStructField("city_name", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("area", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("product_id", DataTypes.LongType, true));
	
	// 1 北京
	// 2 上海
	// 1 北京
	// group by area,product_id
	// 1:北京,2:上海
	
	// 两个函数
	// UDF:concat2(),将两个字段拼接起来,用指定的分隔符
	// UDAF:group_concat_distinct(),将一个分组中的多个字段值,用逗号拼接起来,同时进行去重
	
	StructType schema = DataTypes.createStructType(structFields);

	Dataset<Row> df = sqlContext.createDataFrame(mappedRDD, schema);
	System.out.println("tmp_click_product_basic: " + df.count());  
	
	// 将Dataset<Row>中的数据,注册成临时表(tmp_click_product_basic)
	df.registerTempTable("tmp_click_product_basic");  
}
 
Example 16
Source File: SQLQueryFastq.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryFastq");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(sc);

  Options options = new Options();

  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option samOpt = new Option( "format", true, "parquet or fastq" );
  Option baminOpt = new Option( "in", true, "" );
  options.addOption( new Option( "tablename", true, "Default sql table name is 'records'"));

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( samOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq";
  String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
  tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records";

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

  JavaRDD<MyRead> rdd = fastqRDD.map(record -> {
    MyRead read = new MyRead();
    read.setKey(record._1.toString());
    read.setSequence(record._2.getSequence().toString());
    read.setRead(record._2.getRead());
    read.setQuality(record._2.getQuality().toString());

    read.setTile(record._2.getTile());
    read.setXpos(record._2.getXpos());
    read.setYpos(record._2.getYpos());
    read.setRunNumber(record._2.getRunNumber());
    read.setInstrument(record._2.getInstrument());
    read.setFlowcellId(record._2.getFlowcellId());
    read.setLane(record._2.getLane());
    read.setControlNumber(record._2.getControlNumber());
    read.setFilterPassed(record._2.getFilterPassed());

    return read;
  });

  Dataset df = sqlContext.createDataFrame(rdd, MyRead.class);
  df.registerTempTable(tablename);
  //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records"
  //"SELECT key,LEN(sequence) as l FROM records where l<100;"
  if(query!=null) {

    //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam));
    //Save as parquet file
    Dataset<Row> resultDF = sqlContext.sql(query);
    resultDF.show(100, false);

    if(outDir!=null){
      if(format.equals("fastq")){
        JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF);
        resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
      }
      else
        resultDF.write().parquet(outDir);
    }
  }
  sc.stop();

}
 
Example 17
Source File: SamToFastq.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SamToFastq");
  sc = new JavaSparkContext(conf);

  String in = args[0];
  String out = args[1];

  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(in, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());

  JavaPairRDD<Text, SequencedFragment> fastqrdd = mapSAMRecordsToFastq(samRDD);

  fastqrdd.saveAsNewAPIHadoopFile(out, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

  sc.stop();

}
 
Example 18
Source File: SparkUtils.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public static <T> JavaRDD<T> repartitionApproximateBalance(JavaRDD<T> rdd, Repartition repartition,
                int numPartitions) {
    int origNumPartitions = rdd.partitions().size();
    switch (repartition) {
        case Never:
            return rdd;
        case NumPartitionsWorkersDiffers:
            if (origNumPartitions == numPartitions)
                return rdd;
        case Always:
            // Count each partition...
            List<Integer> partitionCounts =
                            rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<T>, Iterator<Integer>>() {
                                @Override
                                public Iterator<Integer> call(Integer integer, Iterator<T> tIterator)
                                                throws Exception {
                                    int count = 0;
                                    while (tIterator.hasNext()) {
                                        tIterator.next();
                                        count++;
                                    }
                                    return Collections.singletonList(count).iterator();
                                }
                            }, true).collect();

            Integer totalCount = 0;
            for (Integer i : partitionCounts)
                totalCount += i;
            List<Double> partitionWeights = new ArrayList<>(Math.max(numPartitions, origNumPartitions));
            Double ideal = (double) totalCount / numPartitions;
            // partitions in the initial set and not in the final one get -1 => elements always jump
            // partitions in the final set not in the initial one get 0 => aim to receive the average amount
            for (int i = 0; i < Math.min(origNumPartitions, numPartitions); i++) {
                partitionWeights.add((double) partitionCounts.get(i) / ideal);
            }
            for (int i = Math.min(origNumPartitions, numPartitions); i < Math.max(origNumPartitions,
                            numPartitions); i++) {
                // we shrink the # of partitions
                if (i >= numPartitions)
                    partitionWeights.add(-1D);
                // we enlarge the # of partitions
                else
                    partitionWeights.add(0D);
            }

            // this method won't trigger a spark job, which is different from {@link org.apache.spark.rdd.RDD#zipWithIndex}

            JavaPairRDD<Tuple2<Long, Integer>, T> indexedRDD = rdd.zipWithUniqueId()
                            .mapToPair(new PairFunction<Tuple2<T, Long>, Tuple2<Long, Integer>, T>() {
                                @Override
                                public Tuple2<Tuple2<Long, Integer>, T> call(Tuple2<T, Long> tLongTuple2) {
                                    return new Tuple2<>(
                                                    new Tuple2<Long, Integer>(tLongTuple2._2(), 0),
                                                    tLongTuple2._1());
                                }
                            });

            HashingBalancedPartitioner hbp =
                            new HashingBalancedPartitioner(Collections.singletonList(partitionWeights));
            JavaPairRDD<Tuple2<Long, Integer>, T> partitionedRDD = indexedRDD.partitionBy(hbp);

            return partitionedRDD.map(new Function<Tuple2<Tuple2<Long, Integer>, T>, T>() {
                @Override
                public T call(Tuple2<Tuple2<Long, Integer>, T> indexNPayload) {
                    return indexNPayload._2();
                }
            });
        default:
            throw new RuntimeException("Unknown setting for repartition: " + repartition);
    }
}
 
Example 19
Source File: AggregateUnarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void processTensorAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//get input
	// TODO support DataTensor
	JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<TensorIndexes, TensorBlock> out = in;

	// TODO: filter input blocks for trace
	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		// TODO filter non empty blocks if sparse safe
		JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop));
		TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of data characteristics
		// TODO generalize to drop depending on location of correction
		// TODO support DataTensor
		TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1});
		out4.set(0, 0, out3.get(0, 0));
		sec.setTensorOutput(output.getName(), out4);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUTensorAggValueFunction(auop));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			// TODO MULTI_BLOCK
			throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet.");
			/*
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.correctionExists )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
			 */
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}
 
Example 20
Source File: AggregateUnarySPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void processMatrixAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());

	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in;

	//filter input blocks for trace
	if( getOpcode().equalsIgnoreCase("uaktrace") )
		out = out.filter(new FilterDiagMatrixBlocksFunction());

	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		if( auop.sparseSafe )
			out = out.filter(new FilterNonEmptyBlocksFunction());

		JavaRDD<MatrixBlock> out2 = out.map(
				new RDDUAggFunction2(auop, mc.getBlocksize()));
		MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop);

		//drop correction after aggregation
		out3.dropLastRowsOrColumns(aggop.correction);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of matrix characteristics
		sec.setMatrixOutput(output.getName(), out3);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize()));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.existsCorrection() )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}