Java Code Examples for org.apache.spark.api.java.JavaRDD#filter()

The following examples show how to use org.apache.spark.api.java.JavaRDD#filter() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractJavaEsSparkTest.java    From elasticsearch-hadoop with Apache License 2.0 7 votes vote down vote up
public void testEsRDDZReadJson() throws Exception {
    String target = "spark-test-java-basic-json-read/data";

    RestUtils.touch("spark-test-java-basic-json-read");
    RestUtils.postData(target, "{\"message\" : \"Hello World\",\"message_date\" : \"2014-05-25\"}".getBytes());
    RestUtils.postData(target, "{\"message\" : \"Goodbye World\",\"message_date\" : \"2014-05-25\"}".getBytes());
    RestUtils.refresh("spark-test*");

    JavaRDD<String> esRDD = JavaEsSpark.esJsonRDD(sc, target).values();
    System.out.println(esRDD.collect());
    JavaRDD<String> messages = esRDD.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String string) throws Exception {
            return string.contains("message");
        }
    });

    // jdk8
    //esRDD.filter(m -> m.contains("message")));

    assertThat((int) messages.count(), is(2));
    System.out.println(messages.take(10));
    System.out.println(messages);
}
 
Example 2
Source File: MarkDuplicatesSparkUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test(dataProvider = "md", groups = "spark")
public void markDupesTest(final String input, final long totalExpected, final long dupsExpected) {
    final GATKPath inputPathSpec = new GATKPath(input);
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    ReadsSparkSource readSource = new ReadsSparkSource(ctx);
    JavaRDD<GATKRead> reads = readSource.getParallelReads(inputPathSpec, null);
    Assert.assertEquals(reads.count(), totalExpected);

    SAMFileHeader header = readSource.getHeader(inputPathSpec, null);
    OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection();
    final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ?
            new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
    JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(reads, header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, 1, false, MarkDuplicates.DuplicateTaggingPolicy.DontTag);

    Assert.assertEquals(markedReads.count(), totalExpected);
    JavaRDD<GATKRead> dupes = markedReads.filter(GATKRead::isDuplicate);

    Assert.assertEquals(dupes.count(), dupsExpected);
}
 
Example 3
Source File: Algorithm.java    From predictionio-template-java-ecom-recommender with Apache License 2.0 6 votes vote down vote up
private JavaRDD<ItemScore> validScores(JavaRDD<ItemScore> all, final Set<String> whitelist, final Set<String> blacklist, final Set<String> categories, final Map<String, Item> items, String userEntityId) {
    final Set<String> seenItemEntityIds = seenItemEntityIds(userEntityId);
    final Set<String> unavailableItemEntityIds = unavailableItemEntityIds();

    return all.filter(new Function<ItemScore, Boolean>() {
        @Override
        public Boolean call(ItemScore itemScore) throws Exception {
            Item item = items.get(itemScore.getItemEntityId());

            return (item != null
                    && passWhitelistCriteria(whitelist, item.getEntityId())
                    && passBlacklistCriteria(blacklist, item.getEntityId())
                    && passCategoryCriteria(categories, item)
                    && passUnseenCriteria(seenItemEntityIds, item.getEntityId())
                    && passAvailabilityCriteria(unavailableItemEntityIds, item.getEntityId()));
        }
    });
}
 
Example 4
Source File: DeleteHelper.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static <T extends HoodieRecordPayload<T>> HoodieWriteMetadata execute(String instantTime,
                                                                             JavaRDD<HoodieKey> keys, JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable<T> table,
                                                                             CommitActionExecutor<T> deleteExecutor) {
  try {
    HoodieWriteMetadata result = null;
    // De-dupe/merge if needed
    JavaRDD<HoodieKey> dedupedKeys = config.shouldCombineBeforeDelete() ? deduplicateKeys(keys, table) : keys;

    JavaRDD<HoodieRecord<T>> dedupedRecords =
        dedupedKeys.map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload()));
    Instant beginTag = Instant.now();
    // perform index loop up to get existing location of records
    JavaRDD<HoodieRecord<T>> taggedRecords =
        ((HoodieTable<T>)table).getIndex().tagLocation(dedupedRecords, jsc, (HoodieTable<T>)table);
    Duration tagLocationDuration = Duration.between(beginTag, Instant.now());

    // filter out non existant keys/records
    JavaRDD<HoodieRecord<T>> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown);
    if (!taggedValidRecords.isEmpty()) {
      result = deleteExecutor.execute(taggedValidRecords);
      result.setIndexLookupDuration(tagLocationDuration);
    } else {
      // if entire set of keys are non existent
      deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(jsc.emptyRDD()), instantTime);
      result = new HoodieWriteMetadata();
      result.setWriteStatuses(jsc.emptyRDD());
      deleteExecutor.commitOnAutoCommit(result);
    }
    return result;
  } catch (Throwable e) {
    if (e instanceof HoodieUpsertException) {
      throw (HoodieUpsertException) e;
    }
    throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
  }
}
 
Example 5
Source File: CollectBaseDistributionByCycleSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms.
 */
public MetricsFile<BaseDistributionByCycleMetrics, Integer> calculateBaseDistributionByCycle(final JavaRDD<GATKRead> reads){
    final MetricsReadFilter metricsFilter =
        new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly);
    final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read));
    final HistogramGenerator hist = filteredReads.aggregate(new HistogramGenerator(),
            (hgp, read) -> hgp.addRead(read),
            (hgp1, hgp2) -> hgp1.merge(hgp2));

    final MetricsFile<BaseDistributionByCycleMetrics, Integer> metricsFile = getMetricsFile();
    hist.addToMetricsFile(metricsFile);
    return metricsFile;
}
 
Example 6
Source File: Grep.java    From flink-perf with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	String outFile = args[2];

	String patterns[] = new String[args.length-3];
	System.arraycopy(args,3,patterns,0,args.length-3);
	System.err.println("Starting spark with master="+master+" in="+inFile);
	System.err.println("Using patterns: "+ Arrays.toString(patterns));

	SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	JavaRDD<String> file = sc.textFile(inFile);
	for(int p = 0; p < patterns.length; p++) {
		final String pattern = patterns[p];
		JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
			private static final long serialVersionUID = 1L;
			Pattern p = Pattern.compile(pattern);

			@Override
			public Boolean call(String value) throws Exception {
				if (value == null || value.length() == 0) {
					return false;
				}
				final Matcher m = p.matcher(value);
				if (m.find()) {
					return true;
				}
				return false;
			}
		});
		res.saveAsTextFile(outFile+"_"+pattern);
	}
}
 
Example 7
Source File: ComputeResponse.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaRDD<MapWritable> readDataES() throws IOException, PIRException
{
  logger.info("Reading data ");

  JavaRDD<MapWritable> jsonRDD;

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  jsonRDD = sc.newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values().coalesce(numDataPartitions);

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return jsonRDD.filter(new FilterData(accum, bVars));
  }
  else
  {
    logger.info("qSchema.getFilter() is null");
    return jsonRDD;
  }
}
 
Example 8
Source File: QuadUtils.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Get quads with specified subjects filtered out, computed by querying an in-memory set of subjects
 *
 * @param quads            RDD of quads to filter
 * @param subjectBlacklist set of requested subject URIs to be filtered out
 * @return filtered RDD with only those quads whose subject is NOT in subjectBlacklist
 */
public static JavaRDD<Quad> filterQuadsByForbiddenSubjects(JavaRDD<Quad> quads, Set<String> subjectBlacklist) {
    if (subjectBlacklist.isEmpty()) {
        return quads;
    }
    return quads.filter(quad -> !quad.getSubject().isURI() ||
            !subjectBlacklist.contains(quad.getSubject().getURI())
    );
}
 
Example 9
Source File: RddChannel.java    From rheem with Apache License 2.0 5 votes vote down vote up
public void accept(JavaRDD<?> rdd, SparkExecutor sparkExecutor) throws RheemException {
    if (this.isMarkedForInstrumentation() && !this.isRddCached()) {
        final Accumulator<Integer> accumulator = sparkExecutor.sc.accumulator(0);
        this.rdd = rdd.filter(dataQuantum -> {
            accumulator.add(1);
            return true;
        });
        this.accumulator = accumulator;
    } else {
        this.rdd = rdd;
    }
}
 
Example 10
Source File: AnalyzeSpark.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Randomly sample a set of invalid values from a specified column.
 * Values are considered invalid according to the Schema / ColumnMetaData
 *
 * @param numToSample    Maximum number of invalid values to sample
 * @param columnName     Same of the column from which to sample invalid values
 * @param schema         Data schema
 * @param data           Data
 * @param ignoreMissing  If true: ignore missing values (NullWritable or empty/null string) when sampling. If false: include missing values in sampling
 * @return               List of invalid examples
 */
public static List<Writable> sampleInvalidFromColumn(int numToSample, String columnName, Schema schema,
                JavaRDD<List<Writable>> data, boolean ignoreMissing) {
    //First: filter out all valid entries, to leave only invalid entries
    int colIdx = schema.getIndexOfColumn(columnName);
    JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx));

    ColumnMetaData meta = schema.getMetaData(columnName);

    JavaRDD<Writable> invalid = ithColumn.filter(new FilterWritablesBySchemaFunction(meta, false, ignoreMissing));

    return invalid.takeSample(false, numToSample);
}
 
Example 11
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 集合并集.
 * demo计算目的:找出所有进站是广南和天河客运站的信息
 * @since hui_project 1.0.0
 */
@Test
public void testUnionAndFilter() {
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> result = textRDD.filter(x -> x.contains("广州南站"));
    JavaRDD<String> result1 = textRDD.filter(x -> x.contains("天河客运站"));
    JavaRDD<String> union = result.union(result1);
    System.out.println("-------" + union.count() + "-------");
    checkResult(union.collect());
}
 
Example 12
Source File: SparkGenomeReadCounts.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private void collectReads() {
    if ( readArguments.getReadFilesNames().size() != 1 ) {
        throw new UserException("This tool only accepts a single bam/sam/cram as input");
    }

    final SampleCollection sampleCollection = new SampleCollection(getHeaderForReads());
    if(sampleCollection.sampleCount()>1){
        throw new UserException.BadInput("We do not support bams with more than one sample.");
    }
    final String sampleName = sampleCollection.sampleIds().get(0);
    final String[] commentsForRawCoverage = {"##fileFormat  = tsv",
            "##commandLine = " + getCommandLine(),
            String.format("##title = Coverage counts in %d base bins for WGS", binsize)};

    final ReadFilter filter = makeGenomeReadFilter();
    final SAMSequenceDictionary sequenceDictionary = getReferenceSequenceDictionary();

    logger.info("Starting Spark coverage collection...");
    final long coverageCollectionStartTime = System.currentTimeMillis();
    final JavaRDD<GATKRead> rawReads = getReads();
    final JavaRDD<GATKRead> reads = rawReads.filter(read -> filter.test(read));

    //Note: using a field inside a closure will pull in the whole enclosing object to serialization
    // (which leads to bad performance and can blow up if some objects in the fields are not
    // Serializable - closures always use java Serializable and not Kryo)
    //Solution here is to use a temp variable for binsize because it's just an int.
    final int binsize_tmp = binsize;
    final JavaRDD<SimpleInterval> readIntervals = reads
            .filter(read -> sequenceDictionary.getSequence(read.getContig()) != null)
            .map(read -> SparkGenomeReadCounts.createKey(read, sequenceDictionary, binsize_tmp));
    final Map<SimpleInterval, Long> byKey = readIntervals.countByValue();
    final Set<SimpleInterval> readIntervalKeySet = byKey.keySet();
    final long totalReads = byKey.values().stream().mapToLong(v -> v).sum();
    final long coverageCollectionEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished the spark coverage collection with %d targets and %d reads. Elapse of %d seconds",
            readIntervalKeySet.size(), totalReads, (coverageCollectionEndTime - coverageCollectionStartTime) / 1000));

    final String[] commentsForProportionalCoverage = {commentsForRawCoverage[0], commentsForRawCoverage[1],
            String.format("##title = Proportional coverage counts in %d base bins for WGS (total reads: %d)",
                    binsize, totalReads)};

    logger.info("Creating full genome bins...");
    final long createGenomeBinsStartTime = System.currentTimeMillis();
    final List<SimpleInterval> fullGenomeBins = createFullGenomeBins(binsize);
    List<Target> fullGenomeTargetCollection = createTargetListFromSimpleInterval(fullGenomeBins);
    TargetWriter.writeTargetsToFile(new File(outputFile.getAbsolutePath() + ".targets.tsv"), fullGenomeTargetCollection);
    final long createGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating genome bins. Elapse of %d seconds",
            (createGenomeBinsEndTime - createGenomeBinsStartTime) / 1000));

    logger.info("Creating missing genome bins...");
    final long createMissingGenomeBinsStartTime = System.currentTimeMillis();
    logger.info("Creating missing genome bins: Creating a mutable mapping...");
    final Map<SimpleInterval, Long> byKeyMutable = new HashMap<>();
    byKeyMutable.putAll(byKey);

    logger.info("Creating missing genome bins: Populating mutable mapping with zero counts for empty regions...");
    fullGenomeBins.stream().forEach(b -> byKeyMutable.putIfAbsent(b, 0l));

    final long createMissingGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating missing genome bins. Elapse of %d seconds",
            (createMissingGenomeBinsEndTime - createMissingGenomeBinsStartTime) / 1000));

    logger.info("Creating final map...");
    final long createFinalMapStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Long> byKeySorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.putAll(byKeyMutable);
    final long createFinalMapEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating final map. Elapse of %d seconds",
            (createFinalMapEndTime - createFinalMapStartTime) / 1000));

    logger.info("Creating proportional coverage... ");
    final long pCovFileStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Double> byKeyProportionalSorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.entrySet().stream().forEach(e -> byKeyProportionalSorted.put(e.getKey(), (double) e.getValue() / totalReads));
    final long pCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating proportional coverage map. Elapse of %d seconds",
            (pCovFileEndTime - pCovFileStartTime) / 1000));

    logger.info("Writing raw coverage file ...");
    final long writingCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(new File(outputFile.getAbsolutePath() + RAW_COV_OUTPUT_EXTENSION), sampleName, byKeySorted, commentsForRawCoverage);
    final long writingCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing coverage file. Elapse of %d seconds",
            (writingCovFileEndTime - writingCovFileStartTime) / 1000));

    logger.info("Writing proportional coverage file ...");
    final long writingPCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(outputFile, sampleName, byKeyProportionalSorted,
            commentsForProportionalCoverage);
    final long writingPCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing proportional coverage file. Elapse of %d seconds",
            (writingPCovFileEndTime - writingPCovFileStartTime) / 1000));
}
 
Example 13
Source File: MarkDuplicatesSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    final SAMFileHeader mergedHeader = getHeaderForReads();

    // Check if we are using multiple inputs that the headers are all in the correct querygrouped ordering, if so set the aggregate header to reflect this
    if (readArguments.getReadPathSpecifiers().size() > 1) {
        final Optional<GATKPath> badlySorted = readArguments.getReadPathSpecifiers().stream()
                .filter(spec -> !treatAsReadGroupOrdered(getHeaderForReadsInput(spec), treatUnsortedAsOrdered))
                .findFirst();

        if(badlySorted.isPresent()) {
            if (allowMultipleSortOrders) {
                //don't set an ordering, the files will all be sorted downstream
                logger.info("Input files are not all grouped by read name so they will be sorted together.");
            } else {
                throw new UserException(
                        "Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted " +
                                "or querygroup-sorted for multi-input processing but input " + badlySorted.get() + " was sorted in " +
                                getHeaderForReadsInput(badlySorted.get()) + " order");
            }
        } else {
            // The default sort order for merged input files is unsorted, so this will be fed to the tool to be sorted
            if (!allowMultipleSortOrders) {
                mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
            }
        }

    // If there is only one file and we are in treatUnsortedAsOrdered mode than set its group order accordingly.
    } else {
        if (treatUnsortedAsOrdered && (mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unknown) || mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted))) {
            logger.warn("Input bam was marked as " + mergedHeader.getSortOrder().toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped");
            mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
        }
    }

    JavaRDD<GATKRead> reads = getReads();
    final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ?
            new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
    // If we need to remove optical duplicates, set the engine to mark optical duplicates using the DT tag.
    if (markDuplicatesSparkArgumentCollection.removeSequencingDuplicates && markDuplicatesSparkArgumentCollection.taggingPolicy == MarkDuplicates.DuplicateTaggingPolicy.DontTag) {
        markDuplicatesSparkArgumentCollection.taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.OpticalOnly;
    }

    final JavaRDD<GATKRead> finalReadsForMetrics = mark(reads, mergedHeader, finder, markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());

    if (metricsFile != null) {
        final JavaPairRDD<String, GATKDuplicationMetrics> metricsByLibrary = MarkDuplicatesSparkUtils.generateMetrics(
                mergedHeader, finalReadsForMetrics);
        final MetricsFile<GATKDuplicationMetrics, Double> resultMetrics = getMetricsFile();
        MarkDuplicatesSparkUtils.saveMetricsRDD(resultMetrics, mergedHeader, metricsByLibrary, metricsFile);
    }
    JavaRDD<GATKRead> readsForWriting = finalReadsForMetrics;
    // Filter out the duplicates if instructed to do so
    if (markDuplicatesSparkArgumentCollection.removeAllDuplicates) {
        readsForWriting = readsForWriting.filter(r -> !r.isDuplicate());
    } else if (markDuplicatesSparkArgumentCollection.removeSequencingDuplicates) {
        readsForWriting = readsForWriting.filter(r -> !MarkDuplicates.DUPLICATE_TYPE_SEQUENCING.equals(r.getAttributeAsString(MarkDuplicates.DUPLICATE_TYPE_TAG)));
    }

    mergedHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    writeReads(ctx, output, readsForWriting, mergedHeader, true);
}
 
Example 14
Source File: PSUtils.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public static JavaRDD<GATKRead> primaryReads(final JavaRDD<GATKRead> reads) {
    return reads.filter(read -> !(read.isSecondaryAlignment() || read.isSupplementaryAlignment()));
}
 
Example 15
Source File: ReadsPipelineSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
    List<String> localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

    final JavaRDD<GATKRead> alignedReads;
    final SAMFileHeader header;
    final BwaSparkEngine bwaEngine;
    if (align) {
        bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary());
        if (bwaArgs.singleEndAlignment) {
            alignedReads = bwaEngine.alignUnpaired(getReads());
        } else {
            // filter reads after alignment in the case of paired reads since filtering does not know about pairs
            final ReadFilter filter = makeReadFilter(bwaEngine.getHeader());
            alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test);
        }
        header = bwaEngine.getHeader();
    } else {
        bwaEngine = null;
        alignedReads = getReads();
        header = getHeaderForReads();
    }

    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());

    // always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource
    final SAMFileHeader readsHeader = header.clone();
    readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    final JavaRDD<GATKRead> sortedMarkedReads = SparkUtils.sortReadsAccordingToHeader(markedReads, readsHeader, numReducers);

    // The markedReads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
    // filtering performed, so we do that here.
    //NOTE: this doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), header);

    JavaRDD<GATKRead> markedFilteredReadsForBQSR = sortedMarkedReads.filter(bqsrReadFilter::test);

    JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants = JoinReadsWithVariants.join(markedFilteredReadsForBQSR, localKnownSitesFilePaths);
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(sortedMarkedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));

    if (outputBam != null) { // only write output of BQSR if output BAM is specified
        writeReads(ctx, outputBam, finalReads, header, true);
    }

    // Run Haplotype Caller
    final ReadFilter hcReadFilter = ReadFilter.fromList(HaplotypeCallerEngine.makeStandardHCReadFilters(), header);
    final JavaRDD<GATKRead> filteredReadsForHC = finalReads.filter(hcReadFilter::test);
    SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    final List<SimpleInterval> intervals = hasUserSuppliedIntervals() ? getIntervals() : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);

    List<ShardBoundary> intervalShards = intervals.stream()
            .flatMap(interval -> Shard.divideIntervalIntoShards(interval, shardingArgs.readShardSize, shardingArgs.readShardPadding, sequenceDictionary).stream())
            .collect(Collectors.toList());

    HaplotypeCallerSpark.callVariantsWithHaplotypeCallerAndWriteOutput(ctx, filteredReadsForHC, readsHeader, sequenceDictionary, referenceArguments.getReferenceFileName(), intervalShards, hcArgs, shardingArgs, assemblyRegionArgs, output, makeVariantAnnotations(), logger, strict, createOutputVariantIndex);

    if (bwaEngine != null) {
        bwaEngine.close();
    }
}
 
Example 16
Source File: GrepCaching.java    From flink-perf with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	String outFile = args[2];
	String storageLevel = args[3];

	String patterns[] = new String[args.length-4];
	System.arraycopy(args, 4, patterns, 0, args.length - 4);
	System.err.println("Starting spark with master="+master+" in="+inFile);
	System.err.println("Using patterns: "+ Arrays.toString(patterns));

	SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	StorageLevel sl;
	switch(storageLevel) {
		case "MEMORY_ONLY":
			sl = StorageLevel.MEMORY_ONLY(); break;
		case "MEMORY_AND_DISK":
			sl = StorageLevel.MEMORY_AND_DISK(); break;
		case "MEMORY_ONLY_SER":
			sl = StorageLevel.MEMORY_ONLY_SER(); break;
		case "MEMORY_AND_DISK_SER":
			sl = StorageLevel.MEMORY_AND_DISK_SER(); break;
		case "NONE":
			sl = StorageLevel.NONE(); break;
		default:
			throw new RuntimeException("Unknown storage level "+storageLevel);
	}

	JavaRDD<String> file = sc.textFile(inFile).persist(sl);
	for(int p = 0; p < patterns.length; p++) {
		final String pattern = patterns[p];
		JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
			private static final long serialVersionUID = 1L;
			Pattern p = Pattern.compile(pattern);

			@Override
			public Boolean call(String value) throws Exception {
				if (value == null || value.length() == 0) {
					return false;
				}
				final Matcher m = p.matcher(value);
				if (m.find()) {
					return true;
				}
				return false;
			}
		});
		res.saveAsTextFile(outFile+"_"+pattern);
	}
}
 
Example 17
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static JavaRDD<GATKRead> doKmerFiltering(final JavaRDD<GATKRead> reads, final String kmerLibPath,
                                                   final int countThresh) {

    return reads.filter(new ContainsKmerReadFilterSpark(kmerLibPath, countThresh));
}
 
Example 18
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Main PathSeq filtering method. See PathSeqFilterSpark for an overview.
 * Returns a tuple containing the paired reads and unpaired reads as separate RDDs.
 * If metricsFile is null, read count metrics will not be collected.
 */
public Tuple2<JavaRDD<GATKRead>, JavaRDD<GATKRead>> doFilter(JavaRDD<GATKRead> reads, final PSFilterLogger filterLogger) {

    Utils.nonNull(reads, "Input reads cannot be null");
    reads = PSUtils.primaryReads(reads);
    filterLogger.logPrimaryReads(reads);

    if (filterArgs.alignedInput) {
        final Set<String> contigsToIgnoreSet = Collections.unmodifiableSet(new HashSet<>(filterArgs.alignmentContigsToIgnore));
        reads = reads.filter(new ReadFilterSparkifier(new HostAlignmentReadFilter(filterArgs.minIdentity, contigsToIgnoreSet)));
    }
    filterLogger.logReadsAfterPrealignedHostFilter(reads);

    //Clear alignment data from the reads
    reads = clearAllAlignments(reads, header);

    //Remove /1 and /2 from read names
    reads = reads.map(new ReadTransformerSparkifier(new StripMateNumberTransformer()));

    if (!filterArgs.skipFilters) {

        //Adapter trimming
        reads = reads.map(new ReadTransformerSparkifier(new AdapterTrimTransformer(filterArgs.maxAdapterMismatches, filterArgs.minAdapterLength, ADAPTER_SEQUENCES)));

        //Apply simple repeat masking
        //See "Low-complexity DNA and simple repeats" at http://www.repeatmasker.org/webrepeatmaskerhelp.html
        reads = reads.map(new ReadTransformerSparkifier(new SimpleRepeatMaskTransformer(MAX_AT_CONTENT_1, MAX_GC_CONTENT_1, REPEAT_WINDOW_SIZE_1)));
        reads = reads.map(new ReadTransformerSparkifier(new SimpleRepeatMaskTransformer(MAX_AT_CONTENT_2, MAX_GC_CONTENT_2, REPEAT_WINDOW_SIZE_2)));

        //Apply DUST masking
        reads = reads.map(new ReadTransformerSparkifier(new DUSTReadTransformer(filterArgs.dustMask, filterArgs.dustW, filterArgs.dustT)));

        //Apply base quality hard clipping
        reads = reads.map(new ReadTransformerSparkifier(new BaseQualityClipReadTransformer(filterArgs.readTrimThresh)));

        //Filter reads with less than minReadLength bases
        reads = reads.filter(new ReadFilterSparkifier(new ReadLengthReadFilter(filterArgs.minReadLength, Integer.MAX_VALUE)));

        //Change low-quality bases to 'N'
        reads = reads.map(new ReadTransformerSparkifier(new BaseQualityReadTransformer(filterArgs.qualPhredThresh)));

        //Filter reads with too many 'N's
        reads = reads.filter(new ReadFilterSparkifier(new AmbiguousBaseReadFilter(filterArgs.maxAmbiguousBases)));
    }
    filterLogger.logReadsAfterQualityFilter(reads);

    //Kmer filtering
    if (filterArgs.kmerFilePath != null) {
        reads = doKmerFiltering(reads, filterArgs.kmerFilePath, filterArgs.hostKmerThresh);
    }

    //Redistribute reads
    if (!filterArgs.skipPreBwaRepartition) {
        reads = repartitionReadsByName(reads);
    }

    //Bwa host alignment filtering
    if (filterArgs.indexImageFile != null) {
        reads = doBwaFilter(reads, filterArgs.indexImageFile, filterArgs.minSeedLength,
                filterArgs.bwaThreads, filterArgs.minIdentity);
    }
    filterLogger.logReadsAfterHostFilter(reads);

    //Filter duplicates
    if (filterArgs.filterDuplicates) {
        reads = setPairFlags(reads, filterArgs.filterReadsPerPartition);
        reads = filterDuplicateSequences(reads);
    }
    filterLogger.logReadsAfterDeduplication(reads);

    //Sets pairedness flags properly
    reads = setPairFlags(reads, filterArgs.filterReadsPerPartition);
    reads = clearAllAlignments(reads, header);

    //Unset paired read flags for reads that are not paired
    final PSPairedUnpairedSplitterSpark splitter = new PSPairedUnpairedSplitterSpark(reads, filterArgs.filterReadsPerPartition, false);
    final JavaRDD<GATKRead> pairedReads = splitter.getPairedReads();
    final JavaRDD<GATKRead> unpairedReads = splitter.getUnpairedReads();
    filterLogger.logFinalPairedReads(pairedReads);

    return new Tuple2<>(pairedReads, unpairedReads);
}
 
Example 19
Source File: BatchHeatMapProcessor.java    From lambda-arch with Apache License 2.0 3 votes vote down vote up
/**
 * Filter the measurements in a given time period
 *
 * @param measurements | The dataset of measurements
 * @param start        | Start of the time period
 * @param end          | End of the time period
 * @return A set of measurements in the given time period
 */
private JavaRDD<Measurement> filterByTime(JavaRDD<Measurement> measurements, Date start, Date end) {
    return measurements.filter(measurement -> (
                    measurement.getTimestamp().equals(start) || measurement.getTimestamp().after(start)
            ) && measurement.getTimestamp().before(end)
    );
}
 
Example 20
Source File: HoodieReadClient.java    From hudi with Apache License 2.0 2 votes vote down vote up
/**
 * Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
 *
 * @param hoodieRecords Input RDD of Hoodie records.
 * @return A subset of hoodieRecords RDD, with existing records filtered out.
 */
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
  JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords);
  return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
}