Java Code Examples for org.apache.spark.api.java.JavaRDD#aggregate()

The following examples show how to use org.apache.spark.api.java.JavaRDD#aggregate() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AnalyzeSpark.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data, int maxHistogramBuckets) {
    data.cache();
    /*
     * TODO: Some care should be given to add histogramBuckets and histogramBucketCounts to this in the future
     */

    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<AnalysisCounter> counters =
                    data.aggregate(null, new AnalysisAddFunction(schema), new AnalysisCombineFunction());

    double[][] minsMaxes = new double[counters.size()][2];
    List<ColumnAnalysis> list = DataVecAnalysisUtils.convertCounters(counters, minsMaxes, columnTypes);

    List<HistogramCounter> histogramCounters =
                    data.aggregate(null, new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes),
                                    new HistogramCombineFunction());

    DataVecAnalysisUtils.mergeCounters(list, histogramCounters);
    return new DataAnalysis(schema, list);
}
 
Example 2
Source File: FlagStatSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    final JavaRDD<GATKRead> reads = getReads();

    final FlagStatus result = reads.aggregate(new FlagStatus(), FlagStatus::add, FlagStatus::merge);
    System.out.println(result);

    if(out != null ) {
        try ( final PrintStream ps = new PrintStream(BucketUtils.createFile(out)) ) {
            ps.print(result);
        }
    }
}
 
Example 3
Source File: MeanQualityByCycleSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms.
 */
public MetricsFile<?, Integer> calculateMeanQualityByCycle(final JavaRDD<GATKRead> reads){
    final MetricsReadFilter metricsFilter =
        new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly);
    final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read));
    final HistogramGeneratorPair aggregate = filteredReads.aggregate(new HistogramGeneratorPair(),
            (hgp, read) -> hgp.addRead(read),
            (hgp1, hgp2) -> hgp1.merge(hgp2));
    return finish(aggregate.useQuals, aggregate.useOrigQuals);
}
 
Example 4
Source File: QualityScoreDistributionSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    final JavaRDD<GATKRead> reads = getReads();
    final MetricsReadFilter metricsFilter =
        new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly);
    final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read));
    final Counts result = filteredReads.aggregate(new Counts(includeNoCalls),
            (counts, read) -> counts.addRead(read),
            (counts1, counts2) -> counts1.merge(counts2));

    final MetricsFile<?, Byte> metrics = makeMetrics(result);
    saveResults(metrics, getHeaderForReads(), getReadSourceName());
}
 
Example 5
Source File: CollectBaseDistributionByCycleSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms.
 */
public MetricsFile<BaseDistributionByCycleMetrics, Integer> calculateBaseDistributionByCycle(final JavaRDD<GATKRead> reads){
    final MetricsReadFilter metricsFilter =
        new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly);
    final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read));
    final HistogramGenerator hist = filteredReads.aggregate(new HistogramGenerator(),
            (hgp, read) -> hgp.addRead(read),
            (hgp1, hgp2) -> hgp1.merge(hgp2));

    final MetricsFile<BaseDistributionByCycleMetrics, Integer> metricsFile = getMetricsFile();
    hist.addToMetricsFile(metricsFile);
    return metricsFile;
}
 
Example 6
Source File: AnalyzeSpark.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Analyze the data quality of data - provides a report on missing values, values that don't comply with schema, etc
 * @param schema Schema for data
 * @param data   Data to analyze
 * @return DataQualityAnalysis object
 */
public static DataQualityAnalysis analyzeQuality(final Schema schema, final JavaRDD<List<Writable>> data) {
    int nColumns = schema.numColumns();
    List<QualityAnalysisState> states = data.aggregate(null,
            new BiFunctionAdapter<>(new QualityAnalysisAddFunction(schema)),
            new BiFunctionAdapter<>(new QualityAnalysisCombineFunction()));

    List<ColumnQuality> list = new ArrayList<>(nColumns);

    for (QualityAnalysisState qualityState : states) {
        list.add(qualityState.getColumnQuality());
    }
    return new DataQualityAnalysis(schema, list);
}
 
Example 7
Source File: AnalyzeSpark.java    From DataVec with Apache License 2.0 4 votes vote down vote up
/**
 *
 * @param schema
 * @param data
 * @return
 */
public static DataQualityAnalysis analyzeQuality(final Schema schema, final JavaRDD<List<Writable>> data) {
    data.cache();
    int nColumns = schema.numColumns();


    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<QualityAnalysisState> states = data.aggregate(null, new QualityAnalysisAddFunction(schema),
                    new QualityAnalysisCombineFunction());

    List<ColumnQuality> list = new ArrayList<>(nColumns);

    for (QualityAnalysisState qualityState : states) {
        list.add(qualityState.getColumnQuality());
    }

    return new DataQualityAnalysis(schema, list);

}
 
Example 8
Source File: AnalyzeSpark.java    From DataVec with Apache License 2.0 3 votes vote down vote up
/**
 * Get a list of unique values from the specified column.
 * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)}
 *
 * @param columnNames   Names of the column to get unique values from
 * @param schema        Data schema
 * @param data          Data to get unique values from
 * @return              List of unique values, for each of the specified columns
 */
public static Map<String,List<Writable>> getUnique(List<String> columnNames, Schema schema, JavaRDD<List<Writable>> data){
    Map<String,Set<Writable>> m = data.aggregate(null, new UniqueAddFunction(columnNames, schema), new UniqueMergeFunction());
    Map<String,List<Writable>> out = new HashMap<>();
    for(String s : m.keySet()){
        out.put(s, new ArrayList<>(m.get(s)));
    }
    return out;
}
 
Example 9
Source File: AnalyzeSpark.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Get a list of unique values from the specified columns.
 * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)}
 *
 * @param columnNames   Names of the column to get unique values from
 * @param schema        Data schema
 * @param data          Data to get unique values from
 * @return              List of unique values, for each of the specified columns
 */
public static Map<String,List<Writable>> getUnique(List<String> columnNames, Schema schema, JavaRDD<List<Writable>> data){
    Map<String,Set<Writable>> m = data.aggregate(null, new UniqueAddFunction(columnNames, schema), new UniqueMergeFunction());
    Map<String,List<Writable>> out = new HashMap<>();
    for(String s : m.keySet()){
        out.put(s, new ArrayList<>(m.get(s)));
    }
    return out;
}