Java Code Examples for org.datavec.api.transform.schema.Schema#getColumnTypes()

The following examples show how to use org.datavec.api.transform.schema.Schema#getColumnTypes() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AnalyzeSpark.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data, int maxHistogramBuckets) {
    data.cache();
    /*
     * TODO: Some care should be given to add histogramBuckets and histogramBucketCounts to this in the future
     */

    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<AnalysisCounter> counters =
                    data.aggregate(null, new AnalysisAddFunction(schema), new AnalysisCombineFunction());

    double[][] minsMaxes = new double[counters.size()][2];
    List<ColumnAnalysis> list = DataVecAnalysisUtils.convertCounters(counters, minsMaxes, columnTypes);

    List<HistogramCounter> histogramCounters =
                    data.aggregate(null, new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes),
                                    new HistogramCombineFunction());

    DataVecAnalysisUtils.mergeCounters(list, histogramCounters);
    return new DataAnalysis(schema, list);
}
 
Example 2
Source File: AnalyzeLocal.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Analyse the specified data - returns a DataAnalysis object with summary information about each column
 *
 * @param schema Schema for data
 * @param rr     Data to analyze
 * @return DataAnalysis for data
 */
public static DataAnalysis analyze(Schema schema, RecordReader rr, int maxHistogramBuckets){
    AnalysisAddFunction addFn = new AnalysisAddFunction(schema);
    List<AnalysisCounter> counters = null;
    while(rr.hasNext()){
        counters = addFn.apply(counters, rr.next());
    }

    double[][] minsMaxes = new double[counters.size()][2];

    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<ColumnAnalysis> list = DataVecAnalysisUtils.convertCounters(counters, minsMaxes, columnTypes);


    //Do another pass collecting histogram values:
    List<HistogramCounter> histogramCounters = null;
    HistogramAddFunction add = new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes);
    if(rr.resetSupported()){
        rr.reset();
        while(rr.hasNext()){
            histogramCounters = add.apply(histogramCounters, rr.next());
        }

        DataVecAnalysisUtils.mergeCounters(list, histogramCounters);
    }

    return new DataAnalysis(schema, list);
}
 
Example 3
Source File: AnalyzeSpark.java    From DataVec with Apache License 2.0 4 votes vote down vote up
/**
 *
 * @param schema
 * @param data
 * @return
 */
public static DataQualityAnalysis analyzeQuality(final Schema schema, final JavaRDD<List<Writable>> data) {
    data.cache();
    int nColumns = schema.numColumns();


    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<QualityAnalysisState> states = data.aggregate(null, new QualityAnalysisAddFunction(schema),
                    new QualityAnalysisCombineFunction());

    List<ColumnQuality> list = new ArrayList<>(nColumns);

    for (QualityAnalysisState qualityState : states) {
        list.add(qualityState.getColumnQuality());
    }

    return new DataQualityAnalysis(schema, list);

}