org.datavec.api.transform.analysis.DataAnalysis Java Examples

The following examples show how to use org.datavec.api.transform.analysis.DataAnalysis. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AnalyzeSpark.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data, int maxHistogramBuckets) {
    data.cache();
    /*
     * TODO: Some care should be given to add histogramBuckets and histogramBucketCounts to this in the future
     */

    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<AnalysisCounter> counters =
                    data.aggregate(null, new AnalysisAddFunction(schema), new AnalysisCombineFunction());

    double[][] minsMaxes = new double[counters.size()][2];
    List<ColumnAnalysis> list = DataVecAnalysisUtils.convertCounters(counters, minsMaxes, columnTypes);

    List<HistogramCounter> histogramCounters =
                    data.aggregate(null, new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes),
                                    new HistogramCombineFunction());

    DataVecAnalysisUtils.mergeCounters(list, histogramCounters);
    return new DataAnalysis(schema, list);
}
 
Example #2
Source File: TestJsonYaml.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testJsonYamlAnalysis() throws Exception {
    Schema s = new Schema.Builder().addColumnsDouble("first", "second").addColumnString("third")
                    .addColumnCategorical("fourth", "cat0", "cat1").build();

    DoubleAnalysis d1 = new DoubleAnalysis.Builder().max(-1).max(1).countPositive(10).mean(3.0).build();
    DoubleAnalysis d2 = new DoubleAnalysis.Builder().max(-5).max(5).countPositive(4).mean(2.0).build();
    StringAnalysis sa = new StringAnalysis.Builder().minLength(0).maxLength(10).build();
    Map<String, Long> countMap = new HashMap<>();
    countMap.put("cat0", 100L);
    countMap.put("cat1", 200L);
    CategoricalAnalysis ca = new CategoricalAnalysis(countMap);

    DataAnalysis da = new DataAnalysis(s, Arrays.asList(d1, d2, sa, ca));

    String strJson = da.toJson();
    String strYaml = da.toYaml();
    //        System.out.println(str);

    DataAnalysis daFromJson = DataAnalysis.fromJson(strJson);
    DataAnalysis daFromYaml = DataAnalysis.fromYaml(strYaml);
    //        System.out.println(da2);

    assertEquals(da.getColumnAnalysis(), daFromJson.getColumnAnalysis());
    assertEquals(da.getColumnAnalysis(), daFromYaml.getColumnAnalysis());
}
 
Example #3
Source File: TestJsonYaml.java    From DataVec with Apache License 2.0 6 votes vote down vote up
@Test
public void testJsonYamlAnalysis() throws Exception {
    Schema s = new Schema.Builder().addColumnsDouble("first", "second").addColumnString("third")
                    .addColumnCategorical("fourth", "cat0", "cat1").build();

    DoubleAnalysis d1 = new DoubleAnalysis.Builder().max(-1).max(1).countPositive(10).mean(3.0).build();
    DoubleAnalysis d2 = new DoubleAnalysis.Builder().max(-5).max(5).countPositive(4).mean(2.0).build();
    StringAnalysis sa = new StringAnalysis.Builder().minLength(0).maxLength(10).build();
    Map<String, Long> countMap = new HashMap<>();
    countMap.put("cat0", 100L);
    countMap.put("cat1", 200L);
    CategoricalAnalysis ca = new CategoricalAnalysis(countMap);

    DataAnalysis da = new DataAnalysis(s, Arrays.asList(d1, d2, sa, ca));

    String strJson = da.toJson();
    String strYaml = da.toYaml();
    //        System.out.println(str);

    DataAnalysis daFromJson = DataAnalysis.fromJson(strJson);
    DataAnalysis daFromYaml = DataAnalysis.fromYaml(strYaml);
    //        System.out.println(da2);

    assertEquals(da.getColumnAnalysis(), daFromJson.getColumnAnalysis());
    assertEquals(da.getColumnAnalysis(), daFromYaml.getColumnAnalysis());
}
 
Example #4
Source File: CreateInferenceTransformDescription.java    From SKIL_Examples with Apache License 2.0 6 votes vote down vote up
private void entryPoint(String... args) throws Exception {
    JCommander jcmdr = new JCommander(this);
    try {
        jcmdr.parse(args);
    } catch (ParameterException e) {
        System.out.println(e);
        jcmdr.usage();
        System.exit(1);
    }

    DataAnalysis analysis = DataAnalysis.fromJson(FileUtils.readFileToString(dataAnalysis));

    TransformProcess tp = IrisData.inferenceTransform(analysis);

    FileUtils.writeStringToFile(new File(outputPath + File.separator + "iris-inference-transform.json"), tp.toJson());
}
 
Example #5
Source File: AnalyzeLocal.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Analyse the specified data - returns a DataAnalysis object with summary information about each column
 *
 * @param schema Schema for data
 * @param rr     Data to analyze
 * @return DataAnalysis for data
 */
public static DataAnalysis analyze(Schema schema, RecordReader rr, int maxHistogramBuckets){
    AnalysisAddFunction addFn = new AnalysisAddFunction(schema);
    List<AnalysisCounter> counters = null;
    while(rr.hasNext()){
        counters = addFn.apply(counters, rr.next());
    }

    double[][] minsMaxes = new double[counters.size()][2];

    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<ColumnAnalysis> list = DataVecAnalysisUtils.convertCounters(counters, minsMaxes, columnTypes);


    //Do another pass collecting histogram values:
    List<HistogramCounter> histogramCounters = null;
    HistogramAddFunction add = new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes);
    if(rr.resetSupported()){
        rr.reset();
        while(rr.hasNext()){
            histogramCounters = add.apply(histogramCounters, rr.next());
        }

        DataVecAnalysisUtils.mergeCounters(list, histogramCounters);
    }

    return new DataAnalysis(schema, list);
}
 
Example #6
Source File: TestAnalysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
    public void testAnalysisAllColTypes(){

        Schema s = new Schema.Builder()
                .addColumn(new BinaryMetaData("binary"))
                .addColumn(new BooleanMetaData("boolean"))
                .addColumnCategorical("categorical", "a", "b")
                .addColumnDouble("double")
                .addColumnFloat("float")
                .addColumnInteger("integer")
                .addColumnLong("long")
                .addColumnNDArray("ndarray", new long[]{1,4})
                .addColumnString("string")
                .addColumnTime("time", TimeZone.getDefault())
                .build();

        List<List<Writable>> data = Arrays.asList(
                Arrays.asList(new BytesWritable(new byte[3]), new BooleanWritable(true), new Text("a"),
                        new DoubleWritable(1.0), new FloatWritable(1.0f), new IntWritable(1),
                        new LongWritable(1L), new NDArrayWritable(Nd4j.create(DataType.FLOAT, 1, 4)), new Text("text"),
                        new LongWritable(100L)),
                Arrays.asList(new BytesWritable(new byte[3]), new BooleanWritable(false), new Text("b"),
                        new DoubleWritable(0.0), new FloatWritable(0.0f), new IntWritable(0),
                        new LongWritable(0L), new NDArrayWritable(Nd4j.create(DataType.FLOAT, 1, 4)), new Text("text2"),
                        new LongWritable(101L)));

        JavaRDD<List<Writable>> rdd = sc.parallelize(data);
        DataAnalysis da = AnalyzeSpark.analyze(s, rdd);
//        System.out.println(da);
        da.toString();
        da.toJson();
    }
 
Example #7
Source File: SparkUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Write a DataAnalysis to HDFS (or locally) as a HTML file
 *
 * @param outputPath      Output path
 * @param dataAnalysis    Analysis to generate HTML file for
 * @param sc              Spark context
 */
public static void writeAnalysisHTMLToFile(String outputPath, DataAnalysis dataAnalysis, JavaSparkContext sc) {
    try {
        String analysisAsHtml = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis);
        writeStringToFile(outputPath, analysisAsHtml, sc);
    } catch (Exception e) {
        throw new RuntimeException("Error generating or writing HTML analysis file (normalized data)", e);
    }
}
 
Example #8
Source File: TestAnalyzeLocal.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testAnalysisBasic() throws Exception {

    RecordReader rr = new CSVRecordReader();
    rr.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile()));

    Schema s = new Schema.Builder()
            .addColumnsDouble("0", "1", "2", "3")
            .addColumnInteger("label")
            .build();

    DataAnalysis da = AnalyzeLocal.analyze(s, rr);

    System.out.println(da);

    //Compare:
    List<List<Writable>> list = new ArrayList<>();
    rr.reset();
    while(rr.hasNext()){
        list.add(rr.next());
    }

    INDArray arr = RecordConverter.toMatrix(DataType.DOUBLE, list);
    INDArray mean = arr.mean(0);
    INDArray std = arr.std(0);

    for( int i=0; i<5; i++ ){
        double m = ((NumericalColumnAnalysis)da.getColumnAnalysis().get(i)).getMean();
        double stddev = ((NumericalColumnAnalysis)da.getColumnAnalysis().get(i)).getSampleStdev();
        assertEquals(mean.getDouble(i), m, 1e-3);
        assertEquals(std.getDouble(i), stddev, 1e-3);
    }

}
 
Example #9
Source File: SparkUtils.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Write a DataAnalysis to HDFS (or locally) as a HTML file
 *
 * @param outputPath      Output path
 * @param dataAnalysis    Analysis to generate HTML file for
 * @param sc              Spark context
 */
public static void writeAnalysisHTMLToFile(String outputPath, DataAnalysis dataAnalysis, JavaSparkContext sc) {
    try {
        String analysisAsHtml = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis);
        writeStringToFile(outputPath, analysisAsHtml, sc);
    } catch (Exception e) {
        throw new RuntimeException("Error generating or writing HTML analysis file (normalized data)", e);
    }
}
 
Example #10
Source File: IrisData.java    From SKIL_Examples with Apache License 2.0 5 votes vote down vote up
public static TransformProcess inferenceTransform(DataAnalysis analysis) {
    return new TransformProcess.Builder(IrisData.SCHEMA)
            .removeColumns(IrisData.COL5_LABEL)
            .normalize(IrisData.COL1_SEPAL_LENGTH, Normalize.Standardize, analysis)
            .normalize(IrisData.COL2_SEPAL_WIDTH, Normalize.Standardize, analysis)
            .normalize(IrisData.COL3_PETAL_LENGTH, Normalize.Standardize, analysis)
            .normalize(IrisData.COL4_PETAL_WIDTH, Normalize.Standardize, analysis)
            .build();
}
 
Example #11
Source File: IrisData.java    From SKIL_Examples with Apache License 2.0 5 votes vote down vote up
public static TransformProcess trainTransform(DataAnalysis analysis) {
    return new TransformProcess.Builder(IrisData.SCHEMA)
            .normalize(IrisData.COL1_SEPAL_LENGTH, Normalize.Standardize, analysis)
            .normalize(IrisData.COL2_SEPAL_WIDTH, Normalize.Standardize, analysis)
            .normalize(IrisData.COL3_PETAL_LENGTH, Normalize.Standardize, analysis)
            .normalize(IrisData.COL4_PETAL_WIDTH, Normalize.Standardize, analysis)
            .build();
}
 
Example #12
Source File: AnalyzeSpark.java    From DataVec with Apache License 2.0 4 votes vote down vote up
public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data) {
    return analyze(schema, data, DEFAULT_HISTOGRAM_BUCKETS);
}
 
Example #13
Source File: TransformProcess.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * Normalize the specified column with a given type of normalization
 *
 * @param column Column to normalize
 * @param type   Type of normalization to apply
 * @param da     DataAnalysis object
 */
public Builder normalize(String column, Normalize type, DataAnalysis da) {

    ColumnAnalysis ca = da.getColumnAnalysis(column);
    if (!(ca instanceof NumericalColumnAnalysis))
        throw new IllegalStateException(
                "Column \"" + column + "\" analysis is not numerical. " + "Column is not numerical?");

    NumericalColumnAnalysis nca = (NumericalColumnAnalysis) ca;
    double min = nca.getMinDouble();
    double max = nca.getMaxDouble();
    double mean = nca.getMean();
    double sigma = nca.getSampleStdev();

    switch (type) {
        case MinMax:
            return transform(new MinMaxNormalizer(column, min, max));
        case MinMax2:
            return transform(new MinMaxNormalizer(column, min, max, -1, 1));
        case Standardize:
            return transform(new StandardizeNormalizer(column, mean, sigma));
        case SubtractMean:
            return transform(new SubtractMeanNormalizer(column, mean));
        case Log2Mean:
            return transform(new Log2Normalizer(column, mean, min, 0.5));
        case Log2MeanExcludingMin:
            long countMin = nca.getCountMinValue();

            //mean including min value: (sum/totalCount)
            //mean excluding min value: (sum - countMin*min)/(totalCount - countMin)
            double meanExMin;
            if (ca.getCountTotal() - countMin == 0) {
                if (ca.getCountTotal() == 0) {
                    log.warn("Normalizing with Log2MeanExcludingMin but 0 records present in analysis");
                } else {
                    log.warn("Normalizing with Log2MeanExcludingMin but all records are the same value");
                }
                meanExMin = mean;
            } else {
                meanExMin = (mean * ca.getCountTotal() - countMin * min) / (ca.getCountTotal() - countMin);
            }
            return transform(new Log2Normalizer(column, meanExMin, min, 0.5));
        default:
            throw new RuntimeException("Unknown/not implemented normalization type: " + type);
    }
}
 
Example #14
Source File: TransformProcess.java    From DataVec with Apache License 2.0 4 votes vote down vote up
/**
 * Normalize the specified column with a given type of normalization
 *
 * @param column Column to normalize
 * @param type   Type of normalization to apply
 * @param da     DataAnalysis object
 */
public Builder normalize(String column, Normalize type, DataAnalysis da) {

    ColumnAnalysis ca = da.getColumnAnalysis(column);
    if (!(ca instanceof NumericalColumnAnalysis))
        throw new IllegalStateException(
                "Column \"" + column + "\" analysis is not numerical. " + "Column is not numerical?");

    NumericalColumnAnalysis nca = (NumericalColumnAnalysis) ca;
    double min = nca.getMinDouble();
    double max = nca.getMaxDouble();
    double mean = nca.getMean();
    double sigma = nca.getSampleStdev();

    switch (type) {
        case MinMax:
            return transform(new MinMaxNormalizer(column, min, max));
        case MinMax2:
            return transform(new MinMaxNormalizer(column, min, max, -1, 1));
        case Standardize:
            return transform(new StandardizeNormalizer(column, mean, sigma));
        case SubtractMean:
            return transform(new SubtractMeanNormalizer(column, mean));
        case Log2Mean:
            return transform(new Log2Normalizer(column, mean, min, 0.5));
        case Log2MeanExcludingMin:
            long countMin = nca.getCountMinValue();

            //mean including min value: (sum/totalCount)
            //mean excluding min value: (sum - countMin*min)/(totalCount - countMin)
            double meanExMin;
            if (ca.getCountTotal() - countMin == 0) {
                if (ca.getCountTotal() == 0) {
                    log.warn("Normalizing with Log2MeanExcludingMin but 0 records present in analysis");
                } else {
                    log.warn("Normalizing with Log2MeanExcludingMin but all records are the same value");
                }
                meanExMin = mean;
            } else {
                meanExMin = (mean * ca.getCountTotal() - countMin * min) / (ca.getCountTotal() - countMin);
            }
            return transform(new Log2Normalizer(column, meanExMin, min, 0.5));
        default:
            throw new RuntimeException("Unknown/not implemented normalization type: " + type);
    }
}
 
Example #15
Source File: TestAnalysis.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void testAnalysisVsLocal() throws Exception {

        Schema s = new Schema.Builder()
                .addColumnsDouble("%d", 0, 3)
                .addColumnInteger("label")
                .build();

        RecordReader rr = new CSVRecordReader();
        rr.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile()));

        List<List<Writable>> toParallelize = new ArrayList<>();
        while(rr.hasNext()){
            toParallelize.add(rr.next());
        }

        JavaRDD<List<Writable>> rdd = sc.parallelize(toParallelize).coalesce(1);


        rr.reset();
        DataAnalysis local = AnalyzeLocal.analyze(s, rr);
        DataAnalysis spark = AnalyzeSpark.analyze(s, rdd);

//        assertEquals(local.toJson(), spark.toJson());
        assertEquals(local, spark);


        //Also quality analysis:
        rr.reset();
        DataQualityAnalysis localQ = AnalyzeLocal.analyzeQuality(s, rr);
        DataQualityAnalysis sparkQ = AnalyzeSpark.analyzeQuality(s, rdd);

        assertEquals(localQ, sparkQ);


        //And, check unique etc:
        rr.reset();
        Map<String,Set<Writable>> mapLocal = AnalyzeLocal.getUnique(s.getColumnNames(), s, rr);
        Map<String,List<Writable>> mapSpark = AnalyzeSpark.getUnique(s.getColumnNames(), s, rdd);

        assertEquals(mapLocal.keySet(), mapSpark.keySet());
        for( String k : mapLocal.keySet()){
            assertEquals(mapLocal.get(k), new HashSet<Writable>(mapSpark.get(k)));
        }
    }
 
Example #16
Source File: AnalyzeLocal.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Analyse the specified data - returns a DataAnalysis object with summary information about each column
 *
 * @param schema Schema for data
 * @param rr     Data to analyze
 * @return DataAnalysis for data
 */
public static DataAnalysis analyze(Schema schema, RecordReader rr) {
    return analyze(schema, rr, DEFAULT_MAX_HISTOGRAM_BUCKETS);
}
 
Example #17
Source File: AnalyzeSpark.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
/**
 * Analyse the specified data - returns a DataAnalysis object with summary information about each column
 *
 * @param schema Schema for data
 * @param data   Data to analyze
 * @return       DataAnalysis for data
 */
public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data) {
    return analyze(schema, data, DEFAULT_HISTOGRAM_BUCKETS);
}
 
Example #18
Source File: HtmlAnalysis.java    From deeplearning4j with Apache License 2.0 1 votes vote down vote up
/**
 * Render a data analysis object as a HTML file. This will produce a summary table, along charts for
 * numerical columns
 *
 * @param dataAnalysis Data analysis object to render
 * @param output       Output file (should have extension .html)
 */
public static void createHtmlAnalysisFile(DataAnalysis dataAnalysis, File output) throws Exception {

    String str = createHtmlAnalysisString(dataAnalysis);

    FileUtils.writeStringToFile(output, str, StandardCharsets.UTF_8);
}
 
Example #19
Source File: HtmlAnalysis.java    From DataVec with Apache License 2.0 1 votes vote down vote up
/**
 * Render a data analysis object as a HTML file. This will produce a summary table, along charts for
 * numerical columns
 *
 * @param dataAnalysis Data analysis object to render
 * @param output       Output file (should have extension .html)
 */
public static void createHtmlAnalysisFile(DataAnalysis dataAnalysis, File output) throws Exception {

    String str = createHtmlAnalysisString(dataAnalysis);

    FileUtils.writeStringToFile(output, str);
}