Java Code Examples for org.deeplearning4j.spark.util.SparkUtils

The following examples show how to use org.deeplearning4j.spark.util.SparkUtils. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Java-Deep-Learning-Cookbook   Source File: PreprocessSpark.java    License: MIT License 6 votes vote down vote up
protected void entryPoint(String[] args) throws Exception {
    JCommander jcmdr = new JCommander(this);
    jcmdr.parse(args);
    //JCommanderUtils.parseArgs(this, args);
    SparkConf conf = new SparkConf();
    conf.setMaster("local[*]");
    conf.setAppName("DL4JTinyImageNetSparkPreproc");
    JavaSparkContext sc = new JavaSparkContext(conf);

    //Create training set
    JavaRDD<String> filePathsTrain = SparkUtils.listPaths(sc, sourceDir + "/train", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTrain, saveDir, batchSize, sc);

    //Create test set
    JavaRDD<String> filePathsTest = SparkUtils.listPaths(sc, sourceDir + "/test", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTest, saveDir, batchSize, sc);


    System.out.println("----- Data Preprocessing Complete -----");
}
 
Example 2
Source Project: Java-Deep-Learning-Cookbook   Source File: PreprocessSpark.java    License: MIT License 6 votes vote down vote up
protected void entryPoint(String[] args) throws Exception {
    JCommander jcmdr = new JCommander(this);
    jcmdr.parse(args);
    //JCommanderUtils.parseArgs(this, args);
    SparkConf conf = new SparkConf();
    conf.setMaster("local[*]");
    conf.setAppName("DL4JTinyImageNetSparkPreproc");
    JavaSparkContext sc = new JavaSparkContext(conf);

    //Create training set
    JavaRDD<String> filePathsTrain = SparkUtils.listPaths(sc, sourceDir + "/train", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTrain, saveDir, batchSize, sc);

    //Create test set
    JavaRDD<String> filePathsTest = SparkUtils.listPaths(sc, sourceDir + "/test", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTest, saveDir, batchSize, sc);


    System.out.println("----- Data Preprocessing Complete -----");
}
 
Example 3
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerFlatMap<>(getWorkerInstance(network));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, null, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 4
protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = split.partitions().size();

    FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 5
protected void doIterationPDS_MDS(SparkComputationGraph graph, JavaRDD<PortableDataStream> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<PortableDataStream> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerPDSMDSFlatMap<>(getWorkerInstance(graph));

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 6
Source Project: deeplearning4j   Source File: SharedTrainingMaster.java    License: Apache License 2.0 5 votes vote down vote up
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(network, null, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 7
Source Project: deeplearning4j   Source File: SharedTrainingMaster.java    License: Apache License 2.0 5 votes vote down vote up
protected void doIterationMDS(SparkComputationGraph network, JavaRDD<MultiDataSet> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<MultiDataSet>, SharedTrainingResult> function =
                    new SharedFlatMapMultiDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 8
Source Project: deeplearning4j   Source File: SharedTrainingMaster.java    License: Apache License 2.0 5 votes vote down vote up
protected void doIteration(SparkComputationGraph network, JavaRDD<DataSet> data, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        data = repartitioner.repartition(data, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        data = SparkUtils.repartitionEqually(data, repartition, numWorkers);
    }
    int nPartitions = data.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = data.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 9
Source Project: deeplearning4j   Source File: StatsUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static void exportStats(List<EventStats> list, String outputPath, String delimiter, SparkContext sc)
                throws IOException {
    StringBuilder sb = new StringBuilder();
    boolean first = true;
    for (EventStats e : list) {
        if (first)
            sb.append(e.getStringHeader(delimiter)).append("\n");
        sb.append(e.asString(delimiter)).append("\n");
        first = false;
    }
    SparkUtils.writeStringToFile(outputPath, sb.toString(), sc);
}
 
Example 10
Source Project: deeplearning4j   Source File: SparkDataValidation.java    License: Apache License 2.0 5 votes vote down vote up
protected static ValidationResult validateDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                            int[] featuresShape, int[] labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateDataSetFn(deleteInvalid, featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}
 
Example 11
Source Project: deeplearning4j   Source File: SparkDataValidation.java    License: Apache License 2.0 5 votes vote down vote up
protected static ValidationResult validateMultiDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                                 int numFeatureArrays, int numLabelArrays,
                                                 List<int[]> featuresShape, List<int[]> labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateMultiDataSetFn(deleteInvalid, numFeatureArrays, numLabelArrays,
            featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}
 
Example 12
protected <T, Repr> JavaPairRDD<T, Repr>[] getSplitRDDs(JavaPairRDD<T, Repr> trainingData,
                int totalDataSetObjectCount) {
    int dataSetObjectsPerSplit = getNumDataSetObjectsPerSplit(rddDataSetNumExamples);

    if (collectTrainingStats)
        stats.logSplitStart();
    JavaPairRDD<T, Repr>[] splits = SparkUtils.balancedRandomSplit(totalDataSetObjectCount, dataSetObjectsPerSplit,
                    trainingData, rng.nextLong());
    if (collectTrainingStats)
        stats.logSplitEnd();
    return splits;
}
 
Example 13
protected <T> JavaRDD<T>[] getSplitRDDs(JavaRDD<T> trainingData, int totalDataSetObjectCount,
                int examplesPerDataSetObject) {
    int dataSetObjectsPerSplit = getNumDataSetObjectsPerSplit(examplesPerDataSetObject);

    if (collectTrainingStats)
        stats.logSplitStart();
    JavaRDD<T>[] splits = SparkUtils.balancedRandomSplit(totalDataSetObjectCount, dataSetObjectsPerSplit,
                    trainingData, rng.nextLong());
    if (collectTrainingStats)
        stats.logSplitEnd();
    return splits;
}
 
Example 14
@Deprecated
protected void doIterationPDS(SparkDl4jMultiLayer network, SparkComputationGraph graph,
                JavaRDD<PortableDataStream> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<PortableDataStream> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function;
    if (network != null)
        function = new ExecuteWorkerPDSFlatMap<>(getWorkerInstance(network));
    else
        function = new ExecuteWorkerPDSFlatMap<>(getWorkerInstance(graph));

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 15
protected void doIterationPaths(SparkDl4jMultiLayer network, SparkComputationGraph graph, JavaRDD<String> split,
                int splitNum, int numSplits, int dataSetObjectNumExamples, DataSetLoader dsLoader, MultiDataSetLoader mdsLoader) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<String> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(dataSetObjectNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    JavaSparkContext sc = (network != null ? network.getSparkContext() : graph.getSparkContext());
    FlatMapFunction<Iterator<String>, ParameterAveragingTrainingResult> function;
    if (network != null) {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(network), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(network), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    } else {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(graph), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(graph), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    }

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 16
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 5 votes vote down vote up
public SparkDl4jMultiLayer(JavaSparkContext javaSparkContext, MultiLayerNetwork network,
                TrainingMaster<?, ?> trainingMaster) {
    sc = javaSparkContext;
    this.conf = network.getLayerWiseConfigurations().clone();
    this.network = network;
    if (!network.isInitCalled())
        network.init();
    this.trainingMaster = trainingMaster;

    //Check if kryo configuration is correct:
    SparkUtils.checkKryoConfiguration(javaSparkContext, log);
}
 
Example 17
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Fit the SparkDl4jMultiLayer network using a directory of serialized DataSet objects
 * The assumption here is that the directory contains a number of {@link DataSet} objects, each serialized using
 * {@link DataSet#save(OutputStream)}
 *
 * @param path Path to the directory containing the serialized DataSet objcets
 * @return The MultiLayerNetwork after training
 */
public MultiLayerNetwork fit(String path) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();

    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    return fitPaths(paths);
}
 
Example 18
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Evaluate on a directory containing a set of DataSet objects to be loaded with a {@link DataSetLoader}.
 * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}
 * @param path Path/URI to the directory containing the datasets to load
 * @return Evaluation
 */
public <T extends Evaluation> T evaluate(String path, int batchSize, DataSetLoader loader){
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<DataSet> rdd = paths.map(new LoadDataSetFunction(loader, new RemoteFileSourceFactory(BroadcastHadoopConfigHolder.get(sc))));
    return (T)doEvaluation(rdd, batchSize, new org.deeplearning4j.eval.Evaluation())[0];
}
 
Example 19
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 5 votes vote down vote up
protected IEvaluation[] doEvaluation(JavaRDD<String> data, int evalNumWorkers, int evalBatchSize, DataSetLoader loader, MultiDataSetLoader mdsLoader, IEvaluation... emptyEvaluations){
    Configuration config = sc.hadoopConfiguration();
    IEvaluateMDSPathsFlatMapFunction evalFn = new IEvaluateMDSPathsFlatMapFunction(sc.broadcast(conf.toJson()),
            SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, loader, mdsLoader,
            BroadcastHadoopConfigHolder.get(sc), emptyEvaluations);
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    JavaRDD<IEvaluation[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<>(), new IEvaluateAggregateFunction<>());
}
 
Example 20
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
public SparkComputationGraph(JavaSparkContext javaSparkContext, ComputationGraph network,
                TrainingMaster trainingMaster) {
    sc = javaSparkContext;
    this.trainingMaster = trainingMaster;
    this.conf = network.getConfiguration().clone();
    this.network = network;
    this.network.init();

    //Check if kryo configuration is correct:
    SparkUtils.checkKryoConfiguration(javaSparkContext, log);
}
 
Example 21
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
public SparkComputationGraph(JavaSparkContext sparkContext, ComputationGraphConfiguration conf,
                TrainingMaster trainingMaster) {
    sc = sparkContext;
    this.trainingMaster = trainingMaster;
    this.conf = conf.clone();
    this.network = new ComputationGraph(conf);
    this.network.init();

    //Check if kryo configuration is correct:
    SparkUtils.checkKryoConfiguration(sparkContext, log);
}
 
Example 22
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Fit the SparkComputationGraph network using a directory of serialized DataSet objects
 * The assumption here is that the directory contains a number of {@link DataSet} objects, each serialized using
 * {@link DataSet#save(OutputStream)}
 *
 * @param path Path to the directory containing the serialized DataSet objcets
 * @return The MultiLayerNetwork after training
 */
public ComputationGraph fit(String path) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();

    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    return fitPaths(paths);
}
 
Example 23
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Fit the SparkComputationGraph network using a directory of serialized MultiDataSet objects
 * The assumption here is that the directory contains a number of serialized {@link MultiDataSet} objects
 *
 * @param path Path to the directory containing the serialized MultiDataSet objcets
 * @return The MultiLayerNetwork after training
 */
public ComputationGraph fitMultiDataSet(String path) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();

    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    return fitPathsMultiDataSet(paths);
}
 
Example 24
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Evaluate the single-output network on a directory containing a set of DataSet objects to be loaded with a {@link DataSetLoader}.
 * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}
 * @param path Path/URI to the directory containing the datasets to load
 * @return Evaluation
 */
public Evaluation evaluate(String path, DataSetLoader loader){
    JavaRDD<String> data;
    try {
        data = SparkUtils.listPaths(sc, path);
    } catch (IOException e){
        throw new RuntimeException("Error listing files for evaluation of files at path: " + path, e);
    }
    return (Evaluation) doEvaluation(data, DEFAULT_EVAL_WORKERS, DEFAULT_EVAL_SCORE_BATCH_SIZE, loader, (MultiDataSetLoader)null, new Evaluation())[0];
}
 
Example 25
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Evaluate the single-output network on a directory containing a set of MultiDataSet objects to be loaded with a {@link MultiDataSetLoader}.
 * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}
 * @param path Path/URI to the directory containing the datasets to load
 * @return Evaluation
 */
public Evaluation evaluate(String path, MultiDataSetLoader loader){
    JavaRDD<String> data;
    try {
        data = SparkUtils.listPaths(sc, path);
    } catch (IOException e){
        throw new RuntimeException("Error listing files for evaluation of files at path: " + path, e);
    }
    return (Evaluation) doEvaluation(data, DEFAULT_EVAL_WORKERS, DEFAULT_EVAL_SCORE_BATCH_SIZE, null, loader, new Evaluation())[0];
}
 
Example 26
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
public <T extends IEvaluation> T[] doEvaluationMDS(JavaRDD<MultiDataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    IEvaluateMDSFlatMapFunction<T> evalFn = new IEvaluateMDSFlatMapFunction<>(sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(),
                    new IEvaluateAggregateFunction<T>());
}
 
Example 27
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 5 votes vote down vote up
protected IEvaluation[] doEvaluation(JavaRDD<String> data, int evalNumWorkers, int evalBatchSize, DataSetLoader loader, MultiDataSetLoader mdsLoader, IEvaluation... emptyEvaluations){
    IEvaluateMDSPathsFlatMapFunction evalFn = new IEvaluateMDSPathsFlatMapFunction(sc.broadcast(conf.toJson()),
            SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, loader, mdsLoader,
            BroadcastHadoopConfigHolder.get(sc), emptyEvaluations);
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    JavaRDD<IEvaluation[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<>(), new IEvaluateAggregateFunction<>());
}
 
Example 28
Source Project: deeplearning4j   Source File: SharedTrainingMaster.java    License: Apache License 2.0 4 votes vote down vote up
protected void doIterationPaths(SparkDl4jMultiLayer network, SparkComputationGraph graph, JavaRDD<String> data,
                int splitNum, int numSplits, DataSetLoader dsLoader, MultiDataSetLoader mdsLoader, int dataSetObjectNumExamples) {
    if (network == null && graph == null)
        throw new DL4JInvalidConfigException("Both MLN & CompGraph are NULL");

    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/dataSetObjectNumExamples);
        data = repartitioner.repartition(data, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        data = SparkUtils.repartitionEqually(data, repartition, numWorkers);
    }

    int nPartitions = data.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    JavaSparkContext sc = (network != null ? network.getSparkContext() : graph.getSparkContext());
    FlatMapFunction<Iterator<String>, SharedTrainingResult> function;
    if(dsLoader != null){
        function = new SharedFlatMapPaths<>(
                network != null ? getWorkerInstance(network) : getWorkerInstance(graph), dsLoader, BroadcastHadoopConfigHolder.get(sc));
    } else {
        function = new SharedFlatMapPathsMDS<>(
                network != null ? getWorkerInstance(network) : getWorkerInstance(graph), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
    }


    JavaRDD<SharedTrainingResult> result = data.mapPartitions(function);

    processResults(network, graph, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example 29
Source Project: deeplearning4j   Source File: SparkDl4jMultiLayer.java    License: Apache License 2.0 3 votes vote down vote up
/**
 * Perform distributed evaluation of any type of {@link IEvaluation} - or multiple IEvaluation instances.
 * Distributed equivalent of {@link MultiLayerNetwork#doEvaluation(DataSetIterator, IEvaluation[])}
 *
 * @param data             Data to evaluate on
 * @param emptyEvaluations Empty evaluation instances. Starting point (serialized/duplicated, then merged)
 * @param evalNumWorkers   Number of workers (copies of the MultiLayerNetwork) model to use. Generally this should
 *                         be smaller than the number of threads - 2 to 4 is often good enough. If using CUDA GPUs,
 *                         this should ideally be set to the number of GPUs on each node (i.e., 1 for a single GPU node)
 * @param evalBatchSize    Evaluation batch size
 * @param <T>              Type of evaluation instance to return
 * @return IEvaluation instances
 */
public <T extends IEvaluation> T[] doEvaluation(JavaRDD<DataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    IEvaluateFlatMapFunction<T> evalFn = new IEvaluateFlatMapFunction<>(false, sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(), new IEvaluationReduceFunction<T>());
}
 
Example 30
Source Project: deeplearning4j   Source File: SparkComputationGraph.java    License: Apache License 2.0 3 votes vote down vote up
/**
 * Perform distributed evaluation on a <i>single output</i> ComputationGraph form DataSet objects using Spark.
 * Can be used to perform multiple evaluations on this single output (for example, {@link Evaluation} and
 * {@link ROC}) at the same time.<br>
 *
 * @param data             Data to evaluatie
 * @param evalNumWorkers   Number of worker threads (per machine) to use for evaluation. May want tis to be less than
 *                         the number of Spark threads per machine/JVM to reduce memory requirements
 * @param evalBatchSize    Minibatch size for evaluation
 * @param emptyEvaluations Evaluations to perform
 * @return                 Evaluations
 */
public <T extends IEvaluation> T[] doEvaluation(JavaRDD<DataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    IEvaluateFlatMapFunction<T> evalFn = new IEvaluateFlatMapFunction<>(true, sc.broadcast(conf.toJson()),
            SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(),
                    new IEvaluateAggregateFunction<T>());
}