org.deeplearning4j.spark.util.SparkUtils Java Examples

The following examples show how to use org.deeplearning4j.spark.util.SparkUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 7 votes vote down vote up
protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = split.partitions().size();

    FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #2
Source File: PreprocessSpark.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
protected void entryPoint(String[] args) throws Exception {
    JCommander jcmdr = new JCommander(this);
    jcmdr.parse(args);
    //JCommanderUtils.parseArgs(this, args);
    SparkConf conf = new SparkConf();
    conf.setMaster("local[*]");
    conf.setAppName("DL4JTinyImageNetSparkPreproc");
    JavaSparkContext sc = new JavaSparkContext(conf);

    //Create training set
    JavaRDD<String> filePathsTrain = SparkUtils.listPaths(sc, sourceDir + "/train", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTrain, saveDir, batchSize, sc);

    //Create test set
    JavaRDD<String> filePathsTest = SparkUtils.listPaths(sc, sourceDir + "/test", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTest, saveDir, batchSize, sc);


    System.out.println("----- Data Preprocessing Complete -----");
}
 
Example #3
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected void doIterationPDS_MDS(SparkComputationGraph graph, JavaRDD<PortableDataStream> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<PortableDataStream> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerPDSMDSFlatMap<>(getWorkerInstance(graph));

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #4
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerFlatMap<>(getWorkerInstance(network));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, null, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #5
Source File: PreprocessSpark.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
protected void entryPoint(String[] args) throws Exception {
    JCommander jcmdr = new JCommander(this);
    jcmdr.parse(args);
    //JCommanderUtils.parseArgs(this, args);
    SparkConf conf = new SparkConf();
    conf.setMaster("local[*]");
    conf.setAppName("DL4JTinyImageNetSparkPreproc");
    JavaSparkContext sc = new JavaSparkContext(conf);

    //Create training set
    JavaRDD<String> filePathsTrain = SparkUtils.listPaths(sc, sourceDir + "/train", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTrain, saveDir, batchSize, sc);

    //Create test set
    JavaRDD<String> filePathsTest = SparkUtils.listPaths(sc, sourceDir + "/test", true, NativeImageLoader.ALLOWED_FORMATS);
    SparkDataUtils.createFileBatchesSpark(filePathsTest, saveDir, batchSize, sc);


    System.out.println("----- Data Preprocessing Complete -----");
}
 
Example #6
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Fit the SparkDl4jMultiLayer network using a directory of serialized DataSet objects
 * The assumption here is that the directory contains a number of {@link DataSet} objects, each serialized using
 * {@link DataSet#save(OutputStream)}
 *
 * @param path Path to the directory containing the serialized DataSet objcets
 * @return The MultiLayerNetwork after training
 */
public MultiLayerNetwork fit(String path) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();

    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    return fitPaths(paths);
}
 
Example #7
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected IEvaluation[] doEvaluation(JavaRDD<String> data, int evalNumWorkers, int evalBatchSize, DataSetLoader loader, MultiDataSetLoader mdsLoader, IEvaluation... emptyEvaluations){
    IEvaluateMDSPathsFlatMapFunction evalFn = new IEvaluateMDSPathsFlatMapFunction(sc.broadcast(conf.toJson()),
            SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, loader, mdsLoader,
            BroadcastHadoopConfigHolder.get(sc), emptyEvaluations);
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    JavaRDD<IEvaluation[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<>(), new IEvaluateAggregateFunction<>());
}
 
Example #8
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public <T extends IEvaluation> T[] doEvaluationMDS(JavaRDD<MultiDataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    IEvaluateMDSFlatMapFunction<T> evalFn = new IEvaluateMDSFlatMapFunction<>(sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(),
                    new IEvaluateAggregateFunction<T>());
}
 
Example #9
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Evaluate the single-output network on a directory containing a set of MultiDataSet objects to be loaded with a {@link MultiDataSetLoader}.
 * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}
 * @param path Path/URI to the directory containing the datasets to load
 * @return Evaluation
 */
public Evaluation evaluate(String path, MultiDataSetLoader loader){
    JavaRDD<String> data;
    try {
        data = SparkUtils.listPaths(sc, path);
    } catch (IOException e){
        throw new RuntimeException("Error listing files for evaluation of files at path: " + path, e);
    }
    return (Evaluation) doEvaluation(data, DEFAULT_EVAL_WORKERS, DEFAULT_EVAL_SCORE_BATCH_SIZE, null, loader, new Evaluation())[0];
}
 
Example #10
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Evaluate the single-output network on a directory containing a set of DataSet objects to be loaded with a {@link DataSetLoader}.
 * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}
 * @param path Path/URI to the directory containing the datasets to load
 * @return Evaluation
 */
public Evaluation evaluate(String path, DataSetLoader loader){
    JavaRDD<String> data;
    try {
        data = SparkUtils.listPaths(sc, path);
    } catch (IOException e){
        throw new RuntimeException("Error listing files for evaluation of files at path: " + path, e);
    }
    return (Evaluation) doEvaluation(data, DEFAULT_EVAL_WORKERS, DEFAULT_EVAL_SCORE_BATCH_SIZE, loader, (MultiDataSetLoader)null, new Evaluation())[0];
}
 
Example #11
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Fit the SparkComputationGraph network using a directory of serialized MultiDataSet objects
 * The assumption here is that the directory contains a number of serialized {@link MultiDataSet} objects
 *
 * @param path Path to the directory containing the serialized MultiDataSet objcets
 * @return The MultiLayerNetwork after training
 */
public ComputationGraph fitMultiDataSet(String path) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();

    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    return fitPathsMultiDataSet(paths);
}
 
Example #12
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Fit the SparkComputationGraph network using a directory of serialized DataSet objects
 * The assumption here is that the directory contains a number of {@link DataSet} objects, each serialized using
 * {@link DataSet#save(OutputStream)}
 *
 * @param path Path to the directory containing the serialized DataSet objcets
 * @return The MultiLayerNetwork after training
 */
public ComputationGraph fit(String path) {
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueue();

    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    return fitPaths(paths);
}
 
Example #13
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public SparkComputationGraph(JavaSparkContext sparkContext, ComputationGraphConfiguration conf,
                TrainingMaster trainingMaster) {
    sc = sparkContext;
    this.trainingMaster = trainingMaster;
    this.conf = conf.clone();
    this.network = new ComputationGraph(conf);
    this.network.init();

    //Check if kryo configuration is correct:
    SparkUtils.checkKryoConfiguration(sparkContext, log);
}
 
Example #14
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public SparkComputationGraph(JavaSparkContext javaSparkContext, ComputationGraph network,
                TrainingMaster trainingMaster) {
    sc = javaSparkContext;
    this.trainingMaster = trainingMaster;
    this.conf = network.getConfiguration().clone();
    this.network = network;
    this.network.init();

    //Check if kryo configuration is correct:
    SparkUtils.checkKryoConfiguration(javaSparkContext, log);
}
 
Example #15
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected IEvaluation[] doEvaluation(JavaRDD<String> data, int evalNumWorkers, int evalBatchSize, DataSetLoader loader, MultiDataSetLoader mdsLoader, IEvaluation... emptyEvaluations){
    Configuration config = sc.hadoopConfiguration();
    IEvaluateMDSPathsFlatMapFunction evalFn = new IEvaluateMDSPathsFlatMapFunction(sc.broadcast(conf.toJson()),
            SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, loader, mdsLoader,
            BroadcastHadoopConfigHolder.get(sc), emptyEvaluations);
    Preconditions.checkArgument(evalNumWorkers > 0, "Invalid number of evaulation workers: require at least 1 - got %s", evalNumWorkers);
    JavaRDD<IEvaluation[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<>(), new IEvaluateAggregateFunction<>());
}
 
Example #16
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Evaluate on a directory containing a set of DataSet objects to be loaded with a {@link DataSetLoader}.
 * Uses default batch size of {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}
 * @param path Path/URI to the directory containing the datasets to load
 * @return Evaluation
 */
public <T extends Evaluation> T evaluate(String path, int batchSize, DataSetLoader loader){
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<DataSet> rdd = paths.map(new LoadDataSetFunction(loader, new RemoteFileSourceFactory(BroadcastHadoopConfigHolder.get(sc))));
    return (T)doEvaluation(rdd, batchSize, new org.deeplearning4j.eval.Evaluation())[0];
}
 
Example #17
Source File: SharedTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIteration(SparkComputationGraph network, JavaRDD<DataSet> data, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        data = repartitioner.repartition(data, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        data = SparkUtils.repartitionEqually(data, repartition, numWorkers);
    }
    int nPartitions = data.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = data.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #18
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public SparkDl4jMultiLayer(JavaSparkContext javaSparkContext, MultiLayerNetwork network,
                TrainingMaster<?, ?> trainingMaster) {
    sc = javaSparkContext;
    this.conf = network.getLayerWiseConfigurations().clone();
    this.network = network;
    if (!network.isInitCalled())
        network.init();
    this.trainingMaster = trainingMaster;

    //Check if kryo configuration is correct:
    SparkUtils.checkKryoConfiguration(javaSparkContext, log);
}
 
Example #19
Source File: SharedTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, SharedTrainingResult> function =
                    new SharedFlatMapDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(network, null, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #20
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIterationPaths(SparkDl4jMultiLayer network, SparkComputationGraph graph, JavaRDD<String> split,
                int splitNum, int numSplits, int dataSetObjectNumExamples, DataSetLoader dsLoader, MultiDataSetLoader mdsLoader) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<String> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(dataSetObjectNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    JavaSparkContext sc = (network != null ? network.getSparkContext() : graph.getSparkContext());
    FlatMapFunction<Iterator<String>, ParameterAveragingTrainingResult> function;
    if (network != null) {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(network), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(network), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    } else {
        if(dsLoader != null){
            function = new ExecuteWorkerPathFlatMap<>(getWorkerInstance(graph), dsLoader, BroadcastHadoopConfigHolder.get(sc));
        } else {
            function = new ExecuteWorkerPathMDSFlatMap<>(getWorkerInstance(graph), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
        }
    }

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #21
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Deprecated
protected void doIterationPDS(SparkDl4jMultiLayer network, SparkComputationGraph graph,
                JavaRDD<PortableDataStream> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<PortableDataStream> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function;
    if (network != null)
        function = new ExecuteWorkerPDSFlatMap<>(getWorkerInstance(network));
    else
        function = new ExecuteWorkerPDSFlatMap<>(getWorkerInstance(graph));

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #22
Source File: SharedTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected void doIterationMDS(SparkComputationGraph network, JavaRDD<MultiDataSet> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/rddDataSetNumExamples);
        splitData = repartitioner.repartition(splitData, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        splitData = SparkUtils.repartitionEqually(splitData, repartition, numWorkers);
    }
    int nPartitions = splitData.partitions().size();

    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<MultiDataSet>, SharedTrainingResult> function =
                    new SharedFlatMapMultiDataSet<>(getWorkerInstance(network));

    JavaRDD<SharedTrainingResult> result = splitData.mapPartitions(function);

    processResults(null, network, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #23
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected <T> JavaRDD<T>[] getSplitRDDs(JavaRDD<T> trainingData, int totalDataSetObjectCount,
                int examplesPerDataSetObject) {
    int dataSetObjectsPerSplit = getNumDataSetObjectsPerSplit(examplesPerDataSetObject);

    if (collectTrainingStats)
        stats.logSplitStart();
    JavaRDD<T>[] splits = SparkUtils.balancedRandomSplit(totalDataSetObjectCount, dataSetObjectsPerSplit,
                    trainingData, rng.nextLong());
    if (collectTrainingStats)
        stats.logSplitEnd();
    return splits;
}
 
Example #24
Source File: ParameterAveragingTrainingMaster.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected <T, Repr> JavaPairRDD<T, Repr>[] getSplitRDDs(JavaPairRDD<T, Repr> trainingData,
                int totalDataSetObjectCount) {
    int dataSetObjectsPerSplit = getNumDataSetObjectsPerSplit(rddDataSetNumExamples);

    if (collectTrainingStats)
        stats.logSplitStart();
    JavaPairRDD<T, Repr>[] splits = SparkUtils.balancedRandomSplit(totalDataSetObjectCount, dataSetObjectsPerSplit,
                    trainingData, rng.nextLong());
    if (collectTrainingStats)
        stats.logSplitEnd();
    return splits;
}
 
Example #25
Source File: SparkDataValidation.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected static ValidationResult validateMultiDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                                 int numFeatureArrays, int numLabelArrays,
                                                 List<int[]> featuresShape, List<int[]> labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateMultiDataSetFn(deleteInvalid, numFeatureArrays, numLabelArrays,
            featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}
 
Example #26
Source File: SparkDataValidation.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
protected static ValidationResult validateDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                            int[] featuresShape, int[] labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateDataSetFn(deleteInvalid, featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}
 
Example #27
Source File: StatsUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static void exportStats(List<EventStats> list, String outputPath, String delimiter, SparkContext sc)
                throws IOException {
    StringBuilder sb = new StringBuilder();
    boolean first = true;
    for (EventStats e : list) {
        if (first)
            sb.append(e.getStringHeader(delimiter)).append("\n");
        sb.append(e.asString(delimiter)).append("\n");
        first = false;
    }
    SparkUtils.writeStringToFile(outputPath, sb.toString(), sc);
}
 
Example #28
Source File: SharedTrainingMaster.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
protected void doIterationPaths(SparkDl4jMultiLayer network, SparkComputationGraph graph, JavaRDD<String> data,
                int splitNum, int numSplits, DataSetLoader dsLoader, MultiDataSetLoader mdsLoader, int dataSetObjectNumExamples) {
    if (network == null && graph == null)
        throw new DL4JInvalidConfigException("Both MLN & CompGraph are NULL");

    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, thresholdAlgorithm={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, thresholdAlgorithm, numWorkers);

    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    if (collectTrainingStats)
        stats.logRepartitionStart();

    if(repartitioner != null){
        log.info("Repartitioning training data using repartitioner: {}", repartitioner);
        int minPerWorker = Math.max(1, batchSizePerWorker/dataSetObjectNumExamples);
        data = repartitioner.repartition(data, minPerWorker, numWorkers);
    } else {
        log.info("Repartitioning training data using SparkUtils repartitioner");
        data = SparkUtils.repartitionEqually(data, repartition, numWorkers);
    }

    int nPartitions = data.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    JavaSparkContext sc = (network != null ? network.getSparkContext() : graph.getSparkContext());
    FlatMapFunction<Iterator<String>, SharedTrainingResult> function;
    if(dsLoader != null){
        function = new SharedFlatMapPaths<>(
                network != null ? getWorkerInstance(network) : getWorkerInstance(graph), dsLoader, BroadcastHadoopConfigHolder.get(sc));
    } else {
        function = new SharedFlatMapPathsMDS<>(
                network != null ? getWorkerInstance(network) : getWorkerInstance(graph), mdsLoader, BroadcastHadoopConfigHolder.get(sc));
    }


    JavaRDD<SharedTrainingResult> result = data.mapPartitions(function);

    processResults(network, graph, result);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}
 
Example #29
Source File: SparkDl4jMultiLayer.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Perform distributed evaluation of any type of {@link IEvaluation} - or multiple IEvaluation instances.
 * Distributed equivalent of {@link MultiLayerNetwork#doEvaluation(DataSetIterator, IEvaluation[])}
 *
 * @param data             Data to evaluate on
 * @param emptyEvaluations Empty evaluation instances. Starting point (serialized/duplicated, then merged)
 * @param evalNumWorkers   Number of workers (copies of the MultiLayerNetwork) model to use. Generally this should
 *                         be smaller than the number of threads - 2 to 4 is often good enough. If using CUDA GPUs,
 *                         this should ideally be set to the number of GPUs on each node (i.e., 1 for a single GPU node)
 * @param evalBatchSize    Evaluation batch size
 * @param <T>              Type of evaluation instance to return
 * @return IEvaluation instances
 */
public <T extends IEvaluation> T[] doEvaluation(JavaRDD<DataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    IEvaluateFlatMapFunction<T> evalFn = new IEvaluateFlatMapFunction<>(false, sc.broadcast(conf.toJson()),
                    SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(), new IEvaluationReduceFunction<T>());
}
 
Example #30
Source File: SparkComputationGraph.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Perform distributed evaluation on a <i>single output</i> ComputationGraph form DataSet objects using Spark.
 * Can be used to perform multiple evaluations on this single output (for example, {@link Evaluation} and
 * {@link ROC}) at the same time.<br>
 *
 * @param data             Data to evaluatie
 * @param evalNumWorkers   Number of worker threads (per machine) to use for evaluation. May want tis to be less than
 *                         the number of Spark threads per machine/JVM to reduce memory requirements
 * @param evalBatchSize    Minibatch size for evaluation
 * @param emptyEvaluations Evaluations to perform
 * @return                 Evaluations
 */
public <T extends IEvaluation> T[] doEvaluation(JavaRDD<DataSet> data, int evalNumWorkers, int evalBatchSize, T... emptyEvaluations) {
    IEvaluateFlatMapFunction<T> evalFn = new IEvaluateFlatMapFunction<>(true, sc.broadcast(conf.toJson()),
            SparkUtils.asByteArrayBroadcast(sc, network.params()), evalNumWorkers, evalBatchSize, emptyEvaluations);
    JavaRDD<T[]> evaluations = data.mapPartitions(evalFn);
    return evaluations.treeAggregate(null, new IEvaluateAggregateFunction<T>(),
                    new IEvaluateAggregateFunction<T>());
}