Java Code Examples for org.apache.spark.api.java.JavaRDD#count()

The following examples show how to use org.apache.spark.api.java.JavaRDD#count() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LogError.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
/**
 * 对日志进行 转换操作和行动操作
 */
public void log(JavaSparkContext sparkContext) {
    JavaRDD<String> inputRDD = sparkContext.textFile("/usr/local/log");
    JavaRDD<String> errorRDD = inputRDD.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return null;
        }
    });

    long errorRDDCount = errorRDD.count();
    System.out.println("errorRDD count is " + errorRDDCount);
    for (String rddLine : errorRDD.take(10)) {
        System.out.println("errorRDD 数据is " + rddLine);
    }
}
 
Example 2
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test4(SparkSession spark, String outText1) throws Exception {
  try {
    System.out.println("*********************************");
    System.out.println("T4: Create collection and store it as text file in " + outText1);
    List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
    JavaRDD<Integer> distData = new JavaSparkContext(spark.sparkContext()).parallelize(data);
    distData.saveAsTextFile(outText1);
    JavaRDD<String> txtRes = spark.read().textFile(outText1).javaRDD();
    long baseCount = txtRes.count();
    countAndCompare(baseCount, distData.count(), "T4", baseCount);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(outText1, spark.sparkContext().hadoopConfiguration(), true);
  }

}
 
Example 3
Source File: SparkTableChecker.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Check for duplicate indexes
 * @param index
 * @return
 * @throws StandardException
 * @throws InterruptedException
 * @throws ExecutionException
 */
private List<String> checkDuplicateIndexes(PairDataSet table, PairDataSet index) throws StandardException {
    try {
        SpliceSpark.pushScope(String.format("Check duplicates in index %s.%s", schemaName, indexName));
        JavaPairRDD duplicateIndexRdd = ((SparkPairDataSet) index).rdd
                .combineByKey(new CreateCombiner(), new MergeValue(), new MergeCombiners())
                .filter(new DuplicateIndexFilter());

        JavaPairRDD joinedRdd = duplicateIndexRdd
                .join(((SparkPairDataSet) table).rdd);

        JavaRDD duplicateIndex = joinedRdd
                .mapPartitions(new SparkFlatMapFunction<>(new DeleteDuplicateIndexFunction<>(conglomerate, txn, tentativeIndex, baseColumnMap, fix)));

        Iterator it = duplicateIndex.toLocalIterator();
        long count = duplicateIndex.count();
        return reportDuplicateIndexes(it, count, fix);
    }catch (Exception e) {
        throw StandardException.plainWrapException(e);
    }
    finally {
        SpliceSpark.popScope();
    }
}
 
Example 4
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc,
		JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc,
		boolean hasHeader, String delim, boolean fill, double fillValue) {
	//determine unknown dimensions and sparsity if required
	//(w/ robustness for mistakenly counted header in nnz)
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<String> tmp = input.values()
			.map(new CSVAnalysisFunction(aNnz, delim));
		long rlen = tmp.count() - (hasHeader ? 1 : 0);
		long clen = tmp.first().split(delim).length;
		long nnz = Math.min(rlen*clen, UtilFunctions.toLong(aNnz.value()));
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
		.zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
		prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction(
			mc, sparse, hasHeader, delim, fill, fillValue));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}
 
Example 5
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}
 
Example 6
Source File: MockMLUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {
  long testDataCount = testData.count();
  testCounts.add((int) testDataCount);
  log.info("Returning eval {}", testDataCount);
  return testDataCount;
}
 
Example 7
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}
 
Example 8
Source File: ActionRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 集合里面元素数量.
 *
 * @since hui_project 1.0.0
 */
@Test
public void testCount() {
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    long count = stringJavaRDD.count();
    System.out.println(count);
}
 
Example 9
Source File: TextPipeline.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public JavaRDD<Pair<List<String>, AtomicLong>> updateAndReturnAccumulatorVal(JavaRDD<List<String>> tokenizedRDD) {
    // Update the 2 accumulators
    UpdateWordFreqAccumulatorFunction accumulatorClassFunction =
                    new UpdateWordFreqAccumulatorFunction(stopWordBroadCast, wordFreqAcc);
    JavaRDD<Pair<List<String>, AtomicLong>> sentenceWordsCountRDD = tokenizedRDD.map(accumulatorClassFunction);

    // Loop through each element to update accumulator. Count does the same job (verified).
    sentenceWordsCountRDD.count();

    return sentenceWordsCountRDD;
}
 
Example 10
Source File: JavaNaiveBayesExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}
 
Example 11
Source File: MiniBatchTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testMiniBatches() throws Exception {
    log.info("Setting up Spark Context...");
    JavaRDD<String> lines = sc.textFile(new ClassPathResource("svmLight/iris_svmLight_0.txt")
                    .getTempFileFromArchive().toURI().toString()).cache();
    long count = lines.count();
    assertEquals(300, count);
    // gotta map this to a Matrix/INDArray
    RecordReader rr = new SVMLightRecordReader();
    Configuration c = new Configuration();
    c.set(SVMLightRecordReader.NUM_FEATURES, "5");
    rr.setConf(c);
    JavaRDD<DataSet> points = lines.map(new RecordReaderFunction(rr, 4, 3)).cache();
    count = points.count();
    assertEquals(300, count);

    List<DataSet> collect = points.collect();

    points = points.repartition(1);
    JavaRDD<DataSet> miniBatches = new RDDMiniBatches(10, points).miniBatchesJava();
    count = miniBatches.count();
    List<DataSet> list = miniBatches.collect();
    assertEquals(30, count);    //Expect exactly 30 from 1 partition... could be more for multiple input partitions

    lines.unpersist();
    points.unpersist();
    miniBatches.map(new DataSetAssertionFunction());
}
 
Example 12
Source File: PSFilterFileLogger.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public void logReadsAfterQualityFilter(final JavaRDD<GATKRead> reads) {
    Utils.nonNull(reads, "Filter logging parameter reads cannot be null");
    metrics.READS_AFTER_QUALITY_AND_COMPLEXITY_FILTER = reads.count();
}
 
Example 13
Source File: dATest.java    From OpenDL with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	try {
		int x_feature = 784;
		int y_feature = 10;
		int n_hidden = 160;
		List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature);
		
		List<SampleVector> trainList = new ArrayList<SampleVector>();
		List<SampleVector> testList = new ArrayList<SampleVector>();
		DataInput.splitList(samples, trainList, testList, 0.7);
		
		JavaSparkContext context = SparkContextBuild.getContext(args);
		JavaRDD<SampleVector> rdds = context.parallelize(trainList);
		rdds.count();
		logger.info("RDD ok.");
		
		AutoEncoder da = new AutoEncoder(x_feature, n_hidden);
           SGDTrainConfig config = new SGDTrainConfig();
           config.setUseCG(true);
           config.setDoCorruption(true);
           config.setCorruption_level(0.25);
           config.setCgEpochStep(50);
           config.setCgTolerance(0);
           config.setCgMaxIterations(10);
           config.setMaxEpochs(50);
           config.setNbrModelReplica(4);
           config.setMinLoss(0.01);
           config.setUseRegularization(true);
           config.setMrDataStorage(StorageLevel.MEMORY_ONLY());
           config.setPrintLoss(true);
           config.setLossCalStep(3);
           
           logger.info("Start to train dA.");
           DownpourSGDTrain.train(da, rdds, config);
           
           double[] reconstruct_x = new double[x_feature];
           double totalError = 0;
           for(SampleVector test : testList) {
           	da.reconstruct(test.getX(), reconstruct_x);
           	totalError += ClassVerify.squaredError(test.getX(), reconstruct_x);
           }
           logger.info("Mean square error is " + totalError / testList.size());
	} catch(Throwable e) {
		logger.error("", e);
	}
}
 
Example 14
Source File: RBMTest.java    From OpenDL with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	try {
		int x_feature = 784;
		int y_feature = 10;
		int n_hidden = 160;
		List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature);
		
		List<SampleVector> trainList = new ArrayList<SampleVector>();
		List<SampleVector> testList = new ArrayList<SampleVector>();
		DataInput.splitList(samples, trainList, testList, 0.7);
		
		JavaSparkContext context = SparkContextBuild.getContext(args);
		JavaRDD<SampleVector> rdds = context.parallelize(trainList);
		rdds.count();
		logger.info("RDD ok.");
		
		RBM rbm = new RBM(x_feature, n_hidden);
           SGDTrainConfig config = new SGDTrainConfig();
           config.setUseCG(true);
           config.setCgEpochStep(50);
           config.setCgTolerance(0);
           config.setCgMaxIterations(10);
           config.setMaxEpochs(50);
           config.setNbrModelReplica(4);
           config.setMinLoss(0.01);
           config.setMrDataStorage(StorageLevel.MEMORY_ONLY());
           config.setPrintLoss(true);
           config.setLossCalStep(3);
           
           logger.info("Start to train RBM.");
           DownpourSGDTrain.train(rbm, rdds, config);
           
           double[] reconstruct_x = new double[x_feature];
           double totalError = 0;
           for(SampleVector test : testList) {
           	rbm.reconstruct(test.getX(), reconstruct_x);
           	totalError += ClassVerify.squaredError(test.getX(), reconstruct_x);
           }
           logger.info("Mean square error is " + totalError / testList.size());
	} catch(Throwable e) {
		logger.error("", e);
	}
}
 
Example 15
Source File: JavaGradientBoostingRegressionExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf()
    .setAppName("JavaGradientBoostedTreesRegressionExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a GradientBoostedTrees model.
  // The defaultParams for Regression use SquaredError by default.
  BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Regression");
  boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
  boostingStrategy.getTreeStrategy().setMaxDepth(5);
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);

  final GradientBoostedTreesModel model =
    GradientBoostedTrees.train(trainingData, boostingStrategy);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testMSE =
    predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
      @Override
      public Double call(Tuple2<Double, Double> pl) {
        Double diff = pl._1() - pl._2();
        return diff * diff;
      }
    }).reduce(new Function2<Double, Double, Double>() {
      @Override
      public Double call(Double a, Double b) {
        return a + b;
      }
    }) / data.count();
  System.out.println("Test Mean Squared Error: " + testMSE);
  System.out.println("Learned regression GBT model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel");
  GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
    "target/tmp/myGradientBoostingRegressionModel");
  // $example off$

  jsc.stop();
}
 
Example 16
Source File: CollectMultipleMetricsSparkIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public void collectMetrics(JavaRDD<GATKRead> filteredReads, SAMFileHeader samHeader) {
    count = filteredReads.count();
}
 
Example 17
Source File: JavaRandomForestClassificationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a RandomForest model.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Integer numClasses = 2;
  HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "gini";
  Integer maxDepth = 5;
  Integer maxBins = 32;
  Integer seed = 12345;

  final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
    seed);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      @Override
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
      }
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification forest model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel");
  RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
    "target/tmp/myRandomForestClassificationModel");
  // $example off$

  jsc.stop();
}
 
Example 18
Source File: PSFilterFileLogger.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public void logPrimaryReads(final JavaRDD<GATKRead> reads) {
    Utils.nonNull(reads, "Filter logging parameter reads cannot be null");
    metrics.PRIMARY_READS = reads.count();
}
 
Example 19
Source File: AbstractJavaEsSparkTest.java    From elasticsearch-hadoop with Apache License 2.0 4 votes vote down vote up
public void testNoResourceSpecified() throws Exception {
    JavaRDD<Map<String, Object>> rdd = JavaEsSpark.esRDD(sc).values();
    rdd.count();
}
 
Example 20
Source File: PSFilterFileLogger.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public void logReadsAfterDeduplication(final JavaRDD<GATKRead> reads) {
    Utils.nonNull(reads, "Filter logging parameter reads cannot be null");
    metrics.READS_AFTER_DEDUPLICATION = reads.count();
}