org.apache.spark.api.java.JavaRDD#count

Source File: LogError.java From sparkResearch with Apache License 2.0

6 votes

/**
 * 对日志进行 转换操作和行动操作
 */
public void log(JavaSparkContext sparkContext) {
    JavaRDD<String> inputRDD = sparkContext.textFile("/usr/local/log");
    JavaRDD<String> errorRDD = inputRDD.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return null;
        }
    });

    long errorRDDCount = errorRDD.count();
    System.out.println("errorRDD count is " + errorRDDCount);
    for (String rddLine : errorRDD.take(10)) {
        System.out.println("errorRDD 数据is " + rddLine);
    }
}

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test4(SparkSession spark, String outText1) throws Exception {
  try {
    System.out.println("*********************************");
    System.out.println("T4: Create collection and store it as text file in " + outText1);
    List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
    JavaRDD<Integer> distData = new JavaSparkContext(spark.sparkContext()).parallelize(data);
    distData.saveAsTextFile(outText1);
    JavaRDD<String> txtRes = spark.read().textFile(outText1).javaRDD();
    long baseCount = txtRes.count();
    countAndCompare(baseCount, distData.count(), "T4", baseCount);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(outText1, spark.sparkContext().hadoopConfiguration(), true);
  }

}

Source File: SparkTableChecker.java From spliceengine with GNU Affero General Public License v3.0

6 votes

/**
 * Check for duplicate indexes
 * @param index
 * @return
 * @throws StandardException
 * @throws InterruptedException
 * @throws ExecutionException
 */
private List<String> checkDuplicateIndexes(PairDataSet table, PairDataSet index) throws StandardException {
    try {
        SpliceSpark.pushScope(String.format("Check duplicates in index %s.%s", schemaName, indexName));
        JavaPairRDD duplicateIndexRdd = ((SparkPairDataSet) index).rdd
                .combineByKey(new CreateCombiner(), new MergeValue(), new MergeCombiners())
                .filter(new DuplicateIndexFilter());

        JavaPairRDD joinedRdd = duplicateIndexRdd
                .join(((SparkPairDataSet) table).rdd);

        JavaRDD duplicateIndex = joinedRdd
                .mapPartitions(new SparkFlatMapFunction<>(new DeleteDuplicateIndexFunction<>(conglomerate, txn, tentativeIndex, baseColumnMap, fix)));

        Iterator it = duplicateIndex.toLocalIterator();
        long count = duplicateIndex.count();
        return reportDuplicateIndexes(it, count, fix);
    }catch (Exception e) {
        throw StandardException.plainWrapException(e);
    }
    finally {
        SpliceSpark.popScope();
    }
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc,
		JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc,
		boolean hasHeader, String delim, boolean fill, double fillValue) {
	//determine unknown dimensions and sparsity if required
	//(w/ robustness for mistakenly counted header in nnz)
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<String> tmp = input.values()
			.map(new CSVAnalysisFunction(aNnz, delim));
		long rlen = tmp.count() - (hasHeader ? 1 : 0);
		long clen = tmp.first().split(delim).length;
		long nnz = Math.min(rlen*clen, UtilFunctions.toLong(aNnz.value()));
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
		.zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
		prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction(
			mc, sparse, hasHeader, delim, fill, fillValue));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}

Source File: MockMLUpdate.java From oryx with Apache License 2.0

5 votes

@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {
  long testDataCount = testData.count();
  testCounts.add((int) testDataCount);
  log.info("Returning eval {}", testDataCount);
  return testDataCount;
}

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}

Source File: ActionRDDTest.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 集合里面元素数量.
 *
 * @since hui_project 1.0.0
 */
@Test
public void testCount() {
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    long count = stringJavaRDD.count();
    System.out.println(count);
}

Source File: TextPipeline.java From deeplearning4j with Apache License 2.0

5 votes

public JavaRDD<Pair<List<String>, AtomicLong>> updateAndReturnAccumulatorVal(JavaRDD<List<String>> tokenizedRDD) {
    // Update the 2 accumulators
    UpdateWordFreqAccumulatorFunction accumulatorClassFunction =
                    new UpdateWordFreqAccumulatorFunction(stopWordBroadCast, wordFreqAcc);
    JavaRDD<Pair<List<String>, AtomicLong>> sentenceWordsCountRDD = tokenizedRDD.map(accumulatorClassFunction);

    // Loop through each element to update accumulator. Count does the same job (verified).
    sentenceWordsCountRDD.count();

    return sentenceWordsCountRDD;
}

Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}

Source File: MiniBatchTests.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testMiniBatches() throws Exception {
    log.info("Setting up Spark Context...");
    JavaRDD<String> lines = sc.textFile(new ClassPathResource("svmLight/iris_svmLight_0.txt")
                    .getTempFileFromArchive().toURI().toString()).cache();
    long count = lines.count();
    assertEquals(300, count);
    // gotta map this to a Matrix/INDArray
    RecordReader rr = new SVMLightRecordReader();
    Configuration c = new Configuration();
    c.set(SVMLightRecordReader.NUM_FEATURES, "5");
    rr.setConf(c);
    JavaRDD<DataSet> points = lines.map(new RecordReaderFunction(rr, 4, 3)).cache();
    count = points.count();
    assertEquals(300, count);

    List<DataSet> collect = points.collect();

    points = points.repartition(1);
    JavaRDD<DataSet> miniBatches = new RDDMiniBatches(10, points).miniBatchesJava();
    count = miniBatches.count();
    List<DataSet> list = miniBatches.collect();
    assertEquals(30, count);    //Expect exactly 30 from 1 partition... could be more for multiple input partitions

    lines.unpersist();
    points.unpersist();
    miniBatches.map(new DataSetAssertionFunction());
}

Source File: PSFilterFileLogger.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public void logReadsAfterQualityFilter(final JavaRDD<GATKRead> reads) {
    Utils.nonNull(reads, "Filter logging parameter reads cannot be null");
    metrics.READS_AFTER_QUALITY_AND_COMPLEXITY_FILTER = reads.count();
}

Source File: dATest.java From OpenDL with Apache License 2.0

4 votes

public static void main(String[] args) {
	try {
		int x_feature = 784;
		int y_feature = 10;
		int n_hidden = 160;
		List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature);
		
		List<SampleVector> trainList = new ArrayList<SampleVector>();
		List<SampleVector> testList = new ArrayList<SampleVector>();
		DataInput.splitList(samples, trainList, testList, 0.7);
		
		JavaSparkContext context = SparkContextBuild.getContext(args);
		JavaRDD<SampleVector> rdds = context.parallelize(trainList);
		rdds.count();
		logger.info("RDD ok.");
		
		AutoEncoder da = new AutoEncoder(x_feature, n_hidden);
           SGDTrainConfig config = new SGDTrainConfig();
           config.setUseCG(true);
           config.setDoCorruption(true);
           config.setCorruption_level(0.25);
           config.setCgEpochStep(50);
           config.setCgTolerance(0);
           config.setCgMaxIterations(10);
           config.setMaxEpochs(50);
           config.setNbrModelReplica(4);
           config.setMinLoss(0.01);
           config.setUseRegularization(true);
           config.setMrDataStorage(StorageLevel.MEMORY_ONLY());
           config.setPrintLoss(true);
           config.setLossCalStep(3);
           
           logger.info("Start to train dA.");
           DownpourSGDTrain.train(da, rdds, config);
           
           double[] reconstruct_x = new double[x_feature];
           double totalError = 0;
           for(SampleVector test : testList) {
           	da.reconstruct(test.getX(), reconstruct_x);
           	totalError += ClassVerify.squaredError(test.getX(), reconstruct_x);
           }
           logger.info("Mean square error is " + totalError / testList.size());
	} catch(Throwable e) {
		logger.error("", e);
	}
}

Source File: RBMTest.java From OpenDL with Apache License 2.0

4 votes

public static void main(String[] args) {
	try {
		int x_feature = 784;
		int y_feature = 10;
		int n_hidden = 160;
		List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature);
		
		List<SampleVector> trainList = new ArrayList<SampleVector>();
		List<SampleVector> testList = new ArrayList<SampleVector>();
		DataInput.splitList(samples, trainList, testList, 0.7);
		
		JavaSparkContext context = SparkContextBuild.getContext(args);
		JavaRDD<SampleVector> rdds = context.parallelize(trainList);
		rdds.count();
		logger.info("RDD ok.");
		
		RBM rbm = new RBM(x_feature, n_hidden);
           SGDTrainConfig config = new SGDTrainConfig();
           config.setUseCG(true);
           config.setCgEpochStep(50);
           config.setCgTolerance(0);
           config.setCgMaxIterations(10);
           config.setMaxEpochs(50);
           config.setNbrModelReplica(4);
           config.setMinLoss(0.01);
           config.setMrDataStorage(StorageLevel.MEMORY_ONLY());
           config.setPrintLoss(true);
           config.setLossCalStep(3);
           
           logger.info("Start to train RBM.");
           DownpourSGDTrain.train(rbm, rdds, config);
           
           double[] reconstruct_x = new double[x_feature];
           double totalError = 0;
           for(SampleVector test : testList) {
           	rbm.reconstruct(test.getX(), reconstruct_x);
           	totalError += ClassVerify.squaredError(test.getX(), reconstruct_x);
           }
           logger.info("Mean square error is " + totalError / testList.size());
	} catch(Throwable e) {
		logger.error("", e);
	}
}

Source File: JavaGradientBoostingRegressionExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf()
    .setAppName("JavaGradientBoostedTreesRegressionExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a GradientBoostedTrees model.
  // The defaultParams for Regression use SquaredError by default.
  BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Regression");
  boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
  boostingStrategy.getTreeStrategy().setMaxDepth(5);
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);

  final GradientBoostedTreesModel model =
    GradientBoostedTrees.train(trainingData, boostingStrategy);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testMSE =
    predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
      @Override
      public Double call(Tuple2<Double, Double> pl) {
        Double diff = pl._1() - pl._2();
        return diff * diff;
      }
    }).reduce(new Function2<Double, Double, Double>() {
      @Override
      public Double call(Double a, Double b) {
        return a + b;
      }
    }) / data.count();
  System.out.println("Test Mean Squared Error: " + testMSE);
  System.out.println("Learned regression GBT model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel");
  GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
    "target/tmp/myGradientBoostingRegressionModel");
  // $example off$

  jsc.stop();
}

Source File: CollectMultipleMetricsSparkIntegrationTest.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public void collectMetrics(JavaRDD<GATKRead> filteredReads, SAMFileHeader samHeader) {
    count = filteredReads.count();
}

Source File: JavaRandomForestClassificationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a RandomForest model.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Integer numClasses = 2;
  HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "gini";
  Integer maxDepth = 5;
  Integer maxBins = 32;
  Integer seed = 12345;

  final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
    seed);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      @Override
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
      }
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification forest model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel");
  RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
    "target/tmp/myRandomForestClassificationModel");
  // $example off$

  jsc.stop();
}

Source File: PSFilterFileLogger.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public void logPrimaryReads(final JavaRDD<GATKRead> reads) {
    Utils.nonNull(reads, "Filter logging parameter reads cannot be null");
    metrics.PRIMARY_READS = reads.count();
}

Source File: AbstractJavaEsSparkTest.java From elasticsearch-hadoop with Apache License 2.0

4 votes

public void testNoResourceSpecified() throws Exception {
    JavaRDD<Map<String, Object>> rdd = JavaEsSpark.esRDD(sc).values();
    rdd.count();
}

Source File: PSFilterFileLogger.java From gatk with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public void logReadsAfterDeduplication(final JavaRDD<GATKRead> reads) {
    Utils.nonNull(reads, "Filter logging parameter reads cannot be null");
    metrics.READS_AFTER_DEDUPLICATION = reads.count();
}

Java Code Examples for org.apache.spark.api.java.JavaRDD#count()