Java Code Examples for org.apache.spark.api.java.JavaRDD#count()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#count() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LogError.java From sparkResearch with Apache License 2.0 | 6 votes |
/** * 对日志进行 转换操作和行动操作 */ public void log(JavaSparkContext sparkContext) { JavaRDD<String> inputRDD = sparkContext.textFile("/usr/local/log"); JavaRDD<String> errorRDD = inputRDD.filter(new Function<String, Boolean>() { @Override public Boolean call(String v1) throws Exception { return null; } }); long errorRDDCount = errorRDD.count(); System.out.println("errorRDD count is " + errorRDDCount); for (String rddLine : errorRDD.take(10)) { System.out.println("errorRDD 数据is " + rddLine); } }
Example 2
Source File: TestSuite.java From stocator with Apache License 2.0 | 6 votes |
public void test4(SparkSession spark, String outText1) throws Exception { try { System.out.println("*********************************"); System.out.println("T4: Create collection and store it as text file in " + outText1); List<Integer> data = Arrays.asList(1, 2, 3, 4, 5); JavaRDD<Integer> distData = new JavaSparkContext(spark.sparkContext()).parallelize(data); distData.saveAsTextFile(outText1); JavaRDD<String> txtRes = spark.read().textFile(outText1).javaRDD(); long baseCount = txtRes.count(); countAndCompare(baseCount, distData.count(), "T4", baseCount); } catch (Exception e) { throw e; } finally { deleteData(outText1, spark.sparkContext().hadoopConfiguration(), true); } }
Example 3
Source File: SparkTableChecker.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
/** * Check for duplicate indexes * @param index * @return * @throws StandardException * @throws InterruptedException * @throws ExecutionException */ private List<String> checkDuplicateIndexes(PairDataSet table, PairDataSet index) throws StandardException { try { SpliceSpark.pushScope(String.format("Check duplicates in index %s.%s", schemaName, indexName)); JavaPairRDD duplicateIndexRdd = ((SparkPairDataSet) index).rdd .combineByKey(new CreateCombiner(), new MergeValue(), new MergeCombiners()) .filter(new DuplicateIndexFilter()); JavaPairRDD joinedRdd = duplicateIndexRdd .join(((SparkPairDataSet) table).rdd); JavaRDD duplicateIndex = joinedRdd .mapPartitions(new SparkFlatMapFunction<>(new DeleteDuplicateIndexFunction<>(conglomerate, txn, tentativeIndex, baseColumnMap, fix))); Iterator it = duplicateIndex.toLocalIterator(); long count = duplicateIndex.count(); return reportDuplicateIndexes(it, count, fix); }catch (Exception e) { throw StandardException.plainWrapException(e); } finally { SpliceSpark.popScope(); } }
Example 4
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue) { //determine unknown dimensions and sparsity if required //(w/ robustness for mistakenly counted header in nnz) if( !mc.dimsKnown(true) ) { LongAccumulator aNnz = sc.sc().longAccumulator("nnz"); JavaRDD<String> tmp = input.values() .map(new CSVAnalysisFunction(aNnz, delim)); long rlen = tmp.count() - (hasHeader ? 1 : 0); long clen = tmp.first().split(delim).length; long nnz = Math.min(rlen*clen, UtilFunctions.toLong(aNnz.value())); mc.set(rlen, clen, mc.getBlocksize(), nnz); } //prepare csv w/ row indexes (sorted by filenames) JavaPairRDD<Text,Long> prepinput = input.values() .zipWithIndex(); //zip row index //convert csv rdd to binary block rdd (w/ partial blocks) boolean sparse = requiresSparseAllocation(prepinput, mc); JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair(new CSVToBinaryBlockFunction( mc, sparse, hasHeader, delim, fill, fillValue)); //aggregate partial matrix blocks (w/ preferred number of output //partitions as the data is likely smaller in binary block format, //but also to bound the size of partitions for compressed inputs) int parts = SparkUtils.getNumPreferredPartitions(mc, out); return RDDAggregateUtils.mergeByKey(out, parts, false); }
Example 5
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc, Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector) { //determine unknown dimensions and sparsity if required if( !mc.dimsKnown(true) ) { LongAccumulator aNnz = sc.sc().longAccumulator("nnz"); JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector)); long rlen = tmp.count(); long clen = !isVector ? df.columns().length - (containsID?1:0) : ((Vector) tmp.first().get(containsID?1:0)).size(); long nnz = UtilFunctions.toLong(aNnz.value()); mc.set(rlen, clen, mc.getBlocksize(), nnz); } //ensure valid blocksizes if( mc.getBlocksize()<=1 ) mc.setBlocksize(ConfigurationManager.getBlocksize()); //construct or reuse row ids JavaPairRDD<Row, Long> prepinput = containsID ? df.javaRDD().mapToPair(new DataFrameExtractIDFunction( df.schema().fieldIndex(DF_ID_COLUMN))) : df.javaRDD().zipWithIndex(); //zip row index //convert csv rdd to binary block rdd (w/ partial blocks) boolean sparse = requiresSparseAllocation(prepinput, mc); JavaPairRDD<MatrixIndexes, MatrixBlock> out = prepinput.mapPartitionsToPair( new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector)); //aggregate partial matrix blocks (w/ preferred number of output //partitions as the data is likely smaller in binary block format, //but also to bound the size of partitions for compressed inputs) int parts = SparkUtils.getNumPreferredPartitions(mc, out); return RDDAggregateUtils.mergeByKey(out, parts, false); }
Example 6
Source File: MockMLUpdate.java From oryx with Apache License 2.0 | 5 votes |
@Override public double evaluate(JavaSparkContext sparkContext, PMML model, Path modelParentPath, JavaRDD<String> testData, JavaRDD<String> trainData) { long testDataCount = testData.count(); testCounts.add((int) testDataCount); log.info("Returning eval {}", testDataCount); return testDataCount; }
Example 7
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue) { //determine unknown dimensions and sparsity if required if( !mc.dimsKnown() ) { //nnz irrelevant here JavaRDD<String> tmp = input.values() .map(new TextToStringFunction()); String tmpStr = tmp.first(); boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) || tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX); tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr; long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0); long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length; mc.set(rlen, clen, mc.getBlocksize(), -1); } //prepare csv w/ row indexes (sorted by filenames) JavaPairRDD<Text,Long> prepinput = input.values() .zipWithIndex(); //zip row index //prepare default schema if needed if( schema == null || schema.length==1 ) schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING); //convert csv rdd to binary block rdd (w/ partial blocks) JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair( new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim)); return out; }
Example 8
Source File: ActionRDDTest.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 集合里面元素数量. * * @since hui_project 1.0.0 */ @Test public void testCount() { JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH); long count = stringJavaRDD.count(); System.out.println(count); }
Example 9
Source File: TextPipeline.java From deeplearning4j with Apache License 2.0 | 5 votes |
public JavaRDD<Pair<List<String>, AtomicLong>> updateAndReturnAccumulatorVal(JavaRDD<List<String>> tokenizedRDD) { // Update the 2 accumulators UpdateWordFreqAccumulatorFunction accumulatorClassFunction = new UpdateWordFreqAccumulatorFunction(stopWordBroadCast, wordFreqAcc); JavaRDD<Pair<List<String>, AtomicLong>> sentenceWordsCountRDD = tokenizedRDD.map(accumulatorClassFunction); // Loop through each element to update accumulator. Count does the same job (verified). sentenceWordsCountRDD.count(); return sentenceWordsCountRDD; }
Example 10
Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}); JavaRDD<LabeledPoint> training = tmp[0]; // training set JavaRDD<LabeledPoint> test = tmp[1]; // test set final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); } }).count() / (double) test.count(); // Save and load model model.save(jsc.sc(), "target/tmp/myNaiveBayesModel"); NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel"); // $example off$ jsc.stop(); }
Example 11
Source File: MiniBatchTests.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testMiniBatches() throws Exception { log.info("Setting up Spark Context..."); JavaRDD<String> lines = sc.textFile(new ClassPathResource("svmLight/iris_svmLight_0.txt") .getTempFileFromArchive().toURI().toString()).cache(); long count = lines.count(); assertEquals(300, count); // gotta map this to a Matrix/INDArray RecordReader rr = new SVMLightRecordReader(); Configuration c = new Configuration(); c.set(SVMLightRecordReader.NUM_FEATURES, "5"); rr.setConf(c); JavaRDD<DataSet> points = lines.map(new RecordReaderFunction(rr, 4, 3)).cache(); count = points.count(); assertEquals(300, count); List<DataSet> collect = points.collect(); points = points.repartition(1); JavaRDD<DataSet> miniBatches = new RDDMiniBatches(10, points).miniBatchesJava(); count = miniBatches.count(); List<DataSet> list = miniBatches.collect(); assertEquals(30, count); //Expect exactly 30 from 1 partition... could be more for multiple input partitions lines.unpersist(); points.unpersist(); miniBatches.map(new DataSetAssertionFunction()); }
Example 12
Source File: PSFilterFileLogger.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public void logReadsAfterQualityFilter(final JavaRDD<GATKRead> reads) { Utils.nonNull(reads, "Filter logging parameter reads cannot be null"); metrics.READS_AFTER_QUALITY_AND_COMPLEXITY_FILTER = reads.count(); }
Example 13
Source File: dATest.java From OpenDL with Apache License 2.0 | 4 votes |
public static void main(String[] args) { try { int x_feature = 784; int y_feature = 10; int n_hidden = 160; List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature); List<SampleVector> trainList = new ArrayList<SampleVector>(); List<SampleVector> testList = new ArrayList<SampleVector>(); DataInput.splitList(samples, trainList, testList, 0.7); JavaSparkContext context = SparkContextBuild.getContext(args); JavaRDD<SampleVector> rdds = context.parallelize(trainList); rdds.count(); logger.info("RDD ok."); AutoEncoder da = new AutoEncoder(x_feature, n_hidden); SGDTrainConfig config = new SGDTrainConfig(); config.setUseCG(true); config.setDoCorruption(true); config.setCorruption_level(0.25); config.setCgEpochStep(50); config.setCgTolerance(0); config.setCgMaxIterations(10); config.setMaxEpochs(50); config.setNbrModelReplica(4); config.setMinLoss(0.01); config.setUseRegularization(true); config.setMrDataStorage(StorageLevel.MEMORY_ONLY()); config.setPrintLoss(true); config.setLossCalStep(3); logger.info("Start to train dA."); DownpourSGDTrain.train(da, rdds, config); double[] reconstruct_x = new double[x_feature]; double totalError = 0; for(SampleVector test : testList) { da.reconstruct(test.getX(), reconstruct_x); totalError += ClassVerify.squaredError(test.getX(), reconstruct_x); } logger.info("Mean square error is " + totalError / testList.size()); } catch(Throwable e) { logger.error("", e); } }
Example 14
Source File: RBMTest.java From OpenDL with Apache License 2.0 | 4 votes |
public static void main(String[] args) { try { int x_feature = 784; int y_feature = 10; int n_hidden = 160; List<SampleVector> samples = DataInput.readMnist("mnist_784_1000.txt", x_feature, y_feature); List<SampleVector> trainList = new ArrayList<SampleVector>(); List<SampleVector> testList = new ArrayList<SampleVector>(); DataInput.splitList(samples, trainList, testList, 0.7); JavaSparkContext context = SparkContextBuild.getContext(args); JavaRDD<SampleVector> rdds = context.parallelize(trainList); rdds.count(); logger.info("RDD ok."); RBM rbm = new RBM(x_feature, n_hidden); SGDTrainConfig config = new SGDTrainConfig(); config.setUseCG(true); config.setCgEpochStep(50); config.setCgTolerance(0); config.setCgMaxIterations(10); config.setMaxEpochs(50); config.setNbrModelReplica(4); config.setMinLoss(0.01); config.setMrDataStorage(StorageLevel.MEMORY_ONLY()); config.setPrintLoss(true); config.setLossCalStep(3); logger.info("Start to train RBM."); DownpourSGDTrain.train(rbm, rdds, config); double[] reconstruct_x = new double[x_feature]; double totalError = 0; for(SampleVector test : testList) { rbm.reconstruct(test.getX(), reconstruct_x); totalError += ClassVerify.squaredError(test.getX(), reconstruct_x); } logger.info("Mean square error is " + totalError / testList.size()); } catch(Throwable e) { logger.error("", e); } }
Example 15
Source File: JavaGradientBoostingRegressionExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf() .setAppName("JavaGradientBoostedTreesRegressionExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Train a GradientBoostedTrees model. // The defaultParams for Regression use SquaredError by default. BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Regression"); boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice. boostingStrategy.getTreeStrategy().setMaxDepth(5); // Empty categoricalFeaturesInfo indicates all features are continuous. Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo); final GradientBoostedTreesModel model = GradientBoostedTrees.train(trainingData, boostingStrategy); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() { @Override public Double call(Tuple2<Double, Double> pl) { Double diff = pl._1() - pl._2(); return diff * diff; } }).reduce(new Function2<Double, Double, Double>() { @Override public Double call(Double a, Double b) { return a + b; } }) / data.count(); System.out.println("Test Mean Squared Error: " + testMSE); System.out.println("Learned regression GBT model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel"); GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel"); // $example off$ jsc.stop(); }
Example 16
Source File: CollectMultipleMetricsSparkIntegrationTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public void collectMetrics(JavaRDD<GATKRead> filteredReads, SAMFileHeader samHeader) { count = filteredReads.count(); }
Example 17
Source File: JavaRandomForestClassificationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Train a RandomForest model. // Empty categoricalFeaturesInfo indicates all features are continuous. Integer numClasses = 2; HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); Integer numTrees = 3; // Use more in practice. String featureSubsetStrategy = "auto"; // Let the algorithm choose. String impurity = "gini"; Integer maxDepth = 5; Integer maxBins = 32; Integer seed = 12345; final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }).count() / testData.count(); System.out.println("Test Error: " + testErr); System.out.println("Learned classification forest model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel"); RandomForestModel sameModel = RandomForestModel.load(jsc.sc(), "target/tmp/myRandomForestClassificationModel"); // $example off$ jsc.stop(); }
Example 18
Source File: PSFilterFileLogger.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public void logPrimaryReads(final JavaRDD<GATKRead> reads) { Utils.nonNull(reads, "Filter logging parameter reads cannot be null"); metrics.PRIMARY_READS = reads.count(); }
Example 19
Source File: AbstractJavaEsSparkTest.java From elasticsearch-hadoop with Apache License 2.0 | 4 votes |
public void testNoResourceSpecified() throws Exception { JavaRDD<Map<String, Object>> rdd = JavaEsSpark.esRDD(sc).values(); rdd.count(); }
Example 20
Source File: PSFilterFileLogger.java From gatk with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public void logReadsAfterDeduplication(final JavaRDD<GATKRead> reads) { Utils.nonNull(reads, "Filter logging parameter reads cannot be null"); metrics.READS_AFTER_DEDUPLICATION = reads.count(); }