Java Code Examples for org.apache.spark.api.java.JavaRDD#sample()

The following examples show how to use org.apache.spark.api.java.JavaRDD#sample() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SampleAndTake.java    From SparkDemo with MIT License 6 votes vote down vote up
static void sample(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> dataRDD = sc.parallelize(datas);
	
	/**
	 *  ====================================================================================================== 
	 *   |                   随机抽样-----参数withReplacement为true时表示抽样之后还放回,可以被多次抽样,false表示不放回;fraction表示抽样比例;seed为随机数种子                       |
	 *   |                   The random  sampling parameter withReplacement is true, which means that after sampling, it can be returned. It can be sampled many times,  |
	 *   |                   and false indicates no return.  Fraction represents the sampling proportion;seed is the random number seed                                                               |                                                                                                                                                                                                                                           | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<Integer> sampleRDD = dataRDD.sample(false, 0.5, System.currentTimeMillis());
	
	// TODO dataRDD.takeSample(false, 3);
	// TODO dataRDD.take(3)

	sampleRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer t) throws Exception {
			System.out.println(t);
		}
	});

	sc.close();
}
 
Example 2
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素采样.
 * true 元素可以多次采样
 *
 * @since hui_project 1.0.0
 */
public void testSample() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    //元素可以多次采样
    JavaRDD<String> sample = textRDD
            .sample(true, 0.001, 100);
    checkResult(sample.collect());
}
 
Example 3
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素采样.
 * false 元素不可以多次采样
 *
 * @since hui_project 1.0.0
 */
public void testSample2() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> sample = textRDD.sample(false, 0.001, 100);
    checkResult(sample.collect());
}
 
Example 4
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素采样.
 * true 元素可以多次采样
 * @since hui_project 1.0.0
 */
@Test
public void testSample() {
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    //元素可以多次采样
    JavaRDD<String> sample = textRDD
            .sample(true, 0.001, 100);
    checkResult(sample.collect());
}
 
Example 5
Source File: TransformationRDDTest.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素采样.
 * false 元素不可以多次采样
 * @since hui_project 1.0.0
 */
@Test
public void testSample2() {
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> sample = textRDD.sample(false, 0.001, 100);
    checkResult(sample.collect());
}
 
Example 6
Source File: SilhouetteCoefficient.java    From oryx with Apache License 2.0 5 votes vote down vote up
static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) {
  long count = evalData.count();
  if (count > MAX_SAMPLE_SIZE) {
    return evalData.sample(false, (double) MAX_SAMPLE_SIZE / count);
  }
  return evalData;
}
 
Example 7
Source File: JavaSVMWithSGDExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L);
  training.cache();
  JavaRDD<LabeledPoint> test = data.subtract(training);

  // Run training algorithm to build the model.
  int numIterations = 100;
  final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations);

  // Clear the default threshold.
  model.clearThreshold();

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double score = model.predict(p.features());
        return new Tuple2<Object, Object>(score, p.label());
      }
    }
  );

  // Get evaluation metrics.
  BinaryClassificationMetrics metrics =
    new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels));
  double auROC = metrics.areaUnderROC();

  System.out.println("Area under ROC = " + auROC);

  // Save and load model
  model.save(sc, "target/tmp/javaSVMWithSGDModel");
  SVMModel sameModel = SVMModel.load(sc, "target/tmp/javaSVMWithSGDModel");
  // $example off$

  sc.stop();
}