Java Code Examples for org.apache.spark.api.java.JavaPairRDD#cache()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#cache() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PageOneStepConvertRateSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
public static void main(String[] args) {
	// 1、构造Spark上下文
	SparkConf conf = new SparkConf()
			.setAppName(Constants.SPARK_APP_NAME_PAGE);
	SparkUtils.setMaster(conf);
	
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = SparkUtils.getSQLContext(sc.sc());
	
	// 2、生成模拟数据
	SparkUtils.mockData(sc, sqlContext);  
	
	// 3、查询任务,获取任务的参数
	Long taskid = ParamUtils.getTaskIdFromArgs(args, Constants.SPARK_LOCAL_TASKID_PAGE);
	
	ITaskDAO taskDAO = DAOFactory.getTaskDAO();
	Task task = taskDAO.findById(taskid);
	if(task == null) {
		System.out.println(new Date() + ": cannot find this task with id [" + taskid + "].");  
		return;
	}
	
	JSONObject taskParam = JSONObject.parseObject(task.getTaskParam());
	
	// 4、查询指定日期范围内的用户访问行为数据
	JavaRDD<Row> actionRDD = SparkUtils.getActionRDDByDateRange(
			sqlContext, taskParam);
	
	// 对用户访问行为数据做一个映射,将其映射为<sessionid,访问行为>的格式
	// 咱们的用户访问页面切片的生成,是要基于每个session的访问数据,来进行生成的
	// 脱离了session,生成的页面访问切片,是没有意义的
	// 举例,比如用户A,访问了页面3和页面5
	// 用于B,访问了页面4和页面6
	// 漏了一个前提,使用者指定的页面流筛选条件,比如页面3->页面4->页面7
	// 你能不能说,是将页面3->页面4,串起来,作为一个页面切片,来进行统计呢
	// 当然不行
	// 所以说呢,页面切片的生成,肯定是要基于用户session粒度的
	
	JavaPairRDD<String, Row> sessionid2actionRDD = getSessionid2actionRDD(actionRDD);
	sessionid2actionRDD = sessionid2actionRDD.cache(); // persist(StorageLevel.MEMORY_ONLY)
	
	// 对<sessionid,访问行为> RDD,做一次groupByKey操作
	// 因为我们要拿到每个session对应的访问行为数据,才能够去生成切片
	JavaPairRDD<String, Iterable<Row>> sessionid2actionsRDD = sessionid2actionRDD.groupByKey();
	
	// 最核心的一步,每个session的单跳页面切片的生成,以及页面流的匹配,算法
	JavaPairRDD<String, Integer> pageSplitRDD = generateAndMatchPageSplit(
			sc, sessionid2actionsRDD, taskParam);
	Map<String, Long> pageSplitPvMap = pageSplitRDD.countByKey();

	// 使用者指定的页面流是3,2,5,8,6
	// 咱们现在拿到的这个pageSplitPvMap,3->2,2->5,5->8,8->6
	Long startPagePv = getStartPagePv(taskParam, sessionid2actionsRDD);
	
	// 计算目标页面流的各个页面切片的转化率
	Map<String, Double> convertRateMap = computePageSplitConvertRate(
			taskParam, pageSplitPvMap, startPagePv);
	
	// 持久化页面切片转化率
	persistConvertRate(taskid, convertRateMap);  
}
 
Example 2
Source File: JavaLatentDirichletAllocationExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }
 
Example 3
Source File: ThresholdClusterer.java    From ensemble-clustering with MIT License 2 votes vote down vote up
@Override
	public SparkClusterResult doCluster(DataSet ds) {
		// SparkDataSet needs to be passed in
		SparkDataSet rdd = (SparkDataSet)ds;
		
		// cache dataset in memory
//		rdd.getRDD().cache();
		
		distFunc = new DistanceFunction(this.typeDefs);
		ClusterFactory clusterFactory = new ClusterFactory(this.typeDefs, this.onlineUpdate);
		
		log.info("Starting threshold clusterer with threshold {}", threshold);
		
		// TODO look at using a reduce function 
		// Idea is the first step is a map<Instance, List<Instance>> that converts each instance to a single "cluster"
		// second step is a reduce where input is a List<Instances> and produces a List<Instances>
		// this step would merge clusters within threshold
		
		JavaPairRDD<String, Instance> instances = rdd.getRDD();
		instances.cache();
		
		// convert each instance into a singleton cluster
		JavaRDD<Map<String, Instance>> singletons = rdd.getRDD().map( new InstanceToClusterFunction(clusterFactory) );
		//singletons.cache();
		
		log.info("Generated initial singleton clusters");
		
		// merge clusters together
		Map<String, Instance> clusters = singletons.reduce( new AggregateClusterFunction(distFunc, threshold) );
		
		log.info("Merging clusters completed with {} clusters", clusters.size());
		
		// find the best cluster for each instance
		JavaPairRDD<String, Instance> bestCluster = instances.mapToPair( new BestClusterFunction(distFunc, clusters) );
		
		log.info("Output results");
		
		if (clusters != null && centroidsPath != null) rdd.getContext().parallelize(new ArrayList<Instance>(clusters.values())).saveAsTextFile(centroidsPath);
	
		if (bestCluster != null && clustersPath != null) bestCluster.saveAsTextFile(clustersPath);
		
		log.info("Threshold clusterer completed");
		
		// return the cluster membership rdd
		return new SparkClusterResult(bestCluster);
	}