org.apache.spark.Accumulator Java Examples

The following examples show how to use org.apache.spark.Accumulator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EntitySalienceFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example #2
Source File: AccumulatorValue.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(AccumulatorValue.class);

	// 创建累加器
	final Accumulator<Integer> accumulator = sc.accumulator(0, "My Accumulator");

	List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
	JavaRDD<Integer> listRDD = sc.parallelize(list);

	listRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer n) throws Exception {
			accumulator.add(n);
			// 不能读取,会报异常 cannot read, you will report an exception
			// System.out.println(accumulator.value());
		}
	});

	// 只能在Driver读取
	System.out.println(accumulator.value());
	
	try {
		Thread.sleep(5000*5000*5000);
		// http://192.168.68.1:4040
	} catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}

	sc.close();
}
 
Example #3
Source File: RddChannel.java    From rheem with Apache License 2.0 5 votes vote down vote up
public void accept(JavaRDD<?> rdd, SparkExecutor sparkExecutor) throws RheemException {
    if (this.isMarkedForInstrumentation() && !this.isRddCached()) {
        final Accumulator<Integer> accumulator = sparkExecutor.sc.accumulator(0);
        this.rdd = rdd.filter(dataQuantum -> {
            accumulator.add(1);
            return true;
        });
        this.accumulator = accumulator;
    } else {
        this.rdd = rdd;
    }
}
 
Example #4
Source File: UtilHelpers.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static int handleErrors(JavaSparkContext jsc, String instantTime, JavaRDD<WriteStatus> writeResponse) {
  Accumulator<Integer> errors = jsc.accumulator(0);
  writeResponse.foreach(writeStatus -> {
    if (writeStatus.hasErrors()) {
      errors.add(1);
      LOG.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString()));
    }
  });
  if (errors.value() == 0) {
    LOG.info(String.format("Table imported into hoodie with %s instant time.", instantTime));
    return 0;
  }
  LOG.error(String.format("Import failed with %d errors.", errors.value()));
  return -1;
}
 
Example #5
Source File: CountCumSum.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public void cumSumWithinPartition() {

        // Accumulator to get the max of the cumulative sum in each partition
        final Accumulator<Counter<Integer>> maxPerPartitionAcc =
                        sc.accumulator(new Counter<Integer>(), new MaxPerPartitionAccumulator());
        // Partition mapping to fold within partition
        foldWithinPartitionRDD = sentenceCountRDD
                        .mapPartitionsWithIndex(new FoldWithinPartitionFunction(maxPerPartitionAcc), true).cache();
        actionForMapPartition(foldWithinPartitionRDD);

        // Broadcast the counter (partition index : sum of count) to all workers
        broadcastedMaxPerPartitionCounter = sc.broadcast(maxPerPartitionAcc.value());
    }
 
Example #6
Source File: TextPipeline.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public Accumulator<Counter<String>> getWordFreqAcc() {
    if (wordFreqAcc != null) {
        return wordFreqAcc;
    } else {
        throw new IllegalStateException("IllegalStateException: wordFreqAcc not set at TextPipline.");
    }
}
 
Example #7
Source File: CountFunction.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public CountFunction(@NonNull Broadcast<VectorsConfiguration> vectorsConfigurationBroadcast,
                @NonNull Broadcast<VoidConfiguration> voidConfigurationBroadcast,
                @NonNull Accumulator<Counter<Long>> accumulator, boolean fetchLabels) {
    this.accumulator = accumulator;
    this.fetchLabels = fetchLabels;
    this.voidConfigurationBroadcast = voidConfigurationBroadcast;
    this.vectorsConfigurationBroadcast = vectorsConfigurationBroadcast;
}
 
Example #8
Source File: EntitySalienceFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
/**
 * Extract features from set of documents. The documents are already annotated with entities.
 * @param jsc
 * @param documents
 * @return
 * @throws ResourceInitializationException
 */
public JavaRDD<LabeledPoint> extract (JavaSparkContext jsc, JavaRDD<SCAS> documents) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(trainingSettings.getFeatureExtractor()).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    // Create a LabelPoint
    JavaRDD<LabeledPoint> labeledPoints = trainingInstances
            .map(ti -> {
                if (ti.getLabel() == 1.0) {
                    SALIENT_ENTITY_INSTANCES.add(1);
                } else {
                    NON_SALIENT_ENTITY_INSTANCES.add(1);
                }
                return FeatureValueInstanceUtils.convertToSparkMLLabeledPoint(ti, featureVectorSize);
            });

    return labeledPoints;
}
 
Example #9
Source File: EntitySalienceAnnotatorAndFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
/**
 * Annotated documents for entities and extract features from set of documents
 * @param jsc
 * @param documents
 * @return
 * @throws ResourceInitializationException
 */
public JavaRDD<LabeledPoint> extract (JavaSparkContext jsc, JavaRDD<SCAS> documents) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    final SparkSerializableAnalysisEngine ae = EntitySalienceFactory.createEntitySalienceEntityAnnotator(trainingSettings.getEntitySalienceEntityAnnotator());
    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(trainingSettings.getFeatureExtractor()).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents
                    .map(s -> {
                        TOTAL_DOCS.add(1);
                        Logger tmpLogger = LoggerFactory.getLogger(EntitySalienceFeatureExtractorSpark.class);
                        String docId = JCasUtil.selectSingle(s.getJCas(), DocumentMetaData.class).getDocumentId();
                        tmpLogger.info("Processing document {}.", docId);
                        //Before processing the document through the Disambiguation Pipeline, add the AIDA settings
                        // in each document.
                        SparkUimaUtils.addSettingsToJCas(s.getJCas(),
                                trainingSettings.getDocumentCoherent(),
                                trainingSettings.getDocumentConfidenceThreshold());
                        return ae.process(s);
                    })
                    .flatMap(s -> fe.getTrainingInstances(s.getJCas(),
                            trainingSettings.getFeatureExtractor(),
                            trainingSettings.getPositiveInstanceScalingFactor()));

    // Create a LabelPoint
    JavaRDD<LabeledPoint> labeledPoints = trainingInstances
            .map(ti -> {
                if (ti.getLabel() == 1.0) {
                    SALIENT_ENTITY_INSTANCES.add(1);
                } else {
                    NON_SALIENT_ENTITY_INSTANCES.add(1);
                }
                return FeatureValueInstanceUtils.convertToSparkMLLabeledPoint(ti, featureVectorSize);
            });

    return labeledPoints;
}
 
Example #10
Source File: EntitySalienceAnnotatorAndFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    final SparkSerializableAnalysisEngine ae = EntitySalienceFactory.createEntitySalienceEntityAnnotator(trainingSettings.getEntitySalienceEntityAnnotator());
    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents
                    .map(s -> {
                        TOTAL_DOCS.add(1);
                        Logger tmpLogger = LoggerFactory.getLogger(EntitySalienceFeatureExtractorSpark.class);
                        String docId = JCasUtil.selectSingle(s.getJCas(), DocumentMetaData.class).getDocumentId();
                        tmpLogger.info("Processing document {}.", docId);
                        //Before processing the document through the Disambiguation Pipeline, add the AIDA settings
                        // in each document.
                        SparkUimaUtils.addSettingsToJCas(s.getJCas(),
                                trainingSettings.getDocumentCoherent(),
                                trainingSettings.getDocumentConfidenceThreshold());
                        return ae.process(s);
                    })
                    .flatMap(s -> fe.getTrainingInstances(s.getJCas(),
                            trainingSettings.getFeatureExtractor(),
                            trainingSettings.getPositiveInstanceScalingFactor()));

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entity", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example #11
Source File: UserVisitAnalyze.java    From UserActionAnalyzePlatform with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args)
{
    args=new String[]{"1"};
    /**
     * 构建spark上下文
     */
    SparkConf conf=new SparkConf().setAppName(Constants.APP_NAME_SESSION).setMaster("local[3]");
    JavaSparkContext context=new JavaSparkContext(conf);
    SQLContext sc=getSQLContext(context.sc());
    //生成模拟数据
    mock(context,sc);

    //拿到相应的Dao组建
    TaskDao dao= DaoFactory.getTaskDao();
    //从外部传入的参数获取任务的id
    Long taskId=ParamUtils.getTaskIdFromArgs(args);
    //从数据库中查询出相应的task
    Task task=dao.findTaskById(taskId);
    JSONObject jsonObject=JSONObject.parseObject(task.getTaskParam());

    //获取指定范围内的Sesssion
    JavaRDD<Row> sessionRangeDate=getActionRDD(sc,jsonObject);

    //这里增加一个新的方法,主要是映射
    JavaPairRDD<String,Row> sessionInfoPairRDD=getSessonInfoPairRDD(sessionRangeDate);
    //重复用到的RDD进行持久化
    sessionInfoPairRDD.persist(StorageLevel.DISK_ONLY());
    //上面的两个RDD是
    //按照Sesson进行聚合
    JavaPairRDD<String,String> sesssionAggregateInfoRDD=aggregateBySessionId(sc,sessionInfoPairRDD);

    //通过条件对RDD进行筛选
    // 重构,同时统计
    Accumulator<String> sessionAggrStatAccumulator=context.accumulator("",new SessionAggrStatAccumulator());


    //在进行accumulator之前,需要aciton动作,不然会为空
    JavaPairRDD<String,String> filteredSessionRDD=filterSessionAndAggrStat(sesssionAggregateInfoRDD,jsonObject,sessionAggrStatAccumulator);
    //重复用到的RDD进行持久化
    filteredSessionRDD.persist(StorageLevel.DISK_ONLY());
    //获取符合过滤条件的全信息公共RDD
    JavaPairRDD<String, Row> commonFullClickInfoRDD=getFilterFullInfoRDD(filteredSessionRDD,sessionInfoPairRDD);

    //重复用到的RDD进行持久化
    commonFullClickInfoRDD.persist(StorageLevel.DISK_ONLY());
    //session聚合统计,统计出访问时长和访问步长的各个区间所占的比例
    /**
     * 重构实现的思路:
     * 1。不要去生成任何的新RDD
     * 2。不要去单独遍历一遍sesion的数据
     * 3。可以在聚合数据的时候可以直接计算session的访问时长和访问步长
     * 4。在以前的聚合操作中,可以在以前的基础上进行计算加上自己实现的Accumulator来进行一次性解决
     * 开发Spark的经验准则
     * 1。尽量少生成RDD
     * 2。尽量少对RDD进行蒜子操作,如果可能,尽量在一个算子里面,实现多个需求功能
     * 3。尽量少对RDD进行shuffle算子操作,比如groupBykey、reduceBykey、sortByKey
     *         shuffle操作,会导致大量的磁盘读写,严重降低性能
     *         有shuffle的算子,和没有shuffle的算子,甚至性能相差极大
     *         有shuffle的算子,很容易造成性能倾斜,一旦数据倾斜,简直就是性能杀手
     * 4。无论做什么功能,性能第一
     *         在大数据项目中,性能最重要。主要是大数据以及大数据项目的特点,决定了大数据的程序和项目速度,都比较满
     *         如果不考虑性能的话,就会导致一个大数据处理程序运行长达数个小时,甚至是数个小时,对用户的体验,简直是
     *         一场灾难。
     */

    /**
     * 使用CountByKey算子实现随机抽取功能
     */
    randomExtractSession(taskId,filteredSessionRDD,sessionInfoPairRDD);

    //在使用Accumulutor之前,需要使用Action算子,否则获取的值为空,这里随机计算
    //filteredSessionRDD.count();
    //计算各个session占比,并写入MySQL
    calculateAndPersist(sessionAggrStatAccumulator.value(),taskId);
    //获取热门品类数据Top10
    List<Tuple2<CategorySortKey,String>> top10CategoryIds=getTop10Category(taskId,commonFullClickInfoRDD);
    //获取热门每一个品类点击Top10session
    getTop10Session(context,taskId,sessionInfoPairRDD,top10CategoryIds);
    //关闭spark上下文
    context.close();
}
 
Example #12
Source File: UpdateWordFreqAccumulatorFunction.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public UpdateWordFreqAccumulatorFunction(Broadcast<List<String>> stopWords,
                Accumulator<Counter<String>> wordFreqAcc) {
    this.wordFreqAcc = wordFreqAcc;
    this.stopWords = stopWords;
}
 
Example #13
Source File: FoldWithinPartitionFunction.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public FoldWithinPartitionFunction(Accumulator<Counter<Integer>> maxPartitionAcc) {
    this.maxPerPartitionAcc = maxPartitionAcc;
}
 
Example #14
Source File: ExtraCountFunction.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public ExtraCountFunction(@NonNull Accumulator<ExtraCounter<Long>> accumulator, boolean fetchLabels) {
    this.accumulator = accumulator;
    this.fetchLabels = fetchLabels;
}