* Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.

package com.cloudera.oryx.example.batch;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;

import com.cloudera.oryx.api.TopicProducer;
import com.cloudera.oryx.api.batch.BatchLayerUpdate;

 * Input keys are ignored. Values are treated as lines of space-separated text. The job
 * counts, for each word, the number of distinct other words that co-occur in some line
 * of text in the input. These are written as a "MODEL" update, where the word-count mapping
 * is written as a JSON string.
public final class ExampleBatchLayerUpdate implements BatchLayerUpdate<String,String,String> {

  public void runUpdate(JavaSparkContext sparkContext,
                        long timestamp,
                        JavaPairRDD<String,String> newData,
                        JavaPairRDD<String,String> pastData,
                        String modelDirString,
                        TopicProducer<String,String> modelUpdateTopic) throws IOException {
    JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData);
    String modelString;
    try {
      modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData));
    } catch (JsonProcessingException jpe) {
      throw new IOException(jpe);
    modelUpdateTopic.send("MODEL", modelString);

  public static Map<String,Integer> countDistinctOtherWords(JavaPairRDD<String,String> data) {
    return data.values().flatMapToPair(line -> {
      Set<String> distinctTokens = new HashSet<>(Arrays.asList(line.split(" ")));
      return distinctTokens.stream().flatMap(a ->
        distinctTokens.stream().filter(b -> !a.equals(b)).map(b -> new Tuple2<>(a, b))
    }).distinct().mapValues(a -> 1).reduceByKey((c1, c2) -> c1 + c2).collectAsMap();