Java Code Examples for org.apache.spark.api.java.JavaSparkContext.broadcast()

The following are Jave code examples for showing how to use broadcast() of the org.apache.spark.api.java.JavaSparkContext class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: MinoanER   File: CNPNeighborsUnnormalized.java   Source Code and License Vote up 6 votes
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);

    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
Example 2
Project: MinoanER   File: CNPARCS.java   Source Code and License Vote up 6 votes
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);             
    JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates =  getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
Example 3
Project: MinoanER   File: CNPNeighbors.java   Source Code and License Vote up 6 votes
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
    
    //JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);        
    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsOld(neighborSims, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
Example 4
Project: BLASpark   File: OtherOperations.java   Source Code and License Vote up 5 votes
private static CoordinateMatrix GetD_COORD(CoordinateMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        JavaRDD<MatrixEntry> rows = A.entries().toJavaRDD().cache();

        final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);

        JavaRDD<MatrixEntry> LUEntries = rows.mapPartitions(new FlatMapFunction<Iterator<MatrixEntry>, MatrixEntry>() {
            @Override
            public Iterator<MatrixEntry> call(Iterator<MatrixEntry> matrixEntryIterator) throws Exception {
                List<MatrixEntry> newLowerEntries = new ArrayList<MatrixEntry>();

                boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();

                while(matrixEntryIterator.hasNext()) {
                    MatrixEntry currentEntry = matrixEntryIterator.next();

                    if(currentEntry.i() == currentEntry.j()) {
                        if(inverseValuesValue) {
                            newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 1.0/currentEntry.value()));
                        }
                        else {
                            newLowerEntries.add(currentEntry);
                        }

                    }
                    else {
                        newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 0.0));
                    }

                }

                return newLowerEntries.iterator();
            }
        });

        CoordinateMatrix newMatrix = new CoordinateMatrix(LUEntries.rdd());

        return newMatrix;
    }
 
Example 5
Project: Apache-Spark-2x-for-Java-Developers   File: MapSideJoinBroadcast.java   Source Code and License Vote up 5 votes
public static void main(String[] args) {

		SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext());

		JavaPairRDD<String, String> userIdToCityId = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("1", "101"), new Tuple2<String, String>("2", "102"),
						new Tuple2<String, String>("3", "107"), new Tuple2<String, String>("4", "103"),
						new Tuple2<String, String>("11", "101"), new Tuple2<String, String>("12", "102"),
						new Tuple2<String, String>("13", "107"), new Tuple2<String, String>("14", "103")));

		JavaPairRDD<String, String> cityIdToCityName = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("101", "India"), new Tuple2<String, String>("102", "UK"),
						new Tuple2<String, String>("103", "Germany"), new Tuple2<String, String>("107", "USA")));

		Broadcast<Map<String, String>> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap());

		JavaRDD<Tuple3<String, String, String>> joined = userIdToCityId.map(
				v1 -> new Tuple3<String, String, String>(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2())));

		System.out.println(joined.collect());

	}
 
Example 6
Project: mutantpdb   File: App.java   Source Code and License Vote up 5 votes
public static void main( String[] args )
{
    Dataset<Row> mutations = DataProvider.getMutationsToStructures();
    List<String> pdbIds = mutations.select(col("pdbId"))
            .distinct().toJavaRDD().map(t -> t.getString(0)).collect();

    List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList();
    SaprkUtils.stopSparkSession();

    JavaSparkContext sc = SaprkUtils.getSparkContext();
    Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted);

    MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc)
            .downloadMmtfFiles(Arrays.asList("5IRC"), sc)
            .flatMapToPair(new StructureToPolymerChains())
            .flatMapToPair(new AddResidueToKey(bcmut))
            .mapValues(new StructureToBioJava())
            .mapToPair(new FilterResidue())
            .filter(t -> t._2!=null).keys()
            .map(t -> t.replace(".", ","))
            .saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues");
    sc.close();
}
 
Example 7
Project: bunsen   File: ValueSetUdfs.java   Source Code and License Vote up 5 votes
/**
 * Pushes an "in_valueset" UDF that uses the given {@link BroadcastableValueSets} for its content.
 *
 * @param spark the spark session
 * @param valueSets the valuesets to use in the UDF
 */
public static synchronized void pushUdf(SparkSession spark, BroadcastableValueSets valueSets) {

  JavaSparkContext ctx = new JavaSparkContext(spark.sparkContext());

  Broadcast<BroadcastableValueSets> broadcast = ctx.broadcast(valueSets);

  spark.udf()
      .register("in_valueset",
          new InValuesetUdf(broadcast),
          DataTypes.BooleanType);

  // Push the broadcast variable
  valueSetStack.push(broadcast);
}
 
Example 8
Project: BLASpark   File: OtherOperations.java   Source Code and License Vote up 4 votes
private static IndexedRowMatrix GetD_IRW(IndexedRowMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        JavaRDD<IndexedRow> rows = A.rows().toJavaRDD().cache();

        final Broadcast<Boolean> inverseValuesBC = jsc.broadcast(inverseValues);
        JavaRDD<IndexedRow> LURows = rows.map(new Function<IndexedRow, IndexedRow>() {

            @Override
            public IndexedRow call(IndexedRow indexedRow) throws Exception {
                long index = indexedRow.index();
                DenseVector vect = indexedRow.vector().toDense();

                boolean inverseValuesValue = inverseValuesBC.getValue().booleanValue();

                double newValues[] = new double[vect.size()];


                for(int i = 0; i< vect.size(); i++) {

                    if( i == index) {
                        if(inverseValuesValue) {
                            newValues[i] = 1.0/vect.apply(i);
                        }
                        else {
                            newValues[i] = vect.apply(i);
                        }
                    }
                    else {
                        newValues[i] = 0.0;
                    }

                }

                DenseVector newVector = new DenseVector(newValues);

                return new IndexedRow(index, newVector);

            }
        });

        IndexedRowMatrix newMatrix = new IndexedRowMatrix(LURows.rdd());

        return newMatrix;
    }
 
Example 9
Project: s3-inventory-usage-examples   File: ReducedRedundancyLocatorExampleMain.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws Exception{
    String srcBucketName;
    String scrBucketKey;
    String destBucketName;
    String destPrefix;
    ArgumentParser argumentParser = new ArgumentParser();
    AmazonS3 s3Client = new AmazonS3Client();

    try {
        BucketKey location = argumentParser.parseArguments(args);
        srcBucketName = location.getSrcBucket();
        scrBucketKey = location.getSrcKey();
        destBucketName = location.getDestBucket();
        destPrefix = location.getDestPrefix();
    } catch (ParseException e) {
        LOG.info(PARSE_ERROR_MSG);
        throw new IllegalArgumentException("Parser throw a parse Exception", e);
    }

    // Obtain the original manifest files
    InventoryManifestRetriever inventoryManifestRetriever =
            new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
    InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();

    // Check if the inventory report includes the StorageClass column
    String fileSchema = manifest.getFileSchema();
    String filterColumn = "storageClass";
    if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
        throw new StorageClassNotIncludedException();
    }

    //Create Spark Context
    SparkConf sparkConf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());

    // Get the inventory report, split it into lines, parse each line to a POJO,
    // Filter, and write new csv file to S3
    JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
    List<InventoryManifest.Locator> newLocatorList = locatorRDD
            .map(new InventoryReportLineRetriever(clientFactory, manifest))
            .flatMap(new InventoryReportMapper(manifest))
            .filter(new ReducedRedundancyStorageClassFilter())
            .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
                    destBucketName, destPrefix))
            .collect();

    // Generate new manifest files including new locators, and send them back to S3
    new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
            .writeManifest(newLocatorList);

    sc.close();
}
 
Example 10
Project: MinoanER   File: RelationsRank.java   Source Code and License Vote up 4 votes
/**
 * return a map of topN neighbors per entity (reversed to point to in-neighbors (values) having the key entity as their top out-neighbor)
 * @param rawTriples
 * @param SEPARATOR
 * @param entityIdsRDD
 * @param MIN_SUPPORT_THRESHOLD
 * @param N topN neighbors per entity
 * @param positiveIds
 * @param jsc
 * @return 
 */
public Map<Integer,IntArrayList> run(JavaRDD<String> rawTriples, String SEPARATOR, JavaRDD<String> entityIdsRDD, float MIN_SUPPORT_THRESHOLD, int N, boolean positiveIds, JavaSparkContext jsc) {
    //rawTriples.persist(StorageLevel.MEMORY_AND_DISK_SER());        
    
    //List<String> subjects = Utils.getEntityUrlsFromEntityRDDInOrder(rawTriples, SEPARATOR); //a list of (distinct) subject URLs, keeping insertion order (from original triples file)        
    //Object2IntOpenHashMap<String> subjects = Utils.getEntityIdsMapping(rawTriples, SEPARATOR);
    Object2IntOpenHashMap<String> entityIds = Utils.readEntityIdsMapping(entityIdsRDD, positiveIds);
    System.out.println("Found "+entityIds.size()+" entities in collection "+ (positiveIds?"1":"2"));
    
    long numEntitiesSquared = (long)entityIds.keySet().size();
    numEntitiesSquared *= numEntitiesSquared;
    
    Broadcast<Object2IntOpenHashMap<String>> entityIds_BV = jsc.broadcast(entityIds);
     
    JavaPairRDD<String,List<Tuple2<Integer, Integer>>> relationIndex = getRelationIndex(rawTriples, SEPARATOR, entityIds_BV); //a list of (s,o) for each predicate      
    
    //rawTriples.unpersist();        
    relationIndex.persist(StorageLevel.MEMORY_AND_DISK_SER());                
                    
    List<String> relationsRank = getRelationsRank(relationIndex, MIN_SUPPORT_THRESHOLD, numEntitiesSquared);      
    System.out.println("Top-5 relations in collection "+(positiveIds?"1: ":"2: ")+Arrays.toString(relationsRank.subList(0, Math.min(5,relationsRank.size())).toArray()));
    
    JavaPairRDD<Integer, IntArrayList> topOutNeighbors = getTopOutNeighborsPerEntity(relationIndex, relationsRank, N, positiveIds); //action
    
    relationIndex.unpersist(); 
    
    //reverse the outNeighbors, to get in neighbors
    Map<Integer, IntArrayList> inNeighbors =
    topOutNeighbors.flatMapToPair(x -> { //reverse the neighbor pairs from (in,[out1,out2,out3]) to (out1,in), (out2,in), (out3,in)
                List<Tuple2<Integer,Integer>> inNeighbs = new ArrayList<>();
                for (int outNeighbor : x._2()) {
                    inNeighbs.add(new Tuple2<>(outNeighbor, x._1()));
                }
                return inNeighbs.iterator();
            })
            .aggregateByKey(new IntOpenHashSet(), 
                    (x,y) -> {x.add(y); return x;}, 
                    (x,y) -> {x.addAll(y); return x;})
            .mapValues(x-> new IntArrayList(x))
            .collectAsMap();
    
    return inNeighbors;
}
 
Example 11
Project: bunsen   File: BroadcastableMappings.java   Source Code and License Vote up 4 votes
/**
 * Broadcast mappings stored in the given conceptMaps instance that match the given
 * conceptMapUris.
 *
 * @param conceptMaps the {@link ConceptMaps} instance with the content to broadcast
 * @param conceptMapUriToVersion map of the concept map URIs to broadcast to their versions.
 * @return a broadcast variable containing a mappings object usable in UDFs.
 */
public static Broadcast<BroadcastableMappings> broadcast(ConceptMaps conceptMaps,
    Map<String,String> conceptMapUriToVersion) {

  Map<String,ConceptMap> mapsToLoad = conceptMaps.getMaps()
      .collectAsList()
      .stream()
      .filter(conceptMap ->
          conceptMap.getVersion().equals(conceptMapUriToVersion.get(conceptMap.getUrl())))
      .collect(Collectors.toMap(ConceptMap::getUrl, Function.identity()));

  // Expand the concept maps to load and sort them so dependencies are before
  // their dependents in the list.
  List<String> sortedMapsToLoad = sortMapsToLoad(conceptMapUriToVersion.keySet(), mapsToLoad);

  // Since this is used to map from one system to another, we use only targets
  // that don't introduce inaccurate meanings. (For instance, we can't map
  // general condition code to a more specific type, since that is not
  // representative of the source data.)
  Dataset<Mapping> mappings = conceptMaps.getMappings(conceptMapUriToVersion)
      .filter("equivalence in ('equivalent', 'equals', 'wider', 'subsumes')");

  // Group mappings by their concept map URI
  Map<String, List<Mapping>> groupedMappings =  mappings
      .collectAsList()
      .stream()
      .collect(Collectors.groupingBy(Mapping::getConceptMapUri));

  Map<String, BroadcastableConceptMap> broadcastableMaps = new HashMap<>();

  for (String conceptMapUri: sortedMapsToLoad) {

    ConceptMap map = mapsToLoad.get(conceptMapUri);

    Set<String> children = getMapChildren(map);

    List<BroadcastableConceptMap> childMaps = children.stream()
        .map(child -> broadcastableMaps.get(child))
        .collect(Collectors.toList());

    BroadcastableConceptMap broadcastableConceptMap = new BroadcastableConceptMap(conceptMapUri,
        groupedMappings.getOrDefault(conceptMapUri, Collections.emptyList()),
        childMaps);

    broadcastableMaps.put(conceptMapUri, broadcastableConceptMap);
  }

  JavaSparkContext ctx = new JavaSparkContext(conceptMaps.getMaps()
      .sparkSession()
      .sparkContext());

  return ctx.broadcast(new BroadcastableMappings(broadcastableMaps));
}
 
Example 12
Project: oryx2   File: ALSUpdate.java   Source Code and License Vote up 4 votes
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int features = (Integer) hyperParameters.get(0);
  double lambda = (Double) hyperParameters.get(1);
  double alpha = (Double) hyperParameters.get(2);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = (Double) hyperParameters.get(3);
  }
  Preconditions.checkArgument(features > 0);
  Preconditions.checkArgument(lambda >= 0.0);
  Preconditions.checkArgument(alpha > 0.0);
  if (logStrength) {
    Preconditions.checkArgument(epsilon > 0.0);
  }

  JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
  parsedRDD.cache();

  Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
  Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDIndexMap.size(), itemIDIndexMap.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);

  JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
  trainRatingData = aggregateScores(trainRatingData, epsilon);
  ALS als = new ALS()
      .setRank(features)
      .setIterations(iterations)
      .setLambda(lambda)
      .setCheckpointInterval(5);
  if (implicit) {
    als = als.setImplicitPrefs(true).setAlpha(alpha);
  }

  RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
  trainingRatingDataRDD.cache();
  MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
  trainingRatingDataRDD.unpersist(false);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  parsedRDD.unpersist();

  Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
  Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));

  PMML pmml = mfModelToPMML(model,
                            features,
                            lambda,
                            alpha,
                            epsilon,
                            implicit,
                            logStrength,
                            candidatePath,
                            bUserIndexToID,
                            bItemIndexToID);
  unpersist(model);

  bUserIndexToID.unpersist();
  bItemIndexToID.unpersist();

  return pmml;
}
 
Example 13
Project: oryx2   File: ALSUpdate.java   Source Code and License Vote up 4 votes
@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {

  JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
  parsedTestRDD.cache();

  Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
  Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDToIndex.size(), itemIDToIndex.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);

  JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
  }
  testRatingData = aggregateScores(testRatingData, epsilon);

  MatrixFactorizationModel mfModel =
      pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);

  parsedTestRDD.unpersist();

  double eval;
  if (implicit) {
    double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
    log.info("AUC: {}", auc);
    eval = auc;
  } else {
    double rmse = Evaluation.rmse(mfModel, testRatingData);
    log.info("RMSE: {}", rmse);
    eval = -rmse;
  }
  unpersist(mfModel);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  return eval;
}
 
Example 14
Project: BLASpark   File: L3.java   Source Code and License Vote up 3 votes
private static IndexedRowMatrix SCAL_IRW(double alpha, IndexedRowMatrix A, IndexedRowMatrix B, JavaSparkContext jsc) {

        JavaRDD<IndexedRow> rows = A.rows().toJavaRDD();

        final Broadcast<Double> alphaBC = jsc.broadcast(alpha);

        JavaRDD<IndexedRow> newRows = rows.map(new Function<IndexedRow, IndexedRow>() {
            @Override
            public IndexedRow call(IndexedRow indexedRow) throws Exception {

                double alphaValue = alphaBC.getValue().doubleValue();

                long index = indexedRow.index();

                double values[] = new double[indexedRow.vector().size()];

                for(int i = 0; i< values.length; i++) {
                    values[i] = indexedRow.vector().apply(i) * alphaValue;
                }

                return new IndexedRow(index, new DenseVector(values));

            }
        });

        B = new IndexedRowMatrix(newRows.rdd());

        return B;

    }
 
Example 15
Project: BLASpark   File: L3.java   Source Code and License Vote up 3 votes
private static BlockMatrix SCAL_BCK(double alpha, BlockMatrix A, BlockMatrix B, JavaSparkContext jsc) {

        JavaRDD<Tuple2<Tuple2<Object, Object>, Matrix>> blocks = A.blocks().toJavaRDD();

        final Broadcast<Double> alphaBC = jsc.broadcast(alpha);

        JavaRDD<Tuple2<Tuple2<Object, Object>, Matrix>> newBlocks = blocks.map(new Function<Tuple2<Tuple2<Object, Object>, Matrix>, Tuple2<Tuple2<Object, Object>, Matrix>>() {
            @Override
            public Tuple2<Tuple2<Object, Object>, Matrix> call(Tuple2<Tuple2<Object, Object>, Matrix> block) throws Exception {

                double alphaBCRec = alphaBC.getValue().doubleValue();

                Integer row = (Integer)block._1._1; //Integer.parseInt(block._1._1.toString());
                Integer col = (Integer)block._1._2;
                Matrix matrixBlock = block._2;

                for(int i = 0; i< matrixBlock.numRows(); i++) {

                    for(int j = 0; j< matrixBlock.numCols(); j++) {
                        matrixBlock.update(i,j, matrixBlock.apply(i,j) * alphaBCRec);
                    }

                }

                return new Tuple2<Tuple2<Object, Object>, Matrix>(new Tuple2<Object, Object>(row, col), matrixBlock);

            }
        });

        B = new BlockMatrix(newBlocks.rdd(), A.rowsPerBlock(), A.colsPerBlock());

        return B;

    }
 
Example 16
Project: BLASpark   File: L2.java   Source Code and License Vote up 3 votes
private static DenseVector DGEMV_IRW(IndexedRowMatrix matrix, double alpha, DenseVector vector, JavaSparkContext jsc) {

        final Broadcast BC = jsc.broadcast(vector);
        final Broadcast<Double> AlphaBC = jsc.broadcast(alpha);

        //IndexedRowMatrix indexedMatrix = (IndexedRowMatrix) matrix;

        JavaRDD<IndexedRow> rows = matrix.rows().toJavaRDD();
        List<Tuple2<Long, Double>> returnValues = rows.mapToPair(new PairFunction<IndexedRow, Long, Double>() {

            @Override
            public Tuple2<Long, Double> call(IndexedRow row) {
                DenseVector vect = (DenseVector) BC.getValue();
                double alphaBCRec = AlphaBC.getValue().doubleValue();

                DenseVector tmp = row.vector().copy().toDense();

                BLAS.scal(alphaBCRec, tmp);

                return new Tuple2<Long, Double>(row.index(), BLAS.dot(tmp, vect));
            }

        }).collect();


        double[] stockArr = new double[returnValues.size()];

        //for(int i = 0; i< returnValues.size(); i++) {
        for(Tuple2<Long, Double> item : returnValues) {
            stockArr[item._1().intValue()] = item._2();
        }

        return new DenseVector(stockArr);
    }
 
Example 17
Project: BLASpark   File: L2.java   Source Code and License Vote up 3 votes
private static IndexedRowMatrix DGER_IRW(IndexedRowMatrix A, double alpha, DenseVector x, DenseVector y, JavaSparkContext jsc) {

        final Broadcast<Double> AlphaBC = jsc.broadcast(alpha);
        final Broadcast<DenseVector> BCVector_X = jsc.broadcast(x);
        final Broadcast<DenseVector> BCVector_Y = jsc.broadcast(y);

        JavaRDD<IndexedRow> rows = A.rows().toJavaRDD();

        JavaRDD<IndexedRow> resultRows = rows.map(new Function<IndexedRow, IndexedRow>() {
            @Override
            public IndexedRow call(IndexedRow indexedRow) throws Exception {

                DenseVector Vector_X = BCVector_X.getValue();
                DenseVector Vector_Y = BCVector_Y.getValue();
                double alphaBCRec = AlphaBC.getValue().doubleValue();

                DenseVector row = indexedRow.vector().toDense();

                double[] resultArray = new double[row.size()];

                long i = indexedRow.index();

                for( int j = 0; j< Vector_Y.size(); j++) {
                    resultArray[j] = alphaBCRec * Vector_X.apply((int)i) * Vector_Y.apply(j) + row.apply(j);
                }

                DenseVector result = new DenseVector(resultArray);

                return new IndexedRow(indexedRow.index(), result);

            }
        });

        IndexedRowMatrix newMatrix = new IndexedRowMatrix(resultRows.rdd(), x.size(), y.size());

        return newMatrix;
    }
 
Example 18
Project: ViraPipe   File: HDFSWriter.java   Source Code and License Vote up 3 votes
public HDFSWriter(JavaSparkContext sc, String inputpath, boolean broadcastHeader) throws IOException {

        if(broadcastHeader){
            SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(new Path(inputpath), sc.hadoopConfiguration());
            final Broadcast<SAMFileHeader> headerBc = sc.broadcast(header);
        }



    }
 
Example 19
Project: bunsen   File: ValueSets.java   Source Code and License Vote up 3 votes
/**
 * Returns a dataset with the values for each element in the map of uri to version.
 *
 * @param uriToVersion a map of value set URI to the version to load
 * @return a dataset of values for the given URIs and versions.
 */
public Dataset<Value> getValues(Map<String,String> uriToVersion) {

  JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());

  Broadcast<Map<String,String>> broadcastUrisToVersion = context.broadcast(uriToVersion);

  return this.values.filter((FilterFunction<Value>) value -> {

    String latestVersion = broadcastUrisToVersion.getValue().get(value.getValueSetUri());

    return latestVersion != null && latestVersion.equals(value.getValueSetVersion());
  });
}
 
Example 20
Project: bunsen   File: ConceptMaps.java   Source Code and License Vote up 3 votes
/**
 * Returns a dataset with the mappings for each uri and version.
 *
 * @param uriToVersion a map of concept map URI to the version to load
 * @return a dataset of mappings for the given URIs and versions.
 */
public Dataset<Mapping> getMappings(Map<String,String> uriToVersion) {

  JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());

  Broadcast<Map<String,String>> broadcastMaps = context.broadcast(uriToVersion);

  return this.mappings.filter((FilterFunction<Mapping>) mapping -> {

    String latestVersion = broadcastMaps.getValue().get(mapping.getConceptMapUri());

    return latestVersion != null && latestVersion.equals(mapping.getConceptMapVersion());
  });
}