Java Code Examples for org.apache.spark.api.java.JavaRDD

The following are top voted examples for showing how to use org.apache.spark.api.java.JavaRDD. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: ViraPipe   File: InterleaveMulti.java   Source Code and License 12 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example 2
Project: gcp   File: Spark2Streaming.java   Source Code and License 10 votes vote down vote up
public static void main(String[] args) throws InterruptedException {
  SparkConf sc = new SparkConf().setAppName("POC-Streaming");
  try(JavaStreamingContext jsc = new JavaStreamingContext(sc, new Duration(2000))) {
    //JavaDStream<SampleXML> records = jsc.textFileStream("input/").map(new ParseXML());
    //textFileStream process lines of files, so xml has to be 1 line to work //alternative below

    JavaRDD<String> files = jsc.sparkContext().wholeTextFiles("input/").map(tuple -> tuple._2());
    Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
    rddQueue.add(files);
    JavaDStream<String> records = jsc.queueStream(rddQueue);

    records.foreachRDD(rdd -> System.out.printf("Amount of XMLs: %d\n", rdd.count()));

    jsc.start();
    jsc.awaitTermination();
  }
}
 
Example 3
Project: ViraPipe   File: HDFSWriter.java   Source Code and License 8 votes vote down vote up
public static JavaRDD<SAMRecord> setPartitionHeaders(final JavaRDD<SAMRecord> reads, final Broadcast<SAMFileHeader> header) {

        return reads.mapPartitions(records -> {
            //header.getValue().setTextHeader(header.getValue().getTextHeader()+"\\[email protected]\\tSN:"+records..getReferenceName());
            //record.setHeader(header);

            BAMHeaderOutputFormat.setHeader(header.getValue());
            return records;
        });
    }
 
Example 4
Project: MinoanER   File: CNPARCS.java   Source Code and License 8 votes vote down vote up
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);             
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
Example 5
Project: Apache-Spark-2x-for-Java-Developers   File: SparkWordCount.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}
 
Example 6
Project: oryx2   File: ALSUpdate.java   Source Code and License 7 votes vote down vote up
/**
 * Implementation which splits based solely on time. It will return approximately
 * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training
 * data and the rest as test data.
 */
@Override
protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) {
  // Rough approximation; assumes timestamps are fairly evenly distributed
  StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats();

  long minTime = (long) maxMin.min();
  long maxTime = (long) maxMin.max();
  log.info("New data timestamp range: {} - {}", minTime, maxTime);
  long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime));
  log.info("Splitting at timestamp {}", approxTestTrainBoundary);

  JavaRDD<String> newTrainData = newData.filter(
      line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary);
  JavaRDD<String> testData = newData.filter(
      line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary);

  return new Pair<>(newTrainData, testData);
}
 
Example 7
Project: incubator-sdap-mudrod   File: RDDUtil.java   Source Code and License 7 votes vote down vote up
/**
 * getAllWordsInDoc: Extracted all unique terms from all docs.
 *
 * @param docwordRDD Pair RDD, each key is a doc, and value is term list extracted from
 *                   that doc.
 * @return unique term list
 */
public static JavaRDD<String> getAllWordsInDoc(JavaPairRDD<String, List<String>> docwordRDD) {
  JavaRDD<String> wordRDD = docwordRDD.values().flatMap(new FlatMapFunction<List<String>, String>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @Override
    public Iterator<String> call(List<String> list) {
      return list.iterator();
    }
  }).distinct();

  return wordRDD;
}
 
Example 8
Project: oryx2   File: KMeansUpdate.java   Source Code and License 7 votes vote down vote up
/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                                         numberOfRuns, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
 
Example 9
Project: movie-recommender   File: ModelFinder.java   Source Code and License 6 votes vote down vote up
public TrainedModel findBestModel(JavaRDD<Rating> ratings) {
    double weights[] = {6, 2, 2};
    JavaRDD<Rating>[] randomRatings = ratings.randomSplit(weights, 0L);
    JavaRDD<Rating> trainingRdd = randomRatings[0];
    JavaRDD<Rating> validationRdd = randomRatings[1];
    JavaRDD<Rating> testRdd = randomRatings[2];
    TrainConfig trainConfig = findBestTrainingParameters(trainingRdd, validationRdd);

    TrainedModel model = ModelFactory.create(trainingRdd, testRdd, trainConfig.getRankNr(),
            trainConfig.getIterationsNr());
    logger.info("best model have RMSE = " + model.getError());

    return model;
}
 
Example 10
Project: MinoanER   File: BlockFilteringAdvanced.java   Source Code and License 6 votes vote down vote up
public JavaPairRDD<Integer,IntArrayList> parseBlockCollection(JavaRDD<String> blockingInput) {
    System.out.println("Parsing the blocking collection...");
    return blockingInput
        .map(line -> line.split("\t")) //split to [blockId, [entityIds]]
        .filter(line -> line.length == 2) //only keep lines of this format
        .mapToPair(pair -> {                
            int blockId = Integer.parseInt(pair[0]);
            String[] entities = pair[1].replaceFirst(";", "").split("#");
            if (entities == null || entities.length == 0) {
                return null;
            }
            List<Integer> outputEntities = new ArrayList<>(); //possible (but not really probable) cause of OOM (memory errors) if huge blocks exist
            for (String entity : entities) {
                if (entity.isEmpty()) continue; //in case the last entityId finishes with '#'
                Integer entityId = Integer.parseInt(entity);			                    
                outputEntities.add(entityId);
            }
            return new Tuple2<>(blockId, new IntArrayList(outputEntities.stream().mapToInt(i->i).toArray()));
        })
        .filter(x -> x != null);
}
 
Example 11
Project: ViraPipe   File: InterleaveMulti.java   Source Code and License 6 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 12
Project: ViraPipe   File: NormalizeRDD.java   Source Code and License 6 votes vote down vote up
private static JavaRDD<String> getUniqueKmers(JavaPairRDD<Text, SequencedFragment> fastqRDD, int k) {
  JavaRDD<String> rdd = fastqRDD.mapPartitions(records -> {

    HashSet<String> umer_set = new HashSet<String>();
    while (records.hasNext()) {
      Tuple2<Text, SequencedFragment> fastq = records.next();
      String seq = fastq._2.getSequence().toString();
      //HashSet<String> umer_in_seq = new HashSet<String>();
      for (int i = 0; i < seq.length() - k - 1; i++) {
        String kmer = seq.substring(i, i + k);
        umer_set.add(kmer);
      }
    }
    return umer_set.iterator();
  });

  JavaRDD<String> umersRDD = rdd.distinct();
  //umersRDD.sortBy(s -> s, true, 4);
  return umersRDD;
}
 
Example 13
Project: ViraPipe   File: SamToFastq.java   Source Code and License 6 votes vote down vote up
private static JavaPairRDD<Text, SequencedFragment> mapSAMRecordsToFastq(JavaRDD<SAMRecord> bamRDD) {

    JavaPairRDD<Text, SequencedFragment> fastqRDD = bamRDD.mapToPair(read -> {

      String name = read.getReadName();
      if(read.getReadPairedFlag()){
        if(read.getFirstOfPairFlag())
          name = name+"/1";
        if(read.getSecondOfPairFlag())
          name = name+"/2";
      }

      //TODO: check values
      Text t = new Text(name);
      SequencedFragment sf = new SequencedFragment();
      sf.setSequence(new Text(read.getReadString()));
      sf.setQuality(new Text(read.getBaseQualityString()));

      return new Tuple2<Text, SequencedFragment>(t, sf);
    });
    return fastqRDD;
  }
 
Example 14
Project: rdf2x   File: InstanceFilterTest.java   Source Code and License 6 votes vote down vote up
@Test
public void testFilterWithoutIgnoreTypes() {
    InstanceFilterConfig config = new InstanceFilterConfig()
            .setIgnoreOtherTypes(false)
            .setTypes(Arrays.asList("http://example.com/a/type", "http://example.com/b/type"));
    InstanceFilter filter = new InstanceFilter(config);
    JavaRDD<Instance> result = filter.filter(testRDD, typeIndex);

    List<Instance> rdd = new ArrayList<>();

    Instance one = new Instance();
    one.addType(typeIndex.getIndex("http://example.com/a/type"));
    one.setUri("http://example.com/one");
    rdd.add(one);

    Instance two = new Instance();
    two.addType(typeIndex.getIndex("http://example.com/b/type"));
    two.addType(typeIndex.getIndex("http://example.com/c/type"));
    two.setUri("http://example.com/two");
    rdd.add(two);

    JavaRDD<Instance> expected = jsc().parallelize(rdd);

    assertRDDEquals("Expected instances without restricted types were filtered out", expected, result);
}
 
Example 15
Project: MinoanER   File: Utils.java   Source Code and License 6 votes vote down vote up
/**
 * @deprecated use {@link #readEntityIdsMapping(JavaRDD)} instead, to get the entity mappings used in blocking
 * Maps an entity url to its entity id, that is also used by blocking.
 * @param rawTriples
 * @param SEPARATOR
 * @return a map from an entity url to its entity id, that is also used by blocking.
 */
public static Object2IntOpenHashMap<String> getEntityIdsMapping(JavaRDD<String> rawTriples, String SEPARATOR) {        
    LinkedHashSet<String> subjectsSet =                  
        new LinkedHashSet<>(rawTriples
        .map(line -> line.split(SEPARATOR)[0])
        .collect()                
        ); //convert list to set (to remove duplicates)
    
    Object2IntOpenHashMap<String> result = new Object2IntOpenHashMap<>(subjectsSet.size());
    result.defaultReturnValue(-1);
    int index = 0;
    for (String subject : subjectsSet) {
        result.put(subject, index++);
    }
    return result;
}
 
Example 16
Project: bunsen   File: ConceptMaps.java   Source Code and License 6 votes vote down vote up
/**
 * Returns the latest versions of a given set of concept maps.
 *
 * @param urls a set of URLs to retrieve the latest version for, or null to load them all.
 * @param includeExperimental flag to include concept maps marked as experimental
 *
 * @return a map of concept map URLs to the latest version for them.
 */
public Map<String,String> getLatestVersions(final Set<String> urls,
    boolean includeExperimental) {

  // Reduce by the concept map URI to return only the latest version
  // per concept map. Spark's provided max aggregation function
  // only works on numeric types, so we jump into RDDs and perform
  // the reduce by hand.
  JavaRDD<UrlAndVersion> changes = this.conceptMaps.select(col("url"),
      col("version"),
      col("experimental"))
      .toJavaRDD()
      .filter(row -> (urls == null || urls.contains(row.getString(0)))
          && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2)))
      .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1)))
      .reduceByKey((leftVersion, rightVersion) ->
          leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion)
      .map(tuple -> new UrlAndVersion(tuple._1, tuple._2));

  return this.spark.createDataset(changes.rdd(), URL_AND_VERSION_ENCODER)
      .collectAsList()
      .stream()
      .collect(Collectors.toMap(UrlAndVersion::getUrl,
          UrlAndVersion::getVersion));
}
 
Example 17
Project: oryx2   File: DunnIndex.java   Source Code and License 6 votes vote down vote up
/**
 * @param evalData data for evaluation
 * @return the Dunn Index of a given clustering
 *  (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better
 */
@Override
double evaluate(JavaRDD<Vector> evalData) {
  // Intra-cluster distance is mean distance to centroid
  double maxIntraClusterDistance =
      fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max();

  // Inter-cluster distance is distance between centroids
  double minInterClusterDistance = Double.POSITIVE_INFINITY;
  List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values());
  DistanceFn<double[]> distanceFn = getDistanceFn();
  for (int i = 0; i < clusters.size(); i++) {
    double[] centerI = clusters.get(i).getCenter();
    // Distances are symmetric, hence d(i,j) == d(j,i)
    for (int j = i + 1; j < clusters.size(); j++) {
      double[] centerJ = clusters.get(j).getCenter();
      minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ));
    }
  }

  return minInterClusterDistance / maxIntraClusterDistance;
}
 
Example 18
Project: ViraPipe   File: HDFSWriter.java   Source Code and License 6 votes vote down vote up
private static JavaRDD<SAMRecord> alignmentsToSAM(JavaRDD<String> alignmentRDD, SAMFileHeader header) {
    return alignmentRDD.mapPartitions(alns -> {

        List<SAMRecord> records = new ArrayList<SAMRecord>();

        final SAMLineParser samLP = new SAMLineParser(new DefaultSAMRecordFactory(), ValidationStringency.SILENT, header, null, null);
        while (alns.hasNext()) {

            String aln = alns.next().replace("\r\n", "").replace("\n", "").replace(System.lineSeparator(), "");
            SAMRecord record = null;
            try{
                record = samLP.parseLine(aln);
                records.add(record);
            }catch(SAMFormatException e){
                System.out.println(e.getMessage().toString());
            }
        }
        return records.iterator();
    });
}
 
Example 19
Project: ViraPipe   File: HDFSWriter.java   Source Code and License 6 votes vote down vote up
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritable(JavaRDD<SAMRecord> records, Broadcast<SAMFileHeader> header) {
    return records.mapToPair(read -> {

        //SEQUENCE DICTIONARY must be set here for the alignment because it's not given as header file
        //Set in alignment to sam map phase
        if(header.getValue().getSequenceDictionary()==null) header.getValue().setSequenceDictionary(new SAMSequenceDictionary());
        if(header.getValue().getSequenceDictionary().getSequence(read.getReferenceName())==null)
            header.getValue().getSequenceDictionary().addSequence(new SAMSequenceRecord(read.getReferenceName()));

        //read.setHeader(read.getHeader());
        read.setHeaderStrict(header.getValue());
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}
 
Example 20
Project: MinoanER   File: LabelMatchingHeuristic.java   Source Code and License 6 votes vote down vote up
/**
 * Return an RDD with keys: label objects, and values: entity ids from a single collection, having this label
 * @param inputTriples
 * @param labelAtts
 * @param entityIds
 * @param SEPARATOR
 * @param positiveIds
 * @return 
 */
private JavaPairRDD<String,Integer> getLabelBlocks(JavaRDD<String> inputTriples, Set<String> labelAtts, JavaRDD<String> entityIds, String SEPARATOR, boolean positiveIds) {
    Object2IntOpenHashMap<String> urls1 = Utils.readEntityIdsMapping(entityIds, positiveIds);
    return inputTriples.mapToPair(line -> {
    String[] spo = line.toLowerCase().replaceAll(" \\.$", "").split(SEPARATOR); //lose the ending " ." from valid .nt files
      if (spo.length < 3) {
          return null;
      }          
      if (labelAtts.contains(spo[1])) {
        String labelValue = line.substring(line.indexOf(spo[1])+spo[1].length()+SEPARATOR.length())
                .toLowerCase().replaceAll("[^a-z0-9 ]", "").trim();
        int subjectId = urls1.getInt(Utils.encodeURIinUTF8(spo[0])); //replace subject url with entity id
        if (!positiveIds) {
            subjectId = -subjectId;
        }
        return new Tuple2<String,Integer>(labelValue,subjectId);
      } else {
          return null;
      }          
    })
    .filter(x-> x!= null)
    .distinct();
}
 
Example 21
Project: net.jgp.labs.spark.datasources   File: ExifDirectoryRelation.java   Source Code and License 6 votes vote down vote up
@Override
public RDD<Row> buildScan() {
    log.debug("-> buildScan()");
    schema();

    // I have isolated the work to a method to keep the plumbing code as simple as
    // possible.
    List<PhotoMetadata> table = collectData();

    @SuppressWarnings("resource")
    JavaSparkContext sparkContext = new JavaSparkContext(sqlContext.sparkContext());
    JavaRDD<Row> rowRDD = sparkContext.parallelize(table)
            .map(photo -> SparkBeanUtils.getRowFromBean(schema, photo));

    return rowRDD.rdd();
}
 
Example 22
Project: oryx2   File: MLUpdate.java   Source Code and License 6 votes vote down vote up
private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) {
  Objects.requireNonNull(newData);
  if (testFraction <= 0.0) {
    return new Pair<>(pastData == null ? newData : newData.union(pastData), null);
  }
  if (testFraction >= 1.0) {
    return new Pair<>(pastData, newData);
  }
  if (empty(newData)) {
    return new Pair<>(pastData, null);
  }
  Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData);
  JavaRDD<M> newTrainData = newTrainTest.getFirst();
  return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData),
                    newTrainTest.getSecond());
}
 
Example 23
Project: incubator-sdap-mudrod   File: SessionExtractor.java   Source Code and License 6 votes vote down vote up
/**
 * loadClickStremFromTxt:Load click stream form txt file
 *
 * @param clickthroughFile
 *          txt file
 * @param sc
 *          the spark context
 * @return clickstream list in JavaRDD format {@link ClickStream}
 */
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
  return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @SuppressWarnings("unchecked")
    @Override
    public Iterator<ClickStream> call(String line) throws Exception {
      List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
      return (Iterator<ClickStream>) clickthroughs;
    }
  });
}
 
Example 24
Project: incubator-sdap-mudrod   File: SessionGenerator.java   Source Code and License 6 votes vote down vote up
public void combineShortSessionsInParallel(int timeThres) throws InterruptedException, IOException {

    JavaRDD<String> userRDD = getUserRDD(this.cleanupType);

    userRDD.foreachPartition(new VoidFunction<Iterator<String>>() {
      /**
       *
       */
      private static final long serialVersionUID = 1L;

      @Override
      public void call(Iterator<String> arg0) throws Exception {
        ESDriver tmpES = new ESDriver(props);
        tmpES.createBulkProcessor();
        while (arg0.hasNext()) {
          String s = arg0.next();
          combineShortSessions(tmpES, s, timeThres);
        }
        tmpES.destroyBulkProcessor();
        tmpES.close();
      }
    });
  }
 
Example 25
Project: oryx2   File: RDFUpdate.java   Source Code and License 6 votes vote down vote up
private Map<Integer,Collection<String>> getDistinctValues(JavaRDD<String[]> parsedRDD) {
  int[] categoricalIndices = IntStream.range(0, inputSchema.getNumFeatures()).
      filter(inputSchema::isCategorical).toArray();

  return parsedRDD.mapPartitions(data -> {
      Map<Integer,Collection<String>> categoryValues = new HashMap<>();
      for (int i : categoricalIndices) {
        categoryValues.put(i, new HashSet<>());
      }
      data.forEachRemaining(datum ->
        categoryValues.forEach((category, values) -> values.add(datum[category]))
      );
      return Collections.singleton(categoryValues).iterator();
    }).reduce((v1, v2) -> {
      // Assumes both have the same key set
      v1.forEach((category, values) -> values.addAll(v2.get(category)));
      return v1;
    });
}
 
Example 26
Project: incubator-sdap-mudrod   File: LogAbstract.java   Source Code and License 6 votes vote down vote up
protected void checkUserPartition(JavaRDD<String> userRDD) {
  System.out.println("hhhhh");
  List<Partition> partitios = userRDD.partitions();
  System.out.println(partitios.size());
  int[] partitionIds = new int[partitios.size()];
  for (int i = 0; i < partitios.size(); i++) {
    int index = partitios.get(i).index();
    partitionIds[i] = index;
  }

  List<String>[] userIPs = userRDD.collectPartitions(partitionIds);
  for (int i = 0; i < userIPs.length; i++) {
    List<String> iuser = userIPs[i];
    System.out.println(i + " partition");
    System.out.println(iuser.toString());
  }
}
 
Example 27
Project: incubator-sdap-mudrod   File: SVDAnalyzer.java   Source Code and License 6 votes vote down vote up
/**
 * GetSVDMatrix: Create SVD matrix csv file from original csv file.
 *
 * @param csvFileName       each row is a term, and each column is a document.
 * @param svdDimention      Dimension of SVD matrix
 * @param svdMatrixFileName CSV file name of SVD matrix
 */
public void getSVDMatrix(String csvFileName, int svdDimention, String svdMatrixFileName) {

  JavaPairRDD<String, Vector> importRDD = MatrixUtil.loadVectorFromCSV(spark, csvFileName, 1);
  JavaRDD<Vector> vectorRDD = importRDD.values();
  RowMatrix wordDocMatrix = new RowMatrix(vectorRDD.rdd());
  RowMatrix tfidfMatrix = MatrixUtil.createTFIDFMatrix(wordDocMatrix);
  RowMatrix svdMatrix = MatrixUtil.buildSVDMatrix(tfidfMatrix, svdDimention);

  List<String> rowKeys = importRDD.keys().collect();
  List<String> colKeys = new ArrayList<>();
  for (int i = 0; i < svdDimention; i++) {
    colKeys.add("dimension" + i);
  }
  MatrixUtil.exportToCSV(svdMatrix, rowKeys, colKeys, svdMatrixFileName);
}
 
Example 28
Project: Deep-Learning-with-Hadoop   File: TestSparkMultiLayerParameterAveraging.java   Source Code and License 5 votes vote down vote up
@Test
public void testSmallAmountOfData(){
    

    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
            .updater(Updater.RMSPROP)
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1)
            .list()
            .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder()
                    .nIn(nIn).nOut(3)
                    .activation("tanh").build())
            .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MSE)
                    .nIn(3).nOut(nOut)
                    .activation("softmax")
                    .build())
            .build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc,conf,new ParameterAveragingTrainingMaster(true,numExecutors(),1,10,1,0));

    Nd4j.getRandom().setSeed(12345);
    DataSet d1 = new DataSet(Nd4j.rand(1,nIn),Nd4j.rand(1,nOut));
    DataSet d2 = new DataSet(Nd4j.rand(1,nIn),Nd4j.rand(1,nOut));

    JavaRDD<DataSet> rddData = sc.parallelize(Arrays.asList(d1,d2));

    sparkNet.fit(rddData);

}
 
Example 29
Project: BLASpark   File: OtherOperations.java   Source Code and License 5 votes vote down vote up
private static CoordinateMatrix GetLU_COORD(CoordinateMatrix A) {

        JavaRDD<MatrixEntry> rows = A.entries().toJavaRDD().cache();

        JavaRDD<MatrixEntry> LUEntries = rows.mapPartitions(new FlatMapFunction<Iterator<MatrixEntry>, MatrixEntry>() {
            @Override
            public Iterator<MatrixEntry> call(Iterator<MatrixEntry> matrixEntryIterator) throws Exception {
                List<MatrixEntry> newLowerEntries = new ArrayList<MatrixEntry>();


                while(matrixEntryIterator.hasNext()) {
                    MatrixEntry currentEntry = matrixEntryIterator.next();

                    if(currentEntry.i() != currentEntry.j()) {
                        newLowerEntries.add(currentEntry);
                    }
                    else {
                        newLowerEntries.add(new MatrixEntry(currentEntry.i(), currentEntry.j(), 0.0));
                    }

                }

                return newLowerEntries.iterator();
            }
        });

        CoordinateMatrix newMatrix = new CoordinateMatrix(LUEntries.rdd());

        return newMatrix;
    }
 
Example 30
Project: movie-recommender   File: ModelFactory.java   Source Code and License 5 votes vote down vote up
public static TrainedModel create(JavaRDD<Rating> trainingRdd, JavaRDD<Rating> testRdd, int rank, int iterationsNr) {
    logger.info(String.format("Train with parameters -> iterations: %d, rank :%d", iterationsNr, rank));
    JavaRDD<Tuple2<Object, Object>> testForPredict = testRdd.map(rating ->
        new Tuple2<>(rating.user(), rating.product())
    );
    TimeKeeper timeKeeper = new TimeKeeper();
    timeKeeper.start();

    MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(trainingRdd), rank, iterationsNr, 0.1);
    timeKeeper.end().print(logger, "als model trained in ").reset();

    Double error = getError(testRdd, rank, iterationsNr, testForPredict, timeKeeper, model);

    return new TrainedModel(error, model);
}
 
Example 31
Project: betleopard   File: LiveBetMain.java   Source Code and License 5 votes vote down vote up
/**
 * Loads in historical data and stores in Hazelcast IMDG. This is mostly to 
 * provide a source of horses for the bet simulation.
 * 
 * @throws IOException 
 */
public void loadHistoricalRaces() throws IOException {
    filePath = Utils.unpackDataToTmp("historical_races.json");

    final JavaRDD<String> eventsText = sc.textFile(filePath.toString());
    final JavaRDD<Event> events
            = eventsText.map(s -> JSONSerializable.parse(s, Event::parseBag));

    final JavaPairRDD<Horse, Integer> winners
            = events.mapToPair(e -> new Tuple2<>(e.getRaces().get(0).getWinner().orElse(Horse.PALE), 1))
            .reduceByKey((a, b) -> a + b);

    final HazelcastRDDFunctions accessToHC = javaPairRddFunctions(winners);
    accessToHC.saveToHazelcastMap("winners");
}
 
Example 32
Project: ViraPipe   File: Decompress.java   Source Code and License 5 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 33
Project: movie-recommender   File: RecommendationEngine.java   Source Code and License 5 votes vote down vote up
private TrainedModel createAlsModel(JavaRDD<Rating> ratings, TrainConfig trainConfig) {
    double[] weights = {8, 2};
    JavaRDD<Rating>[] randomRatings = ratings.randomSplit(weights, 0L);

    return ModelFactory.create(randomRatings[0],
            randomRatings[1],
            trainConfig.getRankNr(),
            trainConfig.getIterationsNr()
    );
}
 
Example 34
Project: rdf2x   File: QuadUtils.java   Source Code and License 5 votes vote down vote up
/**
 * Get quads that specify type of a resource
 *
 * @param quads             RDD of quads to filter
 * @param typePredicateURIs additional type predicates to use together with rdf:type
 * @return RDD of quads that specify type of a resource
 */
public static JavaRDD<Quad> filterTypeQuads(JavaRDD<Quad> quads, List<String> typePredicateURIs) {
    String typePredicateURI = RDF.TYPE.toString();

    return quads.filter(quad -> {
        if (!quad.getPredicate().isURI() || !quad.getObject().isURI()) {
            return false;
        }
        String uri = quad.getPredicate().getURI();
        return uri.equals(typePredicateURI) || typePredicateURIs.contains(uri);
    });
}
 
Example 35
Project: ViraPipe   File: SQLQueryBlast.java   Source Code and License 5 votes vote down vote up
private static JavaRDD<String> dfToTabDelimited(Dataset<Row> df) {
  return df.toJavaRDD().map(row ->  {
    //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore

    String output = row.getAs("qseqid")+"\t"+row.getAs("sseqid")+"\t"+row.getAs("pident")+"\t"
            +row.getAs("length") +"\t"+row.getAs("mismatch")+"\t"+row.getAs("gapopen")
            +"\t"+row.getAs("qstart")+"\t"+row.getAs("qend")+"\t"+row.getAs("sstart")
            +"\t"+row.getAs("send")+"\t"+row.getAs("evalue")+"\t"+row.getAs("bitscore");

    return output;
  });
}
 
Example 36
Project: ViraPipe   File: SQLQueryBlast.java   Source Code and License 5 votes vote down vote up
private static JavaRDD<String> dfToCSV(Dataset<Row> df) {
  return df.toJavaRDD().map(row ->  {
    //qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore

    String output = row.getAs("qseqid")+","+row.getAs("sseqid")+","+row.getAs("pident")+","
            +row.getAs("length") +","+row.getAs("mismatch")+","+row.getAs("gapopen")
            +","+row.getAs("qstart")+","+row.getAs("qend")+","+row.getAs("sstart")
            +","+row.getAs("send")+","+row.getAs("evalue")+","+row.getAs("bitscore");

    return output;
  });
}
 
Example 37
Project: ViraPipe   File: Interleave.java   Source Code and License 5 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 38
Project: MinoanER   File: LabelMatchingHeuristic.java   Source Code and License 5 votes vote down vote up
/**
 * Returns the pairs that follow the following conditions:
 * 1. the entities of this pair have the same label
 * 2. only this pair of entities has this label
 * @param inputTriples1
 * @param inputTriples2
 * @param entityIds1
 * @param entityIds2
 * @param SEPARATOR
 * @return 
 */
public JavaPairRDD<Integer,Integer> getMatchesFromLabels(JavaRDD<String> inputTriples1, JavaRDD<String> inputTriples2, JavaRDD<String> entityIds1, JavaRDD<String> entityIds2, String SEPARATOR, Set<String> labelAtts1, Set<String> labelAtts2) {                
    JavaPairRDD<String,Integer> labelBlocks1 = getLabelBlocks(inputTriples1, labelAtts1, entityIds1, SEPARATOR, true);
    JavaPairRDD<String,Integer> labelBlocks2 = getLabelBlocks(inputTriples2, labelAtts2, entityIds2, SEPARATOR, false);
    
    return labelBlocks2.join(labelBlocks1) //get blocks from labels existing in both collections (inner join) (first D2, to keep negative ids first)
            .reduceByKey((x,y) -> x != null && x.equals(y) ? x : null) //if the block has more than two (one pair of) entities, skip this block
            .filter(x-> x._2() != null)
            .mapToPair(pair -> new Tuple2<>(pair._2()._1(), pair._2()._2()))
            .distinct()
            .reduceByKey((x,y) -> null) //if the entity is matched to more than one entities, skip this entity (this could happen when this entity has > 1 labels, one of them same with one entity, and the other same with another entity)
            .filter(x-> x._2() != null);
            
}
 
Example 39
Project: bunsen   File: Bundles.java   Source Code and License 5 votes vote down vote up
/**
 * Returns an RDD of bundles loaded from the given path.
 *
 * @param spark the spark session
 * @param path a path to a directory of FHIR Bundles
 * @param minPartitions a suggested value for the minimal number of partitions
 * @return an RDD of FHIR Bundles
 */
public static JavaRDD<Bundle> loadFromDirectory(SparkSession spark,
    String path,
    int minPartitions) {

  return spark.sparkContext()
      .wholeTextFiles(path, minPartitions)
      .toJavaRDD()
      .map(new ToBundle());
}
 
Example 40
Project: ViraPipe   File: HDFSWriter.java   Source Code and License 5 votes vote down vote up
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoHeader(JavaRDD<SAMRecord> records) {
    return records.mapToPair(read -> {
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}