Java Code Examples for org.apache.spark.api.java.JavaSparkContext

The following are top voted examples for showing how to use org.apache.spark.api.java.JavaSparkContext. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: big-data-benchmark   File: SparkWordCount.java   Source Code and License 12 votes vote down vote up
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
Example 2
Project: Apache-Spark-2x-for-Java-Developers   File: WordCount.java   Source Code and License 7 votes vote down vote up
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
 
Example 3
Project: Apache-Spark-2x-for-Java-Developers   File: SparkWordCount.java   Source Code and License 7 votes vote down vote up
public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}
 
Example 4
Project: Sempala   File: Spark.java   Source Code and License 7 votes vote down vote up
/**
 * Initializes a Spark connection. Use it afterwards for execution of Spark
 * SQL queries.
 * 
 * @param appName
 *            the name of the app that will be used with this Spark
 *            connection
 * @param database
 *            name of the database that will be used with this Spark
 *            connection
 */
public Spark(String appName, String database) {

	// TODO check what will happen if there is already in use the same app
	// name
	this.sparkConfiguration = new SparkConf().setAppName(appName);
	this.javaContext = new JavaSparkContext(sparkConfiguration);
	this.hiveContext = new HiveContext(javaContext);
	// TODO check what kind of exception can be thrown here if there is a
	// problem with spark connection

	this.hiveContext.sql(String.format("CREATE DATABASE %s", database));
	// TODO check what kind of exception is thrown if database already

	// use the created database
	this.hiveContext.sql((String.format("USE %s", database)));
}
 
Example 5
Project: oryx2   File: KMeansUpdate.java   Source Code and License 7 votes vote down vote up
/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                                         numberOfRuns, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
 
Example 6
Project: BLASpark   File: OtherOperations.java   Source Code and License 6 votes vote down vote up
public static DistributedMatrix GetLU(DistributedMatrix A, JavaSparkContext jsc) {


        DistributedMatrix returnedMatrix;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrix = OtherOperations.GetLU_IRW((IndexedRowMatrix) A);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrix = OtherOperations.GetLU_COORD((CoordinateMatrix) A);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrix = null;
        }
        else {
            returnedMatrix = null;
        }


        return returnedMatrix;

    }
 
Example 7
Project: BLASpark   File: OtherOperations.java   Source Code and License 6 votes vote down vote up
public static DistributedMatrix GetD(DistributedMatrix A, boolean inverseValues, JavaSparkContext jsc) {

        DistributedMatrix returnedMatrix;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrix = OtherOperations.GetD_IRW((IndexedRowMatrix) A, inverseValues, jsc);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrix = OtherOperations.GetD_COORD((CoordinateMatrix) A, inverseValues, jsc);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrix = null;
        }
        else {
            returnedMatrix = null;
        }


        return returnedMatrix;

    }
 
Example 8
Project: ViraPipe   File: InterleaveMulti.java   Source Code and License 6 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 9
Project: ViraPipe   File: Decompress.java   Source Code and License 6 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example 10
Project: incubator-sdap-mudrod   File: SessionExtractor.java   Source Code and License 6 votes vote down vote up
/**
 * loadClickStremFromTxt:Load click stream form txt file
 *
 * @param clickthroughFile
 *          txt file
 * @param sc
 *          the spark context
 * @return clickstream list in JavaRDD format {@link ClickStream}
 */
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
  return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
    /**
     *
     */
    private static final long serialVersionUID = 1L;

    @SuppressWarnings("unchecked")
    @Override
    public Iterator<ClickStream> call(String line) throws Exception {
      List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
      return (Iterator<ClickStream>) clickthroughs;
    }
  });
}
 
Example 11
Project: incubator-sdap-mudrod   File: SparkDriver.java   Source Code and License 6 votes vote down vote up
public SparkDriver(Properties props) {
  SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
      .set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");

  String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
  String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);

  if (!"".equals(esHost)) {
    conf.set("es.nodes", esHost);
  }

  if (!"".equals(esPort)) {
    conf.set("es.port", esPort);
  }

  conf.set("spark.serializer", KryoSerializer.class.getName());
  conf.set("es.batch.size.entries", "1500");

  sc = new JavaSparkContext(conf);
  sqlContext = new SQLContext(sc);
}
 
Example 12
Project: betleopard   File: LiveBetMain.java   Source Code and License 6 votes vote down vote up
private void init() throws IOException {
    final ClientConfig config = new ClientConfig();
    client = HazelcastClient.newHazelcastClient(config);

    final SparkConf conf = new SparkConf()
            .set("hazelcast.server.addresses", "127.0.0.1:5701")
            .set("hazelcast.server.groupName", "dev")
            .set("hazelcast.server.groupPass", "dev-pass")
            .set("hazelcast.spark.valueBatchingEnabled", "true")
            .set("hazelcast.spark.readBatchSize", "5000")
            .set("hazelcast.spark.writeBatchSize", "5000");

    sc = new JavaSparkContext("local", "appname", conf);

    loadHistoricalRaces();
    createRandomUsers();
    createFutureEvent();
}
 
Example 13
Project: MinoanER   File: CNPNeighborsUnnormalized.java   Source Code and License 6 votes vote down vote up
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);

    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
Example 14
Project: MinoanER   File: CNPARCS.java   Source Code and License 6 votes vote down vote up
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);             
    JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates =  getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
Example 15
Project: MinoanER   File: CNPNeighbors.java   Source Code and License 6 votes vote down vote up
/**
 * 
 * @param topKvalueCandidates the topK results per entity, acquired from value similarity
 * @param rawTriples1 the rdf triples of the first entity collection
 * @param rawTriples2 the rdf triples of the second entity collection
 * @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
 * @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
 * @param entityIds2
 * @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
 * @param K the K for topK candidate matches
 * @param N the N for topN rdf relations (and neighbors)
 * @param jsc the java spark context used to load files and broadcast variables
 * @return topK neighbor candidates per entity
 */
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates, 
        JavaRDD<String> rawTriples1, 
        JavaRDD<String> rawTriples2,             
        String SEPARATOR, 
        JavaRDD<String> entityIds1, 
        JavaRDD<String> entityIds2, 
        float MIN_SUPPORT_THRESHOLD,
        int K,
        int N, 
        JavaSparkContext jsc) {
    
    Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
    inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
    
    Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
    
    //JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);        
    //JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSimsOld(neighborSims, K);        
    JavaPairRDD<Integer, IntArrayList> topKneighborCandidates =  getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);        
    return topKneighborCandidates;
}
 
Example 16
Project: oryx2   File: ExampleBatchLayerUpdate.java   Source Code and License 6 votes vote down vote up
@Override
public void runUpdate(JavaSparkContext sparkContext,
                      long timestamp,
                      JavaPairRDD<String,String> newData,
                      JavaPairRDD<String,String> pastData,
                      String modelDirString,
                      TopicProducer<String,String> modelUpdateTopic) throws IOException {
  JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData);
  String modelString;
  try {
    modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData));
  } catch (JsonProcessingException jpe) {
    throw new IOException(jpe);
  }
  modelUpdateTopic.send("MODEL", modelString);
}
 
Example 17
Project: MinoanER   File: BlocksFromEntityIndexTest.java   Source Code and License 6 votes vote down vote up
@Before
public void setUp() {
    System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
    
    spark = SparkSession.builder()
        .appName("test") 
        .config("spark.sql.warehouse.dir", "/file:/tmp")                
        .config("spark.executor.instances", 1)
        .config("spark.executor.cores", 1)
        .config("spark.executor.memory", "1G")            
        .config("spark.driver.maxResultSize", "1g")
        .config("spark.master", "local")
        .getOrCreate();        
    
    
    
    jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); 
}
 
Example 18
Project: MinoanER   File: BlockFilteringAdvancedTest.java   Source Code and License 6 votes vote down vote up
@Before
public void setUp() {        
    System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
    
    spark = SparkSession.builder()
        .appName("test") 
        .config("spark.sql.warehouse.dir", "/file:/tmp")                
        .config("spark.executor.instances", 1)
        .config("spark.executor.cores", 1)
        .config("spark.executor.memory", "1G")            
        .config("spark.driver.maxResultSize", "1g")
        .config("spark.master", "local")
        .getOrCreate();        
    
    
    
    jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); 
}
 
Example 19
Project: ParquetUtils   File: ParquetRepartTest.java   Source Code and License 5 votes vote down vote up
@BeforeClass
public static void createContext() throws IOException {

	Configuration hdfsConfig = HDFSUtils.getConfiguration();
	SparkConf config = new SparkConf();
	config.setMaster("local[*]");
	config.setAppName("my JUnit running Spark");
	sc = new JavaSparkContext(config);
	fileSystem = FileSystem.get(hdfsConfig);
	sqlContext = new SQLContext(sc);
	engine = new ParquetRepartEngine(fileSystem, sqlContext);
}
 
Example 20
Project: oryx2   File: ALSUpdate.java   Source Code and License 5 votes vote down vote up
private static MatrixFactorizationModel pmmlToMFModel(JavaSparkContext sparkContext,
                                                      PMML pmml,
                                                      Path modelParentPath,
                                                      Broadcast<Map<String,Integer>> bUserIDToIndex,
                                                      Broadcast<Map<String,Integer>> bItemIDToIndex) {
  String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
  String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
  JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));
  JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));
  int rank = userRDD.first()._2().length;
  return new MatrixFactorizationModel(
      rank,
      readAndConvertFeatureRDD(userRDD, bUserIDToIndex),
      readAndConvertFeatureRDD(productRDD, bItemIDToIndex));
}
 
Example 21
Project: mutantpdb   File: App.java   Source Code and License 5 votes vote down vote up
public static void main( String[] args )
{
    Dataset<Row> mutations = DataProvider.getMutationsToStructures();
    List<String> pdbIds = mutations.select(col("pdbId"))
            .distinct().toJavaRDD().map(t -> t.getString(0)).collect();

    List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList();
    SaprkUtils.stopSparkSession();

    JavaSparkContext sc = SaprkUtils.getSparkContext();
    Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted);

    MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc)
            .downloadMmtfFiles(Arrays.asList("5IRC"), sc)
            .flatMapToPair(new StructureToPolymerChains())
            .flatMapToPair(new AddResidueToKey(bcmut))
            .mapValues(new StructureToBioJava())
            .mapToPair(new FilterResidue())
            .filter(t -> t._2!=null).keys()
            .map(t -> t.replace(".", ","))
            .saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues");
    sc.close();
}
 
Example 22
Project: BLASpark   File: OtherOperations.java   Source Code and License 5 votes vote down vote up
public static DistributedMatrix[] GetLU(DistributedMatrix A, boolean diagonalInL, boolean diagonalInU, JavaSparkContext jsc) {

        if((diagonalInL && diagonalInU) || (!diagonalInL && !diagonalInU)) {
            LOG.error("Diagonal values must be in either upper or lower matrices");
            System.exit(-1);
        }

        DistributedMatrix[] returnedMatrices;

        if( A.getClass() == IndexedRowMatrix.class) {
            returnedMatrices = OtherOperations.GetLU_IRW((IndexedRowMatrix) A, diagonalInL, diagonalInU, jsc);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            returnedMatrices = OtherOperations.GetLU_COORD((CoordinateMatrix) A, diagonalInL, diagonalInU, jsc);
        }
        else if (A.getClass() == BlockMatrix.class){
            // TODO: Implement this operation
            //returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
            returnedMatrices = null;
        }
        else {
            returnedMatrices = null;
        }


        return returnedMatrices;

    }
 
Example 23
Project: BLASpark   File: L2.java   Source Code and License 5 votes vote down vote up
public static DenseVector DGEMV(double alpha, DistributedMatrix A, DenseVector x, double beta, DenseVector y, JavaSparkContext jsc){

        // First form  y := beta*y.
        if (beta != 1.0) {
            if (beta == 0.0) {
                y = Vectors.zeros(y.size()).toDense();
            }
            else {
                BLAS.scal(beta, y);
            }
        }

        if (alpha == 0.0) {
            return y;
        }

        DenseVector tmpVector = Vectors.zeros(y.size()).toDense();

        // Form  y := alpha*A*x + y.
        // Case of IndexedRowMatrix
        if( A.getClass() == IndexedRowMatrix.class) {
            tmpVector = L2.DGEMV_IRW((IndexedRowMatrix) A, alpha, x, jsc);
        }
        else if (A.getClass() == CoordinateMatrix.class) {
            tmpVector = L2.DGEMV_COORD((CoordinateMatrix) A, alpha, x, jsc);
        }
        else if (A.getClass() == BlockMatrix.class){
            tmpVector = L2.DGEMV_BCK((BlockMatrix) A, alpha, x, jsc);
        }
        else {
            tmpVector = null;
        }

        BLAS.axpy(1.0, tmpVector, y);


        return y;

    }
 
Example 24
Project: BLASpark   File: L2.java   Source Code and License 5 votes vote down vote up
private static DenseVector DGEMV_COORD(CoordinateMatrix matrix, double alpha, DenseVector vector, JavaSparkContext jsc) {

        JavaRDD<MatrixEntry> items = matrix.entries().toJavaRDD();
        DenseVector result = items.mapPartitions(new MatrixEntriesMultiplication(vector, alpha))
                .reduce(new MatrixEntriesMultiplicationReducer());

        return result;
    }
 
Example 25
Project: gcp   File: Spark1Batch.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sc = new SparkConf().setAppName("POC-Batch");
  try(JavaSparkContext jsc = new JavaSparkContext(sc)) {

    JavaRDD<ExampleXML> records = jsc.wholeTextFiles("input/")
        .map(t -> t._2())
        .map(new ParseXML());

    System.out.printf("Amount of XMLs: %d\n", records.count());
  }
}
 
Example 26
Project: athena   File: MachineLearningManagerImpl.java   Source Code and License 5 votes vote down vote up
public DecisionTreeValidationSummary validateDecisionTreeAthenaFeatures(JavaSparkContext sc,
                                                                        FeatureConstraint featureConstraint,
                                                                        AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                        DecisionTreeDetectionModel decisionTreeDetectionModel,
                                                                        Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    DecisionTreeDetectionAlgorithm decisionTreeDetectionAlgorithm = (DecisionTreeDetectionAlgorithm) decisionTreeDetectionModel.getDetectionAlgorithm();

    DecisionTreeValidationSummary decisionTreeValidationSummary =
            new DecisionTreeValidationSummary(sc.sc(), decisionTreeDetectionAlgorithm.getNumClasses(), indexing, marking);

    DecisionTreeDistJob decisionTreeDistJob = new DecisionTreeDistJob();

    decisionTreeDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            decisionTreeDetectionModel,
            decisionTreeValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    decisionTreeValidationSummary.setTotalValidationTime(time);
    return decisionTreeValidationSummary;
}
 
Example 27
Project: athena   File: MachineLearningManagerImpl.java   Source Code and License 5 votes vote down vote up
public GaussianMixtureValidationSummary validateGaussianMixtureAthenaFeatures(JavaSparkContext sc,
                                                                              FeatureConstraint featureConstraint,
                                                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                              GaussianMixtureDetectionModel gaussianMixtureDetectionModel,
                                                                              Indexing indexing,
                                                                              Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm = (GaussianMixtureDetectionAlgorithm) gaussianMixtureDetectionModel.getDetectionAlgorithm();
    GaussianMixtureValidationSummary gaussianMixtureValidationSummary =
            new GaussianMixtureValidationSummary(sc.sc(), gaussianMixtureDetectionAlgorithm.getK(), indexing, marking);


    GaussianMixtureDistJob gaussianMixtureDistJob = new GaussianMixtureDistJob();

    gaussianMixtureDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            gaussianMixtureDetectionModel,
            gaussianMixtureValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    gaussianMixtureValidationSummary.setTotalValidationTime(time);
    return gaussianMixtureValidationSummary;
}
 
Example 28
Project: Java-Data-Science-Cookbook   File: ScalaTest.java   Source Code and License 5 votes vote down vote up
public static void main( String[] args ){
	String inputFile = "data/dummy.txt";
	SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App");
	JavaSparkContext sparkContext = new JavaSparkContext(configuration);
	JavaRDD<String> logData = sparkContext.textFile(inputFile).cache();

	long numberA = logData.filter(new Function<String,Boolean>(){
		private static final long serialVersionUID = 1L;
		public Boolean call(String s){
			return s.length() == 0;
		}
	}).count();
	sparkContext.close();
	System.out.println("Empty Lines: " + numberA);
}
 
Example 29
Project: athena   File: MachineLearningManagerImpl.java   Source Code and License 5 votes vote down vote up
public LassoValidationSummary validateLassoAthenaFeatures(JavaSparkContext sc,
                                                          FeatureConstraint featureConstraint,
                                                          AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                          LassoDetectionModel lassoDetectionModel,
                                                          Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    LassoDetectionAlgorithm lassoDetectionAlgorithm =
            (LassoDetectionAlgorithm) lassoDetectionModel.getDetectionAlgorithm();

    LassoValidationSummary lassoValidationSummary = new LassoValidationSummary();
    lassoValidationSummary.setLassoDetectionAlgorithm(lassoDetectionAlgorithm);
    LassoDistJob lassoDistJob = new LassoDistJob();

    lassoDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            lassoDetectionModel,
            lassoValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;
    lassoValidationSummary.setValidationTime(time);

    return lassoValidationSummary;
}
 
Example 30
Project: athena   File: MachineLearningManagerImpl.java   Source Code and License 5 votes vote down vote up
public RidgeRegressionValidationSummary validateRidgeRegressionAthenaFeatures(JavaSparkContext sc,
                                                                              FeatureConstraint featureConstraint,
                                                                              AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                              RidgeRegressionDetectionModel ridgeRegressionDetectionModel,
                                                                              Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    RidgeRegressionDetectionAlgorithm ridgeRegressionDetectionAlgorithm =
            (RidgeRegressionDetectionAlgorithm) ridgeRegressionDetectionModel.getDetectionAlgorithm();

    RidgeRegressionValidationSummary ridgeRegressionValidationSummary = new RidgeRegressionValidationSummary();
    ridgeRegressionValidationSummary.setRidgeRegressionDetectionAlgorithm(ridgeRegressionDetectionAlgorithm);
    RidgeRegressionDistJob ridgeRegressionDistJob = new RidgeRegressionDistJob();

    ridgeRegressionDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            ridgeRegressionDetectionModel,
            ridgeRegressionValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;
    ridgeRegressionValidationSummary.setValidationTime(time);
    return ridgeRegressionValidationSummary;
}
 
Example 31
Project: ViraPipe   File: Interleave.java   Source Code and License 5 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 32
Project: oryx2   File: AbstractSparkIT.java   Source Code and License 5 votes vote down vote up
@BeforeClass
public static void setUp() {
  SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkIT");
  javaSparkContext = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate(sparkConf));
}
 
Example 33
Project: Explainer   File: SparkUtils.java   Source Code and License 5 votes vote down vote up
public static SparkUtils getInstance() {
  SparkUtils result = instance;
  if (result == null) {
    synchronized (SparkUtils.class) {
      result = instance;
      if (result == null) {
        instance = result = new SparkUtils();
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("Explainer");
        jsc = new JavaSparkContext(sparkConf);
        sqlContext = new SQLContext(jsc);
      }
    }
  }
  return result;
}
 
Example 34
Project: oryx2   File: ALSUpdate.java   Source Code and License 5 votes vote down vote up
@Override
public void publishAdditionalModelData(JavaSparkContext sparkContext,
                                       PMML pmml,
                                       JavaRDD<String> newData,
                                       JavaRDD<String> pastData,
                                       Path modelParentPath,
                                       TopicProducer<String, String> modelUpdateTopic) {
  // Send item updates first, before users. That way, user-based endpoints like /recommend
  // may take longer to not return 404, but when they do, the result will be more complete.
  log.info("Sending item / Y data as model updates");
  String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
  JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));

  String updateBroker = modelUpdateTopic.getUpdateBroker();
  String topic = modelUpdateTopic.getTopic();

  // For now, there is no use in sending known users for each item
  productRDD.foreachPartition(new EnqueueFeatureVecsFn("Y", updateBroker, topic));

  log.info("Sending user / X data as model updates");
  String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
  JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));

  if (noKnownItems) {
    userRDD.foreachPartition(new EnqueueFeatureVecsFn("X", updateBroker, topic));
  } else {
    log.info("Sending known item data with model updates");
    JavaRDD<String[]> allData =
        (pastData == null ? newData : newData.union(pastData)).map(MLFunctions.PARSE_FN);
    JavaPairRDD<String,Collection<String>> knownItems = knownsRDD(allData, true);
    userRDD.join(knownItems).foreachPartition(
        new EnqueueFeatureVecsAndKnownItemsFn("X", updateBroker, topic));
  }
}
 
Example 35
Project: athena   File: MachineLearningManagerImpl.java   Source Code and License 5 votes vote down vote up
public NaiveBayesValidationSummary validateNaiveBayesAthenaFeatures(JavaSparkContext sc,
                                                                    FeatureConstraint featureConstraint,
                                                                    AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                    NaiveBayesDetectionModel naiveBayesDetectionModel,
                                                                    Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    NaiveBayesDetectionAlgorithm naiveBayesDetectionAlgorithm = (NaiveBayesDetectionAlgorithm) naiveBayesDetectionModel.getDetectionAlgorithm();

    NaiveBayesValidationSummary naiveBayesValidationSummary =
            new NaiveBayesValidationSummary(sc.sc(), naiveBayesDetectionAlgorithm.getNumClasses(), indexing, marking);

    NaiveBayesDistJob naiveBayesDistJob = new NaiveBayesDistJob();


    naiveBayesDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            naiveBayesDetectionModel,
            naiveBayesValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    naiveBayesValidationSummary.setTotalValidationTime(time);
    return naiveBayesValidationSummary;
}
 
Example 36
Project: athena   File: MachineLearningManagerImpl.java   Source Code and License 5 votes vote down vote up
public GradientBoostedTreesValidationSummary validateGradientBoostedTreesAthenaFeatures(JavaSparkContext sc,
                                                                                        FeatureConstraint featureConstraint,
                                                                                        AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                                        GradientBoostedTreesDetectionModel gradientBoostedTreesDetectionModel,
                                                                                        Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    GradientBoostedTreesDetectionAlgorithm gradientBoostedTreesDetectionAlgorithm =
            (GradientBoostedTreesDetectionAlgorithm) gradientBoostedTreesDetectionModel.getDetectionAlgorithm();

    GradientBoostedTreesValidationSummary gradientBoostedTreesValidationSummary =
            new GradientBoostedTreesValidationSummary(sc.sc(),
                    gradientBoostedTreesDetectionAlgorithm.getNumClasses(), indexing, marking);

    GradientBoostedTreesDistJob gradientBoostedTreesDistJob = new GradientBoostedTreesDistJob();

    gradientBoostedTreesDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            gradientBoostedTreesDetectionModel,
            gradientBoostedTreesValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;

    gradientBoostedTreesValidationSummary.setTotalValidationTime(time);
    return gradientBoostedTreesValidationSummary;
}
 
Example 37
Project: movie-recommender   File: SparkModule.java   Source Code and License 5 votes vote down vote up
@Provides
CassandraIo<RawRating> providesCassandraRatingIO(JavaSparkContext sparkContext) {
    if (ratingCassandraIo != null) {
        return ratingCassandraIo;
    }
    ratingCassandraIo = new CassandraIo<>(RawRating.class, "dev", "ratings");
    ratingCassandraIo.setSparkContext(sparkContext);


    return ratingCassandraIo;
}
 
Example 38
Project: metadata-qa-marc   File: ParallelValidator.java   Source Code and License 5 votes vote down vote up
public static void main(String[] args) throws ParseException {

		final Validator validator = new Validator(args);
		ValidatorParameters params = validator.getParameters();
		validator.setDoPrintInProcessRecord(false);

		logger.info("Input file is " + params.getArgs());
		SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
		JavaSparkContext context = new JavaSparkContext(conf);

		System.err.println(validator.getParameters().formatParameters());

		JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

		JavaRDD<String> baseCountsRDD = inputFile
			.flatMap(content -> {
				MarcReader reader = ReadMarc.getMarcStringReader(content);
				Record marc4jRecord = reader.next();
				MarcRecord marcRecord = MarcFactory.createFromMarc4j(
					marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
				validator.processRecord(marcRecord, 1);
				return ValidationErrorFormatter
					.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
					.iterator();
			}
		);
		baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
	}
 
Example 39
Project: athena   File: MachineLearningManagerImpl.java   Source Code and License 5 votes vote down vote up
public LinearRegressionValidationSummary validateLinearRegressionAthenaFeatures(JavaSparkContext sc,
                                                                                FeatureConstraint featureConstraint,
                                                                                AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
                                                                                LinearRegressionDetectionModel linearRegressionDetectionModel,
                                                                                Indexing indexing, Marking marking) {
    long start = System.nanoTime(); // <-- start

    JavaPairRDD<Object, BSONObject> mongoRDD;
    mongoRDD = sc.newAPIHadoopRDD(
            mongodbConfig,            // Configuration
            MongoInputFormat.class,   // InputFormat: read from a live cluster.
            Object.class,             // Key class
            BSONObject.class          // Value class
    );

    LinearRegressionDetectionAlgorithm linearRegressionDetectionAlgorithm =
            (LinearRegressionDetectionAlgorithm) linearRegressionDetectionModel.getDetectionAlgorithm();

    LinearRegressionValidationSummary linearRegressionValidationSummary =
            new LinearRegressionValidationSummary();
    linearRegressionValidationSummary.setLinearRegressionDetectionAlgorithm(linearRegressionDetectionAlgorithm);

    LinearRegressionDistJob linearRegressionDistJob = new LinearRegressionDistJob();

    linearRegressionDistJob.validate(mongoRDD,
            athenaMLFeatureConfiguration,
            linearRegressionDetectionModel,
            linearRegressionValidationSummary);


    long end = System.nanoTime(); // <-- start
    long time = end - start;
    linearRegressionValidationSummary.setValidationTime(time);


    return linearRegressionValidationSummary;
}
 
Example 40
Project: rdf2x   File: TestUtils.java   Source Code and License 5 votes vote down vote up
/**
 * Parse RDF file from resources folder
 * @param sc spark context to use for parsing
 * @param fileName name of the file to parse
 * @return RDD of quads from the requested file
 */
public static JavaRDD<Quad> getQuadsRDD(JavaSparkContext sc, String fileName) {
    QuadParser parser = new ElephasQuadParser(
            new QuadParserConfig()
                    .setBatchSize(2),
            sc
    );
    String path = TestUtils.getDatasetPath(fileName);
    return parser.parseQuads(path);
}