Java Code Examples for org.apache.spark.api.java.JavaSparkContext#sequenceFile()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#sequenceFile() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
/**
 * Read the given path as a Java RDD; The path can have second level sub folder.
 * @param inputPath
 * @param fs
 * @param sc
 * @param keyClass
 * @param valueClass
 * @return
 * @throws IOException
 */
public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass);
    }

    return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass);
}
 
Example 2
Source File: SparkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
/**
 * Read the given path as a Java RDD; The path can have second level sub folder.
 * @param inputPath
 * @param fs
 * @param sc
 * @param keyClass
 * @param valueClass
 * @return
 * @throws IOException
 */
public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass);
    }

    return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass);
}
 
Example 3
Source File: SequenceFile.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
protected static void run(JavaSparkContext sparkContext) {
    JavaPairRDD<Text, IntWritable> javaPairRDD = sparkContext.sequenceFile("url", Text.class, IntWritable.class);
    JavaPairRDD<String, Integer> pairRDD = javaPairRDD.mapToPair(new sequenceToConvert());
    //写
    pairRDD.saveAsHadoopFile("url",Text.class,IntWritable.class,SequenceFileOutputFormat.class);
}
 
Example 4
Source File: EntitySalienceTestingSparkRunner.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                //.set("spark.yarn.executor.memoryOverhead", "4096")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//setMaster("local"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }


        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 2;

        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);

        trainingSettings.setPositiveInstanceScalingFactor(1);
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        int partitionNumber = 3 * totalCores;
        //Read training documents serialized as SCAS
        JavaPairRDD<Text, SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber);

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();


        PipelineModel trainingModel = (PipelineModel) sc.objectFile(model).first();

        //Evaluate the model and write down the evaluation metrics.
        trainingSparkRunner.evaluate(sc, sqlContext, documents, trainingModel, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");

        return 0;
    }
 
Example 5
Source File: TestPairSequenceRecordReaderBytesFunction.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
    //Goal: combine separate files together into a hadoop sequence file, for later parsing by a SequenceRecordReader
    //For example: use to combine input and labels data from separate files for training a RNN
    JavaSparkContext sc = getContext();

    ClassPathResource cpr = new ClassPathResource("/video/shapes_0.mp4");
    String path = cpr.getFile().getAbsolutePath();
    String folder = path.substring(0, path.length() - 12);
    path = folder + "*";

    PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);

    Path p = Files.createTempDirectory("dl4j_rrbytesPairOut");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    new File(outPath).deleteOnExit();
    toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);

    //Load back into memory:
    JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);

    SequenceRecordReader srr1 = getReader();
    SequenceRecordReader srr2 = getReader();
    PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);

    JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
    List<Tuple2<List<List<Writable>>, List<List<Writable>>>> fromSequenceFile = writables.collect();

    //Load manually (single copy) and compare:
    InputSplit is = new FileSplit(new File(folder), new String[] {"mp4"}, true);
    SequenceRecordReader srr = getReader();
    srr.initialize(is);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        Tuple2<List<List<Writable>>, List<List<Writable>>> tuple2 = fromSequenceFile.get(i);
        List<List<Writable>> seq1 = tuple2._1();
        List<List<Writable>> seq2 = tuple2._2();
        assertEquals(seq1, seq2);

        for (int j = 0; j < 4; j++) {
            if (seq1.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions

}
 
Example 6
Source File: TestPairSequenceRecordReaderBytesFunction.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
    //Goal: combine separate files together into a hadoop sequence file, for later parsing by a SequenceRecordReader
    //For example: use to combine input and labels data from separate files for training a RNN
    JavaSparkContext sc = getContext();

    File f = testDir.newFolder();
    new ClassPathResource("datavec-spark/video/").copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);

    Path p = Files.createTempDirectory("dl4j_rrbytesPairOut");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    new File(outPath).deleteOnExit();
    toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);

    //Load back into memory:
    JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);

    SequenceRecordReader srr1 = getReader();
    SequenceRecordReader srr2 = getReader();
    PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);

    JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
    List<Tuple2<List<List<Writable>>, List<List<Writable>>>> fromSequenceFile = writables.collect();

    //Load manually (single copy) and compare:
    InputSplit is = new FileSplit(f, new String[] {"mp4"}, true);
    SequenceRecordReader srr = getReader();
    srr.initialize(is);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        Tuple2<List<List<Writable>>, List<List<Writable>>> tuple2 = fromSequenceFile.get(i);
        List<List<Writable>> seq1 = tuple2._1();
        List<List<Writable>> seq2 = tuple2._2();
        assertEquals(seq1, seq2);

        for (int j = 0; j < 4; j++) {
            if (seq1.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions

}
 
Example 7
Source File: TestDataVecDataSetFunctions.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testDataVecSequencePairDataSetFunction() throws Exception {
    JavaSparkContext sc = getContext();

    File f = testDir.newFolder();
    ClassPathResource cpr = new ClassPathResource("dl4j-spark/csvsequence/");
    cpr.copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);

    Path p = testDir.newFolder("dl4j_testSeqPairFn").toPath();
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    new File(outPath).deleteOnExit();
    toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);

    //Load from sequence file:
    JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);

    SequenceRecordReader srr1 = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader srr2 = new CSVSequenceRecordReader(1, ",");
    PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);
    JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);

    //Map to DataSet:
    DataVecSequencePairDataSetFunction pairFn = new DataVecSequencePairDataSetFunction();
    JavaRDD<DataSet> data = writables.map(pairFn);
    List<DataSet> sparkData = data.collect();


    //Now: do the same thing locally (SequenceRecordReaderDataSetIterator) and compare
    String featuresPath = FilenameUtils.concat(f.getAbsolutePath(), "csvsequence_%d.txt");

    SequenceRecordReader featureReader = new CSVSequenceRecordReader(1, ",");
    SequenceRecordReader labelReader = new CSVSequenceRecordReader(1, ",");
    featureReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));
    labelReader.initialize(new NumberedFileInputSplit(featuresPath, 0, 2));

    SequenceRecordReaderDataSetIterator iter =
                    new SequenceRecordReaderDataSetIterator(featureReader, labelReader, 1, -1, true);

    List<DataSet> localData = new ArrayList<>(3);
    while (iter.hasNext())
        localData.add(iter.next());

    assertEquals(3, sparkData.size());
    assertEquals(3, localData.size());

    for (int i = 0; i < 3; i++) {
        //Check shapes etc. data sets order may differ for spark vs. local
        DataSet dsSpark = sparkData.get(i);
        DataSet dsLocal = localData.get(i);

        assertNull(dsSpark.getFeaturesMaskArray());
        assertNull(dsSpark.getLabelsMaskArray());

        INDArray fSpark = dsSpark.getFeatures();
        INDArray fLocal = dsLocal.getFeatures();
        INDArray lSpark = dsSpark.getLabels();
        INDArray lLocal = dsLocal.getLabels();

        val s = new long[] {1, 3, 4}; //1 example, 3 values, 3 time steps
        assertArrayEquals(s, fSpark.shape());
        assertArrayEquals(s, fLocal.shape());
        assertArrayEquals(s, lSpark.shape());
        assertArrayEquals(s, lLocal.shape());
    }


    //Check that results are the same (order not withstanding)
    boolean[] found = new boolean[3];
    for (int i = 0; i < 3; i++) {
        int foundIndex = -1;
        DataSet ds = sparkData.get(i);
        for (int j = 0; j < 3; j++) {
            if (ds.equals(localData.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(3, count); //Expect all 3 and exactly 3 pairwise matches between spark and local versions
}