Java Code Examples for org.apache.spark.api.java.JavaSparkContext#addFile()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#addFile() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FileIO.java    From RP-DBSCAN with Apache License 2.0 6 votes vote down vote up
public static List<String> broadCastData(JavaSparkContext sc, Configuration conf, String dirPath) throws IOException
{
	FileSystem fs = FileSystem.get(conf);
	FileStatus[] status = fs.listStatus(new Path(dirPath));
	List<String> metaPaths = new ArrayList<String>();

	long size = 0;
	
	for(int i=0; i<status.length; i++)
	{
		String path = status[i].getPath().toString();
		String fileName = status[i].getPath().getName();
		sc.addFile(path);
		metaPaths.add(fileName);
		
		size += status[i].getLen();
	}

	System.out.println("size : " + size);
	return metaPaths;
}
 
Example 2
Source File: GATKSparkTool.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Register the reference file (and associated dictionary and index) to be downloaded to every node using Spark's
 * copying mechanism ({@code SparkContext#addFile()}).
 * @param ctx the Spark context
 * @param referencePath the reference file, can be a local file or a remote path
 * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()}
 */
protected static String addReferenceFilesForSpark(JavaSparkContext ctx, Path referencePath) {
    if (referencePath == null) {
        return null;
    }
    Path indexPath = ReferenceSequenceFileFactory.getFastaIndexFileName(referencePath);
    Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(referencePath);
    Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(referencePath);

    ctx.addFile(referencePath.toUri().toString());
    if (Files.exists(indexPath)) {
        ctx.addFile(indexPath.toUri().toString());
    }
    if (Files.exists(dictPath)) {
        ctx.addFile(dictPath.toUri().toString());
    }
    if (Files.exists(gziPath)) {
        ctx.addFile(gziPath.toUri().toString());
    }

    return referencePath.getFileName().toString();
}
 
Example 3
Source File: GATKSparkTool.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Register the VCF file (and associated index) to be downloaded to every node using Spark's copying mechanism
 * ({@code SparkContext#addFile()}).
 * @param ctx the Spark context
 * @param vcfFileNames the VCF files, can be local files or remote paths
 * @return the reference file name; the absolute path of the file can be found by a Spark task using {@code SparkFiles#get()}
 */
protected static List<String> addVCFsForSpark(JavaSparkContext ctx, List<String> vcfFileNames) {
    for (String vcfFileName : vcfFileNames) {
        String vcfIndexFileName;
        if (vcfFileName.endsWith(FileExtensions.VCF)) {
            vcfIndexFileName = vcfFileName + FileExtensions.VCF_INDEX;
        } else if (vcfFileName.endsWith(FileExtensions.COMPRESSED_VCF)) {
            vcfIndexFileName = vcfFileName + FileExtensions.COMPRESSED_VCF_INDEX;
        } else {
            throw new IllegalArgumentException("Unrecognized known sites file extension. Must be .vcf or .vcf.gz");
        }
        ctx.addFile(vcfFileName);
        if (Files.exists(IOUtils.getPath(vcfIndexFileName))) {
            ctx.addFile(vcfIndexFileName);
        }
    }
    return vcfFileNames.stream().map(name -> IOUtils.getPath(name).getFileName().toString()).collect(Collectors.toList());
}
 
Example 4
Source File: BwaSparkEngine.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * @param ctx           the Spark context
 * @param referenceFile the path to the reference file named <i>_prefix_.fa</i>, which is used to find the image file with name <i>_prefix_.fa.img</i>.
 *                      Can be <code>null</code> if the indexFileName is provided.
 * @param indexFileName the index image file name that already exists, or <code>null</code> to have the image file automatically distributed.
 * @param inputHeader   the SAM file header to use for reads
 * @param refDictionary the sequence dictionary to use for reads if the SAM file header doesn't have one (or it's empty)
 */
public BwaSparkEngine(final JavaSparkContext ctx,
                      final String referenceFile,
                      final String indexFileName,
                      SAMFileHeader inputHeader,
                      final SAMSequenceDictionary refDictionary) {
    Utils.nonNull(referenceFile);
    Utils.nonNull(inputHeader);
    this.ctx = ctx;
    if (indexFileName != null) {
        this.indexFileName = indexFileName;
        this.resolveIndexFileName = false;
    } else {
        String indexFile = referenceFile + REFERENCE_INDEX_IMAGE_FILE_SUFFIX;
        ctx.addFile(indexFile); // distribute index file to all executors
        this.indexFileName = IOUtils.getPath(indexFile).getFileName().toString();
        this.resolveIndexFileName = true;
    }

    if (inputHeader.getSequenceDictionary() == null || inputHeader.getSequenceDictionary().isEmpty()) {
        Utils.nonNull(refDictionary);
        inputHeader = inputHeader.clone();
        inputHeader.setSequenceDictionary(refDictionary);
    }
    broadcastHeader = ctx.broadcast(inputHeader);
}
 
Example 5
Source File: SparkSegmentGenerationJobRunner.java    From incubator-pinot with Apache License 2.0 6 votes vote down vote up
protected void packPluginsToDistributedCache(JavaSparkContext sparkContext) {
  String pluginsRootDir = PluginManager.get().getPluginsRootDir();
  if (pluginsRootDir == null) {
    LOGGER.warn("Local Pinot plugins directory is null, skip packaging...");
    return;
  }
  if (new File(pluginsRootDir).exists()) {
    File pluginsTarGzFile = new File(PINOT_PLUGINS_TAR_GZ);
    try {
      TarGzCompressionUtils.createTarGzOfDirectory(pluginsRootDir, pluginsTarGzFile.getPath());
    } catch (IOException e) {
      LOGGER.error("Failed to tar plugins directory", e);
    }
    sparkContext.addFile(pluginsTarGzFile.getAbsolutePath());
    String pluginsIncludes = System.getProperty(PLUGINS_INCLUDE_PROPERTY_NAME);
    if (pluginsIncludes != null) {
      sparkContext.getConf().set(PLUGINS_INCLUDE_PROPERTY_NAME, pluginsIncludes);
    }
  } else {
    LOGGER.warn("Cannot find local Pinot plugins directory at [{}]", pluginsRootDir);
  }
}
 
Example 6
Source File: EntitySalienceTrainingSparkRunner.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                .set("spark.yarn.executor.memoryOverhead", "3072")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//.setMaster("local[4]"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(folds != null) {
            trainingSettings.setNumFolds(folds);
        }
        if(method != null) {
            trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.valueOf(method));
        }
        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }

        if(scalingFactor != null) {
            trainingSettings.setPositiveInstanceScalingFactor(scalingFactor);
        }

        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 4;
////        trainingSettings.setFeatureExtractor(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE);
////        trainingSettings.setAidaDefaultConf("db");
//        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);
//        trainingSettings.setPositiveInstanceScalingFactor(1);

        //Add the cache files to each node only if annotation is required.
        //The input documents could already be annotated, and in this case no caches are needed.
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        FileSystem fs = FileSystem.get(new Configuration());

        int partitionNumber = 3 * totalCores;
        if(partitions != null) {
            partitionNumber = partitions;
        }

        //Read training documents serialized as SCAS
        JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values();

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();

        //Train a model
        CrossValidatorModel model = trainingSparkRunner.crossValidate(sc, sqlContext, documents, trainingSettings);


        //Create the model path
        String modelPath = output+"/"+sc.getConf().getAppId()+"/model_"+trainingSettings.getClassificationMethod();

        //Delete the old model if there is one
        fs.delete(new Path(modelPath), true);

        //Save the new model model
        List<Model> models = new ArrayList<>();
        models.add(model.bestModel());
        sc.parallelize(models, 1).saveAsObjectFile(modelPath);

        //Save the model stats
        SparkClassificationModel.saveStats(model, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");


        return 0;
    }
 
Example 7
Source File: EntitySalienceTestingSparkRunner.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                //.set("spark.yarn.executor.memoryOverhead", "4096")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//setMaster("local"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }


        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 2;

        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);

        trainingSettings.setPositiveInstanceScalingFactor(1);
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        int partitionNumber = 3 * totalCores;
        //Read training documents serialized as SCAS
        JavaPairRDD<Text, SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber);

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();


        PipelineModel trainingModel = (PipelineModel) sc.objectFile(model).first();

        //Evaluate the model and write down the evaluation metrics.
        trainingSparkRunner.evaluate(sc, sqlContext, documents, trainingModel, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");

        return 0;
    }