org.apache.spark.mllib.linalg.SparseVector Java Examples

The following examples show how to use org.apache.spark.mllib.linalg.SparseVector. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MultilabelPoint.java    From sparkboost with Apache License 2.0 5 votes vote down vote up
public MultilabelPoint(int pointID, SparseVector features, int[] labels) {
    if (features == null)
        throw new NullPointerException("The set of features is 'null'");
    if (labels == null)
        throw new NullPointerException("The set of labels is 'null'");
    this.pointID = pointID;
    this.features = features;
    this.labels = labels;
}
 
Example #2
Source File: DataUtils.java    From sparkboost with Apache License 2.0 5 votes vote down vote up
public static JavaRDD<FeatureDocuments> getFeatureDocuments(JavaRDD<MultilabelPoint> documents) {
    return documents.flatMapToPair(doc -> {
        SparseVector feats = doc.getFeatures();
        int[] indices = feats.indices();
        ArrayList<Tuple2<Integer, FeatureDocuments>> ret = new ArrayList<>();
        for (int i = 0; i < indices.length; i++) {
            int featureID = indices[i];
            int[] docs = new int[]{doc.getPointID()};
            int[][] labels = new int[1][];
            labels[0] = doc.getLabels();
            ret.add(new Tuple2<>(featureID, new FeatureDocuments(featureID, docs, labels)));
        }
        return ret;
    }).reduceByKey((f1, f2) -> {
        int numDocs = f1.getDocuments().length + f2.getDocuments().length;
        int[] docsMerged = new int[numDocs];
        int[][] labelsMerged = new int[numDocs][];
        // Add first feature info.
        for (int idx = 0; idx < f1.getDocuments().length; idx++) {
            docsMerged[idx] = f1.getDocuments()[idx];
        }
        for (int idx = 0; idx < f1.getDocuments().length; idx++) {
            labelsMerged[idx] = f1.getLabels()[idx];
        }

        // Add second feature info.
        for (int idx = f1.getDocuments().length; idx < numDocs; idx++) {
            docsMerged[idx] = f2.getDocuments()[idx - f1.getDocuments().length];
        }
        for (int idx = f1.getDocuments().length; idx < numDocs; idx++) {
            labelsMerged[idx] = f2.getLabels()[idx - f1.getDocuments().length];
        }
        return new FeatureDocuments(f1.featureID, docsMerged, labelsMerged);
    }).map(item -> item._2());
}
 
Example #3
Source File: Data2CoNLL.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
protected int run() throws Exception {

  SparkConf sparkConf = new SparkConf()
      .setAppName("Data2CoNLL")
      .set("spark.hadoop.validateOutputSpecs", "false")
      .set("spark.yarn.executor.memoryOverhead", "3072")
      .set("spark.rdd.compress", "true")
      .set("spark.core.connection.ack.wait.timeout", "600")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      //.set("spark.kryo.registrationRequired", "true")
      .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
          InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
          VectorIndexer.class})
      ;//.setMaster("local[4]"); //Remove this if you run it on the server.


  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
      * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

  FileSystem fs = FileSystem.get(new Configuration());

  int partitionNumber = 3 * totalCores;
  if(partitions != null) {
    partitionNumber = partitions;
  }

  //Read training documents serialized as SCAS
  JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values();

  JavaRDD<String> docStrings = documents.map( s -> {
    JCas jCas = s.getJCas();
    NYTArticleMetaData metadata = JCasUtil.selectSingle(jCas, NYTArticleMetaData.class);

    StringJoiner docBuilder = new StringJoiner("\n");

    docBuilder.add("-DOCSTART- (" +  metadata.getGuid() + ")");
    docBuilder.add("");

    Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
    for(Sentence sentence: sentences) {
      List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
      for(Token token: tokens) {
        CoreLabel taggedWord = CoreNlpUtils.tokenToWord(token);
        StringJoiner lineBuilder = new StringJoiner("\t");
        lineBuilder.add(taggedWord.word().toLowerCase());
        docBuilder.add(lineBuilder.toString());
      }
      docBuilder.add("");
    }
    return docBuilder.toString();
  });

  docStrings.saveAsTextFile(output);
  sc.stop();
  return 0;
}
 
Example #4
Source File: EntitySalienceTrainingSparkRunner.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                .set("spark.yarn.executor.memoryOverhead", "3072")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//.setMaster("local[4]"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(folds != null) {
            trainingSettings.setNumFolds(folds);
        }
        if(method != null) {
            trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.valueOf(method));
        }
        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }

        if(scalingFactor != null) {
            trainingSettings.setPositiveInstanceScalingFactor(scalingFactor);
        }

        JavaSparkContext sc = new JavaSparkContext(sparkConf);
        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 4;
////        trainingSettings.setFeatureExtractor(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE);
////        trainingSettings.setAidaDefaultConf("db");
//        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);
//        trainingSettings.setPositiveInstanceScalingFactor(1);

        //Add the cache files to each node only if annotation is required.
        //The input documents could already be annotated, and in this case no caches are needed.
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        FileSystem fs = FileSystem.get(new Configuration());

        int partitionNumber = 3 * totalCores;
        if(partitions != null) {
            partitionNumber = partitions;
        }

        //Read training documents serialized as SCAS
        JavaRDD<SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber).values();

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();

        //Train a model
        CrossValidatorModel model = trainingSparkRunner.crossValidate(sc, sqlContext, documents, trainingSettings);


        //Create the model path
        String modelPath = output+"/"+sc.getConf().getAppId()+"/model_"+trainingSettings.getClassificationMethod();

        //Delete the old model if there is one
        fs.delete(new Path(modelPath), true);

        //Save the new model model
        List<Model> models = new ArrayList<>();
        models.add(model.bestModel());
        sc.parallelize(models, 1).saveAsObjectFile(modelPath);

        //Save the model stats
        SparkClassificationModel.saveStats(model, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");


        return 0;
    }
 
Example #5
Source File: EntitySalienceTestingSparkRunner.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
    protected int run() throws Exception {

        SparkConf sparkConf = new SparkConf()
                .setAppName("EntitySalienceTrainingSparkRunner")
                .set("spark.hadoop.validateOutputSpecs", "false")
                //.set("spark.yarn.executor.memoryOverhead", "4096")
                .set("spark.rdd.compress", "true")
                .set("spark.core.connection.ack.wait.timeout", "600")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                //.set("spark.kryo.registrationRequired", "true")
                .registerKryoClasses(new Class[] {SCAS.class, LabeledPoint.class, SparseVector.class, int[].class, double[].class,
                        InternalRow[].class, GenericInternalRow.class, Object[].class, GenericArrayData.class,
                        VectorIndexer.class})
                ;//setMaster("local"); //Remove this if you run it on the server.

        TrainingSettings trainingSettings = new TrainingSettings();

        if(defaultConf != null) {
            trainingSettings.setAidaDefaultConf(defaultConf);
        }


        JavaSparkContext sc = new JavaSparkContext(sparkConf);

        int totalCores = Integer.parseInt(sc.getConf().get("spark.executor.instances"))
                * Integer.parseInt(sc.getConf().get("spark.executor.cores"));

//        int totalCores = 2;

        //trainingSettings.setClassificationMethod(TrainingSettings.ClassificationMethod.LOG_REG);

        trainingSettings.setPositiveInstanceScalingFactor(1);
        if(trainingSettings.getFeatureExtractor().equals(TrainingSettings.FeatureExtractor.ANNOTATE_AND_ENTITY_SALIENCE)) {
            sc.addFile(trainingSettings.getBigramCountCache());
            sc.addFile(trainingSettings.getKeywordCountCache());
            sc.addFile(trainingSettings.getWordContractionsCache());
            sc.addFile(trainingSettings.getWordExpansionsCache());
            if (trainingSettings.getAidaDefaultConf().equals("db")) {
                sc.addFile(trainingSettings.getDatabaseAida());
            } else {
                sc.addFile(trainingSettings.getCassandraConfig());
            }
        }

        SQLContext sqlContext = new SQLContext(sc);


        int partitionNumber = 3 * totalCores;
        //Read training documents serialized as SCAS
        JavaPairRDD<Text, SCAS> documents = sc.sequenceFile(input, Text.class, SCAS.class, partitionNumber);

        //Instanciate a training spark runner
        TrainingSparkRunner trainingSparkRunner = new TrainingSparkRunner();


        PipelineModel trainingModel = (PipelineModel) sc.objectFile(model).first();

        //Evaluate the model and write down the evaluation metrics.
        trainingSparkRunner.evaluate(sc, sqlContext, documents, trainingModel, trainingSettings, output+"/"+sc.getConf().getAppId()+"/");

        return 0;
    }
 
Example #6
Source File: VectorBinarizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testVectorBinarizerSparse() {
    // prepare data

    int[] sparseArray1 = {5, 6, 11, 4, 7, 9, 8, 14, 13};
    double[] sparseArray1Values = {-5d, 7d, 1d, -2d, -4d, -1d, 31d, -1d, -3d};

    int[] sparseArray2 = {2, 6, 1};
    double[] sparseArray2Values = {1d, 11d, 2d};

    int[] sparseArray3 = {4, 6, 1};
    double[] sparseArray3Values = {52d, 71d, 11d};

    int[] sparseArray4 = {4, 1, 2};
    double[] sparseArray4Values = {17d, 7d, 9d};

    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(3d, 4d, new SparseVector(20, sparseArray1, sparseArray1Values)),
            RowFactory.create(4d, 5d, new SparseVector(20, sparseArray2, sparseArray2Values)),
            RowFactory.create(5d, 5d, new SparseVector(20, sparseArray3, sparseArray3Values)),
            RowFactory.create(6d, 5d, new SparseVector(20, sparseArray4, sparseArray4Values))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    VectorBinarizer vectorBinarizer = new VectorBinarizer()
            .setInputCol("vector1")
            .setOutputCol("binarized");


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorBinarizer, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] sparkOutput = vectorBinarizer.transform(df).orderBy("id").select("id", "value1", "vector1", "binarized").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorBinarizer.getInputCol(), ((SparseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorBinarizer.getOutputCol());
        assertArrayEquals(output, ((SparseVector)row.get(3)).toArray(), 0d);
    }
}
 
Example #7
Source File: MultilabelPoint.java    From sparkboost with Apache License 2.0 2 votes vote down vote up
/**
 * Get the set of features of this point.
 *
 * @return The set of features of this point.
 */
public SparseVector getFeatures() {
    return features;
}