org.apache.spark.mllib.linalg.VectorUDT Java Examples

The following examples show how to use org.apache.spark.mllib.linalg.VectorUDT. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EntitySalienceFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example #2
Source File: FirstPrediction.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName("First Prediction")
      .master("local").getOrCreate();

  StructType schema = new StructType(
      new StructField[] { new StructField("label", DataTypes.DoubleType,
          false, Metadata.empty()),
          new StructField("features", new VectorUDT(), false, Metadata
              .empty()), });

  // TODO this example is not working yet
}
 
Example #3
Source File: SchemaExporter.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
public static String exportToJson(Set<String> columns, StructType dfSchema) {
    //This would contain column name along with type of a dataframe

    List<Field> schema = new ArrayList<>();

    for (String column : columns) {
        StructField field = dfSchema.fields()[ dfSchema.fieldIndex(column) ];

        if (field.dataType() instanceof StringType) {
            schema.add(new Field(field.name(), STRING));
        } else if (field.dataType() instanceof BooleanType) {
            schema.add(new Field(field.name(), BOOLEAN));
        } else if (field.dataType() instanceof VectorUDT) {
            schema.add(new Field(field.name(), DOUBLE_ARRAY));
        } else if (field.dataType() instanceof DoubleType || field.dataType() instanceof DecimalType || field.dataType() instanceof FloatType ||
                field.dataType() instanceof IntegerType || field.dataType() instanceof LongType || field.dataType() instanceof ShortType) {
            schema.add(new Field(field.name(), DOUBLE));
        } else if (field.dataType() instanceof ArrayType) {
            if(((ArrayType)field.dataType()).elementType() instanceof StringType) {
                schema.add(new Field(field.name(), STRING_ARRAY));
            }else if(((ArrayType)field.dataType()).elementType() instanceof DoubleType) {
                schema.add(new Field(field.name(), DOUBLE_ARRAY));
            }else {
                throw new UnsupportedOperationException("Cannot support data of type " + field.dataType());
            }
        }
        else {
            throw new UnsupportedOperationException("Cannot support data of type " + field.dataType());
        }
    }
    return gson.toJson(schema);
}
 
Example #4
Source File: SchemaExporter.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
public static String exportSchemaToJson(StructType dfSchema) {
    //This would contain column name along with type of a dataframe

    List<Field> schema = new ArrayList<>();

    for (StructField field : dfSchema.fields()) {
        if (field.dataType() instanceof StringType) {
            schema.add(new Field(field.name(), STRING));
        } else if (field.dataType() instanceof BooleanType) {
            schema.add(new Field(field.name(), BOOLEAN));
        } else if (field.dataType() instanceof VectorUDT) {
            schema.add(new Field(field.name(), DOUBLE_ARRAY));
        } else if (field.dataType() instanceof DoubleType || field.dataType() instanceof DecimalType || field.dataType() instanceof FloatType ||
                field.dataType() instanceof IntegerType || field.dataType() instanceof LongType || field.dataType() instanceof ShortType) {
            schema.add(new Field(field.name(), DOUBLE));
        } else if (field.dataType() instanceof ArrayType) {
            if(((ArrayType)field.dataType()).elementType() instanceof StringType) {
                schema.add(new Field(field.name(), STRING_ARRAY));
            }else if(((ArrayType)field.dataType()).elementType() instanceof DoubleType) {
                schema.add(new Field(field.name(), DOUBLE_ARRAY));
            }else {
                throw new UnsupportedOperationException("Cannot support data of type " + field.dataType());
            }
        }
        else {
            throw new UnsupportedOperationException("Cannot support data of type " + field.dataType());
        }
    }
    return gson.toJson(schema);
}
 
Example #5
Source File: VectorBinarizerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testVectorBinarizerDense() {
    // prepare data

    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, 1d, new DenseVector(new double[]{-2d, -3d, -4d, -1d, 6d, -7d, 8d, 0d, 0d, 0d, 0d, 0d})),
            RowFactory.create(1d, 2d, new DenseVector(new double[]{4d, -5d, 6d, 7d, -8d, 9d, -10d, 0d, 0d, 0d, 0d, 0d})),
            RowFactory.create(2d, 3d, new DenseVector(new double[]{-5d, 6d, -8d, 9d, 10d, 11d, 12d, 0d, 0d, 0d, 0d, 0d}))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    VectorBinarizer vectorBinarizer = new VectorBinarizer()
            .setInputCol("vector1")
            .setOutputCol("binarized")
            .setThreshold(2d);


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorBinarizer, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] sparkOutput = vectorBinarizer.transform(df).orderBy("id").select("id", "value1", "vector1", "binarized").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorBinarizer.getInputCol(), ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorBinarizer.getOutputCol());
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}
 
Example #6
Source File: SchemaExporterTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
/**
 * Output :
 [{"name":"id","datatype":"double"},{"name":"label","datatype":"double"},{"name":"features","datatype":"double []"}]
 * */
@Test
public void testSchema1() {
    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });
    System.out.println(SchemaExporter.exportSchemaToJson(schema));
}
 
Example #7
Source File: SchemaExporterTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
/**
 * Output :
 [{"name":"id","datatype":"double"},{"name":"value1","datatype":"double"},{"name":"vector1","datatype":"double []"}]
 * */
@Test
public void testSchema3() {
    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });
    System.out.println(SchemaExporter.exportSchemaToJson(schema));
}
 
Example #8
Source File: SchemaExporterTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
/**
 * Output :
 [{"name":"features","datatype":"double []"},{"name":"id","datatype":"double"}]
 * */
@Test
public void testColumnExport1() {
    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });
    System.out.println(SchemaExporter.exportToJson(new HashSet<String>(Arrays.asList("id", "features")),schema));
}
 
Example #9
Source File: SchemaExporterTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
/**
 * Output :
 [{"name":"id","datatype":"double"},{"name":"vector1","datatype":"double []"}]
 * */
@Test
public void testColumnExport3() {
    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });
    System.out.println(SchemaExporter.exportToJson(new HashSet<String>(Arrays.asList("id", "vector1")),schema));
}
 
Example #10
Source File: EntitySalienceAnnotatorAndFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    final SparkSerializableAnalysisEngine ae = EntitySalienceFactory.createEntitySalienceEntityAnnotator(trainingSettings.getEntitySalienceEntityAnnotator());
    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents
                    .map(s -> {
                        TOTAL_DOCS.add(1);
                        Logger tmpLogger = LoggerFactory.getLogger(EntitySalienceFeatureExtractorSpark.class);
                        String docId = JCasUtil.selectSingle(s.getJCas(), DocumentMetaData.class).getDocumentId();
                        tmpLogger.info("Processing document {}.", docId);
                        //Before processing the document through the Disambiguation Pipeline, add the AIDA settings
                        // in each document.
                        SparkUimaUtils.addSettingsToJCas(s.getJCas(),
                                trainingSettings.getDocumentCoherent(),
                                trainingSettings.getDocumentConfidenceThreshold());
                        return ae.process(s);
                    })
                    .flatMap(s -> fe.getTrainingInstances(s.getJCas(),
                            trainingSettings.getFeatureExtractor(),
                            trainingSettings.getPositiveInstanceScalingFactor()));

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entity", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example #11
Source File: VectorBinarizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testVectorBinarizerSparse() {
    // prepare data

    int[] sparseArray1 = {5, 6, 11, 4, 7, 9, 8, 14, 13};
    double[] sparseArray1Values = {-5d, 7d, 1d, -2d, -4d, -1d, 31d, -1d, -3d};

    int[] sparseArray2 = {2, 6, 1};
    double[] sparseArray2Values = {1d, 11d, 2d};

    int[] sparseArray3 = {4, 6, 1};
    double[] sparseArray3Values = {52d, 71d, 11d};

    int[] sparseArray4 = {4, 1, 2};
    double[] sparseArray4Values = {17d, 7d, 9d};

    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(3d, 4d, new SparseVector(20, sparseArray1, sparseArray1Values)),
            RowFactory.create(4d, 5d, new SparseVector(20, sparseArray2, sparseArray2Values)),
            RowFactory.create(5d, 5d, new SparseVector(20, sparseArray3, sparseArray3Values)),
            RowFactory.create(6d, 5d, new SparseVector(20, sparseArray4, sparseArray4Values))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    VectorBinarizer vectorBinarizer = new VectorBinarizer()
            .setInputCol("vector1")
            .setOutputCol("binarized");


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorBinarizer, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] sparkOutput = vectorBinarizer.transform(df).orderBy("id").select("id", "value1", "vector1", "binarized").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorBinarizer.getInputCol(), ((SparseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorBinarizer.getOutputCol());
        assertArrayEquals(output, ((SparseVector)row.get(3)).toArray(), 0d);
    }
}
 
Example #12
Source File: VectorAssemblerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testVectorAssembler() {
    // prepare data

    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})),
            RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})),
            RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})),
            RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})),
            RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d}))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    VectorAssembler vectorAssembler = new VectorAssembler()
            .setInputCols(new String[]{"value1", "vector1"})
            .setOutputCol("feature");


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorAssembler, null);

    String exportedModelJson = new String(exportedModel);
    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorAssembler.getInputCols()[0], row.get(1));
        data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorAssembler.getOutputCol());
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}
 
Example #13
Source File: ChiSqSelectorBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testChiSqSelector() {
    // prepare data

    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, 0d, new DenseVector(new double[]{8d, 7d, 0d})),
            RowFactory.create(1d, 1d, new DenseVector(new double[]{0d, 9d, 6d})),
            RowFactory.create(2d, 1d, new DenseVector(new double[]{0.0d, 9.0d, 8.0d})),
            RowFactory.create(3d, 2d, new DenseVector(new double[]{8.0d, 9.0d, 5.0d}))
    ));

    double[] preFilteredData = {0.0d, 6.0d, 8.0d, 5.0d};

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    ChiSqSelector chiSqSelector = new ChiSqSelector();
    chiSqSelector.setNumTopFeatures(1);
    chiSqSelector.setFeaturesCol("features");
    chiSqSelector.setLabelCol("label");
    chiSqSelector.setOutputCol("output");

    ChiSqSelectorModel chiSqSelectorModel = chiSqSelector.fit(df);

    //Export this model
    byte[] exportedModel = ModelExporter.export(chiSqSelectorModel, null);

    String exportedModelJson = new String(exportedModel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = chiSqSelectorModel.transform(df).orderBy("id").select("id", "label", "features", "output").collect();
    for (Row row : sparkOutput) {
        Map<String, Object> data = new HashMap<>();
        data.put(chiSqSelectorModel.getFeaturesCol(), ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(chiSqSelectorModel.getOutputCol());
        System.out.println(Arrays.toString(output));
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}
 
Example #14
Source File: ProbabilityColumnProducer.java    From jpmml-evaluator-spark with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public StructField init(Evaluator evaluator){
	return DataTypes.createStructField(getColumnName(), new VectorUDT(), false);
}