org.apache.spark.sql.types.Metadata Java Examples

The following examples show how to use org.apache.spark.sql.types.Metadata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaStopWordsRemoverExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaStopWordsRemoverExample")
    .getOrCreate();

  // $example on$
  StopWordsRemover remover = new StopWordsRemover()
    .setInputCol("raw")
    .setOutputCol("filtered");

  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
    RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField(
      "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
  });

  Dataset<Row> dataset = spark.createDataFrame(data, schema);
  remover.transform(dataset).show(false);
  // $example off$
  spark.stop();
}
 
Example #2
Source File: DefinitionToSparkVisitor.java    From bunsen with Apache License 2.0 6 votes vote down vote up
@Override
public HapiConverter visitContained(String elementPath,
    String elementTypeUrl,
    Map<String, StructureField<HapiConverter<DataType>>> contained) {

  StructField[] fields = contained.values()
      .stream()
      .map(containedEntry -> new StructField(containedEntry.fieldName(),
          containedEntry.result().getDataType(),
          true,
          Metadata.empty()))
      .toArray(StructField[]::new);

  ArrayType container = new ArrayType(new StructType(fields), true);

  return new HapiContainedToSparkConverter(contained, container);
}
 
Example #3
Source File: DataFrames.java    From DataVec with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a datavec schema to a
 * struct type in spark
 *
 * @param schema the schema to convert
 * @return the datavec struct type
 */
public static StructType fromSchema(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns()];
    for (int i = 0; i < structFields.length; i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
Example #4
Source File: TestFlatteningTransformer.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testFlatten() {
  FlatteningTransformer transformer = new FlatteningTransformer();

  // Init
  StructField[] nestedStructFields =
      new StructField[] {new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()),};

  StructField[] structFields =
      new StructField[] {new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()),
          new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty())};

  StructType schema = new StructType(structFields);
  String flattenedSql = transformer.flattenSchema(schema, null);

  assertEquals("intColumn as intColumn,stringColumn as stringColumn,"
      + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn,"
      + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", flattenedSql);
}
 
Example #5
Source File: TestDecisionStep.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneByStepValueFalse() {
  StructType schema = new StructType(new StructField[] {
      new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create(false)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);

  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2");
  step2.configure(step2Config);
  steps.add(step2);

  Set<Step> refactored = step2.refactor(steps);

  assertEquals(refactored, Sets.newHashSet(step1, step2, step5, step6));
}
 
Example #6
Source File: TestDecisionStep.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneByStepValueTrue() {
  StructType schema = new StructType(new StructField[] {
      new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create(true)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);

  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2");
  step2.configure(step2Config);
  steps.add(step2);

  Set<Step> refactored = step2.refactor(steps);

  assertEquals(refactored, Sets.newHashSet(step1, step2, step3, step4, step7, step8));
}
 
Example #7
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	if (cmmModel != null) {
		DataFrame output = cmmModel.transform(input).repartition(1);
		return output.javaRDD().map(new RowToStringFunction(1)).collect();
	} else {
		System.err.println("Tagging model is null. You need to create or load a model first.");
		return null;
	}
}
 
Example #8
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testRangeDataTypes() throws Exception {
  Config config = ConfigUtils.configFromResource("/dq/dq-range-rules.conf").getConfig("steps");
  StructType schema = new StructType(new StructField[] {
    new StructField("fa", DataTypes.LongType, false, Metadata.empty()),
    new StructField("fi", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("fl", DataTypes.LongType, false, Metadata.empty()),
    new StructField("ff", DataTypes.FloatType, false, Metadata.empty()),
    new StructField("fe", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("fd", DataTypes.createDecimalType(), false, Metadata.empty())
  });
  Row row = new RowWithSchema(schema, new Long(2), 2, new Long(2), new Float(2.0), 2.0, new BigDecimal("2.0"));
    
  ConfigObject rro =  config.getObject("dq1.deriver.rules") ;
  for ( String rulename : rro.keySet() ) {
    Config rrc = rro.toConfig().getConfig(rulename);
    RangeRowRule rrr = new RangeRowRule() ;
    rrr.configure(rrc);
    rrr.configureName(rulename);
    assertTrue("Row should pass rule " + rulename, rrr.check(row));
  }
}
 
Example #9
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
public void testDontIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertFalse("Row should not pass rule", rule.check(row1));
}
 
Example #10
Source File: JavaBinarizerExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBinarizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 0.1),
    RowFactory.create(1, 0.8),
    RowFactory.create(2, 0.2)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

  Binarizer binarizer = new Binarizer()
    .setInputCol("feature")
    .setOutputCol("binarized_feature")
    .setThreshold(0.5);

  Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

  System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
  binarizedDataFrame.show();
  // $example off$

  spark.stop();
}
 
Example #11
Source File: DependencyParser.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Parses a list of PoS-tagged sentences, each on a line and writes the result to an output 
 * file in a specified output format.
 * @param jsc
 * @param sentences
 * @param outputFileName
 * @param outuptFormat
 */
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
		private static final long serialVersionUID = -812004521983071103L;
		public Row call(DependencyGraph graph) {
			return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
		}
	});
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),	
		new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame df = sqlContext.createDataFrame(rows, schema);
	
	if (outputFormat == OutputFormat.TEXT)  
		df.select("dependency").write().text(outputFileName);
	else 
		df.repartition(1).write().json(outputFileName);
}
 
Example #12
Source File: NGramBuilder.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
Example #13
Source File: FirstPrediction.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName("First Prediction")
      .master("local").getOrCreate();

  StructType schema = new StructType(
      new StructField[] { new StructField("label", DataTypes.DoubleType,
          false, Metadata.empty()),
          new StructField("features", new VectorUDT(), false, Metadata
              .empty()), });

  // TODO this example is not working yet
}
 
Example #14
Source File: CMMModel.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
@Override
public CMMModel load(String path) {
	org.apache.spark.ml.util.DefaultParamsReader.Metadata metadata = DefaultParamsReader.loadMetadata(path, sc(), CMMModel.class.getName());
	String pipelinePath = new Path(path, "pipelineModel").toString();
	PipelineModel pipelineModel = PipelineModel.load(pipelinePath);
	String dataPath = new Path(path, "data").toString();
	DataFrame df = sqlContext().read().format("parquet").load(dataPath);
	Row row = df.select("markovOrder", "weights", "tagDictionary").head();
	// load the Markov order
	MarkovOrder order = MarkovOrder.values()[row.getInt(0)-1];
	// load the weight vector
	Vector w = row.getAs(1);
	// load the tag dictionary
	@SuppressWarnings("unchecked")
	scala.collection.immutable.HashMap<String, WrappedArray<Integer>> td = (scala.collection.immutable.HashMap<String, WrappedArray<Integer>>)row.get(2);
	Map<String, Set<Integer>> tagDict = new HashMap<String, Set<Integer>>();
	Iterator<Tuple2<String, WrappedArray<Integer>>> iterator = td.iterator();
	while (iterator.hasNext()) {
		Tuple2<String, WrappedArray<Integer>> tuple = iterator.next();
		Set<Integer> labels = new HashSet<Integer>();
		scala.collection.immutable.List<Integer> list = tuple._2().toList();
		for (int i = 0; i < list.size(); i++)
			labels.add(list.apply(i));
		tagDict.put(tuple._1(), labels);
	}
	// build a CMM model
	CMMModel model = new CMMModel(pipelineModel, w, order, tagDict);
	DefaultParamsReader.getAndSetParams(model, metadata);
	return model;
}
 
Example #15
Source File: SimplePredictionFromTextFile.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Simple prediction from Text File").master("local").getOrCreate();

  spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());

  String filename = "data/tuple-data-file.csv";
  StructType schema = new StructType(
      new StructField[] { new StructField("_c0", DataTypes.DoubleType, false,
          Metadata.empty()),
          new StructField("_c1", DataTypes.DoubleType, false, Metadata
              .empty()),
          new StructField("features", new VectorUDT(), true, Metadata
              .empty()), });

  Dataset<Row> df = spark.read().format("csv").schema(schema).option("header",
      "false")
      .load(filename);
  df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
  df = df.withColumn("label", df.col("_c1")).drop("_c1");
  df.printSchema();

  df = df.withColumn("features", callUDF("vectorBuilder", df.col(
      "valuefeatures")));
  df.printSchema();
  df.show();

  LinearRegression lr = new LinearRegression().setMaxIter(20);// .setRegParam(1).setElasticNetParam(1);

  // Fit the model to the data.
  LinearRegressionModel model = lr.fit(df);

  // Given a dataset, predict each point's label, and show the results.
  model.transform(df).show();

  LinearRegressionTrainingSummary trainingSummary = model.summary();
  System.out.println("numIterations: " + trainingSummary.totalIterations());
  System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary
      .objectiveHistory()));
  trainingSummary.residuals().show();
  System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
  System.out.println("r2: " + trainingSummary.r2());

  double intercept = model.intercept();
  System.out.println("Interesection: " + intercept);
  double regParam = model.getRegParam();
  System.out.println("Regression parameter: " + regParam);
  double tol = model.getTol();
  System.out.println("Tol: " + tol);
  Double feature = 7.0;
  Vector features = Vectors.dense(feature);
  double p = model.predict(features);

  System.out.println("Prediction for feature " + feature + " is " + p);
  System.out.println(8 * regParam + intercept);
}
 
Example #16
Source File: ColumnUtils.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
public static Metadata getMetadata(Dataset<Row> df, String colName) {
  StructType schema = df.schema();
  StructField[] fields = schema.fields();
  for (StructField field : fields) {
    // TODO check on case
    if (field.name().compareTo(colName) == 0) {
      return field.metadata();
    }
  }
  return null;
}
 
Example #17
Source File: DataframeUtils.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
public static Dataset<Row> addMetadata(Dataset<Row> df, String colName,
    String key, String value) {
  Metadata metadata = new MetadataBuilder()
      .withMetadata(ColumnUtils.getMetadata(df, colName))
      .putString(key, value)
      .build();
  Column col = col(colName);
  return df.withColumn(colName, col, metadata);
}
 
Example #18
Source File: TestDecisionStep.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testPruneByStepKeyFalse() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("result", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create("namecheck", false),
      RowFactory.create("agerange", true)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);

  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_KEY_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_KEY_STEP_PROPERTY, "step1");
  step2ConfigMap.put(DecisionStep.STEP_BY_KEY_KEY_PROPERTY, "namecheck");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2");
  step2.configure(step2Config);
  steps.add(step2);

  Set<Step> refactored = step2.refactor(steps);

  assertEquals(refactored, Sets.newHashSet(step1, step2, step5, step6));
}
 
Example #19
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testAgeRangeLong() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.LongType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0l,105l));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", 34l, new BigDecimal("0.00"));
  assertTrue("Row should pass rule", rule.check(row1));

  Row row2 = new RowWithSchema(schema, "Webster1", "Websta1", 110l, new BigDecimal("450.10"));
  assertFalse("Row should not pass rule", rule.check(row2));

  Row row3 = new RowWithSchema(schema, "", "Ian1", 110l, new BigDecimal("450.10"));
  assertFalse("Row should not pass rule", rule.check(row3));

  Row row4 = new RowWithSchema(schema, "First Last", "Ian Last", 100l, new BigDecimal("450.10"));
  assertTrue("Row should pass rule", rule.check(row4));
}
 
Example #20
Source File: EntitySalienceFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example #21
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testAgeRangeInt() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", 34, new BigDecimal("0.00"));
  assertTrue("Row should pass rule", rule.check(row1));

  Row row2 = new RowWithSchema(schema, "Webster1", "Websta1", 110, new BigDecimal("450.10"));
  assertFalse("Row should not pass rule", rule.check(row2));

  Row row3 = new RowWithSchema(schema, "", "Ian1", 106, new BigDecimal("450.10"));
  assertFalse("Row should not pass rule", rule.check(row3));

  Row row4 = new RowWithSchema(schema, "First Last", "Ian Last", 105, new BigDecimal("450.10"));
  assertTrue("Row should pass rule", rule.check(row4));
}
 
Example #22
Source File: MinMaxScalerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testMinMaxScaler() {
    //prepare data
    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
            RowFactory.create(1.0, Vectors.dense(data[0])),
            RowFactory.create(2.0, Vectors.dense(data[1])),
            RowFactory.create(3.0, Vectors.dense(data[2])),
            RowFactory.create(4.0, Vectors.dense(data[3]))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    Dataset<Row> df = spark.createDataFrame(jrdd, schema);

    //train model in spark
    MinMaxScalerModel sparkModel = new MinMaxScaler()
            .setInputCol("features")
            .setOutputCol("scaled")
            .setMin(-5)
            .setMax(5)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModel);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    List<Row> sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collectAsList();
    assertCorrectness(sparkOutput, expected, transformer);
}
 
Example #23
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testAgeRangeDecimal() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.DoubleType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("candycrushscore"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "decimal");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList("-1.56","400.45"));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", 34.0, new BigDecimal("-1.00"));
  assertTrue("Row should pass rule", rule.check(row1));

  Row row2 = new RowWithSchema(schema, "Webster1", "Websta1", 110.0, new BigDecimal("-1.57"));
  assertFalse("Row should not pass rule", rule.check(row2));

  Row row3 = new RowWithSchema(schema, "", "Ian1", 110.0, new BigDecimal("450.10"));
  assertFalse("Row should not pass rule", rule.check(row3));

  Row row4 = new RowWithSchema(schema, "First Last", "Ian Last", 100.0, new BigDecimal("400.45"));
  assertTrue("Row should pass rule", rule.check(row4));
}
 
Example #24
Source File: DataQualityDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  if (dependencies.size() > 1 && dataset.isEmpty()) {
    throw new RuntimeException("Must specify dataset on which to conduct data quality tests when more than one dependency");
  }
  Dataset<Row> theDataset;
  Dataset<Row> theResults = null;
  if (dependencies.size() == 1) {
    theDataset = dependencies.values().iterator().next();
  } else {
    theDataset = dependencies.get(dataset);
  }
  if (scope == Scope.DATASET) {
    // The checks are run at a dataset level and we are simply returning a DS of <name, boolean> Rows
    for (DatasetRule rule : datasetRules.values()) {
      if (theResults == null) {
        theResults = rule.check(theDataset, dependencies);
      } else {
        theResults = theResults.union(rule.check(theDataset, dependencies));
      }
    }
  } else {
    if (theDataset.schema().getFieldIndex(resultsField).isDefined()) {
      throw new RuntimeException("The field [" + resultsField + "] already exists in the dataset schema. Use the " +
          RESULTS_FIELD_CONFIG + " configuration parameter to customize the data quality check field name");
    }
    List<StructField> checkField = Lists.newArrayList(
        new StructField(resultsField,
            DataTypes.createMapType(DataTypes.StringType, DataTypes.BooleanType),
            false, Metadata.empty()));
    theResults = theDataset.map(new CheckRowRules(rowRules, resultsField),
        RowEncoder.apply(SchemaUtils.appendFields(theDataset.schema(), checkField)));
  }

  return theResults;
}
 
Example #25
Source File: CMMModel.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
@Override
public DataFrame transform(DataFrame dataset) {
	JavaRDD<Row> output = dataset.javaRDD().map(new DecodeFunction());
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
		new StructField("prediction", DataTypes.StringType, false, Metadata.empty())
	});
	return dataset.sqlContext().createDataFrame(output, schema);
}
 
Example #26
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tags a distributed list of sentences and writes the result to an output file with 
 * a desired output format.
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(JavaRDD<Row> sentences, String outputFileName, OutputFormat outputFormat) {
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(sentences, schema);
	tag(input, outputFileName, outputFormat);
}
 
Example #27
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tags a list of sequences and writes the result to an output file with a
 * desired output format.
 * 
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	tag(input, outputFileName, outputFormat);
}
 
Example #28
Source File: DataFrames.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Convert the DataVec sequence schema to a StructType for Spark, for example for use in
 * {@link #toDataFrameSequence(Schema, JavaRDD)}}
 * <b>Note</b>: as per {@link #toDataFrameSequence(Schema, JavaRDD)}}, the StructType has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(DataRowsFacade)}
 *
 * @param schema Schema to convert
 * @return StructType for the schema
 */
public static StructType fromSchemaSequence(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns() + 2];

    structFields[0] = new StructField(SEQUENCE_UUID_COLUMN, DataTypes.StringType, false, Metadata.empty());
    structFields[1] = new StructField(SEQUENCE_INDEX_COLUMN, DataTypes.IntegerType, false, Metadata.empty());

    for (int i = 0; i < schema.numColumns(); i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
Example #29
Source File: DefinitionToSparkVisitor.java    From bunsen with Apache License 2.0 5 votes vote down vote up
@Override
public HapiConverter<DataType> visitComposite(String elementName,
    String elementPath, String baseType,
    String elementTypeUrl, List<StructureField<HapiConverter<DataType>>> children) {

  String recordName = DefinitionVisitorsUtil.recordNameFor(elementPath);
  String recordNamespace = DefinitionVisitorsUtil.namespaceFor(basePackage, elementTypeUrl);
  String fullName = recordNamespace + "." + recordName;

  HapiConverter<DataType> converter = visitedConverters.get(fullName);

  if (converter == null) {
    StructField[] fields = children.stream()
        .map(entry -> new StructField(entry.fieldName(),
            entry.result().getDataType(),
            true,
            Metadata.empty()))
        .toArray(StructField[]::new);

    converter = new HapiCompositeToSparkConverter(baseType,
        children, new StructType(fields), fhirSupport);

    visitedConverters.put(fullName, converter);
  }

  return converter;

}
 
Example #30
Source File: JavaOneHotEncoderExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaOneHotEncoderExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  OneHotEncoder encoder = new OneHotEncoder()
    .setInputCol("categoryIndex")
    .setOutputCol("categoryVec");

  Dataset<Row> encoded = encoder.transform(indexed);
  encoded.show();
  // $example off$

  spark.stop();
}