Java Code Examples for org.apache.spark.sql.DataFrame

The following examples show how to use org.apache.spark.sql.DataFrame. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: DDF   Source File: SparkDataSourceManager.java    License: Apache License 2.0 7 votes vote down vote up
@Override
public DDF loadFromJDBC(JDBCDataSourceDescriptor dataSource) throws DDFException {
    SparkDDFManager sparkDDFManager = (SparkDDFManager)mDDFManager;
    HiveContext sqlContext = sparkDDFManager.getHiveContext();

    JDBCDataSourceCredentials cred = (JDBCDataSourceCredentials)dataSource.getDataSourceCredentials();
    String fullURL = dataSource.getDataSourceUri().getUri().toString();
    if (cred.getUsername() != null &&  !cred.getUsername().equals("")) {
        fullURL += String.format("?user=%s&password=%s", cred.getUsername(), cred.getPassword());
    }

    Map<String, String> options = new HashMap<String, String>();
    options.put("url", fullURL);
    options.put("dbtable", dataSource.getDbTable());
    DataFrame df = sqlContext.load("jdbc", options);

    DDF ddf = sparkDDFManager.newDDF(sparkDDFManager, df, new Class<?>[]{DataFrame.class},
        null, SparkUtils.schemaFromDataFrame(df));
    // TODO?
    ddf.getRepresentationHandler().get(RDD.class, Row.class);
    ddf.getMetaDataHandler().setDataSourceDescriptor(dataSource);
    return ddf;
}
 
Example 2
Source Project: spark-transformers   Source File: IfZeroVectorBridgeTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testIfZeroVectorSparse() {
    IfZeroVector sparkModel = new IfZeroVector()
            .setInputCol("vectorized_count")
            .setOutputCol("product_title_filtered")
            .setThenSetValue("others")
            .setElseSetCol("product_title");
    System.out.println(sparseOrderDF.schema());
    DataFrame transformed = sparkModel.transform(sparseOrderDF).orderBy("product_title");
    System.out.println(transformed.schema());
    //compare predictions
    Row[] sparkOutput = transformed.select("product_title_filtered").collect();
    assertEquals("others", sparkOutput[0].get(0));
    assertEquals("Nike Airmax 2015", sparkOutput[1].get(0));
    assertEquals("Xiaomi Redmi Note", sparkOutput[2].get(0));
}
 
Example 3
Source Project: spark-ts-examples   Source File: JavaStocks.java    License: Apache License 2.0 6 votes vote down vote up
private static DataFrame loadObservations(JavaSparkContext sparkContext, SQLContext sqlContext,
    String path) {
  JavaRDD<Row> rowRdd = sparkContext.textFile(path).map((String line) -> {
      String[] tokens = line.split("\t");
      ZonedDateTime dt = ZonedDateTime.of(Integer.parseInt(tokens[0]),
          Integer.parseInt(tokens[1]), Integer.parseInt(tokens[1]), 0, 0, 0, 0,
          ZoneId.systemDefault());
      String symbol = tokens[3];
      double price = Double.parseDouble(tokens[5]);
      return RowFactory.create(Timestamp.from(dt.toInstant()), symbol, price);
  });
  List<StructField> fields = new ArrayList();
  fields.add(DataTypes.createStructField("timestamp", DataTypes.TimestampType, true));
  fields.add(DataTypes.createStructField("symbol", DataTypes.StringType, true));
  fields.add(DataTypes.createStructField("price", DataTypes.DoubleType, true));
  StructType schema = DataTypes.createStructType(fields);
  return sqlContext.createDataFrame(rowRdd, schema);
}
 
Example 4
@Override
public IfZeroVectorModelInfo getModelInfo(final IfZeroVector from, DataFrame df) {
    IfZeroVectorModelInfo modelInfo = new IfZeroVectorModelInfo();

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    modelInfo.setThenSetValue(from.getThenSetValue());
    modelInfo.setElseSetCol(from.getElseSetCol());

    return modelInfo;
}
 
Example 5
@Test
public void shouldBehaveExactlyAsSparkNAFillerForAllSupportedDataTypes() {

    DataFrame df = getDataFrame();
    DataFrame df1 = df.na().fill( getFillNAMap() );

    FillNAValuesTransformer fillNAValuesTransformer = new FillNAValuesTransformer();
    fillNAValuesTransformer.setNAValueMap( getFillNAMap() );
    DataFrame df2 = fillNAValuesTransformer.transform(df);

    Row[] data1 = df1.orderBy("id").select("id", "a", "b", "c", "d").collect();
    Row[] data2 = df2.orderBy("id").select("id", "a", "b", "c", "d").collect();

    for( int i =0; i < data1.length; i++) {
        for( int j=1; j<=4; j++) {
            assertEquals(data1[i].get(j), data2[i].get(j));
        }
    }
}
 
Example 6
@Override
public OneHotEncoderModelInfo getModelInfo(final CustomOneHotEncoderModel from, DataFrame df) {
    OneHotEncoderModelInfo modelInfo = new OneHotEncoderModelInfo();

    modelInfo.setNumTypes(from.vectorSize());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
Example 7
@Override
public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel, DataFrame df) {
    final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo();
    logisticRegressionModelInfo.setWeights(sparkLRModel.coefficients().toArray());
    logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept());
    logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses());
    logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures());
    logisticRegressionModelInfo.setThreshold(sparkLRModel.getThreshold());
    logisticRegressionModelInfo.setProbabilityKey(sparkLRModel.getProbabilityCol());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(sparkLRModel.getFeaturesCol());
    logisticRegressionModelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(sparkLRModel.getPredictionCol());
    outputKeys.add(sparkLRModel.getProbabilityCol());
    logisticRegressionModelInfo.setOutputKeys(outputKeys);

    return logisticRegressionModelInfo;
}
 
Example 8
Source Project: vn.vitk   Source File: Tagger.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	if (cmmModel != null) {
		DataFrame output = cmmModel.transform(input).repartition(1);
		return output.javaRDD().map(new RowToStringFunction(1)).collect();
	} else {
		System.err.println("Tagging model is null. You need to create or load a model first.");
		return null;
	}
}
 
Example 9
@Override
public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel, DataFrame df) {
    final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo();
    logisticRegressionModelInfo.setWeights(sparkLRModel.weights().toArray());
    logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept());
    logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses());
    logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures());
    logisticRegressionModelInfo.setThreshold((double) sparkLRModel.getThreshold().get());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add("features");
    logisticRegressionModelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add("prediction");
    outputKeys.add("probability");
    logisticRegressionModelInfo.setOutputKeys(outputKeys);

    return logisticRegressionModelInfo;
}
 
Example 10
Source Project: vn.vitk   Source File: Tagger.java    License: GNU General Public License v3.0 6 votes vote down vote up
void testRandomSplit(String inputFileName, int numFeatures, String modelFileName) {
	CMMParams params = new CMMParams()
		.setMaxIter(600)
		.setRegParam(1E-6)
		.setMarkovOrder(2)
		.setNumFeatures(numFeatures);
	
	JavaRDD<String> lines = jsc.textFile(inputFileName);
	DataFrame dataset = createDataFrame(lines.collect());
	DataFrame[] splits = dataset.randomSplit(new double[]{0.9, 0.1}); 
	DataFrame trainingData = splits[0];
	System.out.println("Number of training sequences = " + trainingData.count());
	DataFrame testData = splits[1];
	System.out.println("Number of test sequences = " + testData.count());
	// train and save a model on the training data
	cmmModel = train(trainingData, modelFileName, params);
	// test the model on the test data
	System.out.println("Test accuracy:");
	evaluate(testData); 
	// test the model on the training data
	System.out.println("Training accuracy:");
	evaluate(trainingData);
}
 
Example 11
@Override
public ProbabilityTransformModelInfo getModelInfo(final ProbabilityTransformModel from, DataFrame df) {
    ProbabilityTransformModelInfo modelInfo = new ProbabilityTransformModelInfo();

    modelInfo.setActualClickProportion(from.getActualClickProportion());
    modelInfo.setUnderSampledClickProportion(from.getUnderSampledClickProportion());
    modelInfo.setProbIndex(from.getProbIndex());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);
    return modelInfo;
}
 
Example 12
@Override
public CountVectorizerModelInfo getModelInfo(final CountVectorizerModel from, final DataFrame df) {
    final CountVectorizerModelInfo modelInfo = new CountVectorizerModelInfo();
    modelInfo.setMinTF(from.getMinTF());
    modelInfo.setVocabulary(from.vocabulary());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
Example 13
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 14
/**
 * 获取指定日期范围内的数据
 * @param sc
 * @param taskParam
 * @return
 */
private static JavaRDD<Row> getActionRDD(SQLContext sc, JSONObject taskParam)
{
    String startTime=ParamUtils.getParam(taskParam,Constants.PARAM_STARTTIME);
    String endTime=ParamUtils.getParam(taskParam,Constants.PARAM_ENDTIME);
    String sql="select *from user_visit_action where date>='"+startTime+"' and date<='"+endTime+"'";
    DataFrame df=sc.sql(sql);
    return df.javaRDD();
}
 
Example 15
Source Project: rdf2x   Source File: RelationSchemaCollector.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Get names of relation predicates
 *
 * @param relations    DataFrame of relations (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
 * @param entitySchema schema storing entity tables with names and other information
 * @return map of predicate URI - predicate name
 */
private Map<String, String> getPredicateNames(DataFrame relations, EntitySchema entitySchema) {
    // get all occurring pairs of related types
    Set<String> predicateURIs = getUniquePredicateURIs(relations);
    // format relation names
    Set<String> tableNames = new HashSet<>();
    if (config.isEntityNamesForbidden()) {
        tableNames.addAll(entitySchema.getTableNames().values());
    }
    return formatter.getRelationNames(predicateURIs, rdfSchema.getUriLabels(), tableNames);
}
 
Example 16
@Test
public void testEsdataFrame1Write() throws Exception {
	DataFrame dataFrame = artistsAsDataFrame();

	String target = resource("sparksql-test-scala-basic-write", "data", version);
	JavaEsSparkSQL.saveToEs(dataFrame, target);
	assertTrue(RestUtils.exists(target));
	assertThat(RestUtils.get(target + "/_search?"), containsString("345"));
}
 
Example 17
Source Project: rdf2x   Source File: RelationSchemaCollector.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Get all unique predicate URIs used in relations.
 *
 * @param relations DataFrame of relations (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
 * @return all unique predicate URIs used in relations
 */
private Set<String> getUniquePredicateURIs(DataFrame relations) {
    return relations
            .select("predicateIndex")
            .distinct()
            .collectAsList().stream()
            .map(predicateIndex -> rdfSchema.getPredicateIndex().getValue(predicateIndex.getInt(0)))
            .collect(Collectors.toSet());
}
 
Example 18
@Override
public ChiSqSelectorModelInfo getModelInfo(final ChiSqSelectorModel from, DataFrame df) {
    ChiSqSelectorModelInfo modelInfo = new ChiSqSelectorModelInfo();
    modelInfo.setSelectedFeatures(from.selectedFeatures());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getFeaturesCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
Example 19
Source Project: deep-spark   Source File: DeepSparkContext.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Creates a JavaSchemaRDD from a DeepJobConfig and a JavaSQLContext.
 * @param config Specific Deep ExtractorConfig.
 * @return A JavaSchemaRDD built from Cells.
 * @throws UnsupportedDataTypeException
 */
public DataFrame createJavaSchemaRDD(ExtractorConfig<Cells> config) throws UnsupportedDataTypeException, UnsupportedOperationException {
    JavaRDD<Cells> cellsRDD = createJavaRDD(config);
    JavaRDD<Row> rowsRDD = DeepSparkContext.createJavaRowRDD(cellsRDD);
    try {
        Cells firstCells = cellsRDD.first();
        StructType schema = CellsUtils.getStructTypeFromCells(firstCells);
        return sqlContext.applySchema(rowsRDD, schema);
    } catch(UnsupportedOperationException e) {
        throw new UnsupportedOperationException("Cannot infer schema from empty data RDD", e);
    }
}
 
Example 20
Source Project: rdf2x   Source File: DbPersistorSQLServer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void writeDataFrame(String name, DataFrame df) {
    for (StructField field : df.schema().fields()) {
        String column = field.name();
        // convert booleans to integers to avoid error in Spark 1.6.2
        // "Cannot specify a column width on data type bit."
        if (field.dataType() == DataTypes.BooleanType) {
            df = df.withColumn(column + TMP_SUFFIX, df.col(column).cast(DataTypes.IntegerType))
                    .drop(column)
                    .withColumnRenamed(column + TMP_SUFFIX, column);
        }
    }
    super.writeDataFrame(name, df);
}
 
Example 21
Source Project: rdf2x   Source File: JSONPersistor.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Write a {@link DataFrame} to the specified output
 *
 * @param name name of output table
 * @param df   dataframe containing the data
 */
@Override
public void writeDataFrame(String name, DataFrame df) {
    String outputFolder = config.getOutputFolder();
    String outputPath = Paths.get(outputFolder, name).toString();
    log.info("Writing JSON files to folder {}", outputPath);
    df.write().mode(saveMode).json(outputPath);
}
 
Example 22
Source Project: rdf2x   Source File: ElasticSearchPersistor.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Write a {@link DataFrame} to the specified output
 *
 * @param name name of output table
 * @param df   dataframe containing the data
 */
@Override
public void writeDataFrame(String name, DataFrame df) {
    Map<String, String> props = config.getProperties(name);
    log.info("Writing to ElasticSearch: {}", props);
    JavaEsSparkSQL.saveToEs(df, props);
}
 
Example 23
Source Project: rdf2x   Source File: DbPersistor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void writeDataFrame(String name, DataFrame df) {
    String fullTableName = getFullTableName(name);

    Properties properties = config.getProperties();
    log.info("Writing to database table {} using batched inserts", fullTableName);
    df.write().mode(saveMode).jdbc(config.getUrl(), fullTableName, properties);
}
 
Example 24
Source Project: rdf2x   Source File: CSVPersistor.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Write a {@link DataFrame} to the specified output
 *
 * @param name name of output table
 * @param df   dataframe containing the data
 */
@Override
public void writeDataFrame(String name, DataFrame df) {
    String outputFolder = config.getOutputFolder();
    String outputPath = Paths.get(outputFolder, name).toString();
    log.info("Writing CSV files to folder {}", outputPath);
    df.write().mode(saveMode)
            .format("com.databricks.spark.csv")
            .option("header", "true")
            .save(outputPath);
}
 
Example 25
Source Project: spark-transformers   Source File: MinMaxScalerBridgeTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testStandardScaler() {
    //prepare data
    List<LabeledPoint> localTraining = Arrays.asList(
            new LabeledPoint(1.0, Vectors.dense(data[0])),
            new LabeledPoint(2.0, Vectors.dense(data[1])),
            new LabeledPoint(3.0, Vectors.dense(data[2])),
            new LabeledPoint(3.0, Vectors.dense(data[3])));
    DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class);

    //train model in spark
    MinMaxScalerModel sparkModel = new MinMaxScaler()
            .setInputCol("features")
            .setOutputCol("scaled")
            .setMin(-5)
            .setMax(5)
            .fit(df);


    //Export model, import it back and get transformer
    byte[] exportedModel = ModelExporter.export(sparkModel, df);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collect();
    assertCorrectness(sparkOutput, expected, transformer);
}
 
Example 26
Source Project: SparkApps   Source File: Main.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    //Sample data-frame loaded from a JSON file
    DataFrame usersDf = sqlContext.jsonFile("spark-save-to-db/src/main/resources/users.json");

    //Save data-frame to MySQL (or any other JDBC supported databases)
    //Choose one of 2 options depending on your requirement (Not both).

    //Option 1: Create new table and insert all records.
    usersDf.createJDBCTable(MYSQL_CONNECTION_URL, "users", true);

    //Option 2: Insert all records to an existing table.
    usersDf.insertIntoJDBC(MYSQL_CONNECTION_URL, "users", false);
}
 
Example 27
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example 28
Source Project: rdf2x   Source File: InstanceRelationWriterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteRelationTablesWithoutPredicateIndex() throws IOException {
    InstanceRelationWriter writer = new InstanceRelationWriter(config
            .setStorePredicate(false), jsc(), persistor, rdfSchema);
    writer.writeRelationTables(getTestRelationSchema(), getTestRelations());

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create(1L, 3L));
    rows.add(RowFactory.create(2L, 3L));

    DataFrame result = this.result.values().iterator().next();
    assertEquals("Expected schema of A_B was extracted", getExpectedSchemaOfAB(false, false), result.schema());
    assertRDDEquals("Expected rows of A_B were extracted", jsc().parallelize(rows), result.toJavaRDD());
}
 
Example 29
Source Project: rdf2x   Source File: InstanceRelationWriterTest.java    License: Apache License 2.0 5 votes vote down vote up
private DataFrame getTestRelations() {
    List<Row> rows = new ArrayList<>();

    rows.add(RowFactory.create(
            uriIndex.getIndex("http://example.com/knows"),
            uriIndex.getIndex("http://example.com/a"),
            1L,
            uriIndex.getIndex("http://example.com/b"),
            3L
    ));

    rows.add(RowFactory.create(
            uriIndex.getIndex("http://example.com/likes"),
            uriIndex.getIndex("http://example.com/a"),
            2L,
            uriIndex.getIndex("http://example.com/b"),
            3L
    ));

    return sql.createDataFrame(rows, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );
}
 
Example 30
Source Project: vn.vitk   Source File: Tagger.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * Trains a tagger with data specified in a data frame. The data frame has 
 * two columns, one column "sentence" contains a word sequence, and the other column "partOfSpeech" 
 * contains the corresponding tag sequence. Each row of the data frame specifies a tagged sequence
 * in the training set.
 * @param dataset
 * @param modelFileName
 * @param params
 * @return a {@link CMMModel}
 */
public CMMModel train(DataFrame dataset, String modelFileName, CMMParams params) {
	CMM cmm = new CMM(params).setVerbose(verbose);
	cmmModel = cmm.fit(dataset);
	try {
		cmmModel.write().overwrite().save(modelFileName);
	} catch (IOException e) {
		e.printStackTrace();
	}
	return cmmModel;
}