Java Code Examples for org.apache.spark.sql.RowFactory

The following examples show how to use org.apache.spark.sql.RowFactory. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: systemds   Source File: FrameRDDConverterUtils.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Iterator<Row> call(Tuple2<Long, FrameBlock> arg0)
	throws Exception 
{
	long rowIndex = arg0._1();
	FrameBlock blk = arg0._2();
	ArrayList<Row> ret = new ArrayList<>();

	//handle Frame block data
	int rows = blk.getNumRows();
	int cols = blk.getNumColumns();
	for( int i=0; i<rows; i++ ) {
		Object[] row = new Object[cols+1];
		row[0] = (double)rowIndex++;
		for( int j=0; j<cols; j++ )
			row[j+1] = blk.get(i, j);
		ret.add(RowFactory.create(row));
	}
	
	return ret.iterator();
}
 
Example 2
Source Project: mmtf-spark   Source File: AtomInteraction.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns rows of pairwise interactions with the central atom.
 * 
 * @return rows of pairwise interactions with the central atom
 */
public List<Row> getPairInteractionsAsRows() {
	List<Row> rows = new ArrayList<>(neighbors.size());

	int length = InteractionCenter.getLength();
	
	calcCoordinationGeometry(neighbors.size());

	// copy data of the interacting atoms
	for (int i = 0; i < neighbors.size(); i++) {
		Object[] data = new Object[2 * length + 2];
		int index = 0;
		data[index++] = structureId;
		System.arraycopy(center.getAsObject(), 0, data, index, length);
		index += length;
		System.arraycopy(neighbors.get(i).getAsObject(), 0, data, index, length);
		index += length;
		data[index++] = distances[i];
		rows.add(RowFactory.create(data));
	}

	return rows;
}
 
Example 3
@Override
public Iterator<Row> call(Row t) throws Exception {
	//get information from the input Row
	String structureChainId = t.getString(0);
	String sequence = t.getString(1);
	String dsspQ8 = t.getString(5);
	String dsspQ3 = t.getString(6);
	
	int numSegments = Math.max(0, sequence.length() - length);
	List<Row> sequences = new ArrayList<>(numSegments);
	
	for (int i = 0; i < sequence.length() - length; i++)
	{
		String currSeq = sequence.substring(i, i+length);
		String labelQ8 = dsspQ8.substring(i + length/2,i + length/2 + 1);
		String labelQ3 = dsspQ3.substring(i + length/2,i + length/2 + 1);
		if ( !labelQ8.equals("X") && !labelQ3.equals("X"))
		{
			sequences.add( RowFactory.create(structureChainId, currSeq, labelQ8, labelQ3) );
		}
	}
	return sequences.iterator();
}
 
Example 4
Source Project: mmtf-spark   Source File: DatasetBalancerTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test() {
	List<Row> rows = Arrays.asList(
			RowFactory.create("a", 1), RowFactory.create("a", 2), 
			RowFactory.create("b", 1), RowFactory.create("b", 2), RowFactory.create("b", 3), 
			RowFactory.create("c", 1), RowFactory.create("c", 2), RowFactory.create("c", 3), RowFactory.create("c", 4));

	SparkSession spark = SparkSession.builder().master("local[1]").getOrCreate();

	StructType schema = new StructType(
			new StructField[] { DataTypes.createStructField("key", DataTypes.StringType, false),
					DataTypes.createStructField("value", DataTypes.IntegerType, false) });

	Dataset<Row> data = spark.createDataFrame(rows, schema);

	long seed = 19;
	Dataset<Row> balancedData = DatasetBalancer.downsample(data, "key", seed);
	assertTrue(balancedData.count() > 0);
	
    spark.close();
}
 
Example 5
Source Project: SparkDemo   Source File: JavaStopWordsRemoverExample.java    License: MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaStopWordsRemoverExample")
    .getOrCreate();

  // $example on$
  StopWordsRemover remover = new StopWordsRemover()
    .setInputCol("raw")
    .setOutputCol("filtered");

  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
    RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField(
      "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
  });

  Dataset<Row> dataset = spark.createDataFrame(data, schema);
  remover.transform(dataset).show(false);
  // $example off$
  spark.stop();
}
 
Example 6
Source Project: SparkDemo   Source File: JavaSQLTransformerExample.java    License: MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaSQLTransformerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 1.0, 3.0),
    RowFactory.create(2, 2.0, 5.0)
  );
  StructType schema = new StructType(new StructField [] {
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  SQLTransformer sqlTrans = new SQLTransformer().setStatement(
    "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");

  sqlTrans.transform(df).show();
  // $example off$

  spark.stop();
}
 
Example 7
Source Project: rdf2x   Source File: RelationExtractor.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Map a {@link Instance} into an Iterator of all of its relations
 * represented as rows of (related URI, predicate index, type index, instance ID)
 *
 * @param instance the requested {@link Instance}
 * @return an Iterator of all of its relations represented as rows of (related URI, predicate index, type index, instance ID)
 */
private Iterable<Row> getRelatedTypeIDs(Instance instance) {
    // typeIDs representing references to the instance in each table (or a single one, if instance has a single type)
    final Long id = instance.getId();

    final List<Tuple2<Integer, Long>> instanceTypeIDs = getRelationEntityTypes(instance)
            .map(typeIndex -> new Tuple2<>(typeIndex, id))
            .collect(Collectors.toList());

    return instance.getRelations().stream()
            .flatMap(relation ->
                    instanceTypeIDs.stream()
                            .map(instanceTypeID -> RowFactory.create(
                                    relation.getObjectURI(),
                                    relation.getPredicateIndex(),
                                    instanceTypeID._1(),
                                    instanceTypeID._2()
                            ))
            ).collect(Collectors.toList());
}
 
Example 8
Source Project: BigDataPlatform   Source File: IfTest.java    License: GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("IfTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row> gradeLevelDF = sqlContext.sql(
			"SELECT IF(grade>=80,'GOOD','BAD') gradeLevel "  
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 9
Source Project: sparkResearch   Source File: CustomDataFrame.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}
 
Example 10
Source Project: kylin-on-parquet-v2   Source File: SparkCubingJobTest.java    License: Apache License 2.0 5 votes vote down vote up
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example 11
Source Project: rdf2x   Source File: MetadataWriterTest.java    License: Apache License 2.0 5 votes vote down vote up
private JavaRDD<Row> getExpectedRowsOfMetaProperties() {
    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create("a", predicateIndex.getIndex("http://example.com/name"), "name", true, true, true, "STRING", null, 0.5, null));
    rows.add(RowFactory.create("b", predicateIndex.getIndex("http://example.com/age"), "age", true, true, false, "INTEGER", null, 0.5, null));
    rows.add(RowFactory.create("b", predicateIndex.getIndex("http://example.com/name"), "name", true, true, false, "STRING", "en", 0.5, null));
    return jsc().parallelize(rows);
}
 
Example 12
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example 13
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 14
Source Project: systemds   Source File: RDDConverterUtilsExt.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Row call(Tuple2<Row, Long> arg0) throws Exception {
	int oldNumCols = arg0._1.length();
	Object [] fields = new Object[oldNumCols + 1];
	for(int i = 0; i < oldNumCols; i++) {
		fields[i] = arg0._1.get(i);
	}
	fields[oldNumCols] = new Double(arg0._2 + 1);
	return RowFactory.create(fields);
}
 
Example 15
Source Project: systemds   Source File: FrameRDDConverterUtils.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Row call(String record) throws Exception {
      String[] fields = IOUtilFunctions.splitCSV(record, _delim);
      Object[] objects = new Object[fields.length]; 
      for (int i=0; i<fields.length; i++) {
	      objects[i] = UtilFunctions.stringToObject(_schema[i], fields[i]);
      }
      return RowFactory.create(objects);
}
 
Example 16
Source Project: systemds   Source File: MLContextFrameTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testInputFrameAndMatrixOutputMatrixAndFrame() {
	System.out.println("MLContextFrameTest - input frame and matrix, output matrix and frame");
	
	Row[] rowsA = {RowFactory.create("Doc1", "Feat1", 10), RowFactory.create("Doc1", "Feat2", 20), RowFactory.create("Doc2", "Feat1", 31)};

	JavaRDD<Row> javaRddRowA = sc. parallelize( Arrays.asList(rowsA)); 

	List<StructField> fieldsA = new ArrayList<>();
	fieldsA.add(DataTypes.createStructField("myID", DataTypes.StringType, true));
	fieldsA.add(DataTypes.createStructField("FeatureName", DataTypes.StringType, true));
	fieldsA.add(DataTypes.createStructField("FeatureValue", DataTypes.IntegerType, true));
	StructType schemaA = DataTypes.createStructType(fieldsA);
	Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);

	String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: false ,recode: [ myID, FeatureName ]}\");";

	Script script = dml(dmlString)
			.in("A", dataFrameA,
					new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length))
			.out("tA").out("tAM");
	MLResults results = ml.execute(script);

	double[][] matrixtA = results.getMatrixAs2DDoubleArray("tA");
	Assert.assertEquals(10.0, matrixtA[0][2], 0.0);
	Assert.assertEquals(20.0, matrixtA[1][2], 0.0);
	Assert.assertEquals(31.0, matrixtA[2][2], 0.0);

	Dataset<Row> dataFrame_tA = results.getMatrix("tA").toDF();
	System.out.println("Number of matrix tA rows = " + dataFrame_tA.count());
	dataFrame_tA.printSchema();
	dataFrame_tA.show();
	
	Dataset<Row> dataFrame_tAM = results.getFrame("tAM").toDF();
	System.out.println("Number of frame tAM rows = " + dataFrame_tAM.count());
	dataFrame_tAM.printSchema();
	dataFrame_tAM.show();
}
 
Example 17
Source Project: systemds   Source File: MLContextFrameTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTransform() {
	System.out.println("MLContextFrameTest - transform");
	
	Row[] rowsA = {RowFactory.create("\"`@(\"(!&",2,"20news-bydate-train/comp.os.ms-windows.misc/9979"),
			RowFactory.create("\"`@(\"\"(!&\"",3,"20news-bydate-train/comp.os.ms-windows.misc/9979")};

	JavaRDD<Row> javaRddRowA = sc. parallelize( Arrays.asList(rowsA)); 

	List<StructField> fieldsA = new ArrayList<>();
	fieldsA.add(DataTypes.createStructField("featureName", DataTypes.StringType, true));
	fieldsA.add(DataTypes.createStructField("featureValue", DataTypes.IntegerType, true));
	fieldsA.add(DataTypes.createStructField("id", DataTypes.StringType, true));
	StructType schemaA = DataTypes.createStructType(fieldsA);
	Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);

	String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: false ,recode: [ featureName, id ]}\");";

	Script script = dml(dmlString)
			.in("A", dataFrameA,
					new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length))
			.out("tA").out("tAM");
	ml.setExplain(true);
	ml.setExplainLevel(ExplainLevel.RECOMPILE_HOPS);
	MLResults results = ml.execute(script);

	double[][] matrixtA = results.getMatrixAs2DDoubleArray("tA");
	Assert.assertEquals(1.0, matrixtA[0][2], 0.0);

	Dataset<Row> dataFrame_tA = results.getMatrix("tA").toDF();
	System.out.println("Number of matrix tA rows = " + dataFrame_tA.count());
	dataFrame_tA.printSchema();
	dataFrame_tA.show();
	
	Dataset<Row> dataFrame_tAM = results.getFrame("tAM").toDF();
	System.out.println("Number of frame tAM rows = " + dataFrame_tAM.count());
	dataFrame_tAM.printSchema();
	dataFrame_tAM.show();
}
 
Example 18
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Row call(String str) throws Exception {
	String[] strings = str.split(",");
	Double[] doubles = new Double[strings.length];
	for (int i = 0; i < strings.length; i++) {
		doubles[i] = Double.parseDouble(strings[i]);
	}
	return RowFactory.create((Object[]) doubles);
}
 
Example 19
/**
 * This method outputs the result of the number of places helps are needed for each
 * repository/package combination to the given BigQuery table.
 */
private void outputGoPackageHelpsTable(
    JavaPairRDD<Tuple2<String, String>, Integer> packagesNeedingHelp, String outputTableId) {
  Dataset<Row> dataset = this.sqlContext.createDataFrame(packagesNeedingHelp
          .map(tuple -> RowFactory.create(tuple._1()._1(), tuple._1()._2(), tuple._2()))
          .rdd(),
      GO_PACKAGES_HELPS_TABLE_SCHEMA);
  BigQueryDataFrame bigQueryDataFrame = new BigQueryDataFrame(dataset);
  bigQueryDataFrame.saveAsBigQueryTable(outputTableId, CreateDisposition.CREATE_IF_NEEDED(),
      WriteDisposition.WRITE_EMPTY());
}
 
Example 20
/**
 * This method outputs the result of the number of times a package is imported in other
 * repositories to the given BigQuery table.
 */
private void outputGoPackageImportsTable(
    JavaPairRDD<Tuple2<String, String>, Integer> packageImports, String outputTableId) {
  Dataset<Row> dataset = this.sqlContext.createDataFrame(
      packageImports
          .map(tuple -> RowFactory.create(tuple._1()._1(), tuple._1()._2(), tuple._2()))
          .rdd(),
      GO_PACKAGE_IMPORTS_TABLE_SCHEMA);
  BigQueryDataFrame bigQueryDataFrame = new BigQueryDataFrame(dataset);
  bigQueryDataFrame
      .saveAsBigQueryTable(outputTableId, CreateDisposition.CREATE_IF_NEEDED(),
          WriteDisposition.WRITE_EMPTY());
}
 
Example 21
Source Project: mmtf-spark   Source File: AtomInteraction.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns interactions and geometric information in a single row.
 * 
 * @return row of interactions and geometric information
 */
public Row getMultipleInteractionsAsRow(int maxInteractions) {
	// pad interaction centers and distances with nulls, if necessary,
	// since each row must be of fixed length
	while (getNumInteractions() < maxInteractions) {
		neighbors.add(new InteractionCenter());
	}

	int length = InteractionCenter.getLength();

	Object[] data = new Object[getNumColumns(maxInteractions)];

	int index = 0;
	data[index++] = structureId;
	data[index++] = getNumberOfPolymerChains();
	
	calcCoordinationGeometry(maxInteractions);
	data[index++] = q3;
	data[index++] = q4;
	data[index++] = q5;
	data[index++] = q6;
	

	// copy data for query atom
	System.arraycopy(center.getAsObject(), 0, data, index, length);
	index += length;

	// copy data for interacting atoms
	for (int i = 0; i < neighbors.size(); i++) {
		System.arraycopy(neighbors.get(i).getAsObject(), 0, data, index, length);
		index += length;
		data[index++] = distances[i];
	}

	// copy angles
	System.arraycopy(angles, 0, data, index, angles.length);
	index += length;

	return RowFactory.create(data);
}
 
Example 22
Source Project: mmtf-spark   Source File: BiojavaAligner.java    License: Apache License 2.0 5 votes vote down vote up
/**
	 * Calculates a structural alignment and returns alignment metrics.
	 * 
	 * @param alignmentAlgorithm name of the algorithm
	 * @param key unique identifier for protein chain pair
	 * @param points1 C-alpha positions of chain 1
	 * @param points2 C-alpha positions of chain 2
	 * @return
	 */
	public static List<Row> getAlignment(String alignmentAlgorithm, String key, Point3d[] points1, Point3d[] points2) {
		// create input for BioJava alignment method
		Atom[] ca1 = getCAAtoms(points1);
		Atom[] ca2 = getCAAtoms(points2);
		
		// calculate the alignment
		AFPChain afp = null;
		try {
			StructureAlignment algorithm  = StructureAlignmentFactory.getAlgorithm(alignmentAlgorithm);
			afp = algorithm.align(ca1,ca2);
			double tmScore = AFPChainScorer.getTMScore(afp, ca1, ca2);
			afp.setTMScore(tmScore);
		} catch (StructureException e) {
			e.printStackTrace();
			return Collections.emptyList();
		} 
		
		// TODO add alignments as arrays to results
//		int[][] alignment = afp.getAfpIndex();
//		for (int i = 0; i < alignment.length; i++) {
//			System.out.println(alignment[i][0] + " - " + alignment[i][1]);
//		}

		// record the alignment metrics
		Row row = RowFactory.create(key, afp.getOptLength(), afp.getCoverage1(), 
				afp.getCoverage2(), (float) afp.getTotalRmsdOpt(), (float) afp.getTMScore());

		return Collections.singletonList(row);
	}
 
Example 23
@Override
public Iterator<Row> call(Row t) throws Exception {
	// get information from the input Row
	String sequence = t.getString(1);
	String dsspQ3 = t.getString(6);

	int currLength = 0;
	String currSequence = "";
	int j;
	List<Row> sequences = new ArrayList<>();

	for (int i = 0; i < sequence.length(); i++) {
		currLength = 0;
		currSequence = "";
		for (j = i; j < sequence.length(); j++) {
			if (dsspQ3.substring(j, j + 1).equals(label)) {
				currLength++;
				currSequence = currSequence.concat(sequence.substring(j, j + 1));
			} else
				break;
		}
		i += currLength;
		if (currLength >= minLength) {
			sequences.add(RowFactory.create(currSequence, label));
		}
	}
	return sequences.iterator();
}
 
Example 24
Source Project: mmtf-spark   Source File: StructureToInteractingResidues.java    License: Apache License 2.0 5 votes vote down vote up
private List<Row> getDistanceProfile(String structureId, List<Integer> matches, int index, List<Integer> groupIndices, List<String> groupNames, StructureDataInterface structure) {
       double cutoffDistanceSq = cutoffDistance * cutoffDistance;
	
	float[] x = structure.getxCoords();
	float[] y = structure.getyCoords();
	float[] z = structure.getzCoords();
	
	int first = groupIndices.get(index);
	int last = groupIndices.get(index+1);
	
	List<Row> rows = new ArrayList<>();
	for (int i: matches) {
		if (i == index) {
			continue;
		}
		double minDSq = Double.MAX_VALUE;
		int minIndex = -1;
		for (int j = groupIndices.get(i); j < groupIndices.get(i+1); j++) {
			
			for (int k = first; k < last; k++) {
				double dx = (x[j] - x[k]);
				double dy = (y[j] - y[k]);
				double dz = (z[j] - z[k]);
				double dSq = dx*dx + dy*dy + dz*dz;
				if (dSq <= cutoffDistanceSq && dSq < minDSq) {
					minDSq = Math.min(minDSq, dSq);
					minIndex = i;
				}
			}
		}
		if (minIndex >= 0) {
			// TODO add unique group (and atom?) for each group?
			Row row = RowFactory.create(structureId, groupNames.get(index), index, groupNames.get(minIndex), minIndex, (float)Math.sqrt(minDSq));
			rows.add(row);
		}
	}
	return rows;
}
 
Example 25
Source Project: mmtf-spark   Source File: QuaternaryStructureDataset.java    License: Apache License 2.0 5 votes vote down vote up
private static Iterator<Row> getQuaternaryStructure(Tuple2<String, StructureDataInterface> t) throws Exception {
	List<Row> rows = new ArrayList<>();
    String key = t._1;
	StructureDataInterface structure = t._2;
	ColumnarStructure cs = new ColumnarStructure(structure, true);
	String[] chainEntityTypes = cs.getChainEntityTypes();
	int[] chainToEntityIndex = cs.getChainToEntityIndices();
	
	for (int i = 0; i < structure.getNumBioassemblies(); i++) {
	    List<Integer> proteinIndices = new ArrayList<>();
	    List<Integer> dnaIndices = new ArrayList<>();
	    List<Integer> rnaIndices = new ArrayList<>();
	   
	    for (int j = 0; j < structure.getNumTransInBioassembly(i); j++) {
	        for (int chainIndex : structure.getChainIndexListForTransform(i, j)) {
	            int entityIndex = chainToEntityIndex[chainIndex];
	            String type = chainEntityTypes[chainIndex];
	            if (type.equals("PRO")) {
	                proteinIndices.add(entityIndex);
	            } else if (type.equals("DNA")) {
	                dnaIndices.add(entityIndex);
	            } else if (type.equals("RNA")) {
	                rnaIndices.add(entityIndex);
	            }
	        }
	    }
	    
	    String proStoich = stoichiometry(coefficients(proteinIndices));
        String dnaStoich = stoichiometry(coefficients(dnaIndices));
        String rnaStoich = stoichiometry(coefficients(rnaIndices));
	    rows.add(RowFactory.create(key, structure.getBioassemblyName(i), proStoich, dnaStoich, rnaStoich));
	}

	return rows.iterator();
}
 
Example 26
Source Project: SparkDemo   Source File: JavaRFormulaExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRFormulaExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("country", StringType, false),
    createStructField("hour", IntegerType, false),
    createStructField("clicked", DoubleType, false)
  });

  List<Row> data = Arrays.asList(
    RowFactory.create(7, "US", 18, 1.0),
    RowFactory.create(8, "CA", 12, 0.0),
    RowFactory.create(9, "NZ", 15, 0.0)
  );

  Dataset<Row> dataset = spark.createDataFrame(data, schema);
  RFormula formula = new RFormula()
    .setFormula("clicked ~ country + hour")
    .setFeaturesCol("features")
    .setLabelCol("label");
  Dataset<Row> output = formula.fit(dataset).transform(dataset);
  output.select("features", "label").show();
  // $example off$
  spark.stop();
}
 
Example 27
Source Project: SparkDemo   Source File: JavaBucketizerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBucketizerExample")
    .getOrCreate();

  // $example on$
  double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};

  List<Row> data = Arrays.asList(
    RowFactory.create(-999.9),
    RowFactory.create(-0.5),
    RowFactory.create(-0.3),
    RowFactory.create(0.0),
    RowFactory.create(0.2),
    RowFactory.create(999.9)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Bucketizer bucketizer = new Bucketizer()
    .setInputCol("features")
    .setOutputCol("bucketedFeatures")
    .setSplits(splits);

  // Transform original data into its bucket index.
  Dataset<Row> bucketedData = bucketizer.transform(dataFrame);

  System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets");
  bucketedData.show();
  // $example off$

  spark.stop();
}
 
Example 28
Source Project: SparkDemo   Source File: JavaVectorAssemblerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorAssemblerExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("hour", IntegerType, false),
    createStructField("mobile", DoubleType, false),
    createStructField("userFeatures", new VectorUDT(), false),
    createStructField("clicked", DoubleType, false)
  });
  Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
  Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);

  VectorAssembler assembler = new VectorAssembler()
    .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
    .setOutputCol("features");

  Dataset<Row> output = assembler.transform(dataset);
  System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
      "'features'");
  output.select("features", "clicked").show(false);
  // $example off$

  spark.stop();
}
 
Example 29
Source Project: SparkDemo   Source File: JavaMinHashLSHExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinHashLSHExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0}))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("keys", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinHashLSH mh = new MinHashLSH()
    .setNumHashTables(1)
    .setInputCol("keys")
    .setOutputCol("values");

  MinHashLSHModel model = mh.fit(dataFrame);
  model.transform(dataFrame).show();
  // $example off$

  spark.stop();
}
 
Example 30
Source Project: rdf2x   Source File: InstanceRelationWriterTest.java    License: Apache License 2.0 5 votes vote down vote up
private DataFrame getTestRelations() {
    List<Row> rows = new ArrayList<>();

    rows.add(RowFactory.create(
            uriIndex.getIndex("http://example.com/knows"),
            uriIndex.getIndex("http://example.com/a"),
            1L,
            uriIndex.getIndex("http://example.com/b"),
            3L
    ));

    rows.add(RowFactory.create(
            uriIndex.getIndex("http://example.com/likes"),
            uriIndex.getIndex("http://example.com/a"),
            2L,
            uriIndex.getIndex("http://example.com/b"),
            3L
    ));

    return sql.createDataFrame(rows, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );
}