Java Code Examples for org.apache.spark.sql.Dataset#count()

The following examples show how to use org.apache.spark.sql.Dataset#count() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataFilterStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

	if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterStep");
        return dataset;
    }
	
	String query = (String) parameters.get("query");               
    BpmnaiLogger.getInstance().writeInfo("Filtering data with filter query: " + query + ".");
    dataset = dataset.filter(query);

    dataset.cache();
    if(dataset.count() == 0) {
        BpmnaiLogger.getInstance().writeInfo("Filtering resulted in zero lines of data. Aborting. Please check your filter query.");
        System.exit(1);
    }
           
    return dataset;
}
 
Example 2
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test14(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T14: Append mode " + containerOut);
  String o1 = containerOut + "myData";
  try {
    createAppendObject("T14 - first append", schemaFlights, o1, type);
    long baseCount = schemaFlights.count();
    System.out
        .println("***T14-1 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-1-" + type, type, o1, spark, baseCount, 1);
    createAppendObject("T14 - second append", schemaFlights, o1, type);
    baseCount = schemaFlights.count();
    System.out
        .println("***T14-2 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-2-" + type, type, o1, spark, baseCount, 2);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), true);
  }
}
 
Example 3
Source File: LoopStep.java    From envelope with Apache License 2.0 6 votes vote down vote up
private List<Row> getRowsFromStep(Set<Step> steps) {
  String stepName = config.getString(STEP_PROPERTY);
  Optional<Step> optionalStep = StepUtils.getStepForName(stepName, steps);
  
  if (!optionalStep.isPresent()) {
    throw new RuntimeException("Step source for loop step '" + getName() + "' does not exist.");
  }
  
  Step step = optionalStep.get();
  
  if (!(step instanceof DataStep)) {
    throw new RuntimeException("Step source for loop step '" + getName() + "' is not a data step.");
  }
  
  Dataset<Row> stepRows = ((DataStep)step).getData();
  
  if (stepRows.count() > 1000) {
    throw new RuntimeException("Step source for loop step '" + getName() + "' can not provide more than 1000 values to loop over");
  }

  return stepRows.collectAsList();
}
 
Example 4
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test13(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T13: Going to create nested structures and check globber on " + containerOut);
  String o1 = containerOut + "Dir/result/jobid=21274501-57a1-4690-9a84-9d2294fcf64d";
  try {
    if (dataCreate) {
      createObject("T13", schemaFlights, o1, type);
    }
    long baseCount = schemaFlights.count();
    String path = containerOut + "Dir/result/jobid=21274501-57a1-4690-9a84-9d2294fcf64";

    System.out.println(
        "***T13-1 : Reading " + path + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T13-1-" + type, type, path, spark, baseCount, 1);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}
 
Example 5
Source File: CountDatasetRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Override
public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) {
  if (isDependency()) {
    Dataset<Row> expectedDependency = stepDependencies.get(dependency);
    if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1
        && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) {
      expected = expectedDependency.collectAsList().get(0).getLong(0);
    } else {
      throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type");
    }
  }
  if (expected < 0) {
    throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency");
  }
  return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA));
}
 
Example 6
Source File: ConceptMaps.java    From bunsen with Apache License 2.0 6 votes vote down vote up
@Override
public ConceptMaps withConceptMaps(Dataset<Row> conceptMaps) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps);

  if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion");
  }

  // Remove the concept contents for persistence. This is most easily done in the ConceptMap
  // object by setting the group to an empty list.
  // Dataset<Row> withoutConcepts =
  JavaRDD<Row> withoutConceptsRdd =
      conceptMaps.javaRDD().map(new ConceptMapRemover());

  Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd,
      conceptMapRowConverter.getSchema());

  Dataset<Mapping> newMappings = conceptMaps.flatMap((Row row) -> expandMappingsIterator(row),
      MAPPING_ENCODER);

  return withConceptMaps(withoutConcepts, newMappings);
}
 
Example 7
Source File: ValueSets.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<Row> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  JavaRDD<Row> valueSetsRdd = valueSets.javaRDD();

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  JavaRDD<Row> withoutConceptsRdd = valueSetsRdd.map(new RemoveConcepts(fhirVersion));

  Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd,
      valueSetRowConverter.getSchema());

  JavaRDD<Value> newValuesRdd = valueSetsRdd.flatMap(new ExtractValues(fhirVersion));

  Dataset<Value> newValues = spark.createDataset(newValuesRdd.rdd(), getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}
 
Example 8
Source File: TestPerformanceRegression.java    From chronix.spark with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws SolrServerException, IOException {

        ChronixSparkLoader loader = new ChronixSparkLoader();

        ChronixSparkContext chronixSparkContext = loader.createChronixSparkContext();
        SQLContext sqlContext = new SQLContext(chronixSparkContext.getSparkContext());

        // BENCHMARK START ...............................
        long start = System.currentTimeMillis();
        for (int i = 0; i < LOOPS; i++) {

            //Load data into ChronixRDD
            ChronixRDD rdd = loader.createChronixRDD(chronixSparkContext);

            //Some actions
            double mean = rdd.mean();
            double approxMean = rdd.approxMean();
            long observationCount = rdd.countObservations();
            double max = rdd.max();
            double min = rdd.min();
            Iterator<MetricTimeSeries> it = rdd.iterator();
            while (it.hasNext()) {
                MetricTimeSeries mts = it.next();
                System.out.print(".");
            }

            //DataFrame operations
            Dataset<MetricObservation> ds = rdd.toObservationsDataset(sqlContext);
            ds.count();
        }
        long stop = System.currentTimeMillis();
        // BENCHMARK STOP ...................................
        System.out.println("\nBenchmark duration: " + (stop - start) + " ms");

        chronixSparkContext.getSparkContext().close();
    }
 
Example 9
Source File: TestSuite.java    From stocator with Apache License 2.0 5 votes vote down vote up
private void countAndCompare(Dataset<Row> inSpark, long readRecords, String msg) throws Exception {
  long totalInSpark = inSpark.count();
  if (totalInSpark != readRecords) {
    System.out.println("*********************************");
    System.out.println(msg + ": Records that were written into object store doesn't match");
    System.out.println(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
    throw new Exception(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  } else {
    System.out.println(
        msg + " Completed successfully. Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  }
}
 
Example 10
Source File: DecisionStep.java    From envelope with Apache License 2.0 5 votes vote down vote up
private boolean evaluateStepByKeyDecision(Set<Step> steps) {
  Optional<Step> optionalStep = StepUtils.getStepForName(stepByKeyStepName, steps);
  
  if (!optionalStep.isPresent()) {
    throw new RuntimeException("Unknown decision step's key step: " + stepByValueStepName);
  }
  
  if (!(optionalStep.get() instanceof DataStep)) {
    throw new RuntimeException("Decision step's key step is not a data step: " + optionalStep.get().getName());
  }
  
  Dataset<Row> keyDataset = ((DataStep)optionalStep.get()).getData();
  
  if (keyDataset.schema().fields().length != 2 ||
      keyDataset.schema().fields()[0].dataType() != DataTypes.StringType ||
      keyDataset.schema().fields()[1].dataType() != DataTypes.BooleanType)
  {
    throw new RuntimeException("Decision step's key step must contain a string column and then a boolean column");
  }
  
  String keyColumnName = keyDataset.schema().fieldNames()[0];
  String whereClause = keyColumnName + " = '" + stepByKeyKey + "'";
  Dataset<Row> decisionDataset = keyDataset.where(whereClause);
  
  if (decisionDataset.count() != 1) {
    throw new RuntimeException("Decision step's key step must contain a single record for the given key");
  }
  
  boolean decision = decisionDataset.collectAsList().get(0).getBoolean(1);
  
  return decision;
}
 
Example 11
Source File: ConceptMaps.java    From bunsen with Apache License 2.0 5 votes vote down vote up
@Override
public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps);

  if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion");
  }

  // Remove the concept contents for persistence. This is most easily done in the ConceptMap
  // object by setting the group to an empty list.
  Dataset<ConceptMap> withoutConcepts = conceptMaps
      .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> {

        // Remove the elements rather than the groups to preserved the
        // "unmapped" structure in a group that can refer to other
        // concept maps.
        ConceptMap withoutElements = conceptMap.copy();

        List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>();

        for (ConceptMapGroupComponent group: withoutElements.getGroup()) {

          group.setElement(new ArrayList<>());
          updatedGroups.add(group);
        }

        withoutElements.setGroup(updatedGroups);

        return withoutElements;
      }, CONCEPT_MAP_ENCODER);

  Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator,
      MAPPING_ENCODER);

  return withConceptMaps(withoutConcepts, newMappings);
}
 
Example 12
Source File: ValueSets.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<ValueSet> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  Dataset<ValueSet> withoutConcepts = valueSets.map((MapFunction<ValueSet,ValueSet>) valueSet -> {
    ValueSet valueSetWithoutConcepts = valueSet.copy();

    List<ConceptSetComponent> updatedInclusions = new ArrayList<>();

    for (ConceptSetComponent inclusion: valueSet.getCompose().getInclude()) {

      ConceptSetComponent inclusionWithoutConcepts = inclusion.copy();

      inclusionWithoutConcepts.setConcept(new ArrayList<>());
      updatedInclusions.add(inclusionWithoutConcepts);
    }

    valueSetWithoutConcepts.getCompose().setInclude(updatedInclusions);

    return valueSetWithoutConcepts;
  }, VALUE_SET_ENCODER);

  Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator,
      getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}
 
Example 13
Source File: MetroAnalysisJob.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
     * 数据逻辑处理
     * @param sparkContext
     * @param inPutPath
     * @param outPutPath
     */
    private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) {
        SparkJobUtil.checkFileExists(inPutPath);

        SQLContext sqlContext = new SQLContext(sparkContext);
//        sqlContext.setConf("spark.sql.parquet.binaryAsString","true");

        //创建快照临时表
        Dataset<Row> dataset = sqlContext.read().json(inPutPath);
        dataset.registerTempTable("hui_metro_testjson");
        dataset.show(10);

        Dataset<Row> resultFrame = sqlContext.sql(SQL);

        if (resultFrame.count() > 0) {
            resultFrame.repartition(3).write()
                    .mode(SaveMode.Append).json(outPutPath);
        }

        resultFrame.show(10);

        //结果写入数据库
        MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig();
        jdbcConfig.init();
        resultFrame.write().mode("append")
                .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties());
    }
 
Example 14
Source File: SparkCubingJobTest.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
private void queryTest(CubeSegment segment) {
    // Result cmp: Parquet vs Spark SQL
    for (LayoutEntity entity : MetadataConverter.extractEntityList2JavaList(segment.getCubeInstance())) {
        // Parquet result
        Dataset<Row> layoutDataset = StorageFactory.createEngineAdapter(new IStorageAware() { // Hardcode
            @Override
            public int getStorageType() {
                return 4;
            }
        }, NSparkCubingEngine.NSparkCubingStorage.class)
                .getFrom(PathManager.getParquetStoragePath(segment.getConfig(), segment.getCubeInstance().getName(),
                        segment.getName(), segment.getStorageLocationIdentifier(), String.valueOf(entity.getId())), ss);

        Set<Integer> measures = new HashSet<Integer>();
        Set<Integer> rowKeys = entity.getOrderedDimensions().keySet();
        for (Map.Entry<Integer, FunctionDesc> entry : entity.getOrderedMeasures().entrySet()) {
            String type = entry.getValue().returnType().dataType();
            if (type.equals("hllc") || type.equals("topn") || type.equals("percentile")) {
                continue;
            }
            measures.add(entry.getKey());
        }
        layoutDataset = layoutDataset.select(NSparkCubingUtil.getColumns(rowKeys, measures))
                .sort(NSparkCubingUtil.getColumns(rowKeys));
        System.out.println("Query cuboid ------------ " + entity.getId());
        layoutDataset = dsConvertToOriginal(layoutDataset, entity);
        layoutDataset.show(10);

        // Spark sql
        Dataset<Row> ds = initFlatTable(segment);
        if (!entity.isTableIndex()) {
            ds = CuboidAggregator.agg(ss, ds, entity.getOrderedDimensions().keySet(), entity.getOrderedMeasures(),
                    null, true);
        }
        Dataset<Row> exceptDs = ds.select(NSparkCubingUtil.getColumns(rowKeys, measures))
                .sort(NSparkCubingUtil.getColumns(rowKeys));
        System.out.println("Spark sql ------------ ");
        exceptDs.show(10);
        long layoutCount = layoutDataset.count();
        long expectCount = exceptDs.count();
        Assert.assertEquals(layoutCount, expectCount);

        String msg = SparkQueryTest.checkAnswer(layoutDataset, exceptDs, false);
        Assert.assertNull(msg);
    }
}
 
Example 15
Source File: GeoWaveSparkSQLIT.java    From geowave with Apache License 2.0 4 votes vote down vote up
@Test
public void testCreateDataFrame() throws Exception {
  // Set up Spark
  final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession();
  final SparkContext context = session.sparkContext();

  // ingest test points
  TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  final SqlQueryRunner queryRunner = new SqlQueryRunner();
  queryRunner.setSparkSession(session);

  try {
    // Load RDD from datastore, no filters
    final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions());
    final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD();

    final long count = javaRdd.count();
    LOGGER.warn("DataStore loaded into RDD with " + count + " features.");

    queryRunner.addInputStore(dataStore, null, "features");

    final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))";

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)");

    Dataset<Row> results = queryRunner.run();
    final long containsCount = results.count();
    LOGGER.warn("Got " + containsCount + " for GeomContains test");

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))");
    results = queryRunner.run();
    final long withinCount = results.count();
    LOGGER.warn("Got " + withinCount + " for GeomWithin test");

    Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount);

    // Test the output writer
    final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore);

    sqlResultsWriter.writeResults("sqltest");

    queryRunner.removeAllStores();

    // Test other spatial UDFs
    final String line1 = "LINESTRING(0 0, 10 10)";
    final String line2 = "LINESTRING(0 10, 10 0)";
    queryRunner.setSql(
        "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    Row result = queryRunner.run().head();

    final boolean intersect = result.getBoolean(0);
    LOGGER.warn("GeomIntersects returned " + intersect);

    Assert.assertTrue("Lines should intersect", intersect);

    queryRunner.setSql(
        "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    result = queryRunner.run().head();

    final boolean disjoint = result.getBoolean(0);
    LOGGER.warn("GeomDisjoint returned " + disjoint);

    Assert.assertFalse("Lines should not be disjoint", disjoint);

  } catch (final Exception e) {
    e.printStackTrace();
    TestUtils.deleteAll(dataStore);
    Assert.fail(
        "Error occurred while testing a bounding box query of spatial index: '"
            + e.getLocalizedMessage()
            + "'");
  }

  // Clean up
  TestUtils.deleteAll(dataStore);
}
 
Example 16
Source File: PdbToUniProt.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * Returns an up-to-date dataset of PDB to UniProt 
 * residue-level mappings for a list of ids.
 * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainId (e.g., 1XYZ.A).
 * This method reads a cached file and downloads updates.
 * 
 * @param ids list of pdbIds or pdbId.chainIds
 * @return dataset of PDB to UniProt residue-level mappings
 * @throws IOException
 */
public static Dataset<Row> getResidueMappings(List<String> ids) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();
    
    boolean withChainId = ids.size() > 0 && ids.get(0).length() > 4;
    
    // create dataset of ids
    Dataset<Row> df = spark.createDataset(ids, Encoders.STRING()).toDF("id");
    // get cached mappings
    Dataset<Row> mapping = getCachedResidueMappings();  
    
    // dataset for non-cached mappings
    Dataset<Row> notCached = null;
    // dataset with PDB Ids to be downloaded
    Dataset<Row> toDownload = null; 
    
    if (withChainId) {
        // get subset of requested ids from cached dataset
        mapping = mapping.join(df, mapping.col("structureChainId").equalTo(df.col("id"))).drop("id");
        // get ids that are not in the cached dataset
        notCached = df.join(mapping, df.col("id").equalTo(mapping.col("structureChainId")), "left_anti").cache(); 
        // create dataset of PDB Ids to be downloaded
        toDownload = notCached.withColumn("id", col("id").substr(0, 4)).distinct().cache();
    } else {
        // get subset of requested ids from cached dataset
        mapping = mapping.withColumn("pdbId", col("structureChainId").substr(0, 4));
        mapping = mapping.join(df, mapping.col("pdbId").equalTo(df.col("id"))).drop("id");
        // create dataset of PDB Ids to be downloaded
        toDownload = df.join(mapping, df.col("id").equalTo(mapping.col("pdbId")), "left_anti").distinct().cache();
        mapping = mapping.drop("pdbId");
    }
    
    toDownload = toDownload.distinct().cache();
        
    // download data that are not in the cache
    if (toDownload.count() > 0) {
        Dataset<Row> unpData = getChainMappings().select("structureId").distinct();
        toDownload = toDownload.join(unpData, toDownload.col("id").equalTo(unpData.col("structureId"))).drop("structureId").cache();
        System.out.println("Downloading mapping for " + toDownload.count() + " PDB structures.");
        Dataset<Row> downloadedData = downloadData(toDownload);
  
        // since data are downloaded for all chains in structure, make sure to only include the requested chains.
        if (withChainId) {
            downloadedData = downloadedData.join(notCached, downloadedData.col("structureChainId").equalTo(notCached.col("id"))).drop("id");
        }
        mapping = mapping.union(downloadedData);
    }
    
    return mapping;
}
 
Example 17
Source File: AtpInteractionAnalysis.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // filter by sequence identity subset
    int sequenceIdentity = 20;
    double resolution = 2.0;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    // find ATP interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // TODO add a line to only analyze interactions 
    // with the oxygens in the terminal phosphate group of ATP
    // (O1G, O2G, O3G)
    // Tip: Google SQL LIKE
    interactions = interactions.filter("atom1 LIKE('O%G')");
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 18
Source File: InteractionAnalysisAdvanced.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(20);

       // TODO print the top 10 interacting elements
       System.out.println("Top interacting elements");
       Dataset<Row> topElements = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("element2")
       		.count();
       
       topElements.withColumn("frequency", col("count").divide(n))
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       interactions
       .groupBy("element2")
       .avg("distance")
       .sort("avg(distance)")
       .show(10);

       // Aggregate multiple statistics
       // Note: import static org.apache.spark.sql.functions.* required!
       // e.g. org.apache.spark.sql.functions.avg
       // for a list of all available functions
       interactions
       .groupBy("element2")
       .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance"))
       .show(10);
       
       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 19
Source File: DatasetFileConverter.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        // process command line options (defaults are provided)
        CommandLine cmd = getCommandLine(args);
        String inputFile = cmd.getOptionValue("input-file");
        int partitions = Integer.parseInt(cmd.getOptionValue("partitions", "0"));
        String fileFormat = cmd.getOptionValue("file-format", "");
        String compressionCodec = cmd.getOptionValue("compression-codec", "");

        SparkSession spark = SparkSession.builder().master("local[*]")
                .appName(DatasetFileConverter.class.getSimpleName())
                .getOrCreate();

        spark.conf().set("spark.sql.orc.impl", "native");

        // encode options in file name
        String outputFile = getFileExtension(inputFile);
        if (partitions > 1) {
            outputFile += "." + partitions;
        }
        outputFile += "." + fileFormat;
        if (!compressionCodec.isEmpty()) {
            outputFile += "." + compressionCodec;
        }

        System.out.println("Input file : " + inputFile);
        System.out.println("Output file: " + outputFile);

        long t1 = System.nanoTime();

        Dataset<Row> dataset = null;
        
        // read dataset
        if (inputFile.contains("orc")) {
            dataset = spark.read().format("orc").load(inputFile);
        } else if (inputFile.contains("csv")) {
            dataset = spark.read().format("csv").load(inputFile);
        } else {
            dataset = spark.read().format("parquet").load(inputFile);
        }
        
        long records = dataset.count();

        // write reformatted dataset
        saveDataset(dataset, partitions, fileFormat, compressionCodec, outputFile);

        long t2 = System.nanoTime();

        System.out.println(records + " records reformatted in " + (t2-t1)/1E9 + " sec.");

        spark.stop();
    }
 
Example 20
Source File: CreatePdbToUniProtMappingFile.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException, InterruptedException {
    
    // process command line options (defaults are provided)
    CommandLine cmd = getCommandLine(args);
    String outputFile = cmd.getOptionValue("output-file");
    boolean build = cmd.hasOption("build");
    boolean update = cmd.hasOption("update");
    
    // these default options for fileFormat and compressionCodec 
    // provide the best compression
    String fileFormat = cmd.getOptionValue("file-format", "orc");
    String compressionCodec = cmd.getOptionValue("compression-codec", "lzo");
   
    SparkSession spark = SparkSession.builder()
            .master("local[*]")
            .appName(CreatePdbToUniProtMappingFile.class.getSimpleName())
            .getOrCreate();
    
    String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());

    long t1 = System.nanoTime();
    
    String dirName = outputFile + "_" + timeStamp + "_tmp";
    String fileName = outputFile + "_" + timeStamp + "." + fileFormat + "." + compressionCodec;
    
    if (build) {
        // create a new mapping file from scratch
        PdbToUniProt.buildDataset(dirName, "orc", "lzo");
    } else if (update) {
        // create an updated mapping file from the cached version
        PdbToUniProt.updateDataset(dirName, "orc", "lzo");
    }

    long t2 = System.nanoTime();
    System.out.println("Time to build/update dataset: " + (t2-t1)/1E9 + " sec.");
           
    // By default, spark creates a directory of files. 
    // For convenience, coalesce the data into a single file.
    Dataset<Row> ds = spark.read().orc(dirName);
    long count = ds.count();
    
    int partitions = 1;
    DatasetFileConverter.saveDataset(ds, partitions, fileFormat, compressionCodec, fileName);
    FileUtils.deleteDirectory(new File(dirName));
    
    System.out.println(count + " records saved to: " + fileName);
    
    long t3 = System.nanoTime();
    System.out.println("Time to reformat data: " + (t3-t2)/1E9 + " sec.");

    spark.stop();
}