org.apache.spark.sql.Dataset#show

Source File: TestWebServiceGet.java From quetzal with Eclipse Public License 2.0

6 votes

public static void main( String[] args )
   {   	
//   	SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("local[2]");
//      	SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://Kavithas-MBP.home:7077");
	SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://kavithas-mbp.watson.ibm.com:7077");
   
   	JavaSparkContext sc = new JavaSparkContext(conf);
   	
   	HiveContext sqlContext = new HiveContext(sc.sc());
   	Dataset urls = sqlContext.read().json("/tmp/urls.json");

   	urls.registerTempTable("urls");
   	Dataset<Row> temp = sqlContext.sql("select * from urls");
   	temp.show();
   	
	   	sqlContext.sql("add jar /tmp/quetzal.jar");
	sqlContext.sql("create temporary function webservice as 'com.ibm.research.rdf.store.utilities.WebServiceGetUDTF'");
	Dataset<Row> drugs = sqlContext.sql("select webservice(\"drug,id,action\", \"url\", \"\", \"GET\", \"xs=http://www.w3.org/2001/XMLSchema\", \"//row\",\"drug\",\"./drug\","
			+ " \"<string>\", \"id\", \"./id\",\"<string>\", \"action\", \"./action\", \"<string>\", url) as (drug, drug_typ, id, id_typ, action, action_typ) from urls");
	drugs.show();
	System.out.println("Num rows:" + drugs.count());
   }

Source File: CsvToDatasetCompatibleWithSparkv1x.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("CSV to Dataset")
      .master("local")
      .getOrCreate();

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv")
      .option("inferSchema", "true")
      .option("header", "false")
      .load(filename);
  df.show();

  // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x
  int count = df.columns().length;
  for (int i = 0; i < count; i++) {
    String oldColName = "_c" + i;
    String newColName = "C" + i;
    df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName);
  }
  df.show();
}

Source File: JavaBinarizerExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBinarizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 0.1),
    RowFactory.create(1, 0.8),
    RowFactory.create(2, 0.2)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

  Binarizer binarizer = new Binarizer()
    .setInputCol("feature")
    .setOutputCol("binarized_feature")
    .setThreshold(0.5);

  Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

  System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
  binarizedDataFrame.show();
  // $example off$

  spark.stop();
}

Source File: SecondaryStructureSegmentDemo.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    List<String> pdbIds = Arrays.asList("1STP"); // single protein chain
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
    
    pdb = pdb
    		.flatMapToPair(new StructureToPolymerChains())
    		.filter(new ContainsLProteinChain());
    
    int segmentLength = 25;
    Dataset<Row> ds = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

    // show the top 50 rows of this dataset
    ds.show(50, false);

    long end = System.nanoTime();
    
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
    
    sc.close();
}

Source File: PolymerInteractionFingerprintDemo.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {		
	SparkSession spark = SparkSession.builder().master("local[*]")
			.appName(PolymerInteractionFingerprintDemo.class.getSimpleName()).getOrCreate();

	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

       List<String> pdbIds = Arrays.asList("1OHR");
       JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);
       
       // find ASP-ARG salt bridges
       InteractionFilter filter = new InteractionFilter();
       filter.setDistanceCutoff(3.5);
       filter.setMinInteractions(1);
       filter.setQueryGroups(true, "ASP");
       filter.setQueryAtomNames(true, "OD1", "OD2");
       filter.setTargetGroups(true, "ARG");
       filter.setTargetAtomNames(true, "NH1", "NH2");
	
	Dataset<Row> interactions = InteractionFingerprinter.getPolymerInteractions(pdb, filter).cache();
       interactions.show(false);

	sc.close();
}

Source File: QuotedCsvWithHeaderToDataset.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
      .master("local").getOrCreate();

  String filename = "data/csv-quoted.txt";
  Dataset<Row> df = spark.read().option("inferSchema", "true").option(
      "header", "true").csv(filename);
  df.show();
  df.printSchema();

}

Source File: DataFrameOperation.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("DataFrameOperation").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);

	SQLContext sqlContext = new SQLContext(sc);

	// 将数据源读取为数据框,可以理解为一张表。具有数据和结构信息
	Dataset<Row> dataset = sqlContext.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	// 格式化的打印这张表
	dataset.show();

	// 搭建元数据(结构)schema
	dataset.printSchema();

	// 查询列并计算
	dataset.select("name").show();
	dataset.select(dataset.col("name"), dataset.col("age").plus(1)).show();

	// 过滤
	dataset.filter(dataset.col("age").gt(20)).show();

	// 根据某一列分组然后统计count
	dataset.groupBy("age").count().show();

	sc.close();
}

Source File: MetroAnalysisJob.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
     * 数据逻辑处理
     * @param sparkContext
     * @param inPutPath
     * @param outPutPath
     */
    private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) {
        SparkJobUtil.checkFileExists(inPutPath);

        SQLContext sqlContext = new SQLContext(sparkContext);
//        sqlContext.setConf("spark.sql.parquet.binaryAsString","true");

        //创建快照临时表
        Dataset<Row> dataset = sqlContext.read().json(inPutPath);
        dataset.registerTempTable("hui_metro_testjson");
        dataset.show(10);

        Dataset<Row> resultFrame = sqlContext.sql(SQL);

        if (resultFrame.count() > 0) {
            resultFrame.repartition(3).write()
                    .mode(SaveMode.Append).json(outPutPath);
        }

        resultFrame.show(10);

        //结果写入数据库
        MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig();
        jdbcConfig.init();
        resultFrame.write().mode("append")
                .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties());
    }

Source File: JavaVectorIndexerExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorIndexerExample")
    .getOrCreate();

  // $example on$
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  VectorIndexer indexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexed")
    .setMaxCategories(10);
  VectorIndexerModel indexerModel = indexer.fit(data);

  Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
  System.out.print("Chose " + categoryMaps.size() + " categorical features:");

  for (Integer feature : categoryMaps.keySet()) {
    System.out.print(" " + feature);
  }
  System.out.println();

  // Create new column "indexed" with categorical values transformed to indices
  Dataset<Row> indexedData = indexerModel.transform(data);
  indexedData.show();
  // $example off$
  spark.stop();
}

Source File: G2SDataset.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Downloads PDB residue mappings for a list of genomic variations.
 * @param variationIds genomic variation ids (e.g. chr7:g.140449103A>C)
 * @param pdbId specific PDB structure used for mapping
 * @param chainId specific chain used for mapping
 * @return dataset with PDB mapping information
 * @throws IOException
 */
private static Dataset<Row> getDataset(List<String> variationIds, String structureId, String chainId) throws IOException {
    // get a spark context
    SparkSession spark = SparkSession.builder().getOrCreate();    
    @SuppressWarnings("resource") // sc will be closed elsewhere
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // download data in parallel
    JavaRDD<String> data = sc.parallelize(variationIds).flatMap(m -> getData(m, structureId, chainId));

    // convert from JavaRDD to Dataset
    Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); 

    // parse json strings and return as a dataset
    Dataset<Row> dataset = spark.read().json(jsonData); 
    dataset.show();
    
    // return null if dataset is empty
    if (dataset.columns().length == 0) {
        System.out.println("G2SDataset: no matches found");
        return null;
    }   
       
    dataset = standardizeData(dataset);
    
    return flattenDataset(dataset);
}

Source File: InteractionAnalysisSimple.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @param args no input arguments
 * @throws IOException if MmtfReader fails
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    
    // use only representative structures
    int sequenceIdentity = 40;
    double resolution = 2.5;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // list the top 10 residue types that interact with Zn
       interactions.printSchema();
       interactions.show(20);
       
       System.out.println("# interactions: " + interactions.count());
       
       // show the top 10 interacting groups
       interactions
       .groupBy(col("residue2"))
       .count()
       .sort(col("count").desc())
       .show(10);
      
    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: JavaBean.java From learning-spark-with-java with MIT License

5 votes

public static void main(String[] args) {
    SparkSession spark = SparkSession
        .builder()
        .appName("Dataset-JavaBean")
        .master("local[4]")
        .getOrCreate();

    //
    // The Java API requires you to explicitly instantiate an encoder for
    // any JavaBean you want to use for schema inference
    //
    Encoder<Number> numberEncoder = Encoders.bean(Number.class);
    //
    // Create a container of the JavaBean instances
    //
    List<Number> data = Arrays.asList(
            new Number(1, "one", "un"),
            new Number(2, "two", "deux"),
            new Number(3, "three", "trois"));
    //
    // Use the encoder and the container of JavaBean instances to create a
    // Dataset
    //
    Dataset<Number> ds = spark.createDataset(data, numberEncoder);

    System.out.println("*** here is the schema inferred from the bean");
    ds.printSchema();

    System.out.println("*** here is the data");
    ds.show();

    // Use the convenient bean-inferred column names to query
    System.out.println("*** filter by one column and fetch others");
    ds.where(col("i").gt(2)).select(col("english"), col("french")).show();

    spark.stop();
}

Source File: JavaChiSqSelectorExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaChiSqSelectorExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
    RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
    RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
    new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  ChiSqSelector selector = new ChiSqSelector()
    .setNumTopFeatures(1)
    .setFeaturesCol("features")
    .setLabelCol("clicked")
    .setOutputCol("selectedFeatures");

  Dataset<Row> result = selector.fit(df).transform(df);

  System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures()
      + " features selected");
  result.show();

  // $example off$
  spark.stop();
}

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runBasicParquetExample(SparkSession spark) {
  // $example on:basic_parquet_example$
  Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // DataFrames can be saved as Parquet files, maintaining the schema information
  peopleDF.write().parquet("people.parquet");

  // Read in the Parquet file created above.
  // Parquet files are self-describing so the schema is preserved
  // The result of loading a parquet file is also a DataFrame
  Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet");

  // Parquet files can also be used to create a temporary view and then used in SQL statements
  parquetFileDF.createOrReplaceTempView("parquetFile");
  Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
  Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() {
    public String call(Row row) {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+
  // $example off:basic_parquet_example$
}

Source File: SecondaryStructurePropertyEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args outputFilePath outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant Pisces 
	// subset set (<=20% seq. identity) of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
               .filter(new Pisces(sequenceIdentity, resolution));
	
	// get content
	int segmentLength = 11;
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache();

	System.out.println("original data     : " + data.count());
	data = data.dropDuplicates("labelQ3", "sequence").cache();
	System.out.println("- duplicate Q3/seq: " + data.count());
	data = data.dropDuplicates("sequence").cache();
	System.out.println("- duplicate seq   : " + data.count());
	
	// add a property encoded feature vector
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	data = encoder.propertyEncode();
	
	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Source File: JavaStringIndexerExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaStringIndexerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );
  StructType schema = new StructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("category", StringType, false)
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexer indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex");

  Dataset<Row> indexed = indexer.fit(df).transform(df);
  indexed.show();
  // $example off$

  spark.stop();
}

Source File: StringSanitizerBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testStringSanitizer() {

	//prepare data
	JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
			RowFactory.create(1, "Jyoti complex near Sananda clothes store; English Bazar; Malda;WB;India,"),
			RowFactory.create(2, "hallalli vinayaka tent road c/o B K vishwanath Mandya"),
			RowFactory.create(3, "M.sathish S/o devudu Lakshmi opticals Gokavaram bus stand Rajhamundry 9494954476")
	));

	StructType schema = new StructType(new StructField[]{
			new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
			new StructField("rawText", DataTypes.StringType, false, Metadata.empty())
	});
	Dataset<Row> dataset = spark.createDataFrame(rdd, schema);
	dataset.show();

	//train model in spark
	StringSanitizer sparkModel = new StringSanitizer()
			.setInputCol("rawText")
			.setOutputCol("token");

	//Export this model
	byte[] exportedModel = ModelExporter.export(sparkModel);

	//Import and get Transformer
	Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

	List<Row> pairs = sparkModel.transform(dataset).select("rawText", "token").collectAsList();

	for (Row row : pairs) {
		Map<String, Object> data = new HashMap<String, Object>();
		data.put(sparkModel.getInputCol(), row.getString(0));
		transformer.transform(data);

		String[] actual = (String[]) data.get(sparkModel.getOutputCol());

		List<String> actualList = Arrays.asList(actual);
		List<String> expected = row.getList(1);

		assertTrue("both should be same", actualList.equals(expected));
	}
}

Source File: UnionApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

/**
 * The processing code.
 * 
 * @throws ParseException
 */
private void start() throws ParseException {
  // Creates a session on a local master
  SparkSession spark = SparkSession.builder()
      .appName("expr()")
      .master("local")
      .getOrCreate();

  // DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd
  // HH:mm:ss", Locale.ENGLISH);

  // Data
  StructType dataSchema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "NAME",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "START_DATE",
          DataTypes.DateType,
          false),
      DataTypes.createStructField(
          "END_DATE",
          DataTypes.DateType,
          false),
      DataTypes.createStructField(
          "STATUS",
          DataTypes.StringType,
          false) });
  List<Row> dataRows = new ArrayList<Row>();
  dataRows.add(RowFactory.create("Alex", toDate("2018-01-01 00:00:00"),
      toDate("2018-02-01 00:00:00"), "OUT"));
  dataRows.add(RowFactory.create("Bob", toDate("2018-02-01 00:00:00"),
      toDate("2018-02-05 00:00:00"), "IN"));
  dataRows.add(RowFactory.create("Mark", toDate("2018-02-01 00:00:00"),
      toDate("2018-03-01 00:00:00"), "IN"));
  dataRows.add(RowFactory.create("Mark", toDate("2018-05-01 00:00:00"),
      toDate("2018-08-01 00:00:00"), "OUT"));
  dataRows.add(RowFactory.create("Meggy", toDate("2018-02-01 00:00:00"),
      toDate("2018-02-01 00:00:00"), "OUT"));
  Dataset<Row> dataDf = spark.createDataFrame(dataRows, dataSchema);
  dataDf.show();
  dataDf.printSchema();

  // Header
  StructType headerSchema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "_c1",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "_c2",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "_c3",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "_c4",
          DataTypes.StringType,
          false) });
  List<Row> headerRows = new ArrayList<Row>();
  headerRows.add(RowFactory.create("REQUEST_DATE",
      format.format(new java.util.Date()), "", ""));
  headerRows.add(RowFactory.create("USER", "Kate", "", ""));
  headerRows.add(RowFactory.create("SEARCH_TYPE", "Global", "", ""));
  headerRows.add(RowFactory.create("", "", "", ""));
  headerRows
      .add(RowFactory.create("NAME", "START_DATE", "END_DATE", "STATUS"));
  Dataset<Row> headerDf = spark.createDataFrame(headerRows, headerSchema);
  headerDf.show(false);
  headerDf.printSchema();

  // Transition
  Dataset<Row> transitionDf = dataDf
      .withColumn("_c1", dataDf.col("NAME"))
      .withColumn("_c2",
          dataDf.col("START_DATE").cast(DataTypes.StringType))
      .withColumn("_c3",
          dataDf.col("END_DATE").cast(DataTypes.StringType))
      .withColumn("_c4", dataDf.col("STATUS").cast(DataTypes.StringType))
      .drop("NAME")
      .drop("START_DATE")
      .drop("END_DATE")
      .drop("STATUS");
  transitionDf.show(false);
  transitionDf.printSchema();

  // Union
  Dataset<Row> unionDf = headerDf.unionByName(transitionDf);
  unionDf.show(false);
  unionDf.printSchema();
}

Source File: JsonFileOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN); 
	      SparkSession sparkSession = SparkSession
	      .builder()
	      .master("local")
		  .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
	      .appName("JavaALSExample")
	      .getOrCreate();
	      
	   RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); 
	   
	   JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class));
	   
	   mapParser.foreach(t -> System.out.println(t)); 
	  
	   Dataset<Row> anotherPeople = sparkSession.read().json(textFile);
	   
	   anotherPeople.printSchema();
	   anotherPeople.show();
	      
	      
	      Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json");
	      json_rec.printSchema();
	      
	      json_rec.show();
	      
	      StructType schema = new StructType( new StructField[] {
	    	            DataTypes.createStructField("cid", DataTypes.IntegerType, true),
	    	            DataTypes.createStructField("county", DataTypes.StringType, true),
	    	            DataTypes.createStructField("firstName", DataTypes.StringType, true),
	    	            DataTypes.createStructField("sex", DataTypes.StringType, true),
	    	            DataTypes.createStructField("year", DataTypes.StringType, true),
	    	            DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) });
	      
	    /*  StructType pep = new StructType(new StructField[] {
					new StructField("Count", DataTypes.StringType, true, Metadata.empty()),
					new StructField("County", DataTypes.StringType, true, Metadata.empty()),
					new StructField("First Name", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Sex", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Year", DataTypes.StringType, true, Metadata.empty()),
				    new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/
	      
	     Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile);
	     
	     person_mod.printSchema();
	     person_mod.show();
	     
	     person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json");

}

Source File: SecondaryStructureShiftedWord2VecEncoder.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet)
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfReducedPath();
    
	if (args.length != 2) {
		System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>");
		System.exit(1);
	}

	long start = System.nanoTime();

	SparkConf conf = new SparkConf()
			.setMaster("local[*]")
			.setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	// read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity)
	// of L-protein chains
	int sequenceIdentity = 20;
	double resolution = 3.0;
	
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
			.readSequenceFile(path, sc)
			.flatMapToPair(new StructureToPolymerChains())
			.filter(new Pisces(sequenceIdentity, resolution));

	// get content
	int segmentLength = 11; 
	Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength);

	// create a Word2Vector representation of the protein sequences
	ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data);
	int windowSize = (segmentLength-1)/2;
	int vectorSize = 50; // dimension of feature vector	(50)
	data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache();

	data.printSchema();
	data.show(25, false);
	
	if (args[1].equals("json")) {
		// coalesce data into a single file
	    data = data.coalesce(1);
	}
	data.write().mode("overwrite").format(args[1]).save(args[0]);
	
	long end = System.nanoTime();

	System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
}

Java Code Examples for org.apache.spark.sql.Dataset#show()