Java Code Examples for org.apache.spark.sql.Dataset#show()
The following examples show how to use
org.apache.spark.sql.Dataset#show() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestWebServiceGet.java From quetzal with Eclipse Public License 2.0 | 6 votes |
public static void main( String[] args ) { // SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("local[2]"); // SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://Kavithas-MBP.home:7077"); SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://kavithas-mbp.watson.ibm.com:7077"); JavaSparkContext sc = new JavaSparkContext(conf); HiveContext sqlContext = new HiveContext(sc.sc()); Dataset urls = sqlContext.read().json("/tmp/urls.json"); urls.registerTempTable("urls"); Dataset<Row> temp = sqlContext.sql("select * from urls"); temp.show(); sqlContext.sql("add jar /tmp/quetzal.jar"); sqlContext.sql("create temporary function webservice as 'com.ibm.research.rdf.store.utilities.WebServiceGetUDTF'"); Dataset<Row> drugs = sqlContext.sql("select webservice(\"drug,id,action\", \"url\", \"\", \"GET\", \"xs=http://www.w3.org/2001/XMLSchema\", \"//row\",\"drug\",\"./drug\"," + " \"<string>\", \"id\", \"./id\",\"<string>\", \"action\", \"./action\", \"<string>\", url) as (drug, drug_typ, id, id_typ, action, action_typ) from urls"); drugs.show(); System.out.println("Num rows:" + drugs.count()); }
Example 2
Source File: CsvToDatasetCompatibleWithSparkv1x.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
private void start() { SparkSession spark = SparkSession.builder() .appName("CSV to Dataset") .master("local") .getOrCreate(); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "false") .load(filename); df.show(); // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x int count = df.columns().length; for (int i = 0; i < count; i++) { String oldColName = "_c" + i; String newColName = "C" + i; df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName); } df.show(); }
Example 3
Source File: JavaBinarizerExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaBinarizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, 0.1), RowFactory.create(1, 0.8), RowFactory.create(2, 0.2) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema); Binarizer binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(0.5); Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame); System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold()); binarizedDataFrame.show(); // $example off$ spark.stop(); }
Example 4
Source File: SecondaryStructureSegmentDemo.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException { long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CustomReportDemo.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); List<String> pdbIds = Arrays.asList("1STP"); // single protein chain JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache(); pdb = pdb .flatMapToPair(new StructureToPolymerChains()) .filter(new ContainsLProteinChain()); int segmentLength = 25; Dataset<Row> ds = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength); // show the top 50 rows of this dataset ds.show(50, false); long end = System.nanoTime(); System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); sc.close(); }
Example 5
Source File: PolymerInteractionFingerprintDemo.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { SparkSession spark = SparkSession.builder().master("local[*]") .appName(PolymerInteractionFingerprintDemo.class.getSimpleName()).getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); List<String> pdbIds = Arrays.asList("1OHR"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc); // find ASP-ARG salt bridges InteractionFilter filter = new InteractionFilter(); filter.setDistanceCutoff(3.5); filter.setMinInteractions(1); filter.setQueryGroups(true, "ASP"); filter.setQueryAtomNames(true, "OD1", "OD2"); filter.setTargetGroups(true, "ARG"); filter.setTargetAtomNames(true, "NH1", "NH2"); Dataset<Row> interactions = InteractionFingerprinter.getPolymerInteractions(pdb, filter).cache(); interactions.show(false); sc.close(); }
Example 6
Source File: QuotedCsvWithHeaderToDataset.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-quoted.txt"; Dataset<Row> df = spark.read().option("inferSchema", "true").option( "header", "true").csv(filename); df.show(); df.printSchema(); }
Example 7
Source File: DataFrameOperation.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("DataFrameOperation").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); // 将数据源读取为数据框,可以理解为一张表。具有数据和结构信息 Dataset<Row> dataset = sqlContext.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // 格式化的打印这张表 dataset.show(); // 搭建元数据(结构)schema dataset.printSchema(); // 查询列并计算 dataset.select("name").show(); dataset.select(dataset.col("name"), dataset.col("age").plus(1)).show(); // 过滤 dataset.filter(dataset.col("age").gt(20)).show(); // 根据某一列分组然后统计count dataset.groupBy("age").count().show(); sc.close(); }
Example 8
Source File: MetroAnalysisJob.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 数据逻辑处理 * @param sparkContext * @param inPutPath * @param outPutPath */ private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) { SparkJobUtil.checkFileExists(inPutPath); SQLContext sqlContext = new SQLContext(sparkContext); // sqlContext.setConf("spark.sql.parquet.binaryAsString","true"); //创建快照临时表 Dataset<Row> dataset = sqlContext.read().json(inPutPath); dataset.registerTempTable("hui_metro_testjson"); dataset.show(10); Dataset<Row> resultFrame = sqlContext.sql(SQL); if (resultFrame.count() > 0) { resultFrame.repartition(3).write() .mode(SaveMode.Append).json(outPutPath); } resultFrame.show(10); //结果写入数据库 MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig(); jdbcConfig.init(); resultFrame.write().mode("append") .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties()); }
Example 9
Source File: JavaVectorIndexerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaVectorIndexerExample") .getOrCreate(); // $example on$ Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); VectorIndexer indexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexed") .setMaxCategories(10); VectorIndexerModel indexerModel = indexer.fit(data); Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps(); System.out.print("Chose " + categoryMaps.size() + " categorical features:"); for (Integer feature : categoryMaps.keySet()) { System.out.print(" " + feature); } System.out.println(); // Create new column "indexed" with categorical values transformed to indices Dataset<Row> indexedData = indexerModel.transform(data); indexedData.show(); // $example off$ spark.stop(); }
Example 10
Source File: G2SDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Downloads PDB residue mappings for a list of genomic variations. * @param variationIds genomic variation ids (e.g. chr7:g.140449103A>C) * @param pdbId specific PDB structure used for mapping * @param chainId specific chain used for mapping * @return dataset with PDB mapping information * @throws IOException */ private static Dataset<Row> getDataset(List<String> variationIds, String structureId, String chainId) throws IOException { // get a spark context SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc will be closed elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // download data in parallel JavaRDD<String> data = sc.parallelize(variationIds).flatMap(m -> getData(m, structureId, chainId)); // convert from JavaRDD to Dataset Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); // parse json strings and return as a dataset Dataset<Row> dataset = spark.read().json(jsonData); dataset.show(); // return null if dataset is empty if (dataset.columns().length == 0) { System.out.println("G2SDataset: no matches found"); return null; } dataset = standardizeData(dataset); return flattenDataset(dataset); }
Example 11
Source File: InteractionAnalysisSimple.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @param args no input arguments * @throws IOException if MmtfReader fails */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // use only representative structures int sequenceIdentity = 40; double resolution = 2.5; pdb = pdb.filter(new Pisces(sequenceIdentity, resolution)); GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // list the top 10 residue types that interact with Zn interactions.printSchema(); interactions.show(20); System.out.println("# interactions: " + interactions.count()); // show the top 10 interacting groups interactions .groupBy(col("residue2")) .count() .sort(col("count").desc()) .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 12
Source File: JavaBean.java From learning-spark-with-java with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("Dataset-JavaBean") .master("local[4]") .getOrCreate(); // // The Java API requires you to explicitly instantiate an encoder for // any JavaBean you want to use for schema inference // Encoder<Number> numberEncoder = Encoders.bean(Number.class); // // Create a container of the JavaBean instances // List<Number> data = Arrays.asList( new Number(1, "one", "un"), new Number(2, "two", "deux"), new Number(3, "three", "trois")); // // Use the encoder and the container of JavaBean instances to create a // Dataset // Dataset<Number> ds = spark.createDataset(data, numberEncoder); System.out.println("*** here is the schema inferred from the bean"); ds.printSchema(); System.out.println("*** here is the data"); ds.show(); // Use the convenient bean-inferred column names to query System.out.println("*** filter by one column and fetch others"); ds.where(col("i").gt(2)).select(col("english"), col("french")).show(); spark.stop(); }
Example 13
Source File: JavaChiSqSelectorExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaChiSqSelectorExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()), new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); ChiSqSelector selector = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("features") .setLabelCol("clicked") .setOutputCol("selectedFeatures"); Dataset<Row> result = selector.fit(df).transform(df); System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures() + " features selected"); result.show(); // $example off$ spark.stop(); }
Example 14
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runBasicParquetExample(SparkSession spark) { // $example on:basic_parquet_example$ Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // DataFrames can be saved as Parquet files, maintaining the schema information peopleDF.write().parquet("people.parquet"); // Read in the Parquet file created above. // Parquet files are self-describing so the schema is preserved // The result of loading a parquet file is also a DataFrame Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet"); // Parquet files can also be used to create a temporary view and then used in SQL statements parquetFileDF.createOrReplaceTempView("parquetFile"); Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() { public String call(Row row) { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:basic_parquet_example$ }
Example 15
Source File: SecondaryStructurePropertyEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args outputFilePath outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructurePropertyEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructurePropertyEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant Pisces // subset set (<=20% seq. identity) of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength).cache(); System.out.println("original data : " + data.count()); data = data.dropDuplicates("labelQ3", "sequence").cache(); System.out.println("- duplicate Q3/seq: " + data.count()); data = data.dropDuplicates("sequence").cache(); System.out.println("- duplicate seq : " + data.count()); // add a property encoded feature vector ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); data = encoder.propertyEncode(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }
Example 16
Source File: JavaStringIndexerExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaStringIndexerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, "a"), RowFactory.create(1, "b"), RowFactory.create(2, "c"), RowFactory.create(3, "a"), RowFactory.create(4, "a"), RowFactory.create(5, "c") ); StructType schema = new StructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("category", StringType, false) }); Dataset<Row> df = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex"); Dataset<Row> indexed = indexer.fit(df).transform(df); indexed.show(); // $example off$ spark.stop(); }
Example 17
Source File: StringSanitizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testStringSanitizer() { //prepare data JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList( RowFactory.create(1, "Jyoti complex near Sananda clothes store; English Bazar; Malda;WB;India,"), RowFactory.create(2, "hallalli vinayaka tent road c/o B K vishwanath Mandya"), RowFactory.create(3, "M.sathish S/o devudu Lakshmi opticals Gokavaram bus stand Rajhamundry 9494954476") )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("rawText", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame(rdd, schema); dataset.show(); //train model in spark StringSanitizer sparkModel = new StringSanitizer() .setInputCol("rawText") .setOutputCol("token"); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> pairs = sparkModel.transform(dataset).select("rawText", "token").collectAsList(); for (Row row : pairs) { Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), row.getString(0)); transformer.transform(data); String[] actual = (String[]) data.get(sparkModel.getOutputCol()); List<String> actualList = Arrays.asList(actual); List<String> expected = row.getList(1); assertTrue("both should be same", actualList.equals(expected)); } }
Example 18
Source File: UnionApp.java From net.jgp.labs.spark with Apache License 2.0 | 4 votes |
/** * The processing code. * * @throws ParseException */ private void start() throws ParseException { // Creates a session on a local master SparkSession spark = SparkSession.builder() .appName("expr()") .master("local") .getOrCreate(); // DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd // HH:mm:ss", Locale.ENGLISH); // Data StructType dataSchema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField( "NAME", DataTypes.StringType, false), DataTypes.createStructField( "START_DATE", DataTypes.DateType, false), DataTypes.createStructField( "END_DATE", DataTypes.DateType, false), DataTypes.createStructField( "STATUS", DataTypes.StringType, false) }); List<Row> dataRows = new ArrayList<Row>(); dataRows.add(RowFactory.create("Alex", toDate("2018-01-01 00:00:00"), toDate("2018-02-01 00:00:00"), "OUT")); dataRows.add(RowFactory.create("Bob", toDate("2018-02-01 00:00:00"), toDate("2018-02-05 00:00:00"), "IN")); dataRows.add(RowFactory.create("Mark", toDate("2018-02-01 00:00:00"), toDate("2018-03-01 00:00:00"), "IN")); dataRows.add(RowFactory.create("Mark", toDate("2018-05-01 00:00:00"), toDate("2018-08-01 00:00:00"), "OUT")); dataRows.add(RowFactory.create("Meggy", toDate("2018-02-01 00:00:00"), toDate("2018-02-01 00:00:00"), "OUT")); Dataset<Row> dataDf = spark.createDataFrame(dataRows, dataSchema); dataDf.show(); dataDf.printSchema(); // Header StructType headerSchema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField( "_c1", DataTypes.StringType, false), DataTypes.createStructField( "_c2", DataTypes.StringType, false), DataTypes.createStructField( "_c3", DataTypes.StringType, false), DataTypes.createStructField( "_c4", DataTypes.StringType, false) }); List<Row> headerRows = new ArrayList<Row>(); headerRows.add(RowFactory.create("REQUEST_DATE", format.format(new java.util.Date()), "", "")); headerRows.add(RowFactory.create("USER", "Kate", "", "")); headerRows.add(RowFactory.create("SEARCH_TYPE", "Global", "", "")); headerRows.add(RowFactory.create("", "", "", "")); headerRows .add(RowFactory.create("NAME", "START_DATE", "END_DATE", "STATUS")); Dataset<Row> headerDf = spark.createDataFrame(headerRows, headerSchema); headerDf.show(false); headerDf.printSchema(); // Transition Dataset<Row> transitionDf = dataDf .withColumn("_c1", dataDf.col("NAME")) .withColumn("_c2", dataDf.col("START_DATE").cast(DataTypes.StringType)) .withColumn("_c3", dataDf.col("END_DATE").cast(DataTypes.StringType)) .withColumn("_c4", dataDf.col("STATUS").cast(DataTypes.StringType)) .drop("NAME") .drop("START_DATE") .drop("END_DATE") .drop("STATUS"); transitionDf.show(false); transitionDf.printSchema(); // Union Dataset<Row> unionDf = headerDf.unionByName(transitionDf); unionDf.show(false); unionDf.printSchema(); }
Example 19
Source File: JsonFileOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") .appName("JavaALSExample") .getOrCreate(); RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class)); mapParser.foreach(t -> System.out.println(t)); Dataset<Row> anotherPeople = sparkSession.read().json(textFile); anotherPeople.printSchema(); anotherPeople.show(); Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json"); json_rec.printSchema(); json_rec.show(); StructType schema = new StructType( new StructField[] { DataTypes.createStructField("cid", DataTypes.IntegerType, true), DataTypes.createStructField("county", DataTypes.StringType, true), DataTypes.createStructField("firstName", DataTypes.StringType, true), DataTypes.createStructField("sex", DataTypes.StringType, true), DataTypes.createStructField("year", DataTypes.StringType, true), DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) }); /* StructType pep = new StructType(new StructField[] { new StructField("Count", DataTypes.StringType, true, Metadata.empty()), new StructField("County", DataTypes.StringType, true, Metadata.empty()), new StructField("First Name", DataTypes.StringType, true, Metadata.empty()), new StructField("Sex", DataTypes.StringType, true, Metadata.empty()), new StructField("Year", DataTypes.StringType, true, Metadata.empty()), new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/ Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile); person_mod.printSchema(); person_mod.show(); person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json"); }
Example 20
Source File: SecondaryStructureShiftedWord2VecEncoder.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] outputFilePath, args[1] outputFormat (json|parquet) * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfReducedPath(); if (args.length != 2) { System.err.println("Usage: " + SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName() + " <outputFilePath> + <fileFormat>"); System.exit(1); } long start = System.nanoTime(); SparkConf conf = new SparkConf() .setMaster("local[*]") .setAppName(SecondaryStructureShiftedWord2VecEncoder.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read MMTF Hadoop sequence file and create a non-redundant set (<=20% seq. identity) // of L-protein chains int sequenceIdentity = 20; double resolution = 3.0; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readSequenceFile(path, sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); // get content int segmentLength = 11; Dataset<Row> data = SecondaryStructureSegmentExtractor.getDataset(pdb, segmentLength); // create a Word2Vector representation of the protein sequences ProteinSequenceEncoder encoder = new ProteinSequenceEncoder(data); int windowSize = (segmentLength-1)/2; int vectorSize = 50; // dimension of feature vector (50) data = encoder.shifted3GramWord2VecEncode(windowSize, vectorSize).cache(); data.printSchema(); data.show(25, false); if (args[1].equals("json")) { // coalesce data into a single file data = data.coalesce(1); } data.write().mode("overwrite").format(args[1]).save(args[0]); long end = System.nanoTime(); System.out.println(TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); }