Java Code Examples for org.apache.spark.sql.SQLContext#createDataFrame()

The following examples show how to use org.apache.spark.sql.SQLContext#createDataFrame() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JavaStocks.java    From spark-ts-examples with Apache License 2.0 6 votes vote down vote up
private static DataFrame loadObservations(JavaSparkContext sparkContext, SQLContext sqlContext,
    String path) {
  JavaRDD<Row> rowRdd = sparkContext.textFile(path).map((String line) -> {
      String[] tokens = line.split("\t");
      ZonedDateTime dt = ZonedDateTime.of(Integer.parseInt(tokens[0]),
          Integer.parseInt(tokens[1]), Integer.parseInt(tokens[1]), 0, 0, 0, 0,
          ZoneId.systemDefault());
      String symbol = tokens[3];
      double price = Double.parseDouble(tokens[5]);
      return RowFactory.create(Timestamp.from(dt.toInstant()), symbol, price);
  });
  List<StructField> fields = new ArrayList();
  fields.add(DataTypes.createStructField("timestamp", DataTypes.TimestampType, true));
  fields.add(DataTypes.createStructField("symbol", DataTypes.StringType, true));
  fields.add(DataTypes.createStructField("price", DataTypes.DoubleType, true));
  StructType schema = DataTypes.createStructType(fields);
  return sqlContext.createDataFrame(rowRdd, schema);
}
 
Example 2
Source File: DependencyParser.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Parses a list of PoS-tagged sentences, each on a line and writes the result to an output 
 * file in a specified output format.
 * @param jsc
 * @param sentences
 * @param outputFileName
 * @param outuptFormat
 */
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
		private static final long serialVersionUID = -812004521983071103L;
		public Row call(DependencyGraph graph) {
			return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
		}
	});
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),	
		new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame df = sqlContext.createDataFrame(rows, schema);
	
	if (outputFormat == OutputFormat.TEXT)  
		df.select("dependency").write().text(outputFileName);
	else 
		df.repartition(1).write().json(outputFileName);
}
 
Example 3
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	if (cmmModel != null) {
		DataFrame output = cmmModel.transform(input).repartition(1);
		return output.javaRDD().map(new RowToStringFunction(1)).collect();
	} else {
		System.err.println("Tagging model is null. You need to create or load a model first.");
		return null;
	}
}
 
Example 4
Source File: RelationSchemaCollectorTest.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
private DataFrame getTestRDD() {
    SQLContext sql = new SQLContext(jsc());
    List<Row> rdd = new ArrayList<>();

    // cycle one -> two -> three -> one
    rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 1L, uriIndex.getIndex("http://example.com/a"), 2L));
    rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 2L, uriIndex.getIndex("http://example.com/a"), 3L));
    rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 3L, uriIndex.getIndex("http://example.com/a"), 1L));

    // one -> four, four -> one
    rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/a"), 1L, uriIndex.getIndex("http://example.com/b"), 4L));
    rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/b"), 4L, uriIndex.getIndex("http://example.com/a"), 1L));

    // five -> one
    rdd.add(RowFactory.create(0, uriIndex.getIndex("http://example.com/c"), 5L, uriIndex.getIndex("http://example.com/a"), 1L));

    return sql.createDataFrame(rdd, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );
}
 
Example 5
Source File: DbPersistorPostgres.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDataFrame(String name, DataFrame df) {
    if (!config.isBulkLoad()) {
        super.writeDataFrame(name, df);
        return;
    }
    String fullTableName = getFullTableName(name);
    Properties properties = config.getProperties();

    // create table schema by persisting empty dataframe
    log.info("Creating schema of table {}", fullTableName);
    SQLContext sql = df.sqlContext();
    DataFrame emptyDf = sql.createDataFrame(sql.sparkContext().emptyRDD(ClassTag$.MODULE$.apply(Row.class)), df.schema());
    emptyDf.write().mode(saveMode).jdbc(config.getUrl(), fullTableName, properties);

    final Function0<Connection> connectionFactory = JdbcUtils.createConnectionFactory(config.getUrl(), properties);
    log.info("Writing to database table {} using PostgreSQL COPY", fullTableName);
    int batchSize = config.getBatchSize();
    df.toJavaRDD().foreachPartition(rows -> {
        Connection connection = connectionFactory.apply();
        copyRows(fullTableName, rows, connection, batchSize);
        try {
            connection.close();
        } catch (SQLException e) {
            log.debug("Unexpected exception when closing database connection: {}", e);
        }
    });

}
 
Example 6
Source File: DataSparkFromRDD.java    From toolbox with Apache License 2.0 5 votes vote down vote up
@Override
public DataFrame getDataFrame(SQLContext sql) {

    // Obtain the schema
    StructType schema = SchemaConverter.getSchema(attributes);

    // Transform the RDD
    JavaRDD<Row> rowRDD = DataFrameOps.toRowRDD(amidstRDD, attributes);

    // Create the DataFrame
    return sql.createDataFrame(rowRDD, schema);
}
 
Example 7
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tags a distributed list of sentences and writes the result to an output file with 
 * a desired output format.
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(JavaRDD<Row> sentences, String outputFileName, OutputFormat outputFormat) {
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(sentences, schema);
	tag(input, outputFileName, outputFormat);
}
 
Example 8
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tags a list of sequences and writes the result to an output file with a
 * desired output format.
 * 
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	tag(input, outputFileName, outputFormat);
}
 
Example 9
Source File: RelationExtractorTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Test if expected directed relations are collected from a RDD of Instances
 */
@Test
public void testCollectRelations() {
    SQLContext sql = new SQLContext(jsc());

    RelationExtractor collector = new RelationExtractor(
            new RelationConfig(),
            jsc(),
            new ClassGraph()
    );

    List<Row> rdd = new ArrayList<>();

    // cycle one -> two -> three -> one
    rdd.add(RowFactory.create(0, 1, 1L, 1, 2L));
    rdd.add(RowFactory.create(0, 1, 2L, 1, 3L));
    rdd.add(RowFactory.create(0, 1, 3L, 1, 1L));

    // one -> four, four -> one
    rdd.add(RowFactory.create(0, 2, 4L, 1, 1L));
    rdd.add(RowFactory.create(0, 1, 1L, 2, 4L));

    // five -> one
    rdd.add(RowFactory.create(0, 3, 5L, 1, 1L));

    DataFrame expected = sql.createDataFrame(rdd, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );

    // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
    DataFrame result = collector.extractRelations(getTestRDD());

    assertEquals("Expected relation row schema is collected", expected.schema(), result.schema());
    assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD());
}
 
Example 10
Source File: EntitySalienceFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 11
Source File: IfTest.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("IfTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row> gradeLevelDF = sqlContext.sql(
			"SELECT IF(grade>=80,'GOOD','BAD') gradeLevel "  
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 12
Source File: CaseWhenTest.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("CaseWhenTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row>  gradeLevelDF = sqlContext.sql(
			"SELECT CASE "
				+ "WHEN grade>=90 THEN 'A' "
				+ "WHEN grade>=80 THEN 'B' "
				+ "WHEN grade>=70 THEN 'C' "
				+ "WHEN grade>=60 THEN 'D' "
				+ "ELSE 'E' "
				+ "END gradeLevel "
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 13
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
 * 生成点击商品基础信息临时表
 * @param sqlContext
 * @param cityid2clickActionRDD
 * @param cityid2cityInfoRDD
 */
private static void generateTempClickProductBasicTable(
		SQLContext sqlContext,
		JavaPairRDD<Long, Row> cityid2clickActionRDD,
		JavaPairRDD<Long, Row> cityid2cityInfoRDD) {
	// 执行join操作,进行点击行为数据和城市数据的关联
	JavaPairRDD<Long, Tuple2<Row, Row>> joinedRDD =
			cityid2clickActionRDD.join(cityid2cityInfoRDD);
	
	// 将上面的JavaPairRDD,转换成一个JavaRDD<Row>(才能将RDD转换为Dataset<Row>)
	JavaRDD<Row> mappedRDD = joinedRDD.map(
			
			new Function<Tuple2<Long,Tuple2<Row,Row>>, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Row call(Tuple2<Long, Tuple2<Row, Row>> tuple)
						throws Exception {
					Long cityid = tuple._1;
					Row clickAction = tuple._2._1;
					Row cityInfo = tuple._2._2;
					
					Long productid = clickAction.getLong(1);
					String cityName = cityInfo.getString(1);
					String area = cityInfo.getString(2);
					
					return RowFactory.create(cityid, cityName, area, productid);  
				}
				
			});
	
	// 基于JavaRDD<Row>的格式,就可以将其转换为Dataset<Row>
	List<StructField> structFields = new ArrayList<StructField>();
	structFields.add(DataTypes.createStructField("city_id", DataTypes.LongType, true));
	structFields.add(DataTypes.createStructField("city_name", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("area", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("product_id", DataTypes.LongType, true));
	
	// 1 北京
	// 2 上海
	// 1 北京
	// group by area,product_id
	// 1:北京,2:上海
	
	// 两个函数
	// UDF:concat2(),将两个字段拼接起来,用指定的分隔符
	// UDAF:group_concat_distinct(),将一个分组中的多个字段值,用逗号拼接起来,同时进行去重
	
	StructType schema = DataTypes.createStructType(structFields);

	Dataset<Row> df = sqlContext.createDataFrame(mappedRDD, schema);
	System.out.println("tmp_click_product_basic: " + df.count());  
	
	// 将Dataset<Row>中的数据,注册成临时表(tmp_click_product_basic)
	df.registerTempTable("tmp_click_product_basic");  
}
 
Example 14
Source File: RDD2DataFrameReflection.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(RDD2DataFrameReflection.class);

	SQLContext sqlContext = new SQLContext(sc);

	JavaRDD<String> lineRDD = sc.textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt");

	JavaRDD<Row> rowsRDD = lineRDD.map(new Function<String, Row>() {

		@Override
		public Row call(String line) throws Exception {
			String[] lineSplited = line.split(",");

			return RowFactory.create(lineSplited[0], Integer.valueOf(lineSplited[1]));
		}
	});

	// 动态构造元数据,这里用的动态创建元数据
	// 如果不确定有哪些列,这些列需要从数据库或配置文件中加载出来!!!!
	List<StructField> fields = new ArrayList<StructField>();
	fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));

	StructType schema = DataTypes.createStructType(fields);

	// 根据表数据和元数据schema创建临时表
	// Spark2.0之后,DataFrame和DataSet合并为更高级的DataSet,新的DataSet具有两个不同的API特性:
	// 1.非强类型(untyped),DataSet[Row]是泛型对象的集合,它的别名是DataFrame;
	// 2.强类型(strongly-typed),DataSet[T]是具体对象的集合,如scala和java中定义的类
	Dataset<Row> dataset = sqlContext.createDataFrame(rowsRDD, schema);
	dataset.registerTempTable("person");

	Dataset<Row> personDataSet = sqlContext.sql("select * from person");

	List<Row> list = personDataSet.javaRDD().collect();

	// 一行记录
	for (Row r : list) {
		System.out.println(r);
	}

	sc.close();
}
 
Example 15
Source File: SQLQueryBAM.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}
 
Example 16
Source File: SQLQueryFastq.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryFastq");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(sc);

  Options options = new Options();

  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option samOpt = new Option( "format", true, "parquet or fastq" );
  Option baminOpt = new Option( "in", true, "" );
  options.addOption( new Option( "tablename", true, "Default sql table name is 'records'"));

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( samOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq";
  String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
  tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records";

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

  JavaRDD<MyRead> rdd = fastqRDD.map(record -> {
    MyRead read = new MyRead();
    read.setKey(record._1.toString());
    read.setSequence(record._2.getSequence().toString());
    read.setRead(record._2.getRead());
    read.setQuality(record._2.getQuality().toString());

    read.setTile(record._2.getTile());
    read.setXpos(record._2.getXpos());
    read.setYpos(record._2.getYpos());
    read.setRunNumber(record._2.getRunNumber());
    read.setInstrument(record._2.getInstrument());
    read.setFlowcellId(record._2.getFlowcellId());
    read.setLane(record._2.getLane());
    read.setControlNumber(record._2.getControlNumber());
    read.setFilterPassed(record._2.getFilterPassed());

    return read;
  });

  Dataset df = sqlContext.createDataFrame(rdd, MyRead.class);
  df.registerTempTable(tablename);
  //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records"
  //"SELECT key,LEN(sequence) as l FROM records where l<100;"
  if(query!=null) {

    //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam));
    //Save as parquet file
    Dataset<Row> resultDF = sqlContext.sql(query);
    resultDF.show(100, false);

    if(outDir!=null){
      if(format.equals("fastq")){
        JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF);
        resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
      }
      else
        resultDF.write().parquet(outDir);
    }
  }
  sc.stop();

}
 
Example 17
Source File: JavaALSExampleByMl.java    From Spark_ALS with MIT License 4 votes vote down vote up
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaALSExample").setMaster("local");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    JavaRDD<Rating> ratingsRDD = jsc.textFile("data/sample_movielens_ratings.txt")
            .map(new Function<String, Rating>() {
                public Rating call(String str) {
                    return Rating.parseRating(str);
                }
            });
    Dataset<Row> ratings = sqlContext.createDataFrame(ratingsRDD, Rating.class);
    Dataset<Row>[] splits = ratings.randomSplit(new double[]{0.8, 0.2}); // //对数据进行分割,80%为训练样例,剩下的为测试样例。
    Dataset<Row> training = splits[0];
    Dataset<Row> test = splits[1];

    // Build the recommendation model using ALS on the training data
    ALS als = new ALS().setMaxIter(5) // 设置迭代次数
            .setRegParam(0.01) // //正则化参数,使每次迭代平滑一些,此数据集取0.1好像错误率低一些。
            .setUserCol("userId").setItemCol("movieId")
            .setRatingCol("rating");
    ALSModel model = als.fit(training); // //调用算法开始训练


    Dataset<Row> itemFactors = model.itemFactors();
    itemFactors.show(1500);
    Dataset<Row> userFactors = model.userFactors();
    userFactors.show();

    // Evaluate the model by computing the RMSE on the test data
    Dataset<Row> rawPredictions = model.transform(test); //对测试数据进行预测
    Dataset<Row> predictions = rawPredictions
            .withColumn("rating", rawPredictions.col("rating").cast(DataTypes.DoubleType))
            .withColumn("prediction", rawPredictions.col("prediction").cast(DataTypes.DoubleType));

    RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol("rating")
            .setPredictionCol("prediction");
    Double rmse = evaluator.evaluate(predictions);
    log.info("Root-mean-square error = {} ", rmse);

    jsc.stop();
}
 
Example 18
Source File: EntitySalienceAnnotatorAndFeatureExtractorSpark.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    final SparkSerializableAnalysisEngine ae = EntitySalienceFactory.createEntitySalienceEntityAnnotator(trainingSettings.getEntitySalienceEntityAnnotator());
    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents
                    .map(s -> {
                        TOTAL_DOCS.add(1);
                        Logger tmpLogger = LoggerFactory.getLogger(EntitySalienceFeatureExtractorSpark.class);
                        String docId = JCasUtil.selectSingle(s.getJCas(), DocumentMetaData.class).getDocumentId();
                        tmpLogger.info("Processing document {}.", docId);
                        //Before processing the document through the Disambiguation Pipeline, add the AIDA settings
                        // in each document.
                        SparkUimaUtils.addSettingsToJCas(s.getJCas(),
                                trainingSettings.getDocumentCoherent(),
                                trainingSettings.getDocumentConfidenceThreshold());
                        return ae.process(s);
                    })
                    .flatMap(s -> fe.getTrainingInstances(s.getJCas(),
                            trainingSettings.getFeatureExtractor(),
                            trainingSettings.getPositiveInstanceScalingFactor()));

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entity", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 19
Source File: ChronixRDD.java    From chronix.spark with Apache License 2.0 3 votes vote down vote up
/**
 * Transformation: Derives a Spark SQL DataFrame from a ChronixRDD.
 * <p>
 * The DataFrame contains the following columns:
 * <ul>
 * <li>for each dimension (@see: de.qaware.chronix.storage.solr.timeseries.metric.MetricDimension) one column</li>
 * <li>one column for the observations' timestamp</li>
 * <li>one column for the measurement value at the observation timestamp</li>
 * </ul>
 *
 * @param sqlContext an open SQLContext
 * @return a DataFrame containing the ChronixRDD data
 */
public Dataset<Row> toDataFrame(SQLContext sqlContext) {
    return sqlContext.createDataFrame(
            this.toObservations(),
            MetricObservation.class
    );
}