Java Code Examples for org.apache.spark.sql.SQLContext

The following examples show how to use org.apache.spark.sql.SQLContext. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: hudi   Source File: HoodieClientTestUtils.java    License: Apache License 2.0 6 votes vote down vote up
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
                                      String instantTime) {
  HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime);
  if (!commitTimeline.containsInstant(commitInstant)) {
    throw new HoodieException("No commit exists at " + instantTime);
  }
  try {
    HashMap<String, String> paths =
        getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
    LOG.info("Path :" + paths.values());
    return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
        .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime));
  } catch (Exception e) {
    throw new HoodieException("Error reading commit " + instantTime, e);
  }
}
 
Example 2
Source Project: deep-spark   Source File: DeepSparkContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void textFileS3Test() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createS3DeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createS3RDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createS3RDD(config);

}
 
Example 3
Source Project: deep-spark   Source File: DeepSparkContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void createHDFSRDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRdd = mock(JavaRDD.class);
    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRdd);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();

    RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config);

    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());

    verify(javaRdd, times(1)).map(any(Function.class));

}
 
Example 4
@Override
public BaseRelation createRelation(SQLContext arg0, Map<String,
    String> arg1) {
  log.debug("-> createRelation()");

  java.util.Map<String, String> javaMap = scala.collection.JavaConverters
      .mapAsJavaMapConverter(arg1).asJava();

  SubStringCounterRelation br = new SubStringCounterRelation();
  br.setSqlContext(arg0);

  for (java.util.Map.Entry<String, String> entry : javaMap.entrySet()) {
    String key = entry.getKey();
    String value = entry.getValue();
    log.debug("[{}] --> [{}]", key, value);
    if (key.compareTo(K.PATH) == 0) {
      br.setFilename(value);
    } else if (key.startsWith(K.COUNT)) {
      br.addCriteria(value);
    }
  }

  return br;
}
 
Example 5
private static BigQuerySQLContext createBigQuerySQLContext(String[] args) {
  String projectId = args[0];
  Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId),
      "GCP project ID must not be empty");
  String gcsBucket = args[1];
  Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket),
      "GCS bucket must not be empty");

  String serviceAccountJsonKeyFilePath = System.getenv(APPLICATION_CREDENTIALS_ENV);
  Preconditions.checkArgument(!Strings.isNullOrEmpty(serviceAccountJsonKeyFilePath),
      APPLICATION_CREDENTIALS_ENV + " must be set");

  SQLContext sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate());
  BigQuerySQLContext bigQuerySQLContext = new BigQuerySQLContext(sqlContext);
  bigQuerySQLContext.setBigQueryProjectId(projectId);
  bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket);
  bigQuerySQLContext.setGcpJsonKeyFile(serviceAccountJsonKeyFilePath);

  return bigQuerySQLContext;
}
 
Example 6
private NeedingHelpGoPackageFinder(
    String projectId,
    String bigQueryDataset,
    String gcsBucket,
    boolean useSampleTables) {
  Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId),
      "GCP project ID must not be empty");
  Preconditions.checkArgument(!Strings.isNullOrEmpty(bigQueryDataset),
      "BigQuery dataset name must not be empty");
  Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket),
      "GCS bucket must not be empty");

  this.projectId = projectId;
  this.bigQueryDataset = bigQueryDataset;

  this.sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate());
  this.bigQuerySQLContext = new BigQuerySQLContext(this.sqlContext);
  this.bigQuerySQLContext.setBigQueryProjectId(projectId);
  this.bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket);

  this.useSampleTables = useSampleTables;
}
 
Example 7
Source Project: toolbox   Source File: DataStreamLoaderExample.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
      SparkConf conf = new SparkConf().setAppName("SLink!").setMaster("local");
      SparkContext sc = new SparkContext(conf);
      SQLContext sqlContext = new SQLContext(sc);

      //Path to dataset
      String path ="datasets/simulated/WI_samples.json";

//Create an AMIDST object for managing the data
      DataSpark dataSpark = DataSparkLoader.open(sqlContext, path);


//Print all the instances in the dataset
      dataSpark.collectDataStream()
              .forEach(
                      dataInstance -> System.out.println(dataInstance)
              );


  }
 
Example 8
Source Project: hudi   Source File: HoodieSnapshotExporter.java    License: Apache License 2.0 6 votes vote down vote up
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
  Partitioner defaultPartitioner = dataset -> {
    Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
    return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
        ? hoodieDroppedDataset.write()
        : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
  };

  Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner)
      ? defaultPartitioner
      : ReflectionUtils.loadClass(cfg.outputPartitioner);

  final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
  Iterator<String> exportingFilePaths = jsc
      .parallelize(partitions, partitions.size())
      .flatMap(partition -> fsView
          .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
          .map(HoodieBaseFile::getPath).iterator())
      .toLocalIterator();

  Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
  partitioner.partition(sourceDataset)
      .format(cfg.outputFormat)
      .mode(SaveMode.Overwrite)
      .save(cfg.targetOutputPath);
}
 
Example 9
Source Project: deep-spark   Source File: DeepSparkContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test(expected = UnsupportedOperationException.class)
public void createJavaSchemaFromEmptyRDDTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    ExtractorConfig config = createDeepJobConfig();
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config);
    JavaRDD<Row> rowRDD = mock(JavaRDD.class);
    mockStatic(DeepSparkContext.class);
    when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD);
    when(singleRdd.first()).thenThrow(new UnsupportedOperationException());

    deepSparkContextSpy.createJavaSchemaRDD(config);
}
 
Example 10
Source Project: hudi   Source File: HoodieClientTestUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Obtain all new data written into the Hoodie table since the given timestamp.
 */
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext,
                                     HoodieTimeline commitTimeline, String lastCommitTime) {
  List<HoodieInstant> commitsToReturn =
      commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList());
  try {
    // Go over the commit metadata, and obtain the new files that need to be read.
    HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
    String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]);
    Dataset<Row> rows = null;
    if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
      rows = sqlContext.read().parquet(paths);
    }

    return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
  } catch (IOException e) {
    throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
  }
}
 
Example 11
Source Project: vn.vitk   Source File: Tagger.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	if (cmmModel != null) {
		DataFrame output = cmmModel.transform(input).repartition(1);
		return output.javaRDD().map(new RowToStringFunction(1)).collect();
	} else {
		System.err.println("Tagging model is null. You need to create or load a model first.");
		return null;
	}
}
 
Example 12
Source Project: vn.vitk   Source File: NGramBuilder.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
Example 13
/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中,查询用户访问行为数据
	// 第一个限定:click_product_id,限定为不为空的访问行为,那么就代表着点击行为
	// 第二个限定:在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}
 
Example 14
Source Project: zeppelin   Source File: SparkKotlinReceiver.java    License: Apache License 2.0 5 votes vote down vote up
public SparkKotlinReceiver(Object spark,
                           JavaSparkContext sc,
                           SQLContext sqlContext,
                           ZeppelinContext z) {
  this._sparkObject = spark;
  this.sc = sc;
  this.sqlContext = sqlContext;
  this.z = z;
}
 
Example 15
Source Project: toolbox   Source File: DataStreamWriterExample.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		//Setting up spark
		SparkConf conf = new SparkConf().setAppName("SparkLink!").setMaster("local");
		JavaSparkContext jsc = new JavaSparkContext(conf);
		SQLContext sqlContext = new SQLContext(jsc);


		//Generate random data
		int seed = 1234;
		int nInstances = 1000;
		int nDiscreteAtts=3;
		int nContinuousAttributes = 2;

		DataSpark data = DataSetGenerator
				.generate(	jsc,
							seed,
							nInstances,
							nDiscreteAtts,
							nContinuousAttributes );


		// Save it as a json and parquet file
		DataSparkWriter.writeDataToFolder(data, "datasets/simulated/randomData.json", sqlContext);
		DataSparkWriter.writeDataToFolder(data, "datasets/simulated/randomData.parquet", sqlContext);

	}
 
Example 16
/**
 * 获取SQLContext
 * 如果是在本地测试环境的话,那么就生成SQLContext对象
 * 如果是在生产环境运行的话,那么就生成HiveContext对象
 * @param sc SparkContext
 * @return SQLContext
 */
private static SQLContext getSQLContext(SparkContext sc) {
	boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
	if(local) {
		return new SQLContext(sc);
	} else {
		return new HiveContext(sc);
	}
}
 
Example 17
/**
 * 生成模拟数据(只有本地模式,才会去生成模拟数据)
 * @param sc
 * @param sqlContext
 */
private static void mockData(JavaSparkContext sc, SQLContext sqlContext) {
	boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
	if(local) {
		MockData.mock(sc, sqlContext);
	}
}
 
Example 18
Source Project: BigDataPlatform   Source File: SparkUtils.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * 生成模拟数据
 * 如果spark.local配置设置为true,则生成模拟数据;否则不生成
 * @param sc
 * @param sqlContext
 */
public static void mockData(JavaSparkContext sc, SQLContext sqlContext) {
	boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
	if(local) {
		MockData.mock(sc, sqlContext);
	}
}
 
Example 19
Source Project: BigDataPlatform   Source File: CaseWhenTest.java    License: GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("CaseWhenTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row>  gradeLevelDF = sqlContext.sql(
			"SELECT CASE "
				+ "WHEN grade>=90 THEN 'A' "
				+ "WHEN grade>=80 THEN 'B' "
				+ "WHEN grade>=70 THEN 'C' "
				+ "WHEN grade>=60 THEN 'D' "
				+ "ELSE 'E' "
				+ "END gradeLevel "
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 20
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 21
Source Project: iceberg   Source File: TestSparkTableUtil.java    License: Apache License 2.0 5 votes vote down vote up
@After
public void after() throws IOException {
  // Drop the hive table.
  SQLContext sc = new SQLContext(TestSparkTableUtil.spark);
  sc.sql(String.format("DROP TABLE IF EXISTS %s", qualifiedTableName));

  // Delete the data corresponding to the table.
  tableLocationPath.getFileSystem(CONF).delete(tableLocationPath, true);
}
 
Example 22
Source Project: chronix.spark   Source File: TestPerformanceRegression.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws SolrServerException, IOException {

        ChronixSparkLoader loader = new ChronixSparkLoader();

        ChronixSparkContext chronixSparkContext = loader.createChronixSparkContext();
        SQLContext sqlContext = new SQLContext(chronixSparkContext.getSparkContext());

        // BENCHMARK START ...............................
        long start = System.currentTimeMillis();
        for (int i = 0; i < LOOPS; i++) {

            //Load data into ChronixRDD
            ChronixRDD rdd = loader.createChronixRDD(chronixSparkContext);

            //Some actions
            double mean = rdd.mean();
            double approxMean = rdd.approxMean();
            long observationCount = rdd.countObservations();
            double max = rdd.max();
            double min = rdd.min();
            Iterator<MetricTimeSeries> it = rdd.iterator();
            while (it.hasNext()) {
                MetricTimeSeries mts = it.next();
                System.out.print(".");
            }

            //DataFrame operations
            Dataset<MetricObservation> ds = rdd.toObservationsDataset(sqlContext);
            ds.count();
        }
        long stop = System.currentTimeMillis();
        // BENCHMARK STOP ...................................
        System.out.println("\nBenchmark duration: " + (stop - start) + " ms");

        chronixSparkContext.getSparkContext().close();
    }
 
Example 23
/**
 * 用于判断是否是生产环境
 * @param sc
 * @return
 */
public static SQLContext getSQLContext(SparkContext sc)
{
    boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
    if(local)
    {
        return new SQLContext(sc);
    }
    return new HiveContext(sc);
}
 
Example 24
private static void mock(JavaSparkContext context,SQLContext sc)
{
    boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
    if(local)
    {
        MockData.mock(context,sc);
    }

}
 
Example 25
Source Project: toolbox   Source File: DataSparkLoader.java    License: Apache License 2.0 5 votes vote down vote up
public static DataSpark open(SQLContext sqlContext, String path, String formatFile) throws Exception {

        //Load the data and store it into an object of class DataFrame
        DataFrame df = sqlContext.read().format(formatFile).load(path);

        //Create an AMIDST object for managing the data
        return loadSparkDataFrame(df);
    }
 
Example 26
Source Project: lambda-arch   Source File: BatchProcessor.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    Properties prop = PropertyFileReader.readPropertyFile("iot-spark.properties");
    String file = prop.getProperty("com.iot.app.hdfs") + "iot-data-parque";
    String[] jars = {prop.getProperty("com.iot.app.jar")};

    JavaSparkContext sparkContext = getSparkContext(prop, jars);
    SQLContext sqlContext = new SQLContext(sparkContext);
    Dataset<Row> dataFrame = getDataFrame(sqlContext, file);
    JavaRDD<IoTData> rdd = dataFrame.javaRDD().map(getRowIoTDataFunction());
    BatchHeatMapProcessor processor = new BatchHeatMapProcessor();
    processor.processHeatMap(rdd);
    sparkContext.close();
    sparkContext.stop();
}
 
Example 27
Source Project: SparkDemo   Source File: SaveModelDemo.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("SaveModelDemo").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);
	// 创建DataFrame 读取json
	SQLContext sqlContext = new SQLContext(sc);

	Dataset<Row> dataset = sqlContext.read().format("json").load(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	dataset.write().mode(SaveMode.ErrorIfExists).save("tmp/people2.json"); // 报错退出
	dataset.write().mode(SaveMode.Append).save("tmp/people2.json"); // 追加
	dataset.write().mode(SaveMode.Ignore).save("tmp/people2.json"); // 忽略错误
	dataset.write().mode(SaveMode.Overwrite).save("tmp/people2.json");// 覆盖

	sc.close();
}
 
Example 28
Source Project: deep-spark   Source File: DeepSparkContextTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void textFileHDFSTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createHDFSRDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createHDFSRDD(config);
}
 
Example 29
Source Project: rdf2x   Source File: MetadataWriter.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param sc        spark context to be used
 * @param persistor output persistor
 * @param rdfSchema schema storing information about classes and properties
 */
public MetadataWriter(JavaSparkContext sc, Persistor persistor, RdfSchema rdfSchema) {
    this.sql = new SQLContext(sc);
    this.persistor = persistor;
    this.rdfSchema = rdfSchema;
    this.stats = new ArrayList<>();
}
 
Example 30
Source Project: rdf2x   Source File: RelationExtractorTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test if expected directed relations are collected from a RDD of Instances
 */
@Test
public void testCollectRelations() {
    SQLContext sql = new SQLContext(jsc());

    RelationExtractor collector = new RelationExtractor(
            new RelationConfig(),
            jsc(),
            new ClassGraph()
    );

    List<Row> rdd = new ArrayList<>();

    // cycle one -> two -> three -> one
    rdd.add(RowFactory.create(0, 1, 1L, 1, 2L));
    rdd.add(RowFactory.create(0, 1, 2L, 1, 3L));
    rdd.add(RowFactory.create(0, 1, 3L, 1, 1L));

    // one -> four, four -> one
    rdd.add(RowFactory.create(0, 2, 4L, 1, 1L));
    rdd.add(RowFactory.create(0, 1, 1L, 2, 4L));

    // five -> one
    rdd.add(RowFactory.create(0, 3, 5L, 1, 1L));

    DataFrame expected = sql.createDataFrame(rdd, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );

    // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
    DataFrame result = collector.extractRelations(getTestRDD());

    assertEquals("Expected relation row schema is collected", expected.schema(), result.schema());
    assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD());
}