org.apache.spark.sql.SQLContext Java Examples

The following examples show how to use org.apache.spark.sql.SQLContext. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HoodieSnapshotExporter.java    From hudi with Apache License 2.0 7 votes vote down vote up
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
  Partitioner defaultPartitioner = dataset -> {
    Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
    return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
        ? hoodieDroppedDataset.write()
        : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
  };

  Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner)
      ? defaultPartitioner
      : ReflectionUtils.loadClass(cfg.outputPartitioner);

  final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
  Iterator<String> exportingFilePaths = jsc
      .parallelize(partitions, partitions.size())
      .flatMap(partition -> fsView
          .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
          .map(HoodieBaseFile::getPath).iterator())
      .toLocalIterator();

  Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
  partitioner.partition(sourceDataset)
      .format(cfg.outputFormat)
      .mode(SaveMode.Overwrite)
      .save(cfg.targetOutputPath);
}
 
Example #2
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void createHDFSRDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRdd = mock(JavaRDD.class);
    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRdd);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();

    RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config);

    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());

    verify(javaRdd, times(1)).map(any(Function.class));

}
 
Example #3
Source File: NGramBuilder.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
Example #4
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void textFileS3Test() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createS3DeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createS3RDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createS3RDD(config);

}
 
Example #5
Source File: SubStringCounterDataSource.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
@Override
public BaseRelation createRelation(SQLContext arg0, Map<String,
    String> arg1) {
  log.debug("-> createRelation()");

  java.util.Map<String, String> javaMap = scala.collection.JavaConverters
      .mapAsJavaMapConverter(arg1).asJava();

  SubStringCounterRelation br = new SubStringCounterRelation();
  br.setSqlContext(arg0);

  for (java.util.Map.Entry<String, String> entry : javaMap.entrySet()) {
    String key = entry.getKey();
    String value = entry.getValue();
    log.debug("[{}] --> [{}]", key, value);
    if (key.compareTo(K.PATH) == 0) {
      br.setFilename(value);
    } else if (key.startsWith(K.COUNT)) {
      br.addCriteria(value);
    }
  }

  return br;
}
 
Example #6
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	if (cmmModel != null) {
		DataFrame output = cmmModel.transform(input).repartition(1);
		return output.javaRDD().map(new RowToStringFunction(1)).collect();
	} else {
		System.err.println("Tagging model is null. You need to create or load a model first.");
		return null;
	}
}
 
Example #7
Source File: HoodieClientTestUtils.java    From hudi with Apache License 2.0 6 votes vote down vote up
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
                                      String instantTime) {
  HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime);
  if (!commitTimeline.containsInstant(commitInstant)) {
    throw new HoodieException("No commit exists at " + instantTime);
  }
  try {
    HashMap<String, String> paths =
        getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
    LOG.info("Path :" + paths.values());
    return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
        .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime));
  } catch (Exception e) {
    throw new HoodieException("Error reading commit " + instantTime, e);
  }
}
 
Example #8
Source File: DataStreamLoaderExample.java    From toolbox with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
      SparkConf conf = new SparkConf().setAppName("SLink!").setMaster("local");
      SparkContext sc = new SparkContext(conf);
      SQLContext sqlContext = new SQLContext(sc);

      //Path to dataset
      String path ="datasets/simulated/WI_samples.json";

//Create an AMIDST object for managing the data
      DataSpark dataSpark = DataSparkLoader.open(sqlContext, path);


//Print all the instances in the dataset
      dataSpark.collectDataStream()
              .forEach(
                      dataInstance -> System.out.println(dataInstance)
              );


  }
 
Example #9
Source File: BigQuerySparkSQL.java    From spark-on-k8s-gcp-examples with Apache License 2.0 6 votes vote down vote up
private static BigQuerySQLContext createBigQuerySQLContext(String[] args) {
  String projectId = args[0];
  Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId),
      "GCP project ID must not be empty");
  String gcsBucket = args[1];
  Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket),
      "GCS bucket must not be empty");

  String serviceAccountJsonKeyFilePath = System.getenv(APPLICATION_CREDENTIALS_ENV);
  Preconditions.checkArgument(!Strings.isNullOrEmpty(serviceAccountJsonKeyFilePath),
      APPLICATION_CREDENTIALS_ENV + " must be set");

  SQLContext sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate());
  BigQuerySQLContext bigQuerySQLContext = new BigQuerySQLContext(sqlContext);
  bigQuerySQLContext.setBigQueryProjectId(projectId);
  bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket);
  bigQuerySQLContext.setGcpJsonKeyFile(serviceAccountJsonKeyFilePath);

  return bigQuerySQLContext;
}
 
Example #10
Source File: HoodieClientTestUtils.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Obtain all new data written into the Hoodie table since the given timestamp.
 */
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext,
                                     HoodieTimeline commitTimeline, String lastCommitTime) {
  List<HoodieInstant> commitsToReturn =
      commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList());
  try {
    // Go over the commit metadata, and obtain the new files that need to be read.
    HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
    String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]);
    Dataset<Row> rows = null;
    if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
      rows = sqlContext.read().parquet(paths);
    }

    return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
  } catch (IOException e) {
    throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
  }
}
 
Example #11
Source File: NeedingHelpGoPackageFinder.java    From spark-on-k8s-gcp-examples with Apache License 2.0 6 votes vote down vote up
private NeedingHelpGoPackageFinder(
    String projectId,
    String bigQueryDataset,
    String gcsBucket,
    boolean useSampleTables) {
  Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId),
      "GCP project ID must not be empty");
  Preconditions.checkArgument(!Strings.isNullOrEmpty(bigQueryDataset),
      "BigQuery dataset name must not be empty");
  Preconditions.checkArgument(!Strings.isNullOrEmpty(gcsBucket),
      "GCS bucket must not be empty");

  this.projectId = projectId;
  this.bigQueryDataset = bigQueryDataset;

  this.sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate());
  this.bigQuerySQLContext = new BigQuerySQLContext(this.sqlContext);
  this.bigQuerySQLContext.setBigQueryProjectId(projectId);
  this.bigQuerySQLContext.setBigQueryGcsBucket(gcsBucket);

  this.useSampleTables = useSampleTables;
}
 
Example #12
Source File: MultiExpressionScript.java    From HiveQLUnit with Apache License 2.0 6 votes vote down vote up
/**
 * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
 * Each expression is run on the provided HiveContext.
 *
 * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
 */
@Override
public void runScript(SQLContext sqlContext) {
    String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
    for (String expression : expressions) {
        sqlContext.sql(expression);
    }
}
 
Example #13
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test(expected = UnsupportedOperationException.class)
public void createJavaSchemaFromEmptyRDDTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    ExtractorConfig config = createDeepJobConfig();
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config);
    JavaRDD<Row> rowRDD = mock(JavaRDD.class);
    mockStatic(DeepSparkContext.class);
    when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD);
    when(singleRdd.first()).thenThrow(new UnsupportedOperationException());

    deepSparkContextSpy.createJavaSchemaRDD(config);
}
 
Example #14
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tags a list of sequences and writes the result to an output file with a
 * desired output format.
 * 
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	tag(input, outputFileName, outputFormat);
}
 
Example #15
Source File: SparkSqlInterpreter.java    From Explorer with Apache License 2.0 5 votes vote down vote up
public int getProgress() {
    SQLContext sqlc = getSparkInterpreter().getSQLContext();
    SparkContext sc = sqlc.sparkContext();
    JobProgressListener sparkListener = getSparkInterpreter().getJobProgressListener();
    int completedTasks = 0;
    int totalTasks = 0;

    DAGScheduler scheduler = sc.dagScheduler();
    HashSet<ActiveJob> jobs = scheduler.activeJobs();
    Iterator<ActiveJob> it = jobs.iterator();
    while (it.hasNext()) {
        ActiveJob job = it.next();
        String g = (String) job.properties().get("spark.jobGroup.id");
        if (jobGroup.equals(g)) {
            int[] progressInfo = null;
            if (sc.version().startsWith("1.0")) {
                progressInfo = getProgressFromStage_1_0x(sparkListener, job.finalStage());
            } else if (sc.version().startsWith("1.1") || sc.version().startsWith("1.2")) {
                progressInfo = getProgressFromStage_1_1x(sparkListener, job.finalStage());
            } else {
                logger.warn("Spark {} getting progress information not supported" + sc.version());
                continue;
            }
            totalTasks += progressInfo[0];
            completedTasks += progressInfo[1];
        }
    }

    if (totalTasks == 0) {
        return 0;
    }
    return completedTasks * 100 / totalTasks;
}
 
Example #16
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Tags a distributed list of sentences and writes the result to an output file with 
 * a desired output format.
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(JavaRDD<Row> sentences, String outputFileName, OutputFormat outputFormat) {
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(sentences, schema);
	tag(input, outputFileName, outputFormat);
}
 
Example #17
Source File: MetadataWriter.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * @param sc        spark context to be used
 * @param persistor output persistor
 * @param rdfSchema schema storing information about classes and properties
 */
public MetadataWriter(JavaSparkContext sc, Persistor persistor, RdfSchema rdfSchema) {
    this.sql = new SQLContext(sc);
    this.persistor = persistor;
    this.rdfSchema = rdfSchema;
    this.stats = new ArrayList<>();
}
 
Example #18
Source File: Tagger.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Creates a data frame from a list of tagged sentences.
 * @param taggedSentences
 * @return a data frame of two columns: "sentence" and "partOfSpeech".
 */
public DataFrame createDataFrame(List<String> taggedSentences) {
	List<String> wordSequences = new LinkedList<String>();
	List<String> tagSequences = new LinkedList<String>();
	for (String taggedSentence : taggedSentences) {
		StringBuilder wordBuf = new StringBuilder();
		StringBuilder tagBuf = new StringBuilder();
		String[] tokens = taggedSentence.split("\\s+");
		for (String token : tokens) {
			String[] parts = token.split("/");
			if (parts.length == 2) {
				wordBuf.append(parts[0]);
				wordBuf.append(' ');
				tagBuf.append(parts[1]);
				tagBuf.append(' ');
			} else { // this token is "///"  
				wordBuf.append('/');
				wordBuf.append(' ');
				tagBuf.append('/');
				tagBuf.append(' ');
			}
		}
		wordSequences.add(wordBuf.toString().trim());
		tagSequences.add(tagBuf.toString().trim());
	}
	if (verbose) {
		System.out.println("Number of sentences = " + wordSequences.size());
	}
	List<Row> rows = new LinkedList<Row>();
	for (int i = 0; i < wordSequences.size(); i++) {
		rows.add(RowFactory.create(wordSequences.get(i), tagSequences.get(i)));
	}
	JavaRDD<Row> jrdd = jsc.parallelize(rows);
	StructType schema = new StructType(new StructField[]{
			new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
			new StructField("partOfSpeech", DataTypes.StringType, false, Metadata.empty())
		});
		
	return new SQLContext(jsc).createDataFrame(jrdd, schema);
}
 
Example #19
Source File: MultiExpressionScript.java    From HiveQLUnit with Apache License 2.0 5 votes vote down vote up
/**
 * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
 * Each expression is run on the provided HiveContext.
 *
 * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
 * @return the row results acquired from the last executed expression
 */
@Override
public List<Row> runScriptReturnResults(SQLContext sqlContext) {
    String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
    for (int i = 0; i < expressions.length - 1; i++) {
        String expression = expressions[i];
        sqlContext.sql(expression);
    }

    List<Row> rows = sqlContext.sql(expressions[expressions.length - 1]).collectAsList();
    return rows;
}
 
Example #20
Source File: RelationExtractorTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Test if expected directed relations are collected from a RDD of Instances
 */
@Test
public void testCollectRelations() {
    SQLContext sql = new SQLContext(jsc());

    RelationExtractor collector = new RelationExtractor(
            new RelationConfig(),
            jsc(),
            new ClassGraph()
    );

    List<Row> rdd = new ArrayList<>();

    // cycle one -> two -> three -> one
    rdd.add(RowFactory.create(0, 1, 1L, 1, 2L));
    rdd.add(RowFactory.create(0, 1, 2L, 1, 3L));
    rdd.add(RowFactory.create(0, 1, 3L, 1, 1L));

    // one -> four, four -> one
    rdd.add(RowFactory.create(0, 2, 4L, 1, 1L));
    rdd.add(RowFactory.create(0, 1, 1L, 2, 4L));

    // five -> one
    rdd.add(RowFactory.create(0, 3, 5L, 1, 1L));

    DataFrame expected = sql.createDataFrame(rdd, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );

    // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
    DataFrame result = collector.extractRelations(getTestRDD());

    assertEquals("Expected relation row schema is collected", expected.schema(), result.schema());
    assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD());
}
 
Example #21
Source File: InstanceRelationWriterTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() {
    sql = new SQLContext(jsc());
    uriIndex = new IndexMap<>(Arrays.asList(
            "http://example.com/a",
            "http://example.com/b",
            "http://example.com/c",
            "http://example.com/knows",
            "http://example.com/likes",
            "http://example.com/name",
            "http://example.com/age"
    ));

    rdfSchema = new RdfSchema(
            new RdfSchemaCollectorConfig(),
            new ClassGraph(),
            uriIndex,
            uriIndex,
            null
    );

    typeNames = new HashMap<>();
    typeNames.put("http://example.com/a", "a");
    typeNames.put("http://example.com/b", "b");
    typeNames.put("http://example.com/c", "c");

    config = new InstanceRelationWriterConfig();

    persistor = new DataFrameMapPersistor();
    result = persistor.getResultMap();
}
 
Example #22
Source File: TestPerformanceRegression.java    From chronix.spark with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws SolrServerException, IOException {

        ChronixSparkLoader loader = new ChronixSparkLoader();

        ChronixSparkContext chronixSparkContext = loader.createChronixSparkContext();
        SQLContext sqlContext = new SQLContext(chronixSparkContext.getSparkContext());

        // BENCHMARK START ...............................
        long start = System.currentTimeMillis();
        for (int i = 0; i < LOOPS; i++) {

            //Load data into ChronixRDD
            ChronixRDD rdd = loader.createChronixRDD(chronixSparkContext);

            //Some actions
            double mean = rdd.mean();
            double approxMean = rdd.approxMean();
            long observationCount = rdd.countObservations();
            double max = rdd.max();
            double min = rdd.min();
            Iterator<MetricTimeSeries> it = rdd.iterator();
            while (it.hasNext()) {
                MetricTimeSeries mts = it.next();
                System.out.print(".");
            }

            //DataFrame operations
            Dataset<MetricObservation> ds = rdd.toObservationsDataset(sqlContext);
            ds.count();
        }
        long stop = System.currentTimeMillis();
        // BENCHMARK STOP ...................................
        System.out.println("\nBenchmark duration: " + (stop - start) + " ms");

        chronixSparkContext.getSparkContext().close();
    }
 
Example #23
Source File: TestHDFSParquetImporter.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Test successful insert and verify data consistency.
 */
@Test
public void testImportWithInsert() throws IOException, ParseException {
  try (JavaSparkContext jsc = getJavaSparkContext()) {
    insert(jsc);
    SQLContext sqlContext = new SQLContext(jsc);
    Dataset<Row> ds = HoodieClientTestUtils.read(jsc, basePath + "/testTarget", sqlContext, dfs, basePath + "/testTarget/*/*/*/*");

    List<Row> readData = ds.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon").collectAsList();
    List<HoodieTripModel> result = readData.stream().map(row ->
        new HoodieTripModel(row.getDouble(0), row.getString(1), row.getString(2), row.getString(3), row.getDouble(4),
            row.getDouble(5), row.getDouble(6), row.getDouble(7)))
        .collect(Collectors.toList());

    List<HoodieTripModel> expected = insertData.stream().map(g ->
        new HoodieTripModel(Double.parseDouble(g.get("timestamp").toString()),
            g.get("_row_key").toString(),
            g.get("rider").toString(),
            g.get("driver").toString(),
            Double.parseDouble(g.get("begin_lat").toString()),
            Double.parseDouble(g.get("begin_lon").toString()),
            Double.parseDouble(g.get("end_lat").toString()),
            Double.parseDouble(g.get("end_lon").toString())))
        .collect(Collectors.toList());

    assertTrue(result.containsAll(expected) && expected.containsAll(result) && result.size() == expected.size());
  }
}
 
Example #24
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 5 votes vote down vote up
@Test
public void textFileHDFSTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createHDFSRDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createHDFSRDD(config);
}
 
Example #25
Source File: SaveModelDemo.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("SaveModelDemo").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);
	// 创建DataFrame 读取json
	SQLContext sqlContext = new SQLContext(sc);

	Dataset<Row> dataset = sqlContext.read().format("json").load(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	dataset.write().mode(SaveMode.ErrorIfExists).save("tmp/people2.json"); // 报错退出
	dataset.write().mode(SaveMode.Append).save("tmp/people2.json"); // 追加
	dataset.write().mode(SaveMode.Ignore).save("tmp/people2.json"); // 忽略错误
	dataset.write().mode(SaveMode.Overwrite).save("tmp/people2.json");// 覆盖

	sc.close();
}
 
Example #26
Source File: BatchProcessor.java    From lambda-arch with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    Properties prop = PropertyFileReader.readPropertyFile("iot-spark.properties");
    String file = prop.getProperty("com.iot.app.hdfs") + "iot-data-parque";
    String[] jars = {prop.getProperty("com.iot.app.jar")};

    JavaSparkContext sparkContext = getSparkContext(prop, jars);
    SQLContext sqlContext = new SQLContext(sparkContext);
    Dataset<Row> dataFrame = getDataFrame(sqlContext, file);
    JavaRDD<IoTData> rdd = dataFrame.javaRDD().map(getRowIoTDataFunction());
    BatchHeatMapProcessor processor = new BatchHeatMapProcessor();
    processor.processHeatMap(rdd);
    sparkContext.close();
    sparkContext.stop();
}
 
Example #27
Source File: DataSparkLoader.java    From toolbox with Apache License 2.0 5 votes vote down vote up
public static DataSpark open(SQLContext sqlContext, String path, String formatFile) throws Exception {

        //Load the data and store it into an object of class DataFrame
        DataFrame df = sqlContext.read().format(formatFile).load(path);

        //Create an AMIDST object for managing the data
        return loadSparkDataFrame(df);
    }
 
Example #28
Source File: UserVisitAnalyze.java    From UserActionAnalyzePlatform with Apache License 2.0 5 votes vote down vote up
private static void mock(JavaSparkContext context,SQLContext sc)
{
    boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
    if(local)
    {
        MockData.mock(context,sc);
    }

}
 
Example #29
Source File: UserVisitAnalyze.java    From UserActionAnalyzePlatform with Apache License 2.0 5 votes vote down vote up
/**
 * 用于判断是否是生产环境
 * @param sc
 * @return
 */
public static SQLContext getSQLContext(SparkContext sc)
{
    boolean local= ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
    if(local)
    {
        return new SQLContext(sc);
    }
    return new HiveContext(sc);
}
 
Example #30
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中,查询用户访问行为数据
	// 第一个限定:click_product_id,限定为不为空的访问行为,那么就代表着点击行为
	// 第二个限定:在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}