org.apache.spark.sql.Encoders Java Examples

The following examples show how to use org.apache.spark.sql.Encoders. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BundlesTest.java    From bunsen with Apache License 2.0 6 votes vote down vote up
@Test
public void testJsonBundleStrings() {

  JavaRDD<String> jsonBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/json/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> jsonBundles = spark.createDataset(jsonBundlesRdd.rdd(),
      Encoders.STRING());

  jsonBundles.write().saveAsTable("json_bundle_table");

  JavaRDD<BundleContainer> bundlesRdd = bundles.fromJson(
      spark.sql("select value from json_bundle_table"), "value");

  Dataset<Patient> patients = BundlesTest.bundles.extractEntry(spark,
      bundlesRdd,
      Patient.class);

  checkPatients(patients);
}
 
Example #2
Source File: BundlesTest.java    From bunsen with Apache License 2.0 6 votes vote down vote up
@Test
public void testJsonBundleStrings() {

  JavaRDD<String> jsonBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/json/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> jsonBundles = spark.createDataset(jsonBundlesRdd.rdd(),
      Encoders.STRING());

  jsonBundles.write().saveAsTable("json_bundle_table");

  JavaRDD<BundleContainer> bundlesRdd = bundles.fromJson(
      spark.sql("select value from json_bundle_table"), "value");

  Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark,
      bundlesRdd,
      Patient.class);

  checkPatients(patients);
}
 
Example #3
Source File: TextEncodedTelemetryReaderTest.java    From metron with Apache License 2.0 6 votes vote down vote up
@Test
public void testCSV() {
  // re-write the test data as a CSV with a header record
  String pathToCSV = tempFolder.getRoot().getAbsolutePath();
  spark.read()
          .format("text")
          .load("src/test/resources/telemetry.json")
          .as(Encoders.STRING())
          .write()
          .mode("overwrite")
          .option("header", "true")
          .format("csv")
          .save(pathToCSV);

  // tell the profiler to use the CSV input data
  profilerProperties.put(TELEMETRY_INPUT_PATH.getKey(), pathToCSV);
  profilerProperties.put(TELEMETRY_INPUT_FORMAT.getKey(), "csv");

  // set a reader property; tell the reader to expect a header
  readerProperties.put("header", "true");

  // there should be 100 valid JSON records
  Dataset<String> telemetry = TelemetryReaders.TEXT.read(spark, profilerProperties, readerProperties);
  assertEquals(100, telemetry.filter(new IsValidJSON()).count());
}
 
Example #4
Source File: BundlesTest.java    From bunsen with Apache License 2.0 6 votes vote down vote up
@Test
public void testXmlBundleStrings() {

  JavaRDD<String> xmlBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/xml/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> xmlBundles = spark.createDataset(xmlBundlesRdd.rdd(),
      Encoders.STRING());

  xmlBundles.write().saveAsTable("xml_bundle_table");

  JavaRDD<BundleContainer> bundles = BundlesTest.bundles.fromXml(
      spark.sql("select value from xml_bundle_table"), "value");

  Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark,
      bundles,
      Patient.class);

  checkPatients(patients);
}
 
Example #5
Source File: MutationToStructureDemo.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local[*]").appName(MutationToStructureDemo.class.getSimpleName())
            .getOrCreate();

    // find missense mutations that map to UniProt ID P15056 (BRAF)
    // that are annotated as pathogenic or likely pathogenic in ClinVar.
    List<String> uniprotIds = Arrays.asList("P15056"); // BRAF: P15056
    String query = "clinvar.rcv.clinical_significance:pathogenic OR clinvar.rcv.clinical_significance:likely pathogenic";
    Dataset<Row> df = MyVariantDataset.getVariations(uniprotIds, query).cache();
    System.out.println("BRAF missense mutations: " + df.count());
    df.show();
    
    // extract the list of variant Ids
    List<String> variantIds = df.select("variationId").as(Encoders.STRING()).collectAsList();
    
    // map to PDB structures
    Dataset<Row> ds = G2SDataset.getPositionDataset(variantIds);
    ds = ds.sort("structureId","chainId","pdbPosition");
    ds.show();

    spark.close(); 
}
 
Example #6
Source File: TestSQLDeriver.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testQueryLiteral() throws Exception {
  Contexts.getSparkSession().createDataset(Lists.newArrayList(1), Encoders.INT()).createOrReplaceTempView("literaltable");

  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(SQLDeriver.QUERY_LITERAL_CONFIG_NAME, "SELECT * FROM literaltable");
  Config config = ConfigFactory.parseMap(configMap);

  SQLDeriver deriver = new SQLDeriver();
  assertNoValidationFailures(deriver, config);
  deriver.configure(config);

  Object result = deriver.derive(Maps.<String, Dataset<Row>>newHashMap()).collectAsList().get(0).get(0);

  assertEquals(1, result);
}
 
Example #7
Source File: CsvToDatasetBookAsJson.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "CSV to Dataset<Book> as JSON").master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING());
  bookDf.show(20, 132);

  Dataset<Row> bookAsJsonDf = spark.read().json(bookDf);
  bookAsJsonDf.show();
}
 
Example #8
Source File: PdbToUniProt.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Returns an up-to-date dataset of PDB to UniProt
 * chain-level mappings for a list of ids.
 * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainIds (e.g., 1XYZ.A).
 * 
 * @param ids list of pdbIds or pdbId.chainIds
 * @return dataset of PDB to UniProt chain-level mappings
 * @throws IOException
 */
public static Dataset<Row> getChainMappings(List<String> ids) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();
    
    // get a dataset of up-to-date UniProt chain mappings
    Dataset<Row> ds = getChainMappings();  
    // create a dataset of ids from the passed-in list
    Dataset<Row> subset = spark.createDataset(ids, Encoders.STRING()).toDF("id");
    
    // create subsets of data
    if (!ids.isEmpty()) {
        if (ids.get(0).length() == 4) {
            // join by pdbId
            ds = ds.join(subset, ds.col("structureId").equalTo(subset.col("id"))).drop("id");    
        } else {
            // join by pdbChainId
            ds = ds.join(subset, ds.col("structureChainId").equalTo(subset.col("id"))).drop("id");    
        }
    }
    
    return ds;
}
 
Example #9
Source File: StructuredNodeLoader.java    From sylph with Apache License 2.0 6 votes vote down vote up
private static TransForm<Dataset<Row>> loadRealTimeTransForm(RealTimeTransForm realTimeTransForm)
{
    return stream -> {
        //spark2.x 要对dataSet 进行map操作必须要加上下面一句类型映射 即必须要指明返回的schema
        //implicit val matchError:org.apache.spark.sql.Encoder[Row] = org.apache.spark.sql.Encoders.kryo[Row]
        //      import collection.JavaConverters._
        //      val mapRowSchema = realTimeTransForm.getRowSchema.getFields.asScala.map(filed => {
        //        StructField(filed.getName, SparkRow.SparkRowParser.parserType(filed.getJavaType), true)
        //      })
        //      RowEncoder.apply(StructType(mapRowSchema))

        //implicit val mapenc = RowEncoder.apply(rddSchema)  //此处无法注册 原因是必须是sql基本类型   //Encoders.STRING
        Dataset<Row> transStream = stream.mapPartitions(
                (MapPartitionsFunction<Row, Row>) partition -> StreamNodeLoader.transFunction(partition, realTimeTransForm),
                Encoders.kryo(Row.class));
        //或者使用 transStream.as()
        return transStream;
    };
}
 
Example #10
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public synchronized void testTablesSupport() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "table");
  createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "1"),
      new SimpleRecord(2, "2"),
      new SimpleRecord(3, "3"));

  Dataset<Row> inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  inputDf.select("id", "data").write()
      .format("iceberg")
      .mode(SaveMode.Append)
      .save(loadLocation(tableIdentifier));

  Dataset<Row> resultDf = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier));
  List<SimpleRecord> actualRecords = resultDf.orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();

  Assert.assertEquals("Records should match", expectedRecords, actualRecords);
}
 
Example #11
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}
 
Example #12
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
RewriteManifestsAction(SparkSession spark, Table table) {
  this.spark = spark;
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class);
  this.table = table;
  this.spec = table.spec();
  this.targetManifestSizeBytes = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.MANIFEST_TARGET_SIZE_BYTES,
      TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT);
  this.fileIO = SparkUtil.serializableFileIO(table);

  // default the staging location to the metadata location
  TableOperations ops = ((HasTableOperations) table).operations();
  Path metadataFilePath = new Path(ops.metadataFileLocation("file"));
  this.stagingLocation = metadataFilePath.getParent().toString();

  // use the current table format version for new manifests
  this.formatVersion = ops.current().formatVersion();
}
 
Example #13
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public List<String> execute() {
  Dataset<Row> validDataFileDF = buildValidDataFileDF();
  Dataset<Row> validMetadataFileDF = buildValidMetadataFileDF();
  Dataset<Row> validFileDF = validDataFileDF.union(validMetadataFileDF);
  Dataset<Row> actualFileDF = buildActualFileDF();

  Column nameEqual = filename.apply(actualFileDF.col("file_path"))
      .equalTo(filename.apply(validFileDF.col("file_path")));
  Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path"));
  Column joinCond = nameEqual.and(actualContains);
  List<String> orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti")
      .as(Encoders.STRING())
      .collectAsList();

  Tasks.foreach(orphanFiles)
      .noRetry()
      .suppressFailureWhenFinished()
      .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc))
      .run(deleteFunc::accept);

  return orphanFiles;
}
 
Example #14
Source File: BatchProfilerIntegrationTest.java    From metron with Apache License 2.0 5 votes vote down vote up
@Test
public void testBatchProfilerWithCSV() throws Exception {
  // re-write the test data as a CSV with a header record
  String pathToCSV = tempFolder.getRoot().getAbsolutePath();
  spark.read()
          .format("text")
          .load("src/test/resources/telemetry.json")
          .as(Encoders.STRING())
          .write()
          .mode("overwrite")
          .option("header", "true")
          .format("csv")
          .save(pathToCSV);

  // tell the profiler to use the CSV input data
  // CSV is an example of needing to define both the reader and the input format
  profilerProperties.put(TELEMETRY_INPUT_PATH.getKey(), pathToCSV);
  profilerProperties.put(TELEMETRY_INPUT_READER.getKey(), "text");
  profilerProperties.put(TELEMETRY_INPUT_FORMAT.getKey(), "csv");

  // set a reader property; tell the reader to expect a header
  readerProperties.put("header", "true");

  BatchProfiler profiler = new BatchProfiler();
  profiler.run(spark, profilerProperties, getGlobals(), readerProperties, fromJSON(profileJson));

  validateProfiles();
}
 
Example #15
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedOverwrite() throws IOException {
  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  // overwrite with the same data; should not produce two copies
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("overwrite")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example #16
Source File: JavaSQLDataSourceExample.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void runBasicParquetExample(SparkSession spark) {
  // $example on:basic_parquet_example$
  Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // DataFrames can be saved as Parquet files, maintaining the schema information
  peopleDF.write().parquet("people.parquet");

  // Read in the Parquet file created above.
  // Parquet files are self-describing so the schema is preserved
  // The result of loading a parquet file is also a DataFrame
  Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet");

  // Parquet files can also be used to create a temporary view and then used in SQL statements
  parquetFileDF.createOrReplaceTempView("parquetFile");
  Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
  Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() {
    public String call(Row row) {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+
  // $example off:basic_parquet_example$
}
 
Example #17
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedStartsWith() {
  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(unpartitioned.toString());

  List<String> matchedData = df.select("data")
      .where("data LIKE 'jun%'")
      .as(Encoders.STRING())
      .collectAsList();

  Assert.assertEquals(1, matchedData.size());
  Assert.assertEquals("junction", matchedData.get(0));
}
 
Example #18
Source File: AbstractJavaEsSparkStructuredStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void test1WriteWithMappingId() throws Exception {
    String target = wrapIndex(resource("test-write-id", "data"));
    String docPath = wrapIndex(docPath("test-write-id", "data"));
    JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));

    RecordBean doc1 = new RecordBean();
    doc1.setId(1);
    doc1.setName("Spark");

    RecordBean doc2 = new RecordBean();
    doc2.setId(2);
    doc2.setName("Hadoop");

    RecordBean doc3 = new RecordBean();
    doc3.setId(3);
    doc3.setName("YARN");

    Dataset<RecordBean> dataset = test
            .withInput(doc1)
            .withInput(doc2)
            .withInput(doc3)
            .stream();

    test.run(
            dataset.writeStream()
                    .option("checkpointLocation", checkpoint(target))
                    .option("es.mapping.id", "id")
                    .format("es"),
            target
    );

    assertEquals(3, JavaEsSpark.esRDD(new JavaSparkContext(spark.sparkContext()), target).count());
    assertTrue(RestUtils.exists(docPath + "/1"));
    assertTrue(RestUtils.exists(docPath + "/2"));
    assertTrue(RestUtils.exists(docPath + "/3"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("Spark"));
}
 
Example #19
Source File: TextEncodedTelemetryReader.java    From metron with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<String> read(SparkSession spark, Properties profilerProps, Properties readerProps) {
  String inputPath = TELEMETRY_INPUT_PATH.get(profilerProps, String.class);
  if(inputFormat == null) {
    inputFormat = TELEMETRY_INPUT_FORMAT.get(profilerProps, String.class);
  }
  LOG.debug("Loading telemetry; inputPath={}, inputFormat={}", inputPath, inputFormat);

  return spark
          .read()
          .options(Maps.fromProperties(readerProps))
          .format(inputFormat)
          .load(inputPath)
          .as(Encoders.STRING());
}
 
Example #20
Source File: AbstractJavaEsSparkStructuredStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void test1BasicWrite() throws Exception {
    String target = wrapIndex(resource("test-write", "data"));
    JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));

    RecordBean doc1 = new RecordBean();
    doc1.setId(1);
    doc1.setName("Spark");

    RecordBean doc2 = new RecordBean();
    doc2.setId(2);
    doc2.setName("Hadoop");

    RecordBean doc3 = new RecordBean();
    doc3.setId(3);
    doc3.setName("YARN");

    Dataset<RecordBean> dataset = test
            .withInput(doc1)
            .withInput(doc2)
            .withInput(doc3)
            .stream();

    test.run(
            dataset.writeStream()
                    .option("checkpointLocation", checkpoint(target))
                    .format("es"),
            target
    );

    assertTrue(RestUtils.exists(target));
    assertThat(RestUtils.get(target + "/_search?"), containsString("Spark"));
    assertThat(RestUtils.get(target + "/_search?"), containsString("Hadoop"));
    assertThat(RestUtils.get(target + "/_search?"), containsString("YARN"));
}
 
Example #21
Source File: TestOrcWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicWrite() throws IOException {
  File parent = temp.newFolder("orc");
  File location = new File(parent, "test");
  location.mkdirs();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build();
  Table table = tables.create(SCHEMA, spec, location.toString());
  table.updateProperties()
      .defaultFormat(FileFormat.ORC)
      .set(OrcConf.COMPRESS.getAttribute(), CompressionKind.NONE.name())
      .commit();

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  // TODO: incoming columns must be ordered according to the table's schema
  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(
      Encoders.bean(SimpleRecord.class)).collectAsList();

  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example #22
Source File: TestPartitionValues.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testReorderedColumns() throws Exception {
  String desc = "reorder_columns";
  File parent = temp.newFolder(desc);
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
  Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString());
  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  List<SimpleRecord> expected = Lists.newArrayList(
          new SimpleRecord(1, "a"),
          new SimpleRecord(2, "b"),
          new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("data", "id").write()
          .format("iceberg")
          .mode("append")
          .option("check-ordering", "false")
          .save(location.toString());

  Dataset<Row> result = spark.read()
          .format("iceberg")
          .load(location.toString());

  List<SimpleRecord> actual = result
          .orderBy("id")
          .as(Encoders.bean(SimpleRecord.class))
          .collectAsList();

  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example #23
Source File: DataSetApplication.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}
 
Example #24
Source File: TestPartitionValues.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNullPartitionValue() throws Exception {
  String desc = "null_part";
  File parent = temp.newFolder(desc);
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
  Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString());
  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c"),
      new SimpleRecord(4, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(location.toString());

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result
      .orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();

  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example #25
Source File: AbstractJavaEsSparkStructuredStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test(expected = EsHadoopIllegalArgumentException.class)
public void test0FailOnIndexCreationDisabled() throws Exception {
    String target = wrapIndex(resource("test-nonexisting", "data"));
    JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));

    RecordBean doc1 = new RecordBean();
    doc1.setId(1);
    doc1.setName("Spark");

    RecordBean doc2 = new RecordBean();
    doc2.setId(2);
    doc2.setName("Hadoop");

    RecordBean doc3 = new RecordBean();
    doc3.setId(3);
    doc3.setName("YARN");

    Dataset<RecordBean> dataset = test
            .withInput(doc1)
            .withInput(doc2)
            .withInput(doc3)
            .expectingToThrow(EsHadoopIllegalArgumentException.class)
            .stream();

    test.run(
            dataset.writeStream()
                    .option("checkpointLocation", checkpoint(target))
                    .option(ES_INDEX_AUTO_CREATE, "no")
                    .format("es"),
            target
    );

    assertTrue(!RestUtils.exists(target));
}
 
Example #26
Source File: TestDataSourceOptions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testHadoopOptions() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();
  Configuration sparkHadoopConf = spark.sessionState().newHadoopConf();
  String originalDefaultFS = sparkHadoopConf.get("fs.default.name");

  try {
    HadoopTables tables = new HadoopTables(CONF);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    Map<String, String> options = Maps.newHashMap();
    tables.create(SCHEMA, spec, options, tableLocation);

    // set an invalid value for 'fs.default.name' in Spark Hadoop config
    // to verify that 'hadoop.' data source options are propagated correctly
    sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000");

    List<SimpleRecord> expectedRecords = Lists.newArrayList(
        new SimpleRecord(1, "a"),
        new SimpleRecord(2, "b")
    );
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
    originalDf.select("id", "data").write()
        .format("iceberg")
        .mode("append")
        .option("hadoop.fs.default.name", "file:///")
        .save(tableLocation);

    Dataset<Row> resultDf = spark.read()
        .format("iceberg")
        .option("hadoop.fs.default.name", "file:///")
        .load(tableLocation);
    List<SimpleRecord> resultRecords = resultDf.orderBy("id")
        .as(Encoders.bean(SimpleRecord.class))
        .collectAsList();

    Assert.assertEquals("Records should match", expectedRecords, resultRecords);
  } finally {
    sparkHadoopConf.set("fs.default.name", originalDefaultFS);
  }
}
 
Example #27
Source File: Functions.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Converts a set of FHIR resources to JSON.
 *
 * @param dataset a dataset containing FHIR resources
 * @param resourceType the FHIR resource type
 * @return a dataset of JSON strings for the FHIR resources
 */
public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) {

  Dataset<IBaseResource> resourceDataset =
      dataset.as(FhirEncoders.forR4()
          .getOrCreate()
          .of(resourceType));

  return resourceDataset.map(new ToJson(), Encoders.STRING());
}
 
Example #28
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> buildValidMetadataFileDF() {
  String allManifestsMetadataTable = metadataTableName(MetadataTableType.ALL_MANIFESTS);
  Dataset<Row> manifestDF = spark.read().format("iceberg")
      .load(allManifestsMetadataTable)
      .selectExpr("path as file_path");

  List<String> otherMetadataFiles = Lists.newArrayList();

  for (Snapshot snapshot : table.snapshots()) {
    String manifestListLocation = snapshot.manifestListLocation();
    if (manifestListLocation != null) {
      otherMetadataFiles.add(manifestListLocation);
    }
  }

  otherMetadataFiles.add(ops.metadataFileLocation("version-hint.text"));

  TableMetadata metadata = ops.current();
  otherMetadataFiles.add(metadata.metadataFileLocation());
  for (TableMetadata.MetadataLogEntry previousMetadataFile : metadata.previousFiles()) {
    otherMetadataFiles.add(previousMetadataFile.file());
  }

  Dataset<Row> otherMetadataFileDF = spark
      .createDataset(otherMetadataFiles, Encoders.STRING())
      .toDF("file_path");

  return manifestDF.union(otherMetadataFileDF);
}
 
Example #29
Source File: JavaUserDefinedTypedAggregation.java    From nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function.
 * @param args arguments.
 */
public static void main(final String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Java Spark SQL user-defined Datasets aggregation example")
      .getOrCreate();

  Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
  String path = args[0];
  Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
  ds.show();
  // +-------+------+
  // |   name|salary|
  // +-------+------+
  // |Michael|  3000|
  // |   Andy|  4500|
  // | Justin|  3500|
  // |  Berta|  4000|
  // +-------+------+

  MyAverage myAverage = new MyAverage();
  // Convert the function to a `TypedColumn` and give it a name
  TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary");
  Dataset<Double> result = ds.select(averageSalary);
  result.show();
  // +--------------+
  // |average_salary|
  // +--------------+
  // |        3750.0|
  // +--------------+
  spark.stop();
}
 
Example #30
Source File: TestSparkDataWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteProjection() throws IOException {
  Assume.assumeTrue(
      "Not supported in Spark 3.0; analysis requires all columns are present",
      spark.version().startsWith("2"));

  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, null),
      new SimpleRecord(2, null),
      new SimpleRecord(3, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id").write() // select only id column
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}