Java Code Examples for org.apache.spark.sql.Dataset#map()

The following examples show how to use org.apache.spark.sql.Dataset#map() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: CsvToDatasetBookAsJson.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "CSV to Dataset<Book> as JSON").master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING());
  bookDf.show(20, 132);

  Dataset<Row> bookAsJsonDf = spark.read().json(bookDf);
  bookAsJsonDf.show();
}

Example 2

Source File: WindowAssignTranslatorBatch.java From beam with Apache License 2.0

6 votes

@Override
public void translateTransform(
    PTransform<PCollection<T>, PCollection<T>> transform, TranslationContext context) {

  Window.Assign<T> assignTransform = (Window.Assign<T>) transform;
  @SuppressWarnings("unchecked")
  final PCollection<T> input = (PCollection<T>) context.getInput();
  @SuppressWarnings("unchecked")
  final PCollection<T> output = (PCollection<T>) context.getOutput();

  Dataset<WindowedValue<T>> inputDataset = context.getDataset(input);
  if (WindowingHelpers.skipAssignWindows(assignTransform, context)) {
    context.putDataset(output, inputDataset);
  } else {
    WindowFn<T, ?> windowFn = assignTransform.getWindowFn();
    WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
        WindowedValue.FullWindowedValueCoder.of(input.getCoder(), windowFn.windowCoder());
    Dataset<WindowedValue<T>> outputDataset =
        inputDataset.map(
            WindowingHelpers.assignWindowsMapFunction(windowFn),
            EncoderHelpers.fromBeamCoder(windowedValueCoder));
    context.putDataset(output, outputDataset);
  }
}

Example 3

Source File: CsvToDatasetBookToDataframeApp.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv")
      .option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<Book> bookDs = df.map(new BookMapper(), Encoders.bean(Book.class));
  bookDs.show();
  bookDs.printSchema();

  Dataset<Row> df2 = bookDs.toDF();
  df2.show();
  df2.printSchema();
}

Example 4

Source File: SparkCubingJobTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}

Example 5

Source File: BookUrlBuilderApp.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("Book URL Builder")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
  ds.printSchema();
  ds.show(20, 80);
}

Example 6

Source File: DataSetApplication.java From sparkResearch with Apache License 2.0

5 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类，可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}

Example 7

Source File: HashDeriver.java From envelope with Apache License 2.0

5 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  String concatenatedFieldName = "_concatenated";

  Dataset<Row> dependency = getStepDataFrame(dependencies);

  Dataset<Row> concatenated = dependency.map(
      new ConcatenationFunction(delimiter, nullString, includeFields, excludeFields),
      RowEncoder.apply(dependency.schema().add(concatenatedFieldName, DataTypes.BinaryType)));

  return concatenated
      .withColumn(hashFieldName, functions.md5(functions.col(concatenatedFieldName)))
      .drop(concatenatedFieldName);
}

Example 8

Source File: DataQualityDeriver.java From envelope with Apache License 2.0

5 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  if (dependencies.size() > 1 && dataset.isEmpty()) {
    throw new RuntimeException("Must specify dataset on which to conduct data quality tests when more than one dependency");
  }
  Dataset<Row> theDataset;
  Dataset<Row> theResults = null;
  if (dependencies.size() == 1) {
    theDataset = dependencies.values().iterator().next();
  } else {
    theDataset = dependencies.get(dataset);
  }
  if (scope == Scope.DATASET) {
    // The checks are run at a dataset level and we are simply returning a DS of <name, boolean> Rows
    for (DatasetRule rule : datasetRules.values()) {
      if (theResults == null) {
        theResults = rule.check(theDataset, dependencies);
      } else {
        theResults = theResults.union(rule.check(theDataset, dependencies));
      }
    }
  } else {
    if (theDataset.schema().getFieldIndex(resultsField).isDefined()) {
      throw new RuntimeException("The field [" + resultsField + "] already exists in the dataset schema. Use the " +
          RESULTS_FIELD_CONFIG + " configuration parameter to customize the data quality check field name");
    }
    List<StructField> checkField = Lists.newArrayList(
        new StructField(resultsField,
            DataTypes.createMapType(DataTypes.StringType, DataTypes.BooleanType),
            false, Metadata.empty()));
    theResults = theDataset.map(new CheckRowRules(rowRules, resultsField),
        RowEncoder.apply(SchemaUtils.appendFields(theDataset.schema(), checkField)));
  }

  return theResults;
}

Example 9

Source File: ValueSets.java From bunsen with Apache License 2.0

5 votes

/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<ValueSet> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  Dataset<ValueSet> withoutConcepts = valueSets.map((MapFunction<ValueSet,ValueSet>) valueSet -> {
    ValueSet valueSetWithoutConcepts = valueSet.copy();

    List<ConceptSetComponent> updatedInclusions = new ArrayList<>();

    for (ConceptSetComponent inclusion: valueSet.getCompose().getInclude()) {

      ConceptSetComponent inclusionWithoutConcepts = inclusion.copy();

      inclusionWithoutConcepts.setConcept(new ArrayList<>());
      updatedInclusions.add(inclusionWithoutConcepts);
    }

    valueSetWithoutConcepts.getCompose().setInclude(updatedInclusions);

    return valueSetWithoutConcepts;
  }, VALUE_SET_ENCODER);

  Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator,
      getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}

Example 10

Source File: ConceptMaps.java From bunsen with Apache License 2.0

5 votes

@Override
public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps);

  if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion");
  }

  // Remove the concept contents for persistence. This is most easily done in the ConceptMap
  // object by setting the group to an empty list.
  Dataset<ConceptMap> withoutConcepts = conceptMaps
      .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> {

        // Remove the elements rather than the groups to preserved the
        // "unmapped" structure in a group that can refer to other
        // concept maps.
        ConceptMap withoutElements = conceptMap.copy();

        List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>();

        for (ConceptMapGroupComponent group: withoutElements.getGroup()) {

          group.setElement(new ArrayList<>());
          updatedGroups.add(group);
        }

        withoutElements.setGroup(updatedGroups);

        return withoutElements;
      }, CONCEPT_MAP_ENCODER);

  Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator,
      MAPPING_ENCODER);

  return withConceptMaps(withoutConcepts, newMappings);
}

Example 11

Source File: ReadSourceTranslatorBatch.java From beam with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Override
public void translateTransform(
    PTransform<PBegin, PCollection<T>> transform, TranslationContext context) {
  AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> rootTransform =
      (AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>>)
          context.getCurrentTransform();

  BoundedSource<T> source;
  try {
    source = ReadTranslation.boundedSourceFromTransform(rootTransform);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  SparkSession sparkSession = context.getSparkSession();

  String serializedSource = Base64Serializer.serializeUnchecked(source);
  Dataset<Row> rowDataset =
      sparkSession
          .read()
          .format(sourceProviderClass)
          .option(DatasetSourceBatch.BEAM_SOURCE_OPTION, serializedSource)
          .option(
              DatasetSourceBatch.DEFAULT_PARALLELISM,
              String.valueOf(context.getSparkSession().sparkContext().defaultParallelism()))
          .option(
              DatasetSourceBatch.PIPELINE_OPTIONS, context.getSerializableOptions().toString())
          .load();

  // extract windowedValue from Row
  WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
      WindowedValue.FullWindowedValueCoder.of(
          source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);

  Dataset<WindowedValue<T>> dataset =
      rowDataset.map(
          RowHelpers.extractWindowedValueFromRowMapFunction(windowedValueCoder),
          EncoderHelpers.fromBeamCoder(windowedValueCoder));

  PCollection<T> output = (PCollection<T>) context.getOutput();
  context.putDataset(output, dataset);
}

Example 12

Source File: PiComputeLambdaApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

/**
 * The processing code.
 */
private void start(int slices) {
  int numberOfThrows = 100000 * slices;
  System.out.println("About to throw " + numberOfThrows
      + " darts, ready? Stay away from the target!");

  long t0 = System.currentTimeMillis();
  SparkSession spark = SparkSession
      .builder()
      .appName("Spark Pi with lambdas")
      .master("local[*]")
      .getOrCreate();

  long t1 = System.currentTimeMillis();
  System.out.println("Session initialized in " + (t1 - t0) + " ms");

  List<Integer> l = new ArrayList<>(numberOfThrows);
  for (int i = 0; i < numberOfThrows; i++) {
    l.add(i);
  }
  Dataset<Row> incrementalDf = spark
      .createDataset(l, Encoders.INT())
      .toDF();

  long t2 = System.currentTimeMillis();
  System.out.println("Initial dataframe built in " + (t2 - t1) + " ms");

  Dataset<Integer> dotsDs = incrementalDf
      .map((MapFunction<Row, Integer>) status -> {
        double x = Math.random() * 2 - 1;
        double y = Math.random() * 2 - 1;
        counter++;
        if (counter % 100000 == 0) {
          System.out.println("" + counter + " darts thrown so far");
        }
        return (x * x + y * y <= 1) ? 1 : 0;
      }, Encoders.INT());

  long t3 = System.currentTimeMillis();
  System.out.println("Throwing darts done in " + (t3 - t2) + " ms");

  int dartsInCircle =
      dotsDs.reduce((ReduceFunction<Integer>) (x, y) -> x + y);
  long t4 = System.currentTimeMillis();
  System.out.println("Analyzing result in " + (t4 - t3) + " ms");

  System.out.println(
      "Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows);

  spark.stop();
}

Example 13

Source File: ParDoTranslatorBatch.java From beam with Apache License 2.0

4 votes

@Override
public void translateTransform(
    PTransform<PCollection<InputT>, PCollectionTuple> transform, TranslationContext context) {
  String stepName = context.getCurrentTransform().getFullName();

  // Check for not supported advanced features
  // TODO: add support of Splittable DoFn
  DoFn<InputT, OutputT> doFn = getDoFn(context);
  checkState(
      !DoFnSignatures.isSplittable(doFn),
      "Not expected to directly translate splittable DoFn, should have been overridden: %s",
      doFn);

  // TODO: add support of states and timers
  checkState(
      !DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");

  checkState(
      !DoFnSignatures.requiresTimeSortedInput(doFn),
      "@RequiresTimeSortedInput is not " + "supported for the moment");

  DoFnSchemaInformation doFnSchemaInformation =
      ParDoTranslation.getSchemaInformation(context.getCurrentTransform());

  // Init main variables
  PValue input = context.getInput();
  Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
  Map<TupleTag<?>, PValue> outputs = context.getOutputs();
  TupleTag<?> mainOutputTag = getTupleTag(context);
  List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
  WindowingStrategy<?, ?> windowingStrategy =
      ((PCollection<InputT>) input).getWindowingStrategy();
  Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
  Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();

  // construct a map from side input to WindowingStrategy so that
  // the DoFn runner can map main-input windows to side input windows
  List<PCollectionView<?>> sideInputs = getSideInputs(context);
  Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
  for (PCollectionView<?> sideInput : sideInputs) {
    sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
  }

  SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);

  Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
  MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();

  List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
  for (TupleTag<?> tag : outputTags) {
    if (!tag.equals(mainOutputTag)) {
      additionalOutputTags.add(tag);
    }
  }

  Map<String, PCollectionView<?>> sideInputMapping =
      ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
  @SuppressWarnings("unchecked")
  DoFnFunction<InputT, OutputT> doFnWrapper =
      new DoFnFunction(
          metricsAccum,
          stepName,
          doFn,
          windowingStrategy,
          sideInputStrategies,
          context.getSerializableOptions(),
          additionalOutputTags,
          mainOutputTag,
          inputCoder,
          outputCoderMap,
          broadcastStateData,
          doFnSchemaInformation,
          sideInputMapping);

  MultiOuputCoder multipleOutputCoder =
      MultiOuputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
  Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs =
      inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
  if (outputs.entrySet().size() > 1) {
    allOutputs.persist();
    for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
      pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
    }
  } else {
    Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
    Coder<WindowedValue<?>> windowedValueCoder =
        (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
    Dataset<WindowedValue<?>> outputDataset =
        allOutputs.map(
            (MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>)
                value -> value._2,
            EncoderHelpers.fromBeamCoder(windowedValueCoder));
    context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
  }
}

Example 14

Source File: PiComputeLambdaWithClassApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

/**
 * The processing code.
 */
private void start(int slices) {
  int numberOfThrows = 100000 * slices;
  System.out.println("About to throw " + numberOfThrows
      + " darts, ready? Stay away from the target!");

  long t0 = System.currentTimeMillis();
  SparkSession spark = SparkSession
      .builder()
      .appName("Spark Pi with lambdas")
      .master("local[*]")
      .getOrCreate();

  long t1 = System.currentTimeMillis();
  System.out.println("Session initialized in " + (t1 - t0) + " ms");

  List<Integer> l = new ArrayList<>(numberOfThrows);
  for (int i = 0; i < numberOfThrows; i++) {
    l.add(i);
  }
  Dataset<Row> incrementalDf = spark
      .createDataset(l, Encoders.INT())
      .toDF();

  long t2 = System.currentTimeMillis();
  System.out.println("Initial dataframe built in " + (t2 - t1) + " ms");

  Dataset<Integer> dotsDs = incrementalDf
      .map(new DartMapper(), Encoders.INT());

  long t3 = System.currentTimeMillis();
  System.out.println("Throwing darts done in " + (t3 - t2) + " ms");

  int dartsInCircle = dotsDs.reduce(new DartReducer());
  long t4 = System.currentTimeMillis();
  System.out.println("Analyzing result in " + (t4 - t3) + " ms");

  System.out
      .println("Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows);

  spark.stop();
}

Example 15

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runProgrammaticSchemaExample(SparkSession spark) {
  // $example on:programmatic_schema$
  // Create an RDD
  JavaRDD<String> peopleRDD = spark.sparkContext()
    .textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt", 1)
    .toJavaRDD();

  // The schema is encoded in a string
  String schemaString = "name age";

  // Generate the schema based on the string of schema
  List<StructField> fields = new ArrayList<>();
  for (String fieldName : schemaString.split(" ")) {
    StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    fields.add(field);
  }
  StructType schema = DataTypes.createStructType(fields);

  // Convert records of the RDD (people) to Rows
  JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
    @Override
    public Row call(String record) throws Exception {
      String[] attributes = record.split(",");
      return RowFactory.create(attributes[0], attributes[1].trim());
    }
  });

  // Apply the schema to the RDD
  Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

  // Creates a temporary view using the DataFrame
  peopleDataFrame.createOrReplaceTempView("people");

  // SQL can be run over a temporary view created using DataFrames
  Dataset<Row> results = spark.sql("SELECT name FROM people");

  // The results of SQL queries are DataFrames and support all the normal RDD operations
  // The columns of a row in the result can be accessed by field index or by field name
  Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +-------------+
  // |        value|
  // +-------------+
  // |Name: Michael|
  // |   Name: Andy|
  // | Name: Justin|
  // +-------------+
  // $example off:programmatic_schema$
}

Example 16

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runDatasetCreationExample(SparkSession spark) {
  // $example on:create_ds$
  // Create an instance of a Bean class
  Person person = new Person();
  person.setName("Andy");
  person.setAge(32);

  // Encoders are created for Java beans
  Encoder<Person> personEncoder = Encoders.bean(Person.class);
  Dataset<Person> javaBeanDS = spark.createDataset(
    Collections.singletonList(person),
    personEncoder
  );
  javaBeanDS.show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 32|Andy|
  // +---+----+

  // Encoders for most common types are provided in class Encoders
  Encoder<Integer> integerEncoder = Encoders.INT();
  Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
  Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
    @Override
    public Integer call(Integer value) throws Exception {
      return value + 1;
    }
  }, integerEncoder);
  transformedDS.collect(); // Returns [2, 3, 4]

  // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
  String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json";
  Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
  peopleDS.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_ds$
}

Example 17

Source File: JavaSparkHiveExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on:spark_hive$
  // warehouseLocation points to the default location for managed databases and tables
  String warehouseLocation = "spark-warehouse";
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark Hive Example")
    .config("spark.sql.warehouse.dir", warehouseLocation)
    .enableHiveSupport()
    .getOrCreate();

  spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
  spark.sql("LOAD DATA LOCAL INPATH 'data/resources/kv1.txt' INTO TABLE src");

  // Queries are expressed in HiveQL
  spark.sql("SELECT * FROM src").show();
  // +---+-------+
  // |key|  value|
  // +---+-------+
  // |238|val_238|
  // | 86| val_86|
  // |311|val_311|
  // ...

  // Aggregation queries are also supported.
  spark.sql("SELECT COUNT(*) FROM src").show();
  // +--------+
  // |count(1)|
  // +--------+
  // |    500 |
  // +--------+

  // The results of SQL queries are themselves DataFrames and support all normal functions.
  Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

  // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
  Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Key: " + row.get(0) + ", Value: " + row.get(1);
    }
  }, Encoders.STRING());
  stringsDS.show();
  // +--------------------+
  // |               value|
  // +--------------------+
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // ...

  // You can also use DataFrames to create temporary views within a SparkSession.
  List<Record> records = new ArrayList<>();
  for (int key = 1; key < 100; key++) {
    Record record = new Record();
    record.setKey(key);
    record.setValue("val_" + key);
    records.add(record);
  }
  Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
  recordsDF.createOrReplaceTempView("records");

  // Queries can then join DataFrames data with data stored in Hive.
  spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
  // +---+------+---+------+
  // |key| value|key| value|
  // +---+------+---+------+
  // |  2| val_2|  2| val_2|
  // |  2| val_2|  2| val_2|
  // |  4| val_4|  4| val_4|
  // ...
  // $example off:spark_hive$

  spark.stop();
}

Example 18

Source File: ReadSourceTranslatorStreaming.java From beam with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Override
public void translateTransform(
    PTransform<PBegin, PCollection<T>> transform, TranslationContext context) {
  AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> rootTransform =
      (AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>>)
          context.getCurrentTransform();

  UnboundedSource<T, UnboundedSource.CheckpointMark> source;
  try {
    source = ReadTranslation.unboundedSourceFromTransform(rootTransform);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  SparkSession sparkSession = context.getSparkSession();

  String serializedSource = Base64Serializer.serializeUnchecked(source);
  Dataset<Row> rowDataset =
      sparkSession
          .readStream()
          .format(sourceProviderClass)
          .option(DatasetSourceStreaming.BEAM_SOURCE_OPTION, serializedSource)
          .option(
              DatasetSourceStreaming.DEFAULT_PARALLELISM,
              String.valueOf(context.getSparkSession().sparkContext().defaultParallelism()))
          .option(
              DatasetSourceStreaming.PIPELINE_OPTIONS,
              context.getSerializableOptions().toString())
          .load();

  // extract windowedValue from Row
  WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
      WindowedValue.FullWindowedValueCoder.of(
          source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
  Dataset<WindowedValue<T>> dataset =
      rowDataset.map(
          RowHelpers.extractWindowedValueFromRowMapFunction(windowedValueCoder),
          EncoderHelpers.fromBeamCoder(windowedValueCoder));

  PCollection<T> output = (PCollection<T>) context.getOutput();
  context.putDataset(output, dataset);
}

Example 19

Source File: StructuredKafkaSource08.java From sylph with Apache License 2.0

4 votes

public Dataset<Row> createSource(SparkSession spark, KafkaSourceConfig08 config, SourceContext context)
{
    String topics = requireNonNull(config.getTopics(), "topics not setting");
    String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器
    String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字
    String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting");

    Map<String, String> otherConfig = config.getOtherConfig().entrySet()
            .stream()
            .filter(x -> x.getValue() != null)
            .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString()));

    Map<String, String> kafkaParams = new HashMap<>(otherConfig);
    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
    //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest   smallest

    Dataset<Row> kafka08 = spark.readStream()
            .format(KafkaDataSource08.class.getName())
            .option("topics", topics)
            .options(kafkaParams)
            .load();

    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return kafka08
                .map((MapFunction<Row, Row>) record -> {
                    return jsonParser.deserialize(
                            record.getAs("_key"),
                            record.getAs("_message"),
                            record.<String>getAs("_topic"),
                            record.<Integer>getAs("_partition"),
                            record.<Long>getAs("_offset"));
                }, RowEncoder.apply(jsonParser.getProducedType()));
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        String[] columns = Arrays.stream(structType.names()).map(name -> {
            switch (name) {
                case "_key":
                    return "CAST(_key AS STRING) as _key";
                case "_message":
                    return "CAST(_message AS STRING) as _message";
                default:
                    return name;
            }
        }).toArray(String[]::new);
        return kafka08.selectExpr(columns); //对输入的数据进行 cast转换
    }
}

Example 20

Source File: Functions.java From bunsen with Apache License 2.0

2 votes

/**
 * Converts a set of FHIR resources to JSON.
 *
 * @param dataset a dataset containing FHIR resources
 * @param resourceTypeUrl the FHIR resource type
 * @return a dataset of JSON strings for the FHIR resources
 */
public static Dataset<String> toJson(Dataset<Row> dataset, String resourceTypeUrl) {

  return dataset.map(new ToJson(resourceTypeUrl), Encoders.STRING());
}