org.apache.spark.api.java.function.MapFunction Java Examples

The following examples show how to use org.apache.spark.api.java.function.MapFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParDoTranslatorBatch.java    From beam with Apache License 2.0 6 votes vote down vote up
private void pruneOutputFilteredByTag(
    TranslationContext context,
    Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs,
    Map.Entry<TupleTag<?>, PValue> output,
    Coder<? extends BoundedWindow> windowCoder) {
  Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> filteredDataset =
      allOutputs.filter(new DoFnFilterFunction(output.getKey()));
  Coder<WindowedValue<?>> windowedValueCoder =
      (Coder<WindowedValue<?>>)
          (Coder<?>)
              WindowedValue.getFullCoder(
                  ((PCollection<OutputT>) output.getValue()).getCoder(), windowCoder);
  Dataset<WindowedValue<?>> outputDataset =
      filteredDataset.map(
          (MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>)
              value -> value._2,
          EncoderHelpers.fromBeamCoder(windowedValueCoder));
  context.putDatasetWildcard(output.getValue(), outputDataset);
}
 
Example #2
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private <T, U> U withReusableDS(Dataset<T> ds, Function<Dataset<T>, U> func) {
  Dataset<T> reusableDS;
  if (useCaching) {
    reusableDS = ds.cache();
  } else {
    int parallelism = SQLConf.get().numShufflePartitions();
    reusableDS = ds.repartition(parallelism).map((MapFunction<T, T>) value -> value, ds.exprEnc());
  }

  try {
    return func.apply(reusableDS);
  } finally {
    if (useCaching) {
      reusableDS.unpersist(false);
    }
  }
}
 
Example #3
Source File: Loinc.java    From bunsen with Apache License 2.0 6 votes vote down vote up
/**
 * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
 * dataset.
 *
 * @param spark the Spark session
 * @param loincHierarchyPath path to the multiaxial hierarchy CSV
 * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark,
    String loincHierarchyPath) {

  return spark.read()
      .option("header", true)
      .csv(loincHierarchyPath)
      .select(col("IMMEDIATE_PARENT"), col("CODE"))
      .where(col("IMMEDIATE_PARENT").isNotNull()
          .and(col("IMMEDIATE_PARENT").notEqual(lit(""))))
      .where(col("CODE").isNotNull()
          .and(col("CODE").notEqual(lit(""))))
      .map((MapFunction<Row, HierarchicalElement>) row -> {

        HierarchicalElement element = new HierarchicalElement();

        element.setAncestorSystem(LOINC_CODE_SYSTEM_URI);
        element.setAncestorValue(row.getString(0));

        element.setDescendantSystem(LOINC_CODE_SYSTEM_URI);
        element.setDescendantValue(row.getString(1));

        return element;
      }, Hierarchies.getHierarchicalElementEncoder());
}
 
Example #4
Source File: SparkCubingJobTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example #5
Source File: WindowingHelpers.java    From beam with Apache License 2.0 5 votes vote down vote up
public static <T, W extends BoundedWindow>
    MapFunction<WindowedValue<T>, WindowedValue<T>> assignWindowsMapFunction(
        WindowFn<T, W> windowFn) {
  return (MapFunction<WindowedValue<T>, WindowedValue<T>>)
      windowedValue -> {
        final BoundedWindow boundedWindow = Iterables.getOnlyElement(windowedValue.getWindows());
        final T element = windowedValue.getValue();
        final Instant timestamp = windowedValue.getTimestamp();
        Collection<W> windows =
            windowFn.assignWindows(
                windowFn.new AssignContext() {

                  @Override
                  public T element() {
                    return element;
                  }

                  @Override
                  public Instant timestamp() {
                    return timestamp;
                  }

                  @Override
                  public BoundedWindow window() {
                    return boundedWindow;
                  }
                });
        return WindowedValue.of(element, timestamp, windows, windowedValue.getPane());
      };
}
 
Example #6
Source File: RowHelpers.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * A Spark {@link MapFunction} for extracting a {@link WindowedValue} from a Row in which the
 * {@link WindowedValue} was serialized to bytes using its {@link
 * WindowedValue.WindowedValueCoder}.
 *
 * @param <T> The type of the object.
 * @return A {@link MapFunction} that accepts a {@link Row} and returns its {@link WindowedValue}.
 */
public static <T> MapFunction<Row, WindowedValue<T>> extractWindowedValueFromRowMapFunction(
    WindowedValue.WindowedValueCoder<T> windowedValueCoder) {
  return (MapFunction<Row, WindowedValue<T>>)
      value -> {
        // there is only one value put in each Row by the InputPartitionReader
        byte[] bytes = (byte[]) value.get(0);
        return windowedValueCoder.decode(new ByteArrayInputStream(bytes));
      };
}
 
Example #7
Source File: Dataset.java    From nemo with Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> map(final MapFunction<T, U> func, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(func, encoder);
  final Dataset<U> result = from(super.map(func, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #8
Source File: JavaSparkSQLExample.java    From nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Function to run data creation example.
 * @param spark spark session.
 * @param peopleJson path to people json file.
 */
private static void runDatasetCreationExample(final SparkSession spark, final String peopleJson) {
  // Create an instance of a Bean class
  Person person = new Person();
  person.setName("Andy");
  person.setAge(32);

  // Encoders are created for Java beans
  Encoder<Person> personEncoder = Encoders.bean(Person.class);
  Dataset<Person> javaBeanDS = spark.createDataset(
      Collections.singletonList(person),
      personEncoder
  );
  javaBeanDS.show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 32|Andy|
  // +---+----+

  // Encoders for most common types are provided in class Encoders
  Encoder<Integer> integerEncoder = Encoders.INT();
  Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
  Dataset<Integer> transformedDS = primitiveDS.map(
      (MapFunction<Integer, Integer>) value -> value + 1,
      integerEncoder);
  transformedDS.collect(); // Returns [2, 3, 4]

  // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
  String path = peopleJson;
  Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
  peopleDS.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
}
 
Example #9
Source File: Snomed.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset.
 *
 * @param spark the Spark session
 * @param snomedRelationshipPath path to the SNOMED relationship file
 * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark,
    String snomedRelationshipPath) {

  return spark.read()
      .option("header", true)
      .option("delimiter", "\t")
      .csv(snomedRelationshipPath)
      .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID)))
      .where(col("active").equalTo(lit("1")))
      .select(col("destinationId"), col("sourceId"))
      .where(col("destinationId").isNotNull()
          .and(col("destinationId").notEqual(lit(""))))
      .where(col("sourceId").isNotNull()
          .and(col("sourceId").notEqual(lit(""))))
      .map((MapFunction<Row, HierarchicalElement>) row -> {

        HierarchicalElement element = new HierarchicalElement();

        element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI);
        element.setAncestorValue(row.getString(0));

        element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI);
        element.setDescendantValue(row.getString(1));

        return element;
      }, Hierarchies.getHierarchicalElementEncoder());
}
 
Example #10
Source File: ConceptMaps.java    From bunsen with Apache License 2.0 5 votes vote down vote up
@Override
public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps);

  if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion");
  }

  // Remove the concept contents for persistence. This is most easily done in the ConceptMap
  // object by setting the group to an empty list.
  Dataset<ConceptMap> withoutConcepts = conceptMaps
      .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> {

        // Remove the elements rather than the groups to preserved the
        // "unmapped" structure in a group that can refer to other
        // concept maps.
        ConceptMap withoutElements = conceptMap.copy();

        List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>();

        for (ConceptMapGroupComponent group: withoutElements.getGroup()) {

          group.setElement(new ArrayList<>());
          updatedGroups.add(group);
        }

        withoutElements.setGroup(updatedGroups);

        return withoutElements;
      }, CONCEPT_MAP_ENCODER);

  Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator,
      MAPPING_ENCODER);

  return withConceptMaps(withoutConcepts, newMappings);
}
 
Example #11
Source File: ValueSets.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<ValueSet> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  Dataset<ValueSet> withoutConcepts = valueSets.map((MapFunction<ValueSet,ValueSet>) valueSet -> {
    ValueSet valueSetWithoutConcepts = valueSet.copy();

    List<ConceptSetComponent> updatedInclusions = new ArrayList<>();

    for (ConceptSetComponent inclusion: valueSet.getCompose().getInclude()) {

      ConceptSetComponent inclusionWithoutConcepts = inclusion.copy();

      inclusionWithoutConcepts.setConcept(new ArrayList<>());
      updatedInclusions.add(inclusionWithoutConcepts);
    }

    valueSetWithoutConcepts.getCompose().setInclude(updatedInclusions);

    return valueSetWithoutConcepts;
  }, VALUE_SET_ENCODER);

  Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator,
      getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}
 
Example #12
Source File: JavaSQLDataSourceExample.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void runBasicParquetExample(SparkSession spark) {
  // $example on:basic_parquet_example$
  Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // DataFrames can be saved as Parquet files, maintaining the schema information
  peopleDF.write().parquet("people.parquet");

  // Read in the Parquet file created above.
  // Parquet files are self-describing so the schema is preserved
  // The result of loading a parquet file is also a DataFrame
  Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet");

  // Parquet files can also be used to create a temporary view and then used in SQL statements
  parquetFileDF.createOrReplaceTempView("parquetFile");
  Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
  Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() {
    public String call(Row row) {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+
  // $example off:basic_parquet_example$
}
 
Example #13
Source File: DataSetApplication.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}
 
Example #14
Source File: NManualBuildAndQueryCuboidTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example #15
Source File: JavaSparkSQLExample.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Function to run data creation example.
 *
 * @param spark      spark session.
 * @param peopleJson path to people json file.
 */
private static void runDatasetCreationExample(final SparkSession spark, final String peopleJson) {
  // Create an instance of a Bean class
  Person person = new Person();
  person.setName("Andy");
  person.setAge(32);

  // Encoders are created for Java beans
  Encoder<Person> personEncoder = Encoders.bean(Person.class);
  Dataset<Person> javaBeanDS = spark.createDataset(
    Collections.singletonList(person),
    personEncoder
  );
  javaBeanDS.show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 32|Andy|
  // +---+----+

  // Encoders for most common types are provided in class Encoders
  Encoder<Integer> integerEncoder = Encoders.INT();
  Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
  Dataset<Integer> transformedDS = primitiveDS.map(
    (MapFunction<Integer, Integer>) value -> value + 1,
    integerEncoder);
  transformedDS.collect(); // Returns [2, 3, 4]

  // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
  String path = peopleJson;
  Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
  peopleDS.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
}
 
Example #16
Source File: Dataset.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> map(final MapFunction<T, U> func, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(func, encoder);
  final Dataset<U> result = from(super.map(func, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #17
Source File: JavaSparkSQLExample.java    From incubator-nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Function to run programmatic schema example.
 *
 * @param spark     spark session.
 * @param peopleTxt path to people txt file.
 */
private static void runProgrammaticSchemaExample(final SparkSession spark, final String peopleTxt) {
  // Create an RDD
  SparkJavaRDD<String> peopleRDD = spark.read()
    .textFile(peopleTxt)
    .toJavaRDD();

  // The schema is encoded in a string
  String schemaString = "name age";

  // Generate the schema based on the string of schema
  List<StructField> fields = new ArrayList<>();
  for (String fieldName : schemaString.split(" ")) {
    StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    fields.add(field);
  }
  StructType schema = DataTypes.createStructType(fields);

  // Convert records of the RDD (people) to Rows
  SparkJavaRDD<Row> rowRDD = peopleRDD.map((Function<String, Row>) record -> {
    String[] attributes = record.split(",");
    return RowFactory.create(attributes[0], attributes[1].trim());
  });

  // Apply the schema to the RDD
  Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

  // Creates a temporary view using the DataFrame
  peopleDataFrame.createOrReplaceTempView(PEOPLE);

  // SQL can be run over a temporary view created using DataFrames
  Dataset<Row> results = spark.sql("SELECT name FROM people");

  // The results of SQL queries are DataFrames and support all the normal RDD operations
  // The columns of a row in the result can be accessed by field index or by field name
  Dataset<String> namesDS = results.map(
    (MapFunction<Row, String>) row -> NAME + row.getString(0),
    Encoders.STRING());
  namesDS.show();
  // +-------------+
  // |        value|
  // +-------------+
  // |Name: Michael|
  // |   Name: Andy|
  // | Name: Justin|
  // +-------------+
}
 
Example #18
Source File: PiComputeLambdaApp.java    From net.jgp.labs.spark with Apache License 2.0 4 votes vote down vote up
/**
 * The processing code.
 */
private void start(int slices) {
  int numberOfThrows = 100000 * slices;
  System.out.println("About to throw " + numberOfThrows
      + " darts, ready? Stay away from the target!");

  long t0 = System.currentTimeMillis();
  SparkSession spark = SparkSession
      .builder()
      .appName("Spark Pi with lambdas")
      .master("local[*]")
      .getOrCreate();

  long t1 = System.currentTimeMillis();
  System.out.println("Session initialized in " + (t1 - t0) + " ms");

  List<Integer> l = new ArrayList<>(numberOfThrows);
  for (int i = 0; i < numberOfThrows; i++) {
    l.add(i);
  }
  Dataset<Row> incrementalDf = spark
      .createDataset(l, Encoders.INT())
      .toDF();

  long t2 = System.currentTimeMillis();
  System.out.println("Initial dataframe built in " + (t2 - t1) + " ms");

  Dataset<Integer> dotsDs = incrementalDf
      .map((MapFunction<Row, Integer>) status -> {
        double x = Math.random() * 2 - 1;
        double y = Math.random() * 2 - 1;
        counter++;
        if (counter % 100000 == 0) {
          System.out.println("" + counter + " darts thrown so far");
        }
        return (x * x + y * y <= 1) ? 1 : 0;
      }, Encoders.INT());

  long t3 = System.currentTimeMillis();
  System.out.println("Throwing darts done in " + (t3 - t2) + " ms");

  int dartsInCircle =
      dotsDs.reduce((ReduceFunction<Integer>) (x, y) -> x + y);
  long t4 = System.currentTimeMillis();
  System.out.println("Analyzing result in " + (t4 - t3) + " ms");

  System.out.println(
      "Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows);

  spark.stop();
}
 
Example #19
Source File: ParDoTranslatorBatch.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public void translateTransform(
    PTransform<PCollection<InputT>, PCollectionTuple> transform, TranslationContext context) {
  String stepName = context.getCurrentTransform().getFullName();

  // Check for not supported advanced features
  // TODO: add support of Splittable DoFn
  DoFn<InputT, OutputT> doFn = getDoFn(context);
  checkState(
      !DoFnSignatures.isSplittable(doFn),
      "Not expected to directly translate splittable DoFn, should have been overridden: %s",
      doFn);

  // TODO: add support of states and timers
  checkState(
      !DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");

  checkState(
      !DoFnSignatures.requiresTimeSortedInput(doFn),
      "@RequiresTimeSortedInput is not " + "supported for the moment");

  DoFnSchemaInformation doFnSchemaInformation =
      ParDoTranslation.getSchemaInformation(context.getCurrentTransform());

  // Init main variables
  PValue input = context.getInput();
  Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
  Map<TupleTag<?>, PValue> outputs = context.getOutputs();
  TupleTag<?> mainOutputTag = getTupleTag(context);
  List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
  WindowingStrategy<?, ?> windowingStrategy =
      ((PCollection<InputT>) input).getWindowingStrategy();
  Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
  Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();

  // construct a map from side input to WindowingStrategy so that
  // the DoFn runner can map main-input windows to side input windows
  List<PCollectionView<?>> sideInputs = getSideInputs(context);
  Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
  for (PCollectionView<?> sideInput : sideInputs) {
    sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
  }

  SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);

  Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
  MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();

  List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
  for (TupleTag<?> tag : outputTags) {
    if (!tag.equals(mainOutputTag)) {
      additionalOutputTags.add(tag);
    }
  }

  Map<String, PCollectionView<?>> sideInputMapping =
      ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
  @SuppressWarnings("unchecked")
  DoFnFunction<InputT, OutputT> doFnWrapper =
      new DoFnFunction(
          metricsAccum,
          stepName,
          doFn,
          windowingStrategy,
          sideInputStrategies,
          context.getSerializableOptions(),
          additionalOutputTags,
          mainOutputTag,
          inputCoder,
          outputCoderMap,
          broadcastStateData,
          doFnSchemaInformation,
          sideInputMapping);

  MultiOuputCoder multipleOutputCoder =
      MultiOuputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
  Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs =
      inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
  if (outputs.entrySet().size() > 1) {
    allOutputs.persist();
    for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
      pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
    }
  } else {
    Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
    Coder<WindowedValue<?>> windowedValueCoder =
        (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
    Dataset<WindowedValue<?>> outputDataset =
        allOutputs.map(
            (MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>)
                value -> value._2,
            EncoderHelpers.fromBeamCoder(windowedValueCoder));
    context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
  }
}
 
Example #20
Source File: KVHelpers.java    From beam with Apache License 2.0 4 votes vote down vote up
/** A Spark {@link MapFunction} for extracting the key out of a {@link KV} for GBK for example. */
public static <K, V> MapFunction<WindowedValue<KV<K, V>>, K> extractKey() {
  return wv -> wv.getValue().getKey();
}
 
Example #21
Source File: JavaSparkSQLExample.java    From incubator-nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Function to run infer schema example.
 *
 * @param spark     spark session.
 * @param peopleTxt path to people txt file.
 */
private static void runInferSchemaExample(final SparkSession spark, final String peopleTxt) {
  // Create an RDD of Person objects from a text file
  SparkJavaRDD<Person> peopleRDD = spark.read()
    .textFile(peopleTxt)
    .javaRDD()
    .map(line -> {
      String[] parts = line.split(",");
      Person person = new Person();
      person.setName(parts[0]);
      person.setAge(Integer.parseInt(parts[1].trim()));
      return person;
    });

  // Apply a schema to an RDD of JavaBeans to get a DataFrame
  Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class);
  // Register the DataFrame as a temporary view
  peopleDF.createOrReplaceTempView(PEOPLE);

  // SQL statements can be run by using the sql methods provided by spark
  Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");

  // The columns of a row in the result can be accessed by field index
  Encoder<String> stringEncoder = Encoders.STRING();
  Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(
    (MapFunction<Row, String>) row -> NAME + row.getString(0),
    stringEncoder);
  teenagerNamesByIndexDF.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+

  // or by field name
  Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(
    (MapFunction<Row, String>) row -> NAME + row.<String>getAs("name"),
    stringEncoder);
  teenagerNamesByFieldDF.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+
}
 
Example #22
Source File: JavaSparkSQLExample.java    From nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Function to run programmatic schema example.
 * @param spark spark session.
 * @param peopleTxt path to people txt file.
 */
private static void runProgrammaticSchemaExample(final SparkSession spark, final String peopleTxt) {
  // Create an RDD
  JavaRDD<String> peopleRDD = spark.read()
      .textFile(peopleTxt)
      .toJavaRDD();

  // The schema is encoded in a string
  String schemaString = "name age";

  // Generate the schema based on the string of schema
  List<StructField> fields = new ArrayList<>();
  for (String fieldName : schemaString.split(" ")) {
    StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    fields.add(field);
  }
  StructType schema = DataTypes.createStructType(fields);

  // Convert records of the RDD (people) to Rows
  JavaRDD<Row> rowRDD = peopleRDD.map((Function<String, Row>) record -> {
    String[] attributes = record.split(",");
    return RowFactory.create(attributes[0], attributes[1].trim());
  });

  // Apply the schema to the RDD
  Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

  // Creates a temporary view using the DataFrame
  peopleDataFrame.createOrReplaceTempView("people");

  // SQL can be run over a temporary view created using DataFrames
  Dataset<Row> results = spark.sql("SELECT name FROM people");

  // The results of SQL queries are DataFrames and support all the normal RDD operations
  // The columns of a row in the result can be accessed by field index or by field name
  Dataset<String> namesDS = results.map(
      (MapFunction<Row, String>) row -> "Name: " + row.getString(0),
      Encoders.STRING());
  namesDS.show();
  // +-------------+
  // |        value|
  // +-------------+
  // |Name: Michael|
  // |   Name: Andy|
  // | Name: Justin|
  // +-------------+
}
 
Example #23
Source File: JavaSparkSQLExample.java    From nemo with Apache License 2.0 4 votes vote down vote up
/**
 * Function to run infer schema example.
 * @param spark spark session.
 * @param peopleTxt path to people txt file.
 */
private static void runInferSchemaExample(final SparkSession spark, final String peopleTxt) {
  // Create an RDD of Person objects from a text file
  JavaRDD<Person> peopleRDD = spark.read()
      .textFile(peopleTxt)
      .javaRDD()
      .map(line -> {
        String[] parts = line.split(",");
        Person person = new Person();
        person.setName(parts[0]);
        person.setAge(Integer.parseInt(parts[1].trim()));
        return person;
      });

  // Apply a schema to an RDD of JavaBeans to get a DataFrame
  Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class);
  // Register the DataFrame as a temporary view
  peopleDF.createOrReplaceTempView("people");

  // SQL statements can be run by using the sql methods provided by spark
  Dataset<Row> teenagersDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");

  // The columns of a row in the result can be accessed by field index
  Encoder<String> stringEncoder = Encoders.STRING();
  Dataset<String> teenagerNamesByIndexDF = teenagersDF.map(
      (MapFunction<Row, String>) row -> "Name: " + row.getString(0),
      stringEncoder);
  teenagerNamesByIndexDF.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+

  // or by field name
  Dataset<String> teenagerNamesByFieldDF = teenagersDF.map(
      (MapFunction<Row, String>) row -> "Name: " + row.<String>getAs("name"),
      stringEncoder);
  teenagerNamesByFieldDF.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+
}
 
Example #24
Source File: JavaSparkHiveExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on:spark_hive$
  // warehouseLocation points to the default location for managed databases and tables
  String warehouseLocation = "spark-warehouse";
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark Hive Example")
    .config("spark.sql.warehouse.dir", warehouseLocation)
    .enableHiveSupport()
    .getOrCreate();

  spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
  spark.sql("LOAD DATA LOCAL INPATH 'data/resources/kv1.txt' INTO TABLE src");

  // Queries are expressed in HiveQL
  spark.sql("SELECT * FROM src").show();
  // +---+-------+
  // |key|  value|
  // +---+-------+
  // |238|val_238|
  // | 86| val_86|
  // |311|val_311|
  // ...

  // Aggregation queries are also supported.
  spark.sql("SELECT COUNT(*) FROM src").show();
  // +--------+
  // |count(1)|
  // +--------+
  // |    500 |
  // +--------+

  // The results of SQL queries are themselves DataFrames and support all normal functions.
  Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

  // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
  Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Key: " + row.get(0) + ", Value: " + row.get(1);
    }
  }, Encoders.STRING());
  stringsDS.show();
  // +--------------------+
  // |               value|
  // +--------------------+
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // ...

  // You can also use DataFrames to create temporary views within a SparkSession.
  List<Record> records = new ArrayList<>();
  for (int key = 1; key < 100; key++) {
    Record record = new Record();
    record.setKey(key);
    record.setValue("val_" + key);
    records.add(record);
  }
  Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
  recordsDF.createOrReplaceTempView("records");

  // Queries can then join DataFrames data with data stored in Hive.
  spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
  // +---+------+---+------+
  // |key| value|key| value|
  // +---+------+---+------+
  // |  2| val_2|  2| val_2|
  // |  2| val_2|  2| val_2|
  // |  4| val_4|  4| val_4|
  // ...
  // $example off:spark_hive$

  spark.stop();
}
 
Example #25
Source File: VideoStreamProcessor.java    From video-stream-analytics with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
//Read properties
Properties prop = PropertyFileReader.readPropertyFile();

//SparkSesion
SparkSession spark = SparkSession
	      .builder()
	      .appName("VideoStreamProcessor")
	      .master(prop.getProperty("spark.master.url"))
	      .getOrCreate();	

//directory to save image files with motion detected
final String processedImageDir = prop.getProperty("processed.output.dir");
logger.warn("Output directory for saving processed images is set to "+processedImageDir+". This is configured in processed.output.dir key of property file.");

//create schema for json message
StructType schema =  DataTypes.createStructType(new StructField[] { 
		DataTypes.createStructField("cameraId", DataTypes.StringType, true),
		DataTypes.createStructField("timestamp", DataTypes.TimestampType, true),
		DataTypes.createStructField("rows", DataTypes.IntegerType, true),
		DataTypes.createStructField("cols", DataTypes.IntegerType, true),
		DataTypes.createStructField("type", DataTypes.IntegerType, true),
		DataTypes.createStructField("data", DataTypes.StringType, true)
		});


//Create DataSet from stream messages from kafka
   Dataset<VideoEventData> ds = spark
     .readStream()
     .format("kafka")
     .option("kafka.bootstrap.servers", prop.getProperty("kafka.bootstrap.servers"))
     .option("subscribe", prop.getProperty("kafka.topic"))
     .option("kafka.max.partition.fetch.bytes", prop.getProperty("kafka.max.partition.fetch.bytes"))
     .option("kafka.max.poll.records", prop.getProperty("kafka.max.poll.records"))
     .load()
     .selectExpr("CAST(value AS STRING) as message")
     .select(functions.from_json(functions.col("message"),schema).as("json"))
     .select("json.*")
     .as(Encoders.bean(VideoEventData.class)); 
   
   //key-value pair of cameraId-VideoEventData
KeyValueGroupedDataset<String, VideoEventData> kvDataset = ds.groupByKey(new MapFunction<VideoEventData, String>() {
	@Override
	public String call(VideoEventData value) throws Exception {
		return value.getCameraId();
	}
}, Encoders.STRING());
	
//process
Dataset<VideoEventData> processedDataset = kvDataset.mapGroupsWithState(new MapGroupsWithStateFunction<String, VideoEventData, VideoEventData,VideoEventData>(){
	@Override
	public VideoEventData call(String key, Iterator<VideoEventData> values, GroupState<VideoEventData> state) throws Exception {
		logger.warn("CameraId="+key+" PartitionId="+TaskContext.getPartitionId());
		VideoEventData existing = null;
		//check previous state
		if (state.exists()) {
			existing = state.get();
		}
		//detect motion
		VideoEventData processed = VideoMotionDetector.detectMotion(key,values,processedImageDir,existing);
		
		//update last processed
		if(processed != null){
			state.update(processed);
		}
		return processed;
	}}, Encoders.bean(VideoEventData.class), Encoders.bean(VideoEventData.class));

//start
 StreamingQuery query = processedDataset.writeStream()
	      .outputMode("update")
	      .format("console")
	      .start();
 
 //await
    query.awaitTermination();
}
 
Example #26
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 4 votes vote down vote up
/**
 * Import files from given partitions to an Iceberg table.
 *
 * @param spark a Spark session
 * @param partitions partitions to import
 * @param targetTable an Iceberg table where to import the data
 * @param spec a partition spec
 * @param stagingDir a staging directory to store temporary manifest files
 */
public static void importSparkPartitions(
    SparkSession spark, List<SparkPartition> partitions, Table targetTable, PartitionSpec spec, String stagingDir) {
  Configuration conf = spark.sessionState().newHadoopConf();
  SerializableConfiguration serializableConf = new SerializableConfiguration(conf);
  int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism());
  int numShufflePartitions = spark.sessionState().conf().numShufflePartitions();
  MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());

  JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
  JavaRDD<SparkPartition> partitionRDD = sparkContext.parallelize(partitions, parallelism);

  Dataset<SparkPartition> partitionDS = spark.createDataset(
      partitionRDD.rdd(),
      Encoders.javaSerialization(SparkPartition.class));

  List<ManifestFile> manifests = partitionDS
      .flatMap((FlatMapFunction<SparkPartition, DataFile>) sparkPartition ->
              listPartition(sparkPartition, spec, serializableConf, metricsConfig).iterator(),
          Encoders.javaSerialization(DataFile.class))
      .repartition(numShufflePartitions)
      .map((MapFunction<DataFile, Tuple2<String, DataFile>>) file ->
              Tuple2.apply(file.path().toString(), file),
          Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class)))
      .orderBy(col("_1"))
      .mapPartitions(
          (MapPartitionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple ->
              buildManifest(serializableConf, spec, stagingDir, fileTuple),
          Encoders.javaSerialization(ManifestFile.class))
      .collectAsList();

  try {
    boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean(
        targetTable.properties(),
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED,
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);

    AppendFiles append = targetTable.newAppend();
    manifests.forEach(append::appendManifest);
    append.commit();

    if (!snapshotIdInheritanceEnabled) {
      // delete original manifests as they were rewritten before the commit
      deleteManifests(targetTable.io(), manifests);
    }
  } catch (Throwable e) {
    deleteManifests(targetTable.io(), manifests);
    throw e;
  }
}
 
Example #27
Source File: StructuredKafkaSource08.java    From sylph with Apache License 2.0 4 votes vote down vote up
public Dataset<Row> createSource(SparkSession spark, KafkaSourceConfig08 config, SourceContext context)
{
    String topics = requireNonNull(config.getTopics(), "topics not setting");
    String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器
    String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字
    String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting");

    Map<String, String> otherConfig = config.getOtherConfig().entrySet()
            .stream()
            .filter(x -> x.getValue() != null)
            .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString()));

    Map<String, String> kafkaParams = new HashMap<>(otherConfig);
    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
    //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest   smallest

    Dataset<Row> kafka08 = spark.readStream()
            .format(KafkaDataSource08.class.getName())
            .option("topics", topics)
            .options(kafkaParams)
            .load();

    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return kafka08
                .map((MapFunction<Row, Row>) record -> {
                    return jsonParser.deserialize(
                            record.getAs("_key"),
                            record.getAs("_message"),
                            record.<String>getAs("_topic"),
                            record.<Integer>getAs("_partition"),
                            record.<Long>getAs("_offset"));
                }, RowEncoder.apply(jsonParser.getProducedType()));
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        String[] columns = Arrays.stream(structType.names()).map(name -> {
            switch (name) {
                case "_key":
                    return "CAST(_key AS STRING) as _key";
                case "_message":
                    return "CAST(_message AS STRING) as _message";
                default:
                    return name;
            }
        }).toArray(String[]::new);
        return kafka08.selectExpr(columns); //对输入的数据进行 cast转换
    }
}
 
Example #28
Source File: StructuredKafkaSource.java    From sylph with Apache License 2.0 4 votes vote down vote up
private static Dataset<Row> createSource(SparkSession spark, KafkaSourceConfig config, SourceContext context)
{
    String topics = config.getTopics();
    String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器
    String groupId = config.getGroupid(); //消费者的名字
    String offsetMode = config.getOffsetMode();

    checkState(!"largest".equals(offsetMode), "kafka 0.10+, use latest");
    checkState(!"smallest".equals(offsetMode), "kafka 0.10+, use earliest");

    Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig());
    kafkaParams.put("subscribe", topics);
    kafkaParams.put("kafka.bootstrap.servers", brokers);
    kafkaParams.put("startingOffsets", offsetMode); //latest   earliest

    kafkaParams.put("key.deserializer", ByteArrayDeserializer.class.getName()); //StringDeserializer
    kafkaParams.put("value.deserializer", ByteArrayDeserializer.class.getName()); //StringDeserializer
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期

    Dataset<Row> inputStream = KafkaSourceUtil.getSource(spark, kafkaParams);
    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return inputStream
                .map((MapFunction<Row, Row>) record -> {
                    return jsonParser.deserialize(record.getAs("key"),
                            record.getAs("value"),
                            record.<String>getAs("topic"),
                            record.<Integer>getAs("partition"),
                            record.<Long>getAs("offset"));
                }, RowEncoder.apply(jsonParser.getProducedType()));
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        return inputStream
                .map((MapFunction<Row, Row>) record -> {
                    String[] names = structType.names();
                    Object[] values = new Object[names.length];
                    for (int i = 0; i < names.length; i++) {
                        switch (names[i]) {
                            case "_topic":
                                values[i] = record.<String>getAs("topic");
                                continue;
                            case "_message":
                                values[i] = new String(record.getAs("value"), UTF_8);
                                continue;
                            case "_key":
                                byte[] key = record.getAs("key");
                                values[i] = key == null ? null : new String(key, UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.<Integer>getAs("partition");
                                continue;
                            case "_offset":
                                values[i] = record.<Long>getAs("offset");
                            default:
                                values[i] = null;
                        }
                    }
                    return (Row) new GenericRowWithSchema(values, structType);
                }, RowEncoder.apply(structType));
    }
}
 
Example #29
Source File: VideoStreamProcessor.java    From video-stream-classification with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
//Read properties
Properties prop = PropertyFileReader.readPropertyFile();

//SparkSesion
SparkSession spark = SparkSession
	      .builder()
	      .appName("VideoStreamProcessor")
	      .master(prop.getProperty("spark.master.url"))
	      .getOrCreate();	

//directory to save image files with motion detected
final String processedImageDir = prop.getProperty("processed.output.dir");
logger.warn("Output directory for saving processed images is set to "+processedImageDir+". This is configured in processed.output.dir key of property file.");

//create schema for json message
StructType schema =  DataTypes.createStructType(new StructField[] { 
		DataTypes.createStructField("cameraId", DataTypes.StringType, true),
		DataTypes.createStructField("timestamp", DataTypes.TimestampType, true),
		DataTypes.createStructField("rows", DataTypes.IntegerType, true),
		DataTypes.createStructField("cols", DataTypes.IntegerType, true),
		DataTypes.createStructField("type", DataTypes.IntegerType, true),
		DataTypes.createStructField("data", DataTypes.StringType, true)
		});


//Create DataSet from stream messages from kafka
   Dataset<VideoEventData> ds = spark
     .readStream()
     .format("kafka")
     .option("kafka.bootstrap.servers", prop.getProperty("kafka.bootstrap.servers"))
     .option("subscribe", prop.getProperty("kafka.topic"))
     .option("kafka.max.partition.fetch.bytes", prop.getProperty("kafka.max.partition.fetch.bytes"))
     .option("kafka.max.poll.records", prop.getProperty("kafka.max.poll.records"))
     .load()
     .selectExpr("CAST(value AS STRING) as message")
     .select(functions.from_json(functions.col("message"),schema).as("json"))
     .select("json.*")
     .as(Encoders.bean(VideoEventData.class)); 
   
   //key-value pair of cameraId-VideoEventData
KeyValueGroupedDataset<String, VideoEventData> kvDataset = ds.groupByKey(new MapFunction<VideoEventData, String>() {
	@Override
	public String call(VideoEventData value) throws Exception {
		return value.getCameraId();
	}
}, Encoders.STRING());
	
//process
Dataset<VideoEventData> processedDataset = kvDataset.mapGroupsWithState(new MapGroupsWithStateFunction<String, VideoEventData, VideoEventData,VideoEventData>(){
	@Override
	public VideoEventData call(String key, Iterator<VideoEventData> values, GroupState<VideoEventData> state) throws Exception {
		logger.warn("CameraId="+key+" PartitionId="+TaskContext.getPartitionId());
		VideoEventData existing = null;
		//check previous state
		if (state.exists()) {
			existing = state.get();
		}
		//classify image
		VideoEventData processed = ImageProcessor.process(key,values,processedImageDir,existing);
		
		//update last processed
		if(processed != null){
			state.update(processed);
		}
		return processed;
	}}, Encoders.bean(VideoEventData.class), Encoders.bean(VideoEventData.class));

//start
 StreamingQuery query = processedDataset.writeStream()
	      .outputMode("update")
	      .format("console")
	      .start();
 
 //await
    query.awaitTermination();
}
 
Example #30
Source File: JavaSparkSQLExample.java    From SparkDemo with MIT License 4 votes vote down vote up
private static void runProgrammaticSchemaExample(SparkSession spark) {
  // $example on:programmatic_schema$
  // Create an RDD
  JavaRDD<String> peopleRDD = spark.sparkContext()
    .textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt", 1)
    .toJavaRDD();

  // The schema is encoded in a string
  String schemaString = "name age";

  // Generate the schema based on the string of schema
  List<StructField> fields = new ArrayList<>();
  for (String fieldName : schemaString.split(" ")) {
    StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    fields.add(field);
  }
  StructType schema = DataTypes.createStructType(fields);

  // Convert records of the RDD (people) to Rows
  JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
    @Override
    public Row call(String record) throws Exception {
      String[] attributes = record.split(",");
      return RowFactory.create(attributes[0], attributes[1].trim());
    }
  });

  // Apply the schema to the RDD
  Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

  // Creates a temporary view using the DataFrame
  peopleDataFrame.createOrReplaceTempView("people");

  // SQL can be run over a temporary view created using DataFrames
  Dataset<Row> results = spark.sql("SELECT name FROM people");

  // The results of SQL queries are DataFrames and support all the normal RDD operations
  // The columns of a row in the result can be accessed by field index or by field name
  Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +-------------+
  // |        value|
  // +-------------+
  // |Name: Michael|
  // |   Name: Andy|
  // | Name: Justin|
  // +-------------+
  // $example off:programmatic_schema$
}