Java Code Examples for org.apache.spark.sql.catalyst.encoders.RowEncoder

The following examples show how to use org.apache.spark.sql.catalyst.encoders.RowEncoder. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: envelope   Source File: CountDatasetRule.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) {
  if (isDependency()) {
    Dataset<Row> expectedDependency = stepDependencies.get(dependency);
    if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1
        && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) {
      expected = expectedDependency.collectAsList().get(0).getLong(0);
    } else {
      throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type");
    }
  }
  if (expected < 0) {
    throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency");
  }
  return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA));
}
 
Example 2
Source Project: envelope   Source File: TestTranslateFunction.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testAppendRaw() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(ComponentFactory.TYPE_CONFIG_NAME, DummyTranslator.class.getName());
  configMap.put(TranslateFunction.APPEND_RAW_ENABLED_CONFIG, true);
  Config config = ConfigFactory.parseMap(configMap);

  TranslateFunction tf = new TranslateFunction(config);
  tf.receiveProvidedSchema(tf.getExpectingSchema());
  Dataset<Row> raw = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("hello?")), tf.getExpectingSchema());
  Dataset<Row> translated = raw.flatMap(tf, RowEncoder.apply(tf.getProvidingSchema()));

  assertEquals(2, translated.schema().size());
  assertEquals("_value", translated.schema().fields()[1].name());
  assertEquals("hello?", translated.collectAsList().get(0).getString(1));
}
 
Example 3
Source Project: envelope   Source File: TestTranslateFunction.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testExplicitDontAppendRaw() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(ComponentFactory.TYPE_CONFIG_NAME, DummyTranslator.class.getName());
  configMap.put(TranslateFunction.APPEND_RAW_ENABLED_CONFIG, false);
  Config config = ConfigFactory.parseMap(configMap);

  TranslateFunction tf = new TranslateFunction(config);
  tf.receiveProvidedSchema(tf.getExpectingSchema());
  Dataset<Row> raw = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("hello?")), tf.getExpectingSchema());
  Dataset<Row> translated = raw.flatMap(tf, RowEncoder.apply(tf.getProvidingSchema()));

  assertEquals(1, translated.schema().size());
  assertNotEquals("_value", translated.schema().fields()[0].name());
}
 
Example 4
Source Project: kylin-on-parquet-v2   Source File: SparkCubingJobTest.java    License: Apache License 2.0 5 votes vote down vote up
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example 5
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example 6
Source Project: envelope   Source File: DataQualityDeriver.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  if (dependencies.size() > 1 && dataset.isEmpty()) {
    throw new RuntimeException("Must specify dataset on which to conduct data quality tests when more than one dependency");
  }
  Dataset<Row> theDataset;
  Dataset<Row> theResults = null;
  if (dependencies.size() == 1) {
    theDataset = dependencies.values().iterator().next();
  } else {
    theDataset = dependencies.get(dataset);
  }
  if (scope == Scope.DATASET) {
    // The checks are run at a dataset level and we are simply returning a DS of <name, boolean> Rows
    for (DatasetRule rule : datasetRules.values()) {
      if (theResults == null) {
        theResults = rule.check(theDataset, dependencies);
      } else {
        theResults = theResults.union(rule.check(theDataset, dependencies));
      }
    }
  } else {
    if (theDataset.schema().getFieldIndex(resultsField).isDefined()) {
      throw new RuntimeException("The field [" + resultsField + "] already exists in the dataset schema. Use the " +
          RESULTS_FIELD_CONFIG + " configuration parameter to customize the data quality check field name");
    }
    List<StructField> checkField = Lists.newArrayList(
        new StructField(resultsField,
            DataTypes.createMapType(DataTypes.StringType, DataTypes.BooleanType),
            false, Metadata.empty()));
    theResults = theDataset.map(new CheckRowRules(rowRules, resultsField),
        RowEncoder.apply(SchemaUtils.appendFields(theDataset.schema(), checkField)));
  }

  return theResults;
}
 
Example 7
Source Project: envelope   Source File: HashDeriver.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  String concatenatedFieldName = "_concatenated";

  Dataset<Row> dependency = getStepDataFrame(dependencies);

  Dataset<Row> concatenated = dependency.map(
      new ConcatenationFunction(delimiter, nullString, includeFields, excludeFields),
      RowEncoder.apply(dependency.schema().add(concatenatedFieldName, DataTypes.BinaryType)));

  return concatenated
      .withColumn(hashFieldName, functions.md5(functions.col(concatenatedFieldName)))
      .drop(concatenatedFieldName);
}
 
Example 8
Source Project: envelope   Source File: DummyBatchInput.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> read() throws Exception {
  Dataset<Row> ds = Contexts.getSparkSession().range(counter, counter + numRows).map(new LongToRowFunction(),
      RowEncoder.apply(schema));
  counter = counter+numRows;
  return ds;
}
 
Example 9
Source Project: envelope   Source File: TestTranslateFunction.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testImplicitDontAppendRaw() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(ComponentFactory.TYPE_CONFIG_NAME, DummyTranslator.class.getName());
  Config config = ConfigFactory.parseMap(configMap);

  TranslateFunction tf = new TranslateFunction(config);
  tf.receiveProvidedSchema(tf.getExpectingSchema());
  Dataset<Row> raw = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("hello?")), tf.getExpectingSchema());
  Dataset<Row> translated = raw.flatMap(tf, RowEncoder.apply(tf.getProvidingSchema()));

  assertEquals(1, translated.schema().size());
  assertNotEquals("_value", translated.schema().fields()[0].name());
}
 
Example 10
Source Project: envelope   Source File: DummyInput.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> read() throws Exception {
  Dataset<Row> df = Contexts.getSparkSession()
      .range(numPartitions * 10)
      .repartition(numPartitions)
      .map(new LongToRowFunction(), 
          RowEncoder.apply(DataTypes.createStructType(
              Lists.newArrayList(
                  DataTypes.createStructField("value", DataTypes.LongType, true),
                  DataTypes.createStructField("modulo", DataTypes.LongType, true))
              )));
  return df;
}
 
Example 11
Source Project: envelope   Source File: TestLoopStep.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testStepValues() throws Exception {
  Set<Step> steps = Sets.newHashSet();

  Map<String, Object> sourceStepConfigMap = Maps.newHashMap();
  Config sourceStepConfig = ConfigFactory.parseMap(sourceStepConfigMap);
  BatchStep sourceStep = new BatchStep("source_step");
  sourceStep.configure(sourceStepConfig);
  Dataset<Row> sourceDF = Contexts.getSparkSession().range(5, 8).map(new LongToRowFunction(),
      RowEncoder.apply(DataTypes.createStructType(Lists.newArrayList(DataTypes.createStructField("value", DataTypes.LongType, false)))));
  sourceStep.setData(sourceDF);
  steps.add(sourceStep);

  Map<String, Object> loopStepConfigMap = Maps.newHashMap();
  loopStepConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("source_step"));
  loopStepConfigMap.put(LoopStep.MODE_PROPERTY, LoopStep.MODE_PARALLEL);
  loopStepConfigMap.put(LoopStep.SOURCE_PROPERTY, LoopStep.SOURCE_STEP);
  loopStepConfigMap.put(LoopStep.STEP_PROPERTY, "source_step");
  Config loopStepConfig = ConfigFactory.parseMap(loopStepConfigMap);
  RefactorStep loopStep = new LoopStep("loop_step");
  loopStep.configure(loopStepConfig);
  steps.add(loopStep);

  Map<String, Object> step1ConfigMap = Maps.newHashMap();
  step1ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("loop_step"));
  Config step1Config = ConfigFactory.parseMap(step1ConfigMap);
  Step step1 = new BatchStep("step1");
  step1.configure(step1Config);
  steps.add(step1);

  Set<Step> unrolled = loopStep.refactor(steps);

  assertEquals(5,unrolled.size());

  assertEquals(StepUtils.getStepForName("source_step", unrolled).get().getDependencyNames(), Sets.newHashSet());
  assertEquals(StepUtils.getStepForName("loop_step", unrolled).get().getDependencyNames(), Sets.newHashSet("source_step"));
  assertEquals(StepUtils.getStepForName("step1_5", unrolled).get().getDependencyNames(), Sets.newHashSet("loop_step"));
  assertEquals(StepUtils.getStepForName("step1_6", unrolled).get().getDependencyNames(), Sets.newHashSet("loop_step"));
  assertEquals(StepUtils.getStepForName("step1_7", unrolled).get().getDependencyNames(), Sets.newHashSet("loop_step"));
}
 
Example 12
Source Project: sylph   Source File: StructuredKafkaSource08.java    License: Apache License 2.0 4 votes vote down vote up
public Dataset<Row> createSource(SparkSession spark, KafkaSourceConfig08 config, SourceContext context)
{
    String topics = requireNonNull(config.getTopics(), "topics not setting");
    String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器
    String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字
    String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting");

    Map<String, String> otherConfig = config.getOtherConfig().entrySet()
            .stream()
            .filter(x -> x.getValue() != null)
            .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString()));

    Map<String, String> kafkaParams = new HashMap<>(otherConfig);
    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
    //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest   smallest

    Dataset<Row> kafka08 = spark.readStream()
            .format(KafkaDataSource08.class.getName())
            .option("topics", topics)
            .options(kafkaParams)
            .load();

    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return kafka08
                .map((MapFunction<Row, Row>) record -> {
                    return jsonParser.deserialize(
                            record.getAs("_key"),
                            record.getAs("_message"),
                            record.<String>getAs("_topic"),
                            record.<Integer>getAs("_partition"),
                            record.<Long>getAs("_offset"));
                }, RowEncoder.apply(jsonParser.getProducedType()));
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        String[] columns = Arrays.stream(structType.names()).map(name -> {
            switch (name) {
                case "_key":
                    return "CAST(_key AS STRING) as _key";
                case "_message":
                    return "CAST(_message AS STRING) as _message";
                default:
                    return name;
            }
        }).toArray(String[]::new);
        return kafka08.selectExpr(columns); //对输入的数据进行 cast转换
    }
}
 
Example 13
Source Project: sylph   Source File: StructuredKafkaSource.java    License: Apache License 2.0 4 votes vote down vote up
private static Dataset<Row> createSource(SparkSession spark, KafkaSourceConfig config, SourceContext context)
{
    String topics = config.getTopics();
    String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器
    String groupId = config.getGroupid(); //消费者的名字
    String offsetMode = config.getOffsetMode();

    checkState(!"largest".equals(offsetMode), "kafka 0.10+, use latest");
    checkState(!"smallest".equals(offsetMode), "kafka 0.10+, use earliest");

    Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig());
    kafkaParams.put("subscribe", topics);
    kafkaParams.put("kafka.bootstrap.servers", brokers);
    kafkaParams.put("startingOffsets", offsetMode); //latest   earliest

    kafkaParams.put("key.deserializer", ByteArrayDeserializer.class.getName()); //StringDeserializer
    kafkaParams.put("value.deserializer", ByteArrayDeserializer.class.getName()); //StringDeserializer
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期

    Dataset<Row> inputStream = KafkaSourceUtil.getSource(spark, kafkaParams);
    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return inputStream
                .map((MapFunction<Row, Row>) record -> {
                    return jsonParser.deserialize(record.getAs("key"),
                            record.getAs("value"),
                            record.<String>getAs("topic"),
                            record.<Integer>getAs("partition"),
                            record.<Long>getAs("offset"));
                }, RowEncoder.apply(jsonParser.getProducedType()));
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        return inputStream
                .map((MapFunction<Row, Row>) record -> {
                    String[] names = structType.names();
                    Object[] values = new Object[names.length];
                    for (int i = 0; i < names.length; i++) {
                        switch (names[i]) {
                            case "_topic":
                                values[i] = record.<String>getAs("topic");
                                continue;
                            case "_message":
                                values[i] = new String(record.getAs("value"), UTF_8);
                                continue;
                            case "_key":
                                byte[] key = record.getAs("key");
                                values[i] = key == null ? null : new String(key, UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.<Integer>getAs("partition");
                                continue;
                            case "_offset":
                                values[i] = record.<Long>getAs("offset");
                            default:
                                values[i] = null;
                        }
                    }
                    return (Row) new GenericRowWithSchema(values, structType);
                }, RowEncoder.apply(structType));
    }
}
 
Example 14
Source Project: envelope   Source File: DatasetRowRuleWrapper.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) {
  return dataset.map(new CheckRule(rowRule, name), RowEncoder.apply(SCHEMA)).select(new BooleanAggregator(name).toColumn());
}
 
Example 15
Source Project: envelope   Source File: DatasetRowRuleWrapper.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Encoder<Row> bufferEncoder() {
  return RowEncoder.apply(SCHEMA);
}
 
Example 16
Source Project: envelope   Source File: DatasetRowRuleWrapper.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Encoder<Row> outputEncoder() {
  return RowEncoder.apply(SCHEMA);
}
 
Example 17
/**
 *
 * Not Supported
 *
 * @param dsp
 * @param partitionBy
 * @param location
 * @param context
 * @return
 */
@Override
public DataSet<ExecRow> writeParquetFile(DataSetProcessor dsp, int[] partitionBy, String location, String compression, OperationContext context) {

    try {
        //Generate Table Schema
        String[] colNames;
        DataValueDescriptor[] dvds;
        if (context.getOperation() instanceof DMLWriteOperation) {
            dvds  = context.getOperation().getExecRowDefinition().getRowArray();
            colNames = ((DMLWriteOperation) context.getOperation()).getColumnNames();
        } else if (context.getOperation() instanceof ExportOperation) {
            dvds = context.getOperation().getLeftOperation().getLeftOperation().getExecRowDefinition().getRowArray();
            ExportOperation export = (ExportOperation) context.getOperation();
            ResultColumnDescriptor[] descriptors = export.getSourceResultColumnDescriptors();
            colNames = new String[descriptors.length];
            int i = 0;
            for (ResultColumnDescriptor rcd : export.getSourceResultColumnDescriptors()) {
                colNames[i++] = rcd.getName();
            }
        } else {
            throw new IllegalArgumentException("Unsupported operation type: " + context.getOperation());
        }
        StructField[] fields = new StructField[colNames.length];
        for (int i=0 ; i<colNames.length ; i++){
            fields[i] = dvds[i].getStructField(colNames[i]);
        }
        StructType tableSchema = DataTypes.createStructType(fields);
        RecordWriter<Void, Object> rw = ParquetWriterService.getFactory().getParquetRecordWriter(location, compression, tableSchema);

        try {
            ExpressionEncoder<Row> encoder = RowEncoder.apply(tableSchema);
            while (iterator.hasNext()) {
                ValueRow vr = (ValueRow) iterator.next();
                context.recordWrite();

                rw.write(null, encoder.toRow(vr));
            }
        } finally {
            rw.close(null);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    ValueRow valueRow=new ValueRow(1);
    valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
    return new ControlDataSet(Collections.singletonList(valueRow).iterator());
}