org.apache.spark.sql.Dataset Java Examples

The following examples show how to use org.apache.spark.sql.Dataset. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkDataSet.java    From spliceengine with GNU Affero General Public License v3.0 7 votes vote down vote up
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public DataSet< V> intersect(DataSet< V> dataSet, String name, OperationContext context, boolean pushScope, String scopeDetail) throws StandardException {
    pushScopeIfNeeded(context, pushScope, scopeDetail);
    try {
        //Convert this rdd backed iterator to a Spark untyped dataset
        Dataset<Row> left = SpliceSpark.getSession()
                .createDataFrame(
                    rdd.map(
                        new LocatedRowToRowFunction()),
                    context.getOperation()
                           .getExecRowDefinition()
                           .schema());

        return new NativeSparkDataSet(left, context).intersect(dataSet, name, context, pushScope, scopeDetail);
    }finally {
        if (pushScope) context.popScope();
    }
}
 
Example #2
Source File: WindowAssignTranslatorBatch.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public void translateTransform(
    PTransform<PCollection<T>, PCollection<T>> transform, TranslationContext context) {

  Window.Assign<T> assignTransform = (Window.Assign<T>) transform;
  @SuppressWarnings("unchecked")
  final PCollection<T> input = (PCollection<T>) context.getInput();
  @SuppressWarnings("unchecked")
  final PCollection<T> output = (PCollection<T>) context.getOutput();

  Dataset<WindowedValue<T>> inputDataset = context.getDataset(input);
  if (WindowingHelpers.skipAssignWindows(assignTransform, context)) {
    context.putDataset(output, inputDataset);
  } else {
    WindowFn<T, ?> windowFn = assignTransform.getWindowFn();
    WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
        WindowedValue.FullWindowedValueCoder.of(input.getCoder(), windowFn.windowCoder());
    Dataset<WindowedValue<T>> outputDataset =
        inputDataset.map(
            WindowingHelpers.assignWindowsMapFunction(windowFn),
            EncoderHelpers.fromBeamCoder(windowedValueCoder));
    context.putDataset(output, outputDataset);
  }
}
 
Example #3
Source File: TestInListDeriver.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testWrongField() throws Exception {
  thrown.expect(RuntimeException.class);
  thrown.expectMessage("Error executing IN list filtering");

  Dataset<Row> source = createTestDataframe();
  List<String> inListLiteral = Arrays.asList("1", "2", "3");

  Map<String, Dataset<Row>> dependencies = new HashMap<>();
  dependencies.put("df1", source);

  Config config = ConfigFactory.empty()
      .withValue(InListDeriver.INLIST_STEP_CONFIG, ConfigValueFactory.fromAnyRef("df1"))
      .withValue(InListDeriver.INLIST_FIELD_CONFIG, ConfigValueFactory.fromAnyRef("non_existing_field"))
      .withValue(InListDeriver.INLIST_VALUES_CONFIG, ConfigValueFactory.fromIterable(inListLiteral));

  InListDeriver deriver = new InListDeriver();

  assertNoValidationFailures(deriver, config);
  deriver.configure(config);

  deriver.derive(dependencies);
}
 
Example #4
Source File: TestImpalaMetadataTask.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testDeriveDropRangePartitionBoundariesQuery() {
  Map<String, Object> configMap = new HashMap<>();
  configMap.put(HOST_CONFIG, "testhost");
  configMap.put(QUERY_TYPE_CONFIG, "drop_partition");
  configMap.put(QUERY_TABLE_CONFIG, "testtable");
  configMap.put(QUERY_PART_RANGE_START_CONFIG, "20190122");
  configMap.put(QUERY_PART_RANGE_END_CONFIG, "20190123");
  configMap.put(AUTH_CONFIG, "none");
  Config config = ConfigFactory.parseMap(configMap);
  ImpalaMetadataTask metadataTask = new ImpalaMetadataTask();
  metadataTask.configure(config);

  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  String query = metadataTask.deriveQuery(dependencies);

  assertEquals("ALTER TABLE testtable DROP IF EXISTS RANGE PARTITION 20190122 <= VALUES < 20190123", query);
}
 
Example #5
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private List<ManifestFile> writeManifestsForUnpartitionedTable(Dataset<Row> manifestEntryDF, int numManifests) {
  Broadcast<FileIO> io = sparkContext.broadcast(fileIO);
  StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();

  // we rely only on the target number of manifests for unpartitioned tables
  // as we should not worry about having too much metadata per partition
  long maxNumManifestEntries = Long.MAX_VALUE;

  return manifestEntryDF
      .repartition(numManifests)
      .mapPartitions(
          toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType),
          manifestEncoder
      )
      .collectAsList();
}
 
Example #6
Source File: ITTestHDFSParquetImportCommand.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Method to verify result is equals to expect.
 */
private void verifyResultData(List<GenericRecord> expectData) {
  Dataset<Row> ds = HoodieClientTestUtils.read(jsc, tablePath, sqlContext, fs, tablePath + "/*/*/*/*");

  List<Row> readData = ds.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon").collectAsList();
  List<HoodieTripModel> result = readData.stream().map(row ->
      new HoodieTripModel(row.getDouble(0), row.getString(1), row.getString(2), row.getString(3), row.getDouble(4),
          row.getDouble(5), row.getDouble(6), row.getDouble(7)))
      .collect(Collectors.toList());

  List<HoodieTripModel> expected = expectData.stream().map(g ->
      new HoodieTripModel(Double.parseDouble(g.get("timestamp").toString()),
          g.get("_row_key").toString(),
          g.get("rider").toString(),
          g.get("driver").toString(),
          Double.parseDouble(g.get("begin_lat").toString()),
          Double.parseDouble(g.get("begin_lon").toString()),
          Double.parseDouble(g.get("end_lat").toString()),
          Double.parseDouble(g.get("end_lon").toString())))
      .collect(Collectors.toList());

  assertAll("Result list equals",
      () -> assertEquals(expected.size(), result.size()),
      () -> assertTrue(result.containsAll(expected) && expected.containsAll(result)));
}
 
Example #7
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testOutputDataFrameFromMatrixDML() {
	System.out.println("MLContextTest - output DataFrame from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
	Script script = dml(s).out("M");
	Dataset<Row> df = ml.execute(script).getMatrix("M").toDF();
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
	List<Row> list = sortedDF.collectAsList();
	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Assert.assertEquals(1.0, row1.getDouble(1), 0.0);
	Assert.assertEquals(2.0, row1.getDouble(2), 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Assert.assertEquals(3.0, row2.getDouble(1), 0.0);
	Assert.assertEquals(4.0, row2.getDouble(2), 0.0);
}
 
Example #8
Source File: TestPassthroughDeriver.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test (expected = RuntimeException.class)
public void testDifferentSchemas() throws Exception {
  StructType schema1 = DataTypes.createStructType(Lists.<StructField>newArrayList(
      DataTypes.createStructField("col1", DataTypes.StringType, false)));
  StructType schema2 = DataTypes.createStructType(Lists.<StructField>newArrayList(
      DataTypes.createStructField("col2", DataTypes.StringType, false)));
  Dataset<Row> dep1 = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("a")), schema1);
  Dataset<Row> dep2= Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("b")), schema2);
  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("dep1", dep1);
  dependencies.put("dep2", dep2);

  Deriver deriver = new PassthroughDeriver();

  deriver.derive(dependencies).collectAsList();
}
 
Example #9
Source File: NExecAndComp.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static Dataset<Row> sql(String prj, String sqlText, List<String> parameters) {
    if (sqlText == null)
        throw new RuntimeException("Sorry your SQL is null...");

    try {
        logger.info("Try to query from cube....");
        long startTs = System.currentTimeMillis();
        Dataset<Row> dataset = queryCubeAndSkipCompute(prj, sqlText, parameters);
        logger.info("Cool! This sql hits cube...");
        logger.info("Duration(ms): {}", (System.currentTimeMillis() - startTs));
        return dataset;
    } catch (Throwable e) {
        logger.error("There is no cube can be used for query [{}]", sqlText);
        logger.error("Reasons:", e);
        throw new RuntimeException("Error in running query [ " + sqlText.trim() + " ]", e);
    }
}
 
Example #10
Source File: DataFrameIT.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
public static void testResultSetToDF(String table, ResultSet[] resultSets) throws SQLException {

    try{
        Connection conn = DriverManager.getConnection("jdbc:default:connection");
        PreparedStatement pstmt = conn.prepareStatement("select * from " + table.toUpperCase());
        ResultSet res = pstmt.executeQuery();
        // Convert result set to Dataframe
        Dataset<Row> resultSetDF = SparkUtils.resultSetToDF(res);
        resultSets[0] = res;

            // Construct Stored Procedure Result
            List<ExecRow> rows = Lists.newArrayList();
            ExecRow row = new ValueRow(1);
            // System.out.println(resultSetDF.dataset().count());
            row.setColumn(1, new SQLLongint(resultSetDF.count()));
            rows.add(row);
            IteratorNoPutResultSet resultsToWrap = wrapResults((EmbedConnection) conn, rows, DATAFRAME_COUNT_STORED_PROCEDURE_COLUMN_DECSRIPTOR);
            resultSets[0] = new EmbedResultSet40((EmbedConnection)conn, resultsToWrap, false, null, true);

            conn.close();
        }
        catch (StandardException e) {
            throw new SQLException(Throwables.getRootCause(e));
        }
    }
 
Example #11
Source File: CommonAddressFeaturesBridgeTest.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
private void assertCorrectness(Dataset<Row> rowDataset, Transformer transformer) {
	List<Row> sparkOutput = rowDataset.collectAsList();

	for (Row row : sparkOutput) {
		Map<String, Object> data = new HashMap<>();
		data.put("mergedAddress", row.get(0));

		List<Object> list = row.getList(1);
		String[] sanitizedAddress = new String[list.size()];
		for (int j = 0; j < sanitizedAddress.length; j++) {
			sanitizedAddress[j] = (String) list.get(j);
		}

		data.put("sanitizedAddress", sanitizedAddress);
		transformer.transform(data);

		assertEquals("number of words should be equals", row.get(2), data.get("numWords"));
		assertEquals("number of commas should be equals", row.get(3), data.get("numCommas"));
		assertEquals("numericPresent should be equals", row.get(4), data.get("numericPresent"));
		assertEquals("addressLength should be equals", row.get(5), data.get("addressLength"));
		assertEquals("favouredStart should be equals", row.get(6), data.get("favouredStart"));
		assertEquals("unfavouredStart should be equals", row.get(7), data.get("unfavouredStart"));
	}
}
 
Example #12
Source File: Normalization.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Scale based on min,max
 *
 * @param dataFrame the dataframe to scale
 * @param min       the minimum value
 * @param max       the maximum value
 * @return the normalized dataframe per column
 */
public static Dataset<Row> normalize(Dataset<Row> dataFrame, double min, double max, List<String> skipColumns) {
    List<String> columnsList = DataFrames.toList(dataFrame.columns());
    columnsList.removeAll(skipColumns);
    String[] columnNames = DataFrames.toArray(columnsList);
    //first row is min second row is max, each column in a row is for a particular column
    List<Row> minMax = minMaxColumns(dataFrame, columnNames);
    for (int i = 0; i < columnNames.length; i++) {
        String columnName = columnNames[i];
        double dMin = ((Number) minMax.get(0).get(i)).doubleValue();
        double dMax = ((Number) minMax.get(1).get(i)).doubleValue();
        double maxSubMin = (dMax - dMin);
        if (maxSubMin == 0)
            maxSubMin = 1;

        Column newCol = dataFrame.col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min);
        dataFrame = dataFrame.withColumn(columnName, newCol);
    }


    return dataFrame;
}
 
Example #13
Source File: SparkRelationalOperator.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (relOpKind == EQUALS_RELOP)
        return leftExpr.equalTo(rightExpr);
    else if (relOpKind == NOT_EQUALS_RELOP)
        return leftExpr.notEqual(rightExpr);
    else if (relOpKind == GREATER_THAN_RELOP)
        return leftExpr.gt(rightExpr);
    else if (relOpKind == GREATER_EQUALS_RELOP)
        return leftExpr.geq(rightExpr);
    else if (relOpKind == LESS_THAN_RELOP)
        return leftExpr.lt(rightExpr);
    else if (relOpKind == LESS_EQUALS_RELOP)
        return leftExpr.leq(rightExpr);
    else if (relOpKind == IS_NULL_RELOP)
        return leftExpr.isNull();
    else if (relOpKind == IS_NOT_NULL_RELOP)
        return leftExpr.isNotNull();
    else
        throw new UnsupportedOperationException();
}
 
Example #14
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column");

	List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #15
Source File: TestTranslateFunction.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testExplicitDontAppendRaw() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(ComponentFactory.TYPE_CONFIG_NAME, DummyTranslator.class.getName());
  configMap.put(TranslateFunction.APPEND_RAW_ENABLED_CONFIG, false);
  Config config = ConfigFactory.parseMap(configMap);

  TranslateFunction tf = new TranslateFunction(config);
  tf.receiveProvidedSchema(tf.getExpectingSchema());
  Dataset<Row> raw = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("hello?")), tf.getExpectingSchema());
  Dataset<Row> translated = raw.flatMap(tf, RowEncoder.apply(tf.getProvidingSchema()));

  assertEquals(1, translated.schema().size());
  assertNotEquals("_value", translated.schema().fields()[0].name());
}
 
Example #16
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testOutputDataFrameDoublesWithIDColumnFromMatrixDML() {
	System.out.println("MLContextTest - output DataFrame of doubles with ID column from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
	Script script = dml(s).out("M");
	Dataset<Row> df = ml.execute(script).getMatrix("M").toDFDoubleWithIDColumn();
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Assert.assertEquals(1.0, row1.getDouble(1), 0.0);
	Assert.assertEquals(2.0, row1.getDouble(2), 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Assert.assertEquals(3.0, row2.getDouble(1), 0.0);
	Assert.assertEquals(4.0, row2.getDouble(2), 0.0);
}
 
Example #17
Source File: AbstractJavaEsSparkStructuredStreamingTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void test1WriteWithMappingExclude() throws Exception {
    String target = wrapIndex(resource("test-mapping-exclude", "data"));
    JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));

    RecordBean doc1 = new RecordBean();
    doc1.setId(1);
    doc1.setName("Spark");

    RecordBean doc2 = new RecordBean();
    doc2.setId(2);
    doc2.setName("Hadoop");

    RecordBean doc3 = new RecordBean();
    doc3.setId(3);
    doc3.setName("YARN");

    Dataset<RecordBean> dataset = test
            .withInput(doc1)
            .withInput(doc2)
            .withInput(doc3)
            .stream();

    test.run(
            dataset.writeStream()
                    .option("checkpointLocation", checkpoint(target))
                    .option(ES_MAPPING_EXCLUDE, "name")
                    .format("es"),
            target
    );

    assertTrue(RestUtils.exists(target));
    assertThat(RestUtils.get(target + "/_search?"), not(containsString("Spark")));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("Hadoop")));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("YARN")));
}
 
Example #18
Source File: BookUrlBuilderApp.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName("Book URL Builder")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
  ds.printSchema();
  ds.show(20, 80);
}
 
Example #19
Source File: SparkEngine.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
public Enumerable<Object> computeSCALA(DataContext dataContext, RelNode relNode, RelDataType resultType) {
    Dataset<Row> sparkPlan = toSparkPlan(dataContext, relNode);
    log.debug("SPARK LOGICAL PLAN {}", sparkPlan.queryExecution().logical());
    return ResultPlan.getResult(sparkPlan, resultType, ResultType.SCALA()).right().get();

}
 
Example #20
Source File: TextEncodedTelemetryReader.java    From metron with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<String> read(SparkSession spark, Properties profilerProps, Properties readerProps) {
  String inputPath = TELEMETRY_INPUT_PATH.get(profilerProps, String.class);
  if(inputFormat == null) {
    inputFormat = TELEMETRY_INPUT_FORMAT.get(profilerProps, String.class);
  }
  LOG.debug("Loading telemetry; inputPath={}, inputFormat={}", inputPath, inputFormat);

  return spark
          .read()
          .options(Maps.fromProperties(readerProps))
          .format(inputFormat)
          .load(inputPath)
          .as(Encoders.STRING());
}
 
Example #21
Source File: IcebergSourceFlatAvroDataReadBenchmark.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Benchmark
@Threads(1)
public void readWithProjectionFileSource() {
  Map<String, String> conf = Maps.newHashMap();
  conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
  withSQLConf(conf, () -> {
    Dataset<Row> df = spark().read().format("avro").load(dataLocation()).select("longCol");
    materialize(df);
  });
}
 
Example #22
Source File: MockValueSets.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a MockValueSets instance with the given data.
 */
public MockValueSets(SparkSession spark,
    Dataset<UrlAndVersion> members,
    Dataset<Row> valueSets,
    Dataset<Value> values,
    SparkRowConverter valueSetRowConverter) {

  super(spark, FhirVersionEnum.DSTU3, members, valueSets, values, valueSetRowConverter);
}
 
Example #23
Source File: BatchStep.java    From envelope with Apache License 2.0 5 votes vote down vote up
public void submit(Set<Step> dependencySteps) throws Exception {
  Contexts.getSparkSession().sparkContext().setJobDescription("Step: " + getName());

  Dataset<Row> data;
  Dataset<Row> errored = null;
  if (hasInput()) {
    data = ((BatchInput)getInput(true)).read();

    if (getInput(true) instanceof CanReturnErroredData) {
      errored = ((CanReturnErroredData) getInput(true)).getErroredData();
    }
  }
  else if (hasDeriver()) {
    Map<String, Dataset<Row>> dependencies = StepUtils.getStepDataFrames(dependencySteps);
    data = getDeriver(true).derive(dependencies);
    if (getDeriver(true) instanceof CanReturnErroredData) {
      errored = ((CanReturnErroredData) getDeriver(true)).getErroredData();
    }
  }
  else {
    throw new RuntimeException("Batch step '" + getName() + "' must contain either an input or a deriver.");
  }

  if (errored != null) {
    BatchStep erroredBatchStep = new BatchStep(getName() + DEFAULT_ERROR_DATAFRAME_SUFFIX);
    erroredBatchStep.configure(ConfigFactory.empty());
    erroredBatchStep.setData(errored);
    erroredBatchStep.setState(StepState.FINISHED);
    addNewBatchStep(erroredBatchStep);
  }
  
  if (doesRepartition()) {
    data = repartition(data);
  }

  setData(data);
  writeData();

  setState(StepState.FINISHED);
}
 
Example #24
Source File: VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Dataset<Row> withIntColumnDictEncoded(Dataset<Row> df) {
  return df.withColumn(
      "intCol",
      when(modColumn(9, 0), lit(0))
          .when(modColumn(9, 1), lit(1))
          .when(modColumn(9, 2), lit(2))
          .when(modColumn(9, 3), lit(3))
          .when(modColumn(9, 4), lit(4))
          .when(modColumn(9, 5), lit(5))
          .when(modColumn(9, 6), lit(6))
          .when(modColumn(9, 7), lit(7))
          .when(modColumn(9, 8), lit(8)));
}
 
Example #25
Source File: Snomed.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset.
 *
 * @param spark the Spark session
 * @param snomedRelationshipPath path to the SNOMED relationship file
 * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark,
    String snomedRelationshipPath) {

  return spark.read()
      .option("header", true)
      .option("delimiter", "\t")
      .csv(snomedRelationshipPath)
      .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID)))
      .where(col("active").equalTo(lit("1")))
      .select(col("destinationId"), col("sourceId"))
      .where(col("destinationId").isNotNull()
          .and(col("destinationId").notEqual(lit(""))))
      .where(col("sourceId").isNotNull()
          .and(col("sourceId").notEqual(lit(""))))
      .map((MapFunction<Row, HierarchicalElement>) row -> {

        HierarchicalElement element = new HierarchicalElement();

        element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI);
        element.setAncestorValue(row.getString(0));

        element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI);
        element.setDescendantValue(row.getString(1));

        return element;
      }, Hierarchies.getHierarchicalElementEncoder());
}
 
Example #26
Source File: AbstractJavaEsSparkSQLTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicRead() throws Exception {
       Dataset<Row> dataset = artistsAsDataset();
       assertTrue(dataset.count() > 300);
       dataset.createOrReplaceTempView("datfile");
       assertEquals(5, ((Object[]) dataset.take(5)).length);
       Dataset<Row> results = sqc
			.sql("SELECT name FROM datfile WHERE id >=1 AND id <=10");
       assertEquals(10, ((Object[]) results.take(10)).length);
}
 
Example #27
Source File: TestWriteMetricsConfig.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}
 
Example #28
Source File: NFilePruningTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private long assertResultsAndScanFiles(String sql, long numScanFiles) throws Exception {
    Dataset<Row> dataset = queryCubeAndSkipCompute(getProject(), sql);
    dataset.collect();
    long actualNum = findFileSourceScanExec(dataset.queryExecution().sparkPlan()).metrics().get("numFiles").get().value();
    Assert.assertEquals(numScanFiles, actualNum);
    return actualNum;
}
 
Example #29
Source File: CsvSourceTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetFlatTable() throws IOException {
    System.out.println(getTestConfig().getMetadataUrl());
    CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
    CubeInstance cube = cubeMgr.getCube(CUBE_NAME);
    cleanupSegments(CUBE_NAME);
    DataModelDesc model = cube.getModel();
    CubeSegment segment = cubeMgr.appendSegment(cube, new SegmentRange.TSRange(dateToLong("2010-01-01"), dateToLong("2013-01-01")));
    Dataset<Row> ds = initFlatTable(segment);
    ds.show(10);
    StructType schema = ds.schema();

    SegmentInfo segmentInfo = MetadataConverter.getSegmentInfo(segment.getCubeInstance(), segment.getUuid(),
            segment.getName(), segment.getStorageLocationIdentifier());
    scala.collection.immutable.Map<String, String> map = BuildUtils.getColumnIndexMap(segmentInfo);
    for (StructField field : schema.fields()) {
        Assert.assertNotNull(model.findColumn(map.apply(field.name())));
    }

    for (LayoutEntity layoutEntity : MetadataConverter.extractEntityList2JavaList(cube)) {
        Set<Integer> dims = layoutEntity.getOrderedDimensions().keySet();
        Column[] modelCols = new Column[dims.size()];
        int index = 0;
        for (int id : dims) {
            modelCols[index] = new Column(String.valueOf(id));
            index++;
        }
        ds.select(modelCols).show(10);
    }
}
 
Example #30
Source File: SparkMLDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  if (model == null) {
    model = PipelineModel.load(modelPath);
  }

  Dataset<Row> data = getData(dependencies);

  return model.transform(data);
}