Java Code Examples for org.apache.spark.sql.catalyst.InternalRow

The following examples show how to use org.apache.spark.sql.catalyst.InternalRow. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Override
public List<InputPartition<InternalRow>> planInputPartitions() {
    if (schema.map(StructType::isEmpty).orElse(false)) {
        // create empty projection
        return createEmptyProjectionPartitions();
    }

    ImmutableList<String> selectedFields = schema
            .map(requiredSchema -> ImmutableList.copyOf(requiredSchema.fieldNames()))
            .orElse(ImmutableList.of());
    Optional<String> filter = emptyIfNeeded(SparkFilterUtils.getCompiledFilter(
            readSessionCreatorConfig.getReadDataFormat(), globalFilter, pushedFilters));
    ReadSessionResponse readSessionResponse = readSessionCreator.create(
            tableId, selectedFields, filter, readSessionCreatorConfig.getMaxParallelism());
    ReadSession readSession = readSessionResponse.getReadSession();
    return readSession.getStreamsList().stream()
            .map(stream -> new BigQueryInputPartition(
                    bigQueryReadClientFactory,
                    stream.getName(),
                    readSessionCreatorConfig.getMaxReadRowsRetries(),
                    createConverter(selectedFields, readSessionResponse)))
            .collect(Collectors.toList());
}
 
Example 2
Source Project: iceberg   Source File: TestDataFileSerialization.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetWriterSplitOffsets() throws IOException {
  Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L);
  File parquetFile = new File(
      temp.getRoot(),
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
  FileAppender<InternalRow> writer =
      Parquet.write(Files.localOutput(parquetFile))
          .schema(DATE_SCHEMA)
          .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType))
          .build();
  try {
    writer.addAll(records);
  } finally {
    writer.close();
  }

  Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
  File dataFile = temp.newFile();
  try (Output out = new Output(new FileOutputStream(dataFile))) {
    kryo.writeClassAndObject(out, writer.splitOffsets());
  }
  try (Input in = new Input(new FileInputStream(dataFile))) {
    kryo.readClassAndObject(in);
  }
}
 
Example 3
Source Project: iceberg   Source File: TestHelpers.java    License: Apache License 2.0 6 votes vote down vote up
public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch,
                                     boolean checkArrowValidityVector) {
  for (int rowId = 0; rowId < batch.numRows(); rowId++) {
    List<Types.NestedField> fields = struct.fields();
    InternalRow row = batch.getRow(rowId);
    Record rec = expected.next();
    for (int i = 0; i < fields.size(); i += 1) {
      Type fieldType = fields.get(i).type();
      Object expectedValue = rec.get(i);
      Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
      assertEqualsUnsafe(fieldType, expectedValue, actualValue);

      if (checkArrowValidityVector) {
        ColumnVector columnVector = batch.column(i);
        ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector();
        Assert.assertEquals("Nullability doesn't match", expectedValue == null, arrowVector.isNull(rowId));
      }
    }
  }
}
 
Example 4
@Benchmark
@Threads(1)
public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    Iterable<InternalRow> unsafeRows = Iterables.transform(
        rows,
        APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke);

    for (InternalRow row : unsafeRows) {
      blackhole.consume(row);
    }
  }
}
 
Example 5
@Benchmark
@Threads(1)
public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    Iterable<InternalRow> unsafeRows = Iterables.transform(
        rows,
        APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke);

    for (InternalRow row : unsafeRows) {
      blackhole.consume(row);
    }
  }
}
 
Example 6
@Benchmark
@Threads(1)
public void readUsingSparkReader(Blackhole blackhole) throws IOException {
  StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA);
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .readSupport(new ParquetReadSupport())
      .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json())
      .set("spark.sql.parquet.binaryAsString", "false")
      .set("spark.sql.parquet.int96AsTimestamp", "false")
      .callInit()
      .build()) {

    for (InternalRow row : rows) {
      blackhole.consume(row);
    }
  }
}
 
Example 7
Source Project: iceberg   Source File: Writer.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void write(InternalRow row) throws IOException {
  key.partition(row);

  if (!key.equals(currentKey)) {
    closeCurrent();

    if (completedPartitions.contains(key)) {
      // if rows are not correctly grouped, detect and fail the write
      PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null);
      LOG.warn("Duplicate key: {} == {}", existingKey, key);
      throw new IllegalStateException("Already closed file for partition: " + key.toPath());
    }

    this.currentKey = key.copy();
    this.currentPath = outputPathFunc.apply(currentKey);
    OutputFile file = HadoopOutputFile.fromPath(currentPath, conf);
    this.currentAppender = factory.newAppender(file, format);
  }

  currentAppender.add(row);
}
 
Example 8
Source Project: spark-llap   Source File: HiveStreamingDataWriter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void write(final InternalRow record) throws IOException {
  String delimitedRow = Joiner.on(",").useForNull("")
    .join(scala.collection.JavaConversions.seqAsJavaList(record.toSeq(schema)));
  try {
    streamingConnection.write(delimitedRow.getBytes(Charset.forName("UTF-8")));
    rowsWritten++;
    if (rowsWritten > 0 && commitAfterNRows > 0 && (rowsWritten % commitAfterNRows == 0)) {
      LOG.info("Committing transaction after rows: {}", rowsWritten);
      streamingConnection.commitTransaction();
      streamingConnection.beginTransaction();
    }
  } catch (StreamingException e) {
    throw new IOException(e);
  }
}
 
Example 9
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example 10
Source Project: iceberg   Source File: PartitionedWriter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void write(InternalRow row) throws IOException {
  key.partition(row);

  PartitionKey currentKey = getCurrentKey();
  if (!key.equals(currentKey)) {
    closeCurrent();
    completedPartitions.add(currentKey);

    if (completedPartitions.contains(key)) {
      // if rows are not correctly grouped, detect and fail the write
      PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null);
      LOG.warn("Duplicate key: {} == {}", existingKey, key);
      throw new IllegalStateException("Already closed files for partition: " + key.toPath());
    }

    setCurrentKey(key.copy());
    openCurrent();
  }

  writeInternal(row);
}
 
Example 11
Source Project: iceberg   Source File: CodegenExamples.java    License: Apache License 2.0 6 votes vote down vote up
public UnsafeRow apply(InternalRow i) {
  holder.reset();

  rowWriter.zeroOutNullBytes();


  boolean isNull = i.isNullAt(0);
  long value = isNull ? -1L : (i.getLong(0));
  if (isNull) {
    rowWriter.setNullAt(0);
  } else {
    rowWriter.write(0, value);
  }


  boolean isNull1 = i.isNullAt(1);
  UTF8String value1 = isNull1 ? null : (i.getUTF8String(1));
  if (isNull1) {
    rowWriter.setNullAt(1);
  } else {
    rowWriter.write(1, value1);
  }
  result.setTotalSize(holder.totalSize());
  return result;
}
 
Example 12
Source Project: iceberg   Source File: PartitionKey.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
PartitionKey(PartitionSpec spec, Schema inputSchema) {
  this.spec = spec;

  List<PartitionField> fields = spec.fields();
  this.size = fields.size();
  this.partitionTuple = new Object[size];
  this.transforms = new Transform[size];
  this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size);

  Schema schema = spec.schema();
  Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema);
  for (int i = 0; i < size; i += 1) {
    PartitionField field = fields.get(i);
    Accessor<InternalRow> accessor = newAccessors.get(field.sourceId());
    if (accessor == null) {
      throw new RuntimeException(
          "Cannot build accessor for field: " + schema.findField(field.sourceId()));
    }
    this.accessors[i] = accessor;
    this.transforms[i] = field.transform();
  }
}
 
Example 13
Source Project: iceberg   Source File: TestSparkAvroReader.java    License: Apache License 2.0 5 votes vote down vote up
protected void writeAndValidate(Schema schema) throws IOException {
  List<Record> expected = RandomData.generateList(schema, 100, 0L);

  File testFile = temp.newFile();
  Assert.assertTrue("Delete should succeed", testFile.delete());

  try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile))
      .schema(schema)
      .named("test")
      .build()) {
    for (Record rec : expected) {
      writer.add(rec);
    }
  }

  List<InternalRow> rows;
  try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile))
      .createReaderFunc(SparkAvroReader::new)
      .project(schema)
      .build()) {
    rows = Lists.newArrayList(reader);
  }

  for (int i = 0; i < expected.size(); i += 1) {
    assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.get(i));
  }
}
 
Example 14
Source Project: iceberg   Source File: TestSparkOrcReader.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void writeAndValidateRepeatingRecords() throws IOException {
  Schema structSchema = new Schema(
      required(100, "id", Types.LongType.get()),
      required(101, "data", Types.StringType.get())
  );
  List<InternalRow> expectedRepeating = Collections.nCopies(100,
      RandomData.generateSpark(structSchema, 1, 0L).iterator().next());

  writeAndValidateRecords(structSchema, expectedRepeating);
}
 
Example 15
Source Project: iceberg   Source File: SparkAvroReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public InternalRow read(InternalRow reuse, Decoder decoder) throws IOException {
  ResolvingDecoder resolver = resolve(decoder);
  InternalRow row = reader.read(resolver, reuse);
  resolver.drain();
  return row;
}
 
Example 16
@Override
public Iterator<InternalRow> convert(ReadRowsResponse response) {
    return new ArrowBinaryIterator(
            columnsInOrder,
            arrowSchema,
            response.getArrowRecordBatch().getSerializedRecordBatch());
}
 
Example 17
@Override
public InternalRow next() {
    try {
        return SchemaConverters.convertToInternalRow(bqSchema,
                columnsInOrder, (GenericRecord) reader.read(null, in));
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
}
 
Example 18
public ArrowBinaryIterator(List<String> columnsInOrder, ByteString schema, ByteString rowsInBytes) {
    BufferAllocator allocator = (new RootAllocator(maxAllocation)).newChildAllocator("ArrowBinaryIterator",
            0, maxAllocation);

    SequenceInputStream bytesWithSchemaStream = new SequenceInputStream(
            new ByteArrayInputStream(schema.toByteArray()),
            new ByteArrayInputStream(rowsInBytes.toByteArray()));

    ArrowStreamReader arrowStreamReader = new ArrowStreamReader(bytesWithSchemaStream, allocator);
    arrowReaderIterator = new ArrowReaderIterator(arrowStreamReader);
    currentIterator = ImmutableList.<InternalRow>of().iterator();
    this.columnsInOrder = columnsInOrder;
}
 
Example 19
/**
 * Returns an iterator over the rows in this batch.
 */
public Iterator<InternalRow> rowIterator() {
    final int maxRows = numRows;
    final MutableColumnarRow row = new MutableColumnarRow(columns);
    return new Iterator<InternalRow>() {
        int rowId = 0;

        @Override
        public boolean hasNext() {
            return rowId < maxRows;
        }

        @Override
        public InternalRow next() {
            if (rowId >= maxRows) {
                throw new NoSuchElementException();
            }
            row.rowId = rowId++;
            return row;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    };
}
 
Example 20
Source Project: beam   Source File: DatasetSourceBatch.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public InternalRow get() {
  WindowedValue<T> windowedValue =
      WindowedValue.timestampedValueInGlobalWindow(
          reader.getCurrent(), reader.getCurrentTimestamp());
  return RowHelpers.storeWindowedValueInRow(windowedValue, source.getOutputCoder());
}
 
Example 21
@Benchmark
@Threads(1)
public void readUsingIcebergReader(Blackhole blackHole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    for (InternalRow row : rows) {
      blackHole.consume(row);
    }
  }
}
 
Example 22
@Benchmark
@Threads(1)
public void readWithProjectionUsingIcebergReader(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(PROJECTED_SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(PROJECTED_SCHEMA, type))
      .build()) {

    for (InternalRow row : rows) {
      blackhole.consume(row);
    }
  }
}
 
Example 23
@Benchmark
@Threads(1)
public void writeUsingIcebergWriter() throws IOException {
  try (FileAppender<InternalRow> writer = Parquet.write(Files.localOutput(dataFile))
      .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(SCHEMA), msgType))
      .schema(SCHEMA)
      .build()) {

    writer.addAll(rows);
  }
}
 
Example 24
Source Project: iceberg   Source File: TestHelpers.java    License: Apache License 2.0 5 votes vote down vote up
private static void assertEquals(String context, DataType type, Object expected, Object actual) {
  if (expected == null && actual == null) {
    return;
  }

  if (type instanceof StructType) {
    Assert.assertTrue("Expected should be an InternalRow: " + context,
        expected instanceof InternalRow);
    Assert.assertTrue("Actual should be an InternalRow: " + context,
        actual instanceof InternalRow);
    assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual);

  } else if (type instanceof ArrayType) {
    Assert.assertTrue("Expected should be an ArrayData: " + context,
        expected instanceof ArrayData);
    Assert.assertTrue("Actual should be an ArrayData: " + context,
        actual instanceof ArrayData);
    assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual);

  } else if (type instanceof MapType) {
    Assert.assertTrue("Expected should be a MapData: " + context,
        expected instanceof MapData);
    Assert.assertTrue("Actual should be a MapData: " + context,
        actual instanceof MapData);
    assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual);

  } else if (type instanceof BinaryType) {
    assertEqualBytes(context, (byte[]) expected, (byte[]) actual);
  } else {
    Assert.assertEquals("Value should match expected: " + context, expected, actual);
  }
}
 
Example 25
@Benchmark
@Threads(1)
public void readUsingIcebergReader(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    for (InternalRow row : rows) {
      blackhole.consume(row);
    }
  }
}
 
Example 26
Source Project: iceberg   Source File: TestDataFrameWrites.java    License: Apache License 2.0 5 votes vote down vote up
private Dataset<Row> createDataset(Iterable<Record> records, Schema schema) throws IOException {
  // this uses the SparkAvroReader to create a DataFrame from the list of records
  // it assumes that SparkAvroReader is correct
  File testFile = temp.newFile();
  Assert.assertTrue("Delete should succeed", testFile.delete());

  try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile))
      .schema(schema)
      .named("test")
      .build()) {
    for (Record rec : records) {
      writer.add(rec);
    }
  }

  // make sure the dataframe matches the records before moving on
  List<InternalRow> rows = Lists.newArrayList();
  try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile))
      .createReaderFunc(SparkAvroReader::new)
      .project(schema)
      .build()) {

    Iterator<Record> recordIter = records.iterator();
    Iterator<InternalRow> readIter = reader.iterator();
    while (recordIter.hasNext() && readIter.hasNext()) {
      InternalRow row = readIter.next();
      assertEqualsUnsafe(schema.asStruct(), recordIter.next(), row);
      rows.add(row);
    }
    Assert.assertEquals("Both iterators should be exhausted", recordIter.hasNext(), readIter.hasNext());
  }

  JavaRDD<InternalRow> rdd = sc.parallelize(rows);
  return spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(schema), false);
}
 
Example 27
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}
 
Example 28
Source Project: iceberg   Source File: TestHelpers.java    License: Apache License 2.0 5 votes vote down vote up
private static void assertEquals(String context, StructType struct,
                         InternalRow expected, InternalRow actual) {
  Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields());
  for (int i = 0; i < actual.numFields(); i += 1) {
    StructField field = struct.fields()[i];
    DataType type = field.dataType();
    assertEquals(context + "." + field.name(), type, expected.get(i, type), actual.get(i, type));
  }
}
 
Example 29
Source Project: iceberg   Source File: SparkValueWriters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void write(InternalRow row, Encoder encoder) throws IOException {
  for (int i = 0; i < types.length; i += 1) {
    if (row.isNullAt(i)) {
      writers[i].write(null, encoder);
    } else {
      write(row, i, writers[i], encoder);
    }
  }
}
 
Example 30
/**
 * Returns an iterator over the rows in this batch.
 */
public Iterator<InternalRow> rowIterator() {
    final int maxRows = numRows;
    final MutableColumnarRow row = new MutableColumnarRow(columns);
    return new Iterator<InternalRow>() {
        int rowId = 0;

        @Override
        public boolean hasNext() {
            return rowId < maxRows;
        }

        @Override
        public InternalRow next() {
            if (rowId >= maxRows) {
                throw new NoSuchElementException();
            }
            row.rowId = rowId++;
            return row;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    };
}