org.apache.spark.sql.catalyst.InternalRow Java Examples

The following examples show how to use org.apache.spark.sql.catalyst.InternalRow. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveStreamingDataWriter.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
@Override
public void write(final InternalRow record) throws IOException {
  String delimitedRow = Joiner.on(",").useForNull("")
    .join(scala.collection.JavaConversions.seqAsJavaList(record.toSeq(schema)));
  try {
    streamingConnection.write(delimitedRow.getBytes(Charset.forName("UTF-8")));
    rowsWritten++;
    if (rowsWritten > 0 && commitAfterNRows > 0 && (rowsWritten % commitAfterNRows == 0)) {
      LOG.info("Committing transaction after rows: {}", rowsWritten);
      streamingConnection.commitTransaction();
      streamingConnection.beginTransaction();
    }
  } catch (StreamingException e) {
    throw new IOException(e);
  }
}
 
Example #2
Source File: SparkParquetReadersFlatDataBenchmark.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Benchmark
@Threads(1)
public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    Iterable<InternalRow> unsafeRows = Iterables.transform(
        rows,
        APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke);

    for (InternalRow row : unsafeRows) {
      blackhole.consume(row);
    }
  }
}
 
Example #3
Source File: TestHelpers.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch,
                                     boolean checkArrowValidityVector) {
  for (int rowId = 0; rowId < batch.numRows(); rowId++) {
    List<Types.NestedField> fields = struct.fields();
    InternalRow row = batch.getRow(rowId);
    Record rec = expected.next();
    for (int i = 0; i < fields.size(); i += 1) {
      Type fieldType = fields.get(i).type();
      Object expectedValue = rec.get(i);
      Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
      assertEqualsUnsafe(fieldType, expectedValue, actualValue);

      if (checkArrowValidityVector) {
        ColumnVector columnVector = batch.column(i);
        ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector();
        Assert.assertEquals("Nullability doesn't match", expectedValue == null, arrowVector.isNull(rowId));
      }
    }
  }
}
 
Example #4
Source File: PartitionedWriter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public void write(InternalRow row) throws IOException {
  key.partition(row);

  PartitionKey currentKey = getCurrentKey();
  if (!key.equals(currentKey)) {
    closeCurrent();
    completedPartitions.add(currentKey);

    if (completedPartitions.contains(key)) {
      // if rows are not correctly grouped, detect and fail the write
      PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null);
      LOG.warn("Duplicate key: {} == {}", existingKey, key);
      throw new IllegalStateException("Already closed files for partition: " + key.toPath());
    }

    setCurrentKey(key.copy());
    openCurrent();
  }

  writeInternal(row);
}
 
Example #5
Source File: TestDataFileSerialization.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetWriterSplitOffsets() throws IOException {
  Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L);
  File parquetFile = new File(
      temp.getRoot(),
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
  FileAppender<InternalRow> writer =
      Parquet.write(Files.localOutput(parquetFile))
          .schema(DATE_SCHEMA)
          .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType))
          .build();
  try {
    writer.addAll(records);
  } finally {
    writer.close();
  }

  Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
  File dataFile = temp.newFile();
  try (Output out = new Output(new FileOutputStream(dataFile))) {
    kryo.writeClassAndObject(out, writer.splitOffsets());
  }
  try (Input in = new Input(new FileInputStream(dataFile))) {
    kryo.readClassAndObject(in);
  }
}
 
Example #6
Source File: CodegenExamples.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public UnsafeRow apply(InternalRow i) {
  holder.reset();

  rowWriter.zeroOutNullBytes();


  boolean isNull = i.isNullAt(0);
  long value = isNull ? -1L : (i.getLong(0));
  if (isNull) {
    rowWriter.setNullAt(0);
  } else {
    rowWriter.write(0, value);
  }


  boolean isNull1 = i.isNullAt(1);
  UTF8String value1 = isNull1 ? null : (i.getUTF8String(1));
  if (isNull1) {
    rowWriter.setNullAt(1);
  } else {
    rowWriter.write(1, value1);
  }
  result.setTotalSize(holder.totalSize());
  return result;
}
 
Example #7
Source File: SparkParquetReadersNestedDataBenchmark.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Benchmark
@Threads(1)
public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    Iterable<InternalRow> unsafeRows = Iterables.transform(
        rows,
        APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke);

    for (InternalRow row : unsafeRows) {
      blackhole.consume(row);
    }
  }
}
 
Example #8
Source File: PartitionKey.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
PartitionKey(PartitionSpec spec, Schema inputSchema) {
  this.spec = spec;

  List<PartitionField> fields = spec.fields();
  this.size = fields.size();
  this.partitionTuple = new Object[size];
  this.transforms = new Transform[size];
  this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size);

  Schema schema = spec.schema();
  Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema);
  for (int i = 0; i < size; i += 1) {
    PartitionField field = fields.get(i);
    Accessor<InternalRow> accessor = newAccessors.get(field.sourceId());
    if (accessor == null) {
      throw new RuntimeException(
          "Cannot build accessor for field: " + schema.findField(field.sourceId()));
    }
    this.accessors[i] = accessor;
    this.transforms[i] = field.transform();
  }
}
 
Example #9
Source File: SparkParquetReadersNestedDataBenchmark.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Benchmark
@Threads(1)
public void readUsingSparkReader(Blackhole blackhole) throws IOException {
  StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA);
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .readSupport(new ParquetReadSupport())
      .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json())
      .set("spark.sql.parquet.binaryAsString", "false")
      .set("spark.sql.parquet.int96AsTimestamp", "false")
      .callInit()
      .build()) {

    for (InternalRow row : rows) {
      blackhole.consume(row);
    }
  }
}
 
Example #10
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}
 
Example #11
Source File: Writer.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public void write(InternalRow row) throws IOException {
  key.partition(row);

  if (!key.equals(currentKey)) {
    closeCurrent();

    if (completedPartitions.contains(key)) {
      // if rows are not correctly grouped, detect and fail the write
      PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null);
      LOG.warn("Duplicate key: {} == {}", existingKey, key);
      throw new IllegalStateException("Already closed file for partition: " + key.toPath());
    }

    this.currentKey = key.copy();
    this.currentPath = outputPathFunc.apply(currentKey);
    OutputFile file = HadoopOutputFile.fromPath(currentPath, conf);
    this.currentAppender = factory.newAppender(file, format);
  }

  currentAppender.add(row);
}
 
Example #12
Source File: BigQueryDataSourceReader.java    From spark-bigquery-connector with Apache License 2.0 6 votes vote down vote up
@Override
public List<InputPartition<InternalRow>> planInputPartitions() {
    if (schema.map(StructType::isEmpty).orElse(false)) {
        // create empty projection
        return createEmptyProjectionPartitions();
    }

    ImmutableList<String> selectedFields = schema
            .map(requiredSchema -> ImmutableList.copyOf(requiredSchema.fieldNames()))
            .orElse(ImmutableList.of());
    Optional<String> filter = emptyIfNeeded(SparkFilterUtils.getCompiledFilter(
            readSessionCreatorConfig.getReadDataFormat(), globalFilter, pushedFilters));
    ReadSessionResponse readSessionResponse = readSessionCreator.create(
            tableId, selectedFields, filter, readSessionCreatorConfig.getMaxParallelism());
    ReadSession readSession = readSessionResponse.getReadSession();
    return readSession.getStreamsList().stream()
            .map(stream -> new BigQueryInputPartition(
                    bigQueryReadClientFactory,
                    stream.getName(),
                    readSessionCreatorConfig.getMaxReadRowsRetries(),
                    createConverter(selectedFields, readSessionResponse)))
            .collect(Collectors.toList());
}
 
Example #13
Source File: SparkParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected GenericInternalRow newStructData(InternalRow reuse) {
  if (reuse instanceof GenericInternalRow) {
    return (GenericInternalRow) reuse;
  } else {
    return new GenericInternalRow(numFields);
  }
}
 
Example #14
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
void partition(InternalRow row) {
  for (int i = 0; i < partitionTuple.length; i += 1) {
    Transform<Object, Object> transform = transforms[i];
    partitionTuple[i] = transform.apply(accessors[i].get(row));
  }
}
 
Example #15
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Accessor<InternalRow> newAccessor(int position, boolean isOptional, Types.StructType type,
                                                 Accessor<InternalRow> accessor) {
  int size = type.fields().size();
  if (isOptional) {
    // the wrapped position handles null layers
    return new WrappedPositionAccessor(position, size, accessor);
  } else if (accessor.getClass() == PositionAccessor.class) {
    return new Position2Accessor(position, size, (PositionAccessor) accessor);
  } else if (accessor instanceof Position2Accessor) {
    return new Position3Accessor(position, size, (Position2Accessor) accessor);
  } else {
    return new WrappedPositionAccessor(position, size, accessor);
  }
}
 
Example #16
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Accessor<InternalRow> newAccessor(int position, Type type) {
  switch (type.typeId()) {
    case STRING:
      return new StringAccessor(position, SparkSchemaUtil.convert(type));
    case DECIMAL:
      return new DecimalAccessor(position, SparkSchemaUtil.convert(type));
    case BINARY:
      return new BytesAccessor(position, SparkSchemaUtil.convert(type));
    default:
      return new PositionAccessor(position, SparkSchemaUtil.convert(type));
  }
}
 
Example #17
Source File: RowDataReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> newDataIterable(DataTask task, Schema readSchema) {
  StructInternalRow row = new StructInternalRow(tableSchema.asStruct());
  CloseableIterable<InternalRow> asSparkRows = CloseableIterable.transform(
      task.asDataTask().rows(), row::setStruct);
  return CloseableIterable.transform(
      asSparkRows, APPLY_PROJECTION.bind(projection(readSchema, tableSchema))::invoke);
}
 
Example #18
Source File: RowDataReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> newOrcIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  return ORC.read(location)
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .build();
}
 
Example #19
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public Object get(InternalRow row) {
  if (row.isNullAt(position())) {
    return null;
  }
  return ByteBuffer.wrap((byte[]) row.get(position(), type()));
}
 
Example #20
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public Object get(InternalRow row) {
  if (row.isNullAt(p)) {
    return null;
  }
  return row.get(p, type);
}
 
Example #21
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public Object get(InternalRow row) {
  if (row.isNullAt(position())) {
    return null;
  }
  return row.get(position(), type()).toString();
}
 
Example #22
Source File: SparkValueReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected InternalRow reuseOrCreate(Object reuse) {
  if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) {
    return (InternalRow) reuse;
  }
  return new GenericInternalRow(numFields);
}
 
Example #23
Source File: Reader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> newAvroIterable(InputFile location,
                                                  FileScanTask task,
                                                  Schema readSchema) {
  return Avro.read(location)
      .reuseContainers()
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(SparkAvroReader::new)
      .build();
}
 
Example #24
Source File: SparkParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected GenericInternalRow newStructData(InternalRow reuse) {
  if (reuse instanceof GenericInternalRow) {
    return (GenericInternalRow) reuse;
  } else {
    return new GenericInternalRow(numFields);
  }
}
 
Example #25
Source File: Writer.java    From iceberg with Apache License 2.0 5 votes vote down vote up
PartitionedWriter(PartitionSpec spec, FileFormat format, Configuration conf,
                  AppenderFactory<InternalRow> factory,
                  Function<PartitionKey, Path> outputPathFunc) {
  this.spec = spec;
  this.format = format;
  this.conf = conf;
  this.factory = factory;
  this.outputPathFunc = outputPathFunc;
  this.key = new PartitionKey(spec);
}
 
Example #26
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Accessor<InternalRow> newAccessor(int p, Type type) {
  switch (type.typeId()) {
    case STRING:
      return new StringAccessor(p, convert(type));
    case DECIMAL:
      return new DecimalAccessor(p, convert(type));
    default:
      return new PositionAccessor(p, convert(type));
  }
}
 
Example #27
Source File: TestDataFrameWrites.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> createDataset(List<Record> records, Schema schema) throws IOException {
  // this uses the SparkAvroReader to create a DataFrame from the list of records
  // it assumes that SparkAvroReader is correct
  File testFile = temp.newFile();
  Assert.assertTrue("Delete should succeed", testFile.delete());

  try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile))
      .schema(schema)
      .named("test")
      .build()) {
    for (Record rec : records) {
      writer.add(rec);
    }
  }

  List<InternalRow> rows;
  try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile))
      .createReaderFunc(SparkAvroReader::new)
      .project(schema)
      .build()) {
    rows = Lists.newArrayList(reader);
  }

  // make sure the dataframe matches the records before moving on
  for (int i = 0; i < records.size(); i += 1) {
    assertEqualsUnsafe(schema.asStruct(), records.get(i), rows.get(i));
  }

  JavaRDD<InternalRow> rdd = sc.parallelize(rows);
  return spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(schema), false);
}
 
Example #28
Source File: ColumnarBatch.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Returns an iterator over the rows in this batch.
 */
public Iterator<InternalRow> rowIterator() {
    final int maxRows = numRows;
    final MutableColumnarRow row = new MutableColumnarRow(columns);
    return new Iterator<InternalRow>() {
        int rowId = 0;

        @Override
        public boolean hasNext() {
            return rowId < maxRows;
        }

        @Override
        public InternalRow next() {
            if (rowId >= maxRows) {
                throw new NoSuchElementException();
            }
            row.rowId = rowId++;
            return row;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    };
}
 
Example #29
Source File: SparkValueWriters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public void write(InternalRow row, Encoder encoder) throws IOException {
  for (int i = 0; i < types.length; i += 1) {
    if (row.isNullAt(i)) {
      writers[i].write(null, encoder);
    } else {
      write(row, i, writers[i], encoder);
    }
  }
}
 
Example #30
Source File: TestFilteredScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}