org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch#reset

Source File: HiveORCVectorizedReader.java From dremio-oss with Apache License 2.0

6 votes

/**
 * Helper method that creates {@link VectorizedRowBatch}. For each selected column an input vector is created in the
 * batch. For unselected columns the vector entry is going to be null. The order of input vectors in batch should
 * match the order the columns in ORC file.
 *
 * @param rowOI Used to find the ordinal of the selected column.
 * @return
 */
private VectorizedRowBatch createVectorizedRowBatch(StructObjectInspector rowOI, boolean isOriginal) {
  final List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
  final List<ColumnVector> vectors = getVectors(rowOI);

  final VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size());

  ColumnVector[] vectorArray =  vectors.toArray(new ColumnVector[0]);

  if (!isOriginal) {
    vectorArray = createTransactionalVectors(vectorArray);
  }

  result.cols = vectorArray;
  result.numCols = fieldRefs.size();
  result.reset();
  return result;
}

Source File: HiveORCVectorizedReader.java From dremio-oss with Apache License 2.0

6 votes

/**
 * Helper method that creates {@link VectorizedRowBatch}. For each selected column an input vector is created in the
 * batch. For unselected columns the vector entry is going to be null. The order of input vectors in batch should
 * match the order the columns in ORC file.
 *
 * @param rowOI Used to find the ordinal of the selected column.
 * @return
 */
private VectorizedRowBatch createVectorizedRowBatch(StructObjectInspector rowOI, boolean isOriginal) {
  final List<? extends StructField> fieldRefs = rowOI.getAllStructFieldRefs();
  final List<ColumnVector> vectors = getVectors(rowOI);

  final VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size());

  ColumnVector[] vectorArray =  vectors.toArray(new ColumnVector[0]);

  if (!isOriginal) {
    vectorArray = createTransactionalVectors(vectorArray);
  }

  result.cols = vectorArray;
  result.numCols = fieldRefs.size();
  result.reset();
  return result;
}

Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0

6 votes

public static void addAvroRecord(
    VectorizedRowBatch batch,
    GenericRecord record,
    TypeDescription orcSchema,
    int orcBatchSize,
    Writer writer
) throws IOException {

  for (int c = 0; c < batch.numCols; c++) {
    ColumnVector colVector = batch.cols[c];
    final String thisField = orcSchema.getFieldNames().get(c);
    final TypeDescription type = orcSchema.getChildren().get(c);

    Object fieldValue = record.get(thisField);
    Schema.Field avroField = record.getSchema().getField(thisField);
    addToVector(type, colVector, avroField.schema(), fieldValue, batch.size);
  }

  batch.size++;

  if (batch.size % orcBatchSize == 0 || batch.size == batch.getMaxSize()) {
    writer.addRowBatch(batch);
    batch.reset();
    batch.size = 0;
  }
}

Source File: MDSHiveDirectVectorizedReader.java From multiple-dimension-spread with Apache License 2.0

5 votes

@Override
public boolean next( final NullWritable key, final VectorizedRowBatch outputBatch ) throws IOException {
  outputBatch.reset();
  setting.setPartitionValues( outputBatch );

  if( indexSize <= currentIndex ){
    if( ! currentReader.hasNext() ){
      updateCounter( currentReader.getReadStats() );
      outputBatch.endOfFile = true;
      isEnd = true;
      return false;
    }
    while( ! setSpread() ){
      if( ! currentReader.hasNext() ){
        updateCounter( currentReader.getReadStats() );
        outputBatch.endOfFile = true;
        isEnd = true;
        return false;
      }
    }
  }
  int maxSize = outputBatch.getMaxSize();
  if( indexSize < currentIndex + maxSize ){
    maxSize = indexSize - currentIndex;
  }

  for( int colIndex : needColumnIds ){
    assignors[colIndex].setColumnVector( outputBatch.cols[colIndex] , currentIndexList , currentIndex , maxSize );
  }
  outputBatch.size = maxSize;

  currentIndex += maxSize;
  if( indexSize <= currentIndex && ! currentReader.hasNext() ){
    outputBatch.endOfFile = true;
  }

  return outputBatch.size > 0;
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

5 votes

public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

5 votes

@Test
public void recordConversion() throws IOException {
  Path outputFilePath = new Path(createTempFile());

  Schema.Parser schemaParser = new Schema.Parser();
  Schema schema = schemaParser.parse(
      "{\"type\": \"record\", \"name\": \"MyRecord\", \"fields\": [{\"name\": \"first\", \"type\": \"int\"},{" +
          "\"name\": \"second\", \"type\": {\"type\": \"record\", \"name\": \"MySubRecord\", \"fields\":" +
          " [{\"name\": \"sub1\", \"type\": \"string\"}, {\"name\": \"sub2\", \"type\": \"int\"}] } }, {\"name\":" +
          " \"somedate\", \"type\": { \"type\" : \"int\", \"logicalType\": \"date\"} } ]}"
  );

  TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  GenericRecord avroRecord = new GenericData.Record(schema);
  avroRecord.put("first", 1);
  avroRecord.put("somedate", 17535);

  GenericData.Record subRecord = new GenericData.Record(schema.getField("second").schema());
  subRecord.put("sub1", new Utf8("value1"));
  subRecord.put("sub2", 42);

  avroRecord.put("second", subRecord);

  VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  // TODO: add code to read the ORC file and validate the contents
}

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

4 votes

@Test
public void unionTypeConversions() throws IOException {
  final Path outputFilePath = new Path(createTempFile());

  final Schema.Parser schemaParser = new Schema.Parser();
  final Schema schema = schemaParser.parse(TestAvroToOrcRecordConverter.class.getResourceAsStream("avro_union_types.json"));

  final TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  final Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  final GenericRecord avroRecord1 = new GenericData.Record(schema);
  avroRecord1.put("nullableInteger", 87);
  avroRecord1.put("integerOrString", "someString");
  avroRecord1.put("nullableStringOrInteger", "nonNullString");
  avroRecord1.put("justLong", 57844942331l);

  final GenericRecord avroRecord2 = new GenericData.Record(schema);
  avroRecord2.put("nullableInteger", null);
  avroRecord2.put("integerOrString", 16);
  avroRecord2.put("nullableStringOrInteger", null);
  avroRecord2.put("justLong", 758934l);

  final VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord1, orcSchema, 1000, orcWriter);
  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord2, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  try (OrcToSdcRecordConverter sdcRecordConverter = new OrcToSdcRecordConverter(outputFilePath)) {

    final Record record1 = RecordCreator.create();
    boolean populated = sdcRecordConverter.populateRecord(record1);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(record1, avroRecord1, null);

    final Record record2 = RecordCreator.create();
    populated = sdcRecordConverter.populateRecord(record2);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(
        record2,
        avroRecord2,
        ImmutableMap.<String, Matcher<Field>>builder()
            .put("nullableInteger", Matchers.intFieldWithNullValue())
            .put("nullableStringOrInteger", Matchers.stringFieldWithNullValue())
            .build()
    );
  }
}

Source File: OrcColumnarRowSplitReaderTest.java From flink with Apache License 2.0

4 votes

protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Source File: ORCRecordExtractorTest.java From incubator-pinot with Apache License 2.0

4 votes

/**
 * Create an ORC input file using the input records
 */
@Override
protected void createInputFile()
    throws IOException {
  TypeDescription schema = TypeDescription.fromString(
      "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>");
  Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()),
      OrcFile.writerOptions(new Configuration()).setSchema(schema));

  int numRecords = _inputRecords.size();
  VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords);
  LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0];
  userIdVector.noNulls = false;
  BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1];
  firstNameVector.noNulls = false;
  BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2];
  ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3];
  bidsVector.noNulls = false;
  LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child;
  bidsElementVector.ensureSize(6, false);
  BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4];
  DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5];
  LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6];

  for (int i = 0; i < numRecords; i++) {
    Map<String, Object> record = _inputRecords.get(i);

    Integer userId = (Integer) record.get("user_id");
    if (userId != null) {
      userIdVector.vector[i] = userId;
    } else {
      userIdVector.isNull[i] = true;
    }
    String firstName = (String) record.get("firstName");
    if (firstName != null) {
      firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName));
    } else {
      firstNameVector.isNull[i] = true;
    }
    lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName")));
    List<Integer> bids = (List<Integer>) record.get("bids");
    if (bids != null) {
      bidsVector.offsets[i] = bidsVector.childCount;
      bidsVector.lengths[i] = bids.size();
      for (int bid : bids) {
        bidsElementVector.vector[bidsVector.childCount++] = bid;
      }
    } else {
      bidsVector.isNull[i] = true;
    }
    campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo")));
    costVector.vector[i] = (double) record.get("cost");
    timestampVector.vector[i] = (long) record.get("timestamp");

    rowBatch.size++;
  }

  writer.addRowBatch(rowBatch);
  rowBatch.reset();
  writer.close();
}

Java Code Examples for org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch#reset()