Java Code Examples for org.apache.orc.TypeDescription#createRowBatch()

The following examples show how to use org.apache.orc.TypeDescription#createRowBatch() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

6 votes

OrcFileAppender(Schema schema, OutputFile file,
                Function<TypeDescription, OrcValueWriter<?>> createWriterFunc,
                Configuration conf, Map<String, byte[]> metadata,
                int batchSize) {
  this.conf = conf;
  this.file = file;
  this.batchSize = batchSize;
  this.schema = schema;

  TypeDescription orcSchema = ORCSchemaUtil.convert(this.schema);
  this.batch = orcSchema.createRowBatch(this.batchSize);

  OrcFile.WriterOptions options = OrcFile.writerOptions(conf).useUTCTimestamp(true);
  if (file instanceof HadoopOutputFile) {
    options.fileSystem(((HadoopOutputFile) file).getFileSystem());
  }
  options.setSchema(orcSchema);
  this.writer = newOrcWriter(file, options, metadata);
  this.valueWriter = newOrcValueWriter(orcSchema, createWriterFunc);
}

Example 2

Source File: PentahoOrcRecordWriter.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

public PentahoOrcRecordWriter( List<? extends IOrcOutputField> fields, TypeDescription schema, String filePath,
                               Configuration conf ) {
  this.fields = fields;
  this.schema = schema;
  final AtomicInteger fieldNumber = new AtomicInteger();  //Mutable field count
  fields.forEach( field -> setOutputMeta( fieldNumber, field ) );
  outputRowMetaAndData = new RowMetaAndData( outputRowMeta, new Object[ fieldNumber.get() ] );

  try {
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( filePath, conf );
    Path outputFile = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( filePath ) );
    writer = OrcFile.createWriter( outputFile,
      OrcFile.writerOptions( conf )
        .setSchema( schema ) );
    batch = schema.createRowBatch();
  } catch ( IOException e ) {
    logger.error( e );
  }

  //Write the addition metadata for the fields
  // new OrcMetaDataWriter( writer ).write( fields );
}

Example 3

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

5 votes

public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}

Example 4

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

5 votes

@Test
public void recordConversion() throws IOException {
  Path outputFilePath = new Path(createTempFile());

  Schema.Parser schemaParser = new Schema.Parser();
  Schema schema = schemaParser.parse(
      "{\"type\": \"record\", \"name\": \"MyRecord\", \"fields\": [{\"name\": \"first\", \"type\": \"int\"},{" +
          "\"name\": \"second\", \"type\": {\"type\": \"record\", \"name\": \"MySubRecord\", \"fields\":" +
          " [{\"name\": \"sub1\", \"type\": \"string\"}, {\"name\": \"sub2\", \"type\": \"int\"}] } }, {\"name\":" +
          " \"somedate\", \"type\": { \"type\" : \"int\", \"logicalType\": \"date\"} } ]}"
  );

  TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  GenericRecord avroRecord = new GenericData.Record(schema);
  avroRecord.put("first", 1);
  avroRecord.put("somedate", 17535);

  GenericData.Record subRecord = new GenericData.Record(schema.getField("second").schema());
  subRecord.put("sub1", new Utf8("value1"));
  subRecord.put("sub2", 42);

  avroRecord.put("second", subRecord);

  VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  // TODO: add code to read the ORC file and validate the contents
}

Example 5

Source File: OrcNoHiveBulkWriterFactory.java From flink with Apache License 2.0

5 votes

@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
	TypeDescription description = TypeDescription.fromString(schema);
	opts.setSchema(description);
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));
	WriterImpl writer = new WriterImpl(null, new Path("."), opts);

	VectorizedRowBatch rowBatch = description.createRowBatch();
	return new BulkWriter<RowData>() {
		@Override
		public void addElement(RowData row) throws IOException {
			int rowId = rowBatch.size++;
			for (int i = 0; i < row.getArity(); ++i) {
				setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
			}
			if (rowBatch.size == rowBatch.getMaxSize()) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void flush() throws IOException {
			if (rowBatch.size != 0) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void finish() throws IOException {
			flush();
			writer.close();
		}
	};
}

Example 6

Source File: VectorizedRowBatchIterator.java From iceberg with Apache License 2.0

4 votes

VectorizedRowBatchIterator(String fileLocation, TypeDescription schema, RecordReader rows) {
  this.fileLocation = fileLocation;
  this.rows = rows;
  this.batch = schema.createRowBatch();
}

Example 7

Source File: OrcIterator.java From iceberg with Apache License 2.0

4 votes

OrcIterator(Path filename, TypeDescription schema, RecordReader rows) {
  this.filename = filename;
  this.rows = rows;
  this.batch = schema.createRowBatch();
}

Example 8

Source File: SparkOrcWriter.java From iceberg with Apache License 2.0

4 votes

public SparkOrcWriter(OrcFileAppender writer) {
  TypeDescription schema = writer.getSchema();
  batch = schema.createRowBatch(BATCH_SIZE);
  this.writer = writer;
  converters = buildConverters(schema);
}

Example 9

Source File: TestAvroToOrcRecordConverter.java From datacollector with Apache License 2.0

4 votes

@Test
public void unionTypeConversions() throws IOException {
  final Path outputFilePath = new Path(createTempFile());

  final Schema.Parser schemaParser = new Schema.Parser();
  final Schema schema = schemaParser.parse(TestAvroToOrcRecordConverter.class.getResourceAsStream("avro_union_types.json"));

  final TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  final Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  final GenericRecord avroRecord1 = new GenericData.Record(schema);
  avroRecord1.put("nullableInteger", 87);
  avroRecord1.put("integerOrString", "someString");
  avroRecord1.put("nullableStringOrInteger", "nonNullString");
  avroRecord1.put("justLong", 57844942331l);

  final GenericRecord avroRecord2 = new GenericData.Record(schema);
  avroRecord2.put("nullableInteger", null);
  avroRecord2.put("integerOrString", 16);
  avroRecord2.put("nullableStringOrInteger", null);
  avroRecord2.put("justLong", 758934l);

  final VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord1, orcSchema, 1000, orcWriter);
  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord2, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  try (OrcToSdcRecordConverter sdcRecordConverter = new OrcToSdcRecordConverter(outputFilePath)) {

    final Record record1 = RecordCreator.create();
    boolean populated = sdcRecordConverter.populateRecord(record1);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(record1, avroRecord1, null);

    final Record record2 = RecordCreator.create();
    populated = sdcRecordConverter.populateRecord(record2);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(
        record2,
        avroRecord2,
        ImmutableMap.<String, Matcher<Field>>builder()
            .put("nullableInteger", Matchers.intFieldWithNullValue())
            .put("nullableStringOrInteger", Matchers.stringFieldWithNullValue())
            .build()
    );
  }
}

Example 10

Source File: OrcNoHiveShim.java From flink with Apache License 2.0

4 votes

@Override
public OrcNoHiveBatchWrapper createBatchWrapper(TypeDescription schema, int batchSize) {
	return new OrcNoHiveBatchWrapper(schema.createRowBatch(batchSize));
}

Example 11

Source File: OrcColumnarRowSplitReaderNoHiveTest.java From flink with Apache License 2.0

4 votes

@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Example 12

Source File: OrcShimV200.java From flink with Apache License 2.0

4 votes

@Override
public HiveOrcBatchWrapper createBatchWrapper(TypeDescription schema, int batchSize) {
	return new HiveOrcBatchWrapper(schema.createRowBatch(batchSize));
}

Example 13

Source File: OrcColumnarRowSplitReaderTest.java From flink with Apache License 2.0

4 votes

protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}

Example 14

Source File: ORCRecordExtractorTest.java From incubator-pinot with Apache License 2.0

4 votes

/**
 * Create an ORC input file using the input records
 */
@Override
protected void createInputFile()
    throws IOException {
  TypeDescription schema = TypeDescription.fromString(
      "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>");
  Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()),
      OrcFile.writerOptions(new Configuration()).setSchema(schema));

  int numRecords = _inputRecords.size();
  VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords);
  LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0];
  userIdVector.noNulls = false;
  BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1];
  firstNameVector.noNulls = false;
  BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2];
  ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3];
  bidsVector.noNulls = false;
  LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child;
  bidsElementVector.ensureSize(6, false);
  BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4];
  DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5];
  LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6];

  for (int i = 0; i < numRecords; i++) {
    Map<String, Object> record = _inputRecords.get(i);

    Integer userId = (Integer) record.get("user_id");
    if (userId != null) {
      userIdVector.vector[i] = userId;
    } else {
      userIdVector.isNull[i] = true;
    }
    String firstName = (String) record.get("firstName");
    if (firstName != null) {
      firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName));
    } else {
      firstNameVector.isNull[i] = true;
    }
    lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName")));
    List<Integer> bids = (List<Integer>) record.get("bids");
    if (bids != null) {
      bidsVector.offsets[i] = bidsVector.childCount;
      bidsVector.lengths[i] = bids.size();
      for (int bid : bids) {
        bidsElementVector.vector[bidsVector.childCount++] = bid;
      }
    } else {
      bidsVector.isNull[i] = true;
    }
    campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo")));
    costVector.vector[i] = (double) record.get("cost");
    timestampVector.vector[i] = (long) record.get("timestamp");

    rowBatch.size++;
  }

  writer.addRowBatch(rowBatch);
  rowBatch.reset();
  writer.close();
}