org.apache.orc.Writer Java Examples

The following examples show how to use org.apache.orc.Writer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroToOrcRecordConverter.java    From datacollector with Apache License 2.0 6 votes vote down vote up
public static void addAvroRecord(
    VectorizedRowBatch batch,
    GenericRecord record,
    TypeDescription orcSchema,
    int orcBatchSize,
    Writer writer
) throws IOException {

  for (int c = 0; c < batch.numCols; c++) {
    ColumnVector colVector = batch.cols[c];
    final String thisField = orcSchema.getFieldNames().get(c);
    final TypeDescription type = orcSchema.getChildren().get(c);

    Object fieldValue = record.get(thisField);
    Schema.Field avroField = record.getSchema().getField(thisField);
    addToVector(type, colVector, avroField.schema(), fieldValue, batch.size);
  }

  batch.size++;

  if (batch.size % orcBatchSize == 0 || batch.size == batch.getMaxSize()) {
    writer.addRowBatch(batch);
    batch.reset();
    batch.size = 0;
  }
}
 
Example #2
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Metrics fromWriter(Writer writer) {
  try {
    return buildOrcMetrics(writer.getNumberOfRows(), writer.getSchema(), writer.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get statistics from writer");
  }
}
 
Example #3
Source File: OrcFileAppender.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Writer newOrcWriter(OutputFile file,
                                   OrcFile.WriterOptions options, Map<String, byte[]> metadata) {
  final Path locPath = new Path(file.location());
  final Writer writer;

  try {
    writer = OrcFile.createWriter(locPath, options);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Can't create file " + locPath);
  }

  metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));

  return writer;
}
 
Example #4
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}
 
Example #5
Source File: AvroToOrcRecordConverter.java    From datacollector with Apache License 2.0 5 votes vote down vote up
public static Writer createOrcWriter(Properties orcWriterProperties, Configuration configuration, Path orcOutputFile, TypeDescription orcSchema) throws IOException {
  if (LOG.isDebugEnabled()) {
    LOG.debug("Creating ORC writer at: {}", orcOutputFile.toString());
  }
  return OrcFile.createWriter(
      orcOutputFile,
      OrcFile.writerOptions(orcWriterProperties, configuration).setSchema(orcSchema)
  );
}
 
Example #6
Source File: TestAvroToOrcRecordConverter.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@Test
public void recordConversion() throws IOException {
  Path outputFilePath = new Path(createTempFile());

  Schema.Parser schemaParser = new Schema.Parser();
  Schema schema = schemaParser.parse(
      "{\"type\": \"record\", \"name\": \"MyRecord\", \"fields\": [{\"name\": \"first\", \"type\": \"int\"},{" +
          "\"name\": \"second\", \"type\": {\"type\": \"record\", \"name\": \"MySubRecord\", \"fields\":" +
          " [{\"name\": \"sub1\", \"type\": \"string\"}, {\"name\": \"sub2\", \"type\": \"int\"}] } }, {\"name\":" +
          " \"somedate\", \"type\": { \"type\" : \"int\", \"logicalType\": \"date\"} } ]}"
  );

  TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  GenericRecord avroRecord = new GenericData.Record(schema);
  avroRecord.put("first", 1);
  avroRecord.put("somedate", 17535);

  GenericData.Record subRecord = new GenericData.Record(schema.getField("second").schema());
  subRecord.put("sub1", new Utf8("value1"));
  subRecord.put("sub2", 42);

  avroRecord.put("second", subRecord);

  VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  // TODO: add code to read the ORC file and validate the contents
}
 
Example #7
Source File: OrcBulkWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
OrcBulkWriter(Vectorizer<T> vectorizer, Writer writer) {
	this.vectorizer = checkNotNull(vectorizer);
	this.writer = checkNotNull(writer);
	this.rowBatch = vectorizer.getSchema().createRowBatch();

	// Configure the vectorizer with the writer so that users can add
	// metadata on the fly through the Vectorizer#vectorize(...) method.
	this.vectorizer.setWriter(this.writer);
}
 
Example #8
Source File: OrcKeyCompactorOutputFormat.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Required for extension since super method hard-coded file extension as ".orc". To keep flexibility
 * of extension name, we made it configuration driven.
 * @param taskAttemptContext The source of configuration that determines the file extension
 * @return The {@link RecordWriter} that write out Orc object.
 * @throws IOException
 */
@Override
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
  Configuration conf = taskAttemptContext.getConfiguration();
  String extension = "." + conf.get(COMPACTION_OUTPUT_EXTENSION, "orc" );

  Path filename = getDefaultWorkFile(taskAttemptContext, extension);
  Writer writer = OrcFile.createWriter(filename,
      org.apache.orc.mapred.OrcOutputFormat.buildOptions(conf));
  return new OrcMapreduceRecordWriter(writer);
}
 
Example #9
Source File: OrcCompactionTaskTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception {
  Configuration configuration = new Configuration();
  OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema);

  Writer writer = OrcFile.createWriter(path, options);
  OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer);
  for (OrcStruct orcRecord : orcStructs) {
    recordWriter.write(NullWritable.get(), orcRecord);
  }
  recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID()));
}
 
Example #10
Source File: OrcWriter.java    From osm2orc with ISC License 4 votes vote down vote up
OrcEntityProcessor(Writer writer, VectorizedRowBatch batch) {
    this.writer = writer;
    this.batch = batch;
}
 
Example #11
Source File: TestAvroToOrcRecordConverter.java    From datacollector with Apache License 2.0 4 votes vote down vote up
@Test
public void unionTypeConversions() throws IOException {
  final Path outputFilePath = new Path(createTempFile());

  final Schema.Parser schemaParser = new Schema.Parser();
  final Schema schema = schemaParser.parse(TestAvroToOrcRecordConverter.class.getResourceAsStream("avro_union_types.json"));

  final TypeDescription orcSchema = AvroToOrcSchemaConverter.getOrcSchema(schema);

  final Writer orcWriter = AvroToOrcRecordConverter.createOrcWriter(
      new Properties(),
      new Configuration(),
      outputFilePath,
      orcSchema
  );

  final GenericRecord avroRecord1 = new GenericData.Record(schema);
  avroRecord1.put("nullableInteger", 87);
  avroRecord1.put("integerOrString", "someString");
  avroRecord1.put("nullableStringOrInteger", "nonNullString");
  avroRecord1.put("justLong", 57844942331l);

  final GenericRecord avroRecord2 = new GenericData.Record(schema);
  avroRecord2.put("nullableInteger", null);
  avroRecord2.put("integerOrString", 16);
  avroRecord2.put("nullableStringOrInteger", null);
  avroRecord2.put("justLong", 758934l);

  final VectorizedRowBatch batch = orcSchema.createRowBatch();

  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord1, orcSchema, 1000, orcWriter);
  AvroToOrcRecordConverter.addAvroRecord(batch, avroRecord2, orcSchema, 1000, orcWriter);
  orcWriter.addRowBatch(batch);
  batch.reset();
  orcWriter.close();

  try (OrcToSdcRecordConverter sdcRecordConverter = new OrcToSdcRecordConverter(outputFilePath)) {

    final Record record1 = RecordCreator.create();
    boolean populated = sdcRecordConverter.populateRecord(record1);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(record1, avroRecord1, null);

    final Record record2 = RecordCreator.create();
    populated = sdcRecordConverter.populateRecord(record2);
    assertThat(populated, equalTo(true));
    assertSdcRecordMatchesAvro(
        record2,
        avroRecord2,
        ImmutableMap.<String, Matcher<Field>>builder()
            .put("nullableInteger", Matchers.intFieldWithNullValue())
            .put("nullableStringOrInteger", Matchers.stringFieldWithNullValue())
            .build()
    );
  }
}
 
Example #12
Source File: OrcColumnarRowSplitReaderNoHiveTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}
 
Example #13
Source File: OrcColumnarRowSplitReaderTest.java    From flink with Apache License 2.0 4 votes vote down vote up
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}
 
Example #14
Source File: ORCRecordExtractorTest.java    From incubator-pinot with Apache License 2.0 4 votes vote down vote up
/**
 * Create an ORC input file using the input records
 */
@Override
protected void createInputFile()
    throws IOException {
  TypeDescription schema = TypeDescription.fromString(
      "struct<user_id:int,firstName:string,lastName:string,bids:array<int>,campaignInfo:string,cost:double,timestamp:bigint>");
  Writer writer = OrcFile.createWriter(new Path(_dataFile.getAbsolutePath()),
      OrcFile.writerOptions(new Configuration()).setSchema(schema));

  int numRecords = _inputRecords.size();
  VectorizedRowBatch rowBatch = schema.createRowBatch(numRecords);
  LongColumnVector userIdVector = (LongColumnVector) rowBatch.cols[0];
  userIdVector.noNulls = false;
  BytesColumnVector firstNameVector = (BytesColumnVector) rowBatch.cols[1];
  firstNameVector.noNulls = false;
  BytesColumnVector lastNameVector = (BytesColumnVector) rowBatch.cols[2];
  ListColumnVector bidsVector = (ListColumnVector) rowBatch.cols[3];
  bidsVector.noNulls = false;
  LongColumnVector bidsElementVector = (LongColumnVector) bidsVector.child;
  bidsElementVector.ensureSize(6, false);
  BytesColumnVector campaignInfoVector = (BytesColumnVector) rowBatch.cols[4];
  DoubleColumnVector costVector = (DoubleColumnVector) rowBatch.cols[5];
  LongColumnVector timestampVector = (LongColumnVector) rowBatch.cols[6];

  for (int i = 0; i < numRecords; i++) {
    Map<String, Object> record = _inputRecords.get(i);

    Integer userId = (Integer) record.get("user_id");
    if (userId != null) {
      userIdVector.vector[i] = userId;
    } else {
      userIdVector.isNull[i] = true;
    }
    String firstName = (String) record.get("firstName");
    if (firstName != null) {
      firstNameVector.setVal(i, StringUtils.encodeUtf8(firstName));
    } else {
      firstNameVector.isNull[i] = true;
    }
    lastNameVector.setVal(i, StringUtils.encodeUtf8((String) record.get("lastName")));
    List<Integer> bids = (List<Integer>) record.get("bids");
    if (bids != null) {
      bidsVector.offsets[i] = bidsVector.childCount;
      bidsVector.lengths[i] = bids.size();
      for (int bid : bids) {
        bidsElementVector.vector[bidsVector.childCount++] = bid;
      }
    } else {
      bidsVector.isNull[i] = true;
    }
    campaignInfoVector.setVal(i, StringUtils.encodeUtf8((String) record.get("campaignInfo")));
    costVector.vector[i] = (double) record.get("cost");
    timestampVector.vector[i] = (long) record.get("timestamp");

    rowBatch.size++;
  }

  writer.addRowBatch(rowBatch);
  rowBatch.reset();
  writer.close();
}
 
Example #15
Source File: OrcMetaDataWriter.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
public OrcMetaDataWriter( Writer writer ) {
  this.writer = writer;
}
 
Example #16
Source File: Vectorizer.java    From flink with Apache License 2.0 2 votes vote down vote up
/**
 * Users are not supposed to use this method since this is intended to be used only by the {@link OrcBulkWriter}.
 *
 * @param writer the underlying ORC Writer.
 */
public void setWriter(Writer writer) {
	this.writer = writer;
}